Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Issue #6 : Should be able to fetch the next page #11

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
82 changes: 39 additions & 43 deletions lib/readability.js
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
/*jslint undef: true, nomen: true, eqeqeq: true, plusplus: true, newcap: true, immed: true, browser: true, devel: true, passfail: false */
/*global window: false, readConvertLinksToFootnotes: false, readStyle: false, readSize: false, readMargin: false, Typekit: false, ActiveXObject: false */

var Buffer = require('buffer').Buffer;
var Iconv = require('iconv').Iconv;
var dbg = (typeof console !== 'undefined') ? function(s) {
if (readability.debugging) {
console.log("Readability: " + s);
Expand Down Expand Up @@ -41,7 +42,9 @@ var readability = {
maxPages: 30, /* The maximum number of pages to loop through before we call it quits and just show a link. */
parsedPages: {}, /* The list of pages we've parsed in this call of readability, for autopaging. As a key store for easier searching. */
pageETags: {}, /* A list of the ETag headers of pages we've parsed, in case they happen to match, we'll know it's a duplicate. */

success: function (html) {

},
/**
* All of the regular expressions in use within readability.
* Defined up here so we don't instantiate them repeatedly in loops.
Expand Down Expand Up @@ -91,7 +94,6 @@ var readability = {

/* Pull out any possible next page link first */
var nextPageLink = readability.findNextPageLink(document.body);

readability.prepDocument();

/* Build readability's DOM tree */
Expand Down Expand Up @@ -188,6 +190,8 @@ var readability = {
window.setTimeout(function() {
readability.appendNextPage(nextPageLink);
}, 500);
} else {
readability.success(document.body.innerHTML);
}

/** Smooth scrolling **/
Expand Down Expand Up @@ -1408,49 +1412,26 @@ timed(function(){
* TODO: This could likely be simplified beyond what we have here right now. There's still a bit of excess junk.
**/
xhr: function () {
if (typeof XMLHttpRequest !== 'undefined' && (window.location.protocol !== 'file:' || !window.ActiveXObject)) {
/*if (typeof XMLHttpRequest !== 'undefined' && (window.location.protocol !== 'file:' || !window.ActiveXObject)) {
return new XMLHttpRequest();
}
else {
try { return new ActiveXObject('Msxml2.XMLHTTP.6.0'); } catch(sixerr) { }
try { return new ActiveXObject('Msxml2.XMLHTTP.3.0'); } catch(threrr) { }
try { return new ActiveXObject('Msxml2.XMLHTTP'); } catch(err) { }
}
}*/
var request = require('request');

return false;
return request;
},

successfulRequest: function (request) {
return (request.status >= 200 && request.status < 300) || request.status === 304 || (request.status === 0 && request.responseText);
},

ajax: function (url, options) {
ajax: function (url, callback) {
var request = readability.xhr();

function respondToReadyState(readyState) {
if (request.readyState === 4) {
if (readability.successfulRequest(request)) {
if (options.success) { options.success(request); }
}
else {
if (options.error) { options.error(request); }
}
}
}

if (typeof options === 'undefined') { options = {}; }

request.onreadystatechange = respondToReadyState;

request.open('get', url, true);
request.setRequestHeader('Accept', 'text/html');

try {
request.send(options.postBody);
}
catch (e) {
if (options.error) { options.error(); }
}
request({url:url, 'encoding':'binary'}, callback);

return request;
},
Expand Down Expand Up @@ -1482,11 +1463,20 @@ timed(function(){
* asynchronously and load the cleaned content into the div we created for it.
**/
(function(pageUrl, thisPage) {
readability.ajax(pageUrl, {
success: function(r) {

readability.ajax(pageUrl, function(error, r, html) {
var encoding = undefined;
if(r['headers']['content-type']) {
var content_type = r['headers']['content-type'].split('=');
if(content_type.length == 2) encoding = content_type[1].toUpperCase();
}
if(encoding) {
body = new Buffer(html, 'binary');
iconv = new Iconv(encoding, 'utf8');
html = iconv.convert(body).toString('utf8');
}
r.responseText = html;
/* First, check to see if we have a matching ETag in headers - if we do, this is a duplicate page. */
var eTag = r.getResponseHeader('ETag');
var eTag = r['headers']['ETag'];
if(eTag) {
if(eTag in readability.pageETags) {
dbg("Exact duplicate page found via ETag. Aborting.");
Expand Down Expand Up @@ -1562,8 +1552,9 @@ timed(function(){

if(nextPageLink) {
readability.appendNextPage(nextPageLink);
} else {
readability.success(document.body.innerHTML);
}
}
});
}(nextPageLink, articlePage));
},
Expand Down Expand Up @@ -1961,7 +1952,7 @@ var jsdom = require('jsdom'),
var R = readability;
var patch = {
reComma: /[\uff0c,]/, // chinese comma, too
findNextPageLink: function() {return null;},
/*findNextPageLink: function() {return null;},*/
getArticleTools: function() {return document.createElement('div');},
getArticleTitle: (function() {
var old = R.getArticleTitle;
Expand Down Expand Up @@ -2195,16 +2186,17 @@ function start(w, options, cb) {
if (options.profile) {
MyProfiler.reset();
}
readability.success = cb;

readability.init();

if (options.profile) MyProfiler.report();

if (options.removeReadabilityArtifacts) removeReadabilityArtifacts();
if (options.removeClassNames) removeClassNames();

document.body.innerHTML = '<div id="readability-content">' + document.body.innerHTML + '</div>';
//dbg('[Readability] done');
cb(document.body.innerHTML);
//cb(document.body.innerHTML);
}

var HTML5;
Expand All @@ -2225,7 +2217,11 @@ exports.parse = function parse(theHtml, url, options, callback) {
removeClassNames: true
};
options = Utils.extend({}, defaultOptions, options);

if(options.encoding && options.encoding != 'utf8') {
body = new Buffer(theHtml, 'binary');
iconv = new Iconv(options.encoding, 'utf8');
theHtml = iconv.convert(body).toString('utf8');
}
var startTime = new Date().getTime();
//dbg(html);
var html = theHtml.replace(/<script[^>]*>([\s\S]*?)<\/script>/gi, '');
Expand All @@ -2239,7 +2235,7 @@ exports.parse = function parse(theHtml, url, options, callback) {
features : {
FetchExternalResources : [],
ProcessExternalResources : false
}
},
};

function createDocWithHTMLParser() {
Expand Down Expand Up @@ -2279,7 +2275,7 @@ exports.parse = function parse(theHtml, url, options, callback) {
return callback({title: '', content: '', error: true});
}

dbg('---DOM created');
//dbg('---DOM created');

var win = doc.parentWindow;
win = win || doc.createWindow(); //for backward compatibility with jsdom <= 0.1.20
Expand Down
4 changes: 3 additions & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,9 @@
"dependencies": {
"mjsunit.runner": ">=0.1.0",
"jsdom": ">=0.1.21",
"htmlparser": ">=1.7.3"
"htmlparser": ">=1.7.3",
"html5":">0.1",
"iconv":">=1.1.3"
},
"engines" : { "node" : ">=0.2.5" },
"directories": {
Expand Down
17 changes: 17 additions & 0 deletions test/multi-page.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
var readability = require('../lib/readability'),
request = require('request'),
encoding = 'utf8';
var url = "http://www.washingtonpost.com/world/national-security/manhunt-details-us-mission-to-find-osama-bin-laden/2012/04/27/gIQAz5pLoT_story.html";


request({url:url, 'encoding':'binary'}, function (error, response, html) {
if(response['headers']['content-type']) {
var content_type = response['headers']['content-type'].split('=');
if(content_type.length == 2) encoding = content_type[1].toUpperCase();
}
if(!error && response.statusCode == 200) {
readability.parse(html, url, {encoding:encoding}, function(result) {
console.log(result.title, result.content);
});
}
});