This repository has been archived by the owner on Nov 10, 2017. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 31
/
phantom-scrape.js
101 lines (90 loc) · 2.82 KB
/
phantom-scrape.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
var system = require("system");
var page = require("webpage").create();
var url = system.args[1];
var readabilityPath = system.args[2];
var userAgent = system.args[3];
var consoleLogs = [];
// Prevent page js errors to break JSON output
// XXX: should we log these instead?
phantom.onError = page.onError = function(){};
function exitWithError(message) {
outputJSON({error: {message: message}});
phantom.exit();
}
function outputJSON(object) {
console.log(JSON.stringify(object, null, 2));
}
/**
* Note: This function runs within page environment.
*/
function runReadability(url, userAgent, pageContent) {
var location = document.location;
var uri = {
spec: location.href,
host: location.host,
prePath: location.protocol + "//" + location.host, // TODO This is incomplete, needs username/password and port
scheme: location.protocol.substr(0, location.protocol.indexOf(":")),
pathBase: location.protocol + "//" + location.host + location.pathname.substr(0, location.pathname.lastIndexOf("/") + 1)
};
try {
var readabilityObj = new Readability(uri, document);
var isProbablyReaderable = readabilityObj.isProbablyReaderable();
var result = readabilityObj.parse();
if (result) {
result.userAgent = userAgent;
result.isProbablyReaderable = isProbablyReaderable;
} else {
result = {
error: {
message: "Empty result from Readability.js.",
sourceHTML: pageContent || "Empty page content."
}
};
}
return result;
} catch (err) {
return {
error: {
message: err.message,
line: err.line,
stack: err.stack,
sourceHTML: pageContent || "Empty page content."
}
};
}
};
if (!url) {
exitWithError("Missing url arg.");
} else if (!readabilityPath) {
exitWithError("Missing readabilityPath arg.");
}
if (userAgent) {
page.settings.userAgent = userAgent;
}
// disable loading images as we don't use them
page.settings.loadImages = false;
// ensure we don't waste time trying to load slow/missing resources
page.settings.resourceTimeout = 5000;
// if we do timeout a slow resource, say something useful
page.onResourceTimeout = function(request) {
console.log('Response (#' + request.id + '): ' + JSON.stringify(request));
};
page.onConsoleMessage = function(msg) {
consoleLogs.push(msg);
};
page.open(url, function(status) {
if (status !== "success") {
return exitWithError("Unable to access " + url);
}
if (!page.injectJs(readabilityPath)) {
exitWithError("Couldn't inject " + readabilityPath);
}
var result = page.evaluate(runReadability, url, page.settings.userAgent, page.content);
if (result && result.error) {
result.error.consoleLogs = consoleLogs;
} else if (result && result.content) {
result.consoleLogs = consoleLogs;
}
outputJSON(result);
phantom.exit();
});