-
Notifications
You must be signed in to change notification settings - Fork 0
/
crawler.js
110 lines (95 loc) · 2.78 KB
/
crawler.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
/* eslint-disable comma-dangle */
const request = require('request');
const cheerio = require('cheerio');
const URL = require('url-parse');
class Page {
constructor(href, parent) {
this.parent = parent;
this.href = href;
this.links = [];
this.assets = [];
}
}
const visitedPages = [];
const pageList = [];
let host;
// private methods
/**
* @private
* @function getPage given a URL and a parent URL, returns a new page object
* @param {*} href string URL
* @param {*} parent the url of the parent node alternately could be a node as well.
* @param {*} next returns the new page node
*/
function getPage(href, parent, next) {
const links = [];
const assets = [];
request(href, (err, response, body) => {
const page = new Page(href, parent);
if (!err && response.statusCode === 200) {
const $ = cheerio.load(body);
const $links = $('a');
$($links).each((i, link) => {
let followLink = $(link).attr('href');
if (followLink[0] === '/') {
followLink = host + followLink;
}
const followUrl = new URL(followLink);
if (followUrl.origin === host) {
if (!links.includes(followLink)) links.push(followLink);
}
});
const $img = $('IMG');
$($img).each((i, img) => {
const imgUrl = $(img).attr('src');
if (imgUrl.includes(host)) {
if (!assets.includes(imgUrl)) assets.push(imgUrl);
}
});
const $scripts = $('script');
$($scripts).each((s, script) => {
const scriptUrl = $(script).attr('src');
if (scriptUrl !== undefined) {
if (scriptUrl.includes(host)) {
if (!assets.includes(scriptUrl)) assets.push(scriptUrl);
}
}
});
page.links.push(links);
page.assets.push(assets);
}
next(err, page);
});
}
// public methods
/**
* @public
* @function crawl recursively crawls a site
* @param {string} href the next page to crawl
* @param {number} depth the maximum depth from root href
* @param {string} parent the parent page
* @returns {pageList} an array of pages
*/
function crawl(href, depth, parent) {
const url = new URL(href);
if (parent === undefined) { // reset everything
host = url.origin;
}
if (!visitedPages.includes(href) && href.includes(host)) {
visitedPages.push(href);
getPage(href, parent, (error, page) => {
pageList.push(page);
const links = page.links[0];
console.log('depth', depth, 'href', href, 'parent', parent);
if (links !== undefined && depth > 0) {
links.forEach((link) => {
if (!visitedPages.includes(link.replace(/\/$/, ''))) {
crawl(link, depth - 1, href);
}
});
}
}); // getPage
}
return pageList;
} // if
module.exports = crawl;