From 916885a099ae29becc18aa2d7c0e6198fefd4fda Mon Sep 17 00:00:00 2001 From: Forrest Date: Mon, 12 Nov 2018 14:51:47 -0800 Subject: [PATCH 1/3] Added sitemap.xml gen Added sitemap.xml gen --- crawlsite.js | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/crawlsite.js b/crawlsite.js index 04ac178..3d2ea2b 100644 --- a/crawlsite.js +++ b/crawlsite.js @@ -45,6 +45,7 @@ const URL = process.env.URL || 'https://news.polymer-project.org/'; const SCREENSHOTS = process.argv.includes('--screenshots'); const DEPTH = parseInt(process.env.DEPTH) || 2; const VIEWPORT = SCREENSHOTS ? {width: 1028, height: 800, deviceScaleFactor: 2} : null; +const SITEMAP = process.argv.includes('--sitemap'); const OUT_DIR = process.env.OUTDIR || `output/${slugify(URL)}`; const crawledPages = new Map(); @@ -161,6 +162,28 @@ async function crawl(browser, page, depth = 0) { } } +function buildSitemap() { + if (SITEMAP && crawledPages) { + var p = ""; + crawledPages.forEach(element => { + var n = "\t\t\n"; + n = n + "\t\t\t\n"; + n = n + `\t\t\t\t${element.url}\n`; + n = n + "\t\t\t\n"; + n = n + "\t\t\n"; + + p = p + n; + }); + var sm = "\n"; + sm = sm + "\t\n"; + sm = sm + p + "\t\n"; + const path = `./${OUT_DIR}/sitemap.xml`; + fs.writeFile(path, sm, function (err) { + if (err) throw err; + }); + } +} + (async() => { mkdirSync(OUT_DIR); // create output dir if it doesn't exist. @@ -175,6 +198,10 @@ if (VIEWPORT) { const root = {url: URL}; await crawl(browser, root); +if (SITEMAP) { + buildSitemap(); +} + await util.promisify(fs.writeFile)(`./${OUT_DIR}/crawl.json`, JSON.stringify(root, null, ' ')); await browser.close(); From 3be003006eb81a1055036cb668c0b7129e601c72 Mon Sep 17 00:00:00 2001 From: forrest321 Date: Sat, 21 Sep 2019 12:19:53 -0500 Subject: [PATCH 2/3] added sitemap npm package --- crawlsite.js | 31 +++++++++++++------------------ package.json | 1 + 2 files changed, 14 insertions(+), 18 deletions(-) diff --git a/crawlsite.js b/crawlsite.js index 3d2ea2b..311d983 100644 --- a/crawlsite.js +++ b/crawlsite.js @@ -19,12 +19,14 @@ /** * Discovers all the pages in site or single page app (SPA) and creates * a tree of the result in ./output/ { - var n = "\t\t\n"; - n = n + "\t\t\t\n"; - n = n + `\t\t\t\t${element.url}\n`; - n = n + "\t\t\t\n"; - n = n + "\t\t\n"; - - p = p + n; - }); - var sm = "\n"; - sm = sm + "\t\n"; - sm = sm + p + "\t\n"; - const path = `./${OUT_DIR}/sitemap.xml`; - fs.writeFile(path, sm, function (err) { +function buildSitemap(rootURL = "") { + if (SITEMAP && crawledPages) { + let siteMap = createSitemap({hostname: rootURL}); + crawledPages.forEach(pg => { + siteMap.add({url: pg.url, title: pg.title}) + }); + + let path = `./${OUT_DIR}/sitemap.xml`; + fs.writeFile(path, siteMap.toString(true), function (err) { if (err) throw err; }); } diff --git a/package.json b/package.json index 8ad3da0..7eec7c0 100644 --- a/package.json +++ b/package.json @@ -29,6 +29,7 @@ "request-promise-native": "^1.0.5", "resize-img": "^1.1.2", "sharp": "^0.20.1", + "sitemap": "^4.1.1", "ws": "^5.1.1", "yargs": "^11.0.0" } From f42359142c814e9259e890ccf017ae3733093e09 Mon Sep 17 00:00:00 2001 From: forrest321 Date: Sat, 21 Sep 2019 12:57:08 -0500 Subject: [PATCH 3/3] error handling --- crawlsite.js | 102 +++++++++++++++++++++++++++------------------------ 1 file changed, 55 insertions(+), 47 deletions(-) diff --git a/crawlsite.js b/crawlsite.js index 311d983..5a3218d 100644 --- a/crawlsite.js +++ b/crawlsite.js @@ -16,26 +16,26 @@ * @author ebidel@ (Eric Bidelman) */ - /** - * Discovers all the pages in site or single page app (SPA) and creates - * a tree of the result in ./output/ a !== URL) // link doesn't point to start url of crawl. page.title = await newPage.evaluate('document.title'); - page.children = anchors.map(url => ({url})); + page.children = anchors.map(url => ({ url })); if (SCREENSHOTS) { - const path = `./${OUT_DIR}/${slugify(page.url)}.png`; - let imgBuff = await newPage.screenshot({fullPage: false}); + let path = `./${OUT_DIR}/${slugify(page.url)}.png`; + let imgBuff = await newPage.screenshot({ fullPage: false }); imgBuff = await sharp(imgBuff).resize(null, 150).toBuffer(); // resize image to 150 x auto. util.promisify(fs.writeFile)(path, imgBuff); // async page.img = `data:img/png;base64,${imgBuff.toString('base64')}`; @@ -166,40 +166,48 @@ async function crawl(browser, page, depth = 0) { } function buildSitemap(rootURL = "") { - if (SITEMAP && crawledPages) { - let siteMap = createSitemap({hostname: rootURL}); + if (SITEMAP && crawledPages) { + let siteMap = createSitemap({ hostname: rootURL }); crawledPages.forEach(pg => { - siteMap.add({url: pg.url, title: pg.title}) + try { + siteMap.add({ url: pg.url, title: pg.title }) + } catch (err) { + // bad url, don't add. + } }); let path = `./${OUT_DIR}/sitemap.xml`; - fs.writeFile(path, siteMap.toString(true), function (err) { - if (err) throw err; - }); + try { + fs.writeFile(path, siteMap.toString(true), function (err) { + if (err) throw err; + }); + } catch (err) { + throw err; + } } } -(async() => { +(async () => { -mkdirSync(OUT_DIR); // create output dir if it doesn't exist. -await del([`${OUT_DIR}/*`]); // cleanup after last run. + mkdirSync(OUT_DIR); // create output dir if it doesn't exist. + await del([`${OUT_DIR}/*`]); // cleanup after last run. -const browser = await puppeteer.launch(); -const page = await browser.newPage(); -if (VIEWPORT) { - await page.setViewport(VIEWPORT); -} + const browser = await puppeteer.launch(); + const page = await browser.newPage(); + if (VIEWPORT) { + await page.setViewport(VIEWPORT); + } -const root = {url: URL}; -await crawl(browser, root); + const root = { url: URL }; + await crawl(browser, root); -if (SITEMAP) { - buildSitemap(); -} + if (SITEMAP) { + buildSitemap(URL); + } -await util.promisify(fs.writeFile)(`./${OUT_DIR}/crawl.json`, JSON.stringify(root, null, ' ')); + await util.promisify(fs.writeFile)(`./${OUT_DIR}/crawl.json`, JSON.stringify(root, null, ' ')); -await browser.close(); + await browser.close(); })();