diff --git a/crawlsite.js b/crawlsite.js index 04ac178..5a3218d 100644 --- a/crawlsite.js +++ b/crawlsite.js @@ -16,24 +16,26 @@ * @author ebidel@ (Eric Bidelman) */ - /** - * Discovers all the pages in site or single page app (SPA) and creates - * a tree of the result in ./output/ a !== URL) // link doesn't point to start url of crawl. page.title = await newPage.evaluate('document.title'); - page.children = anchors.map(url => ({url})); + page.children = anchors.map(url => ({ url })); if (SCREENSHOTS) { - const path = `./${OUT_DIR}/${slugify(page.url)}.png`; - let imgBuff = await newPage.screenshot({fullPage: false}); + let path = `./${OUT_DIR}/${slugify(page.url)}.png`; + let imgBuff = await newPage.screenshot({ fullPage: false }); imgBuff = await sharp(imgBuff).resize(null, 150).toBuffer(); // resize image to 150 x auto. util.promisify(fs.writeFile)(path, imgBuff); // async page.img = `data:img/png;base64,${imgBuff.toString('base64')}`; @@ -161,23 +165,49 @@ async function crawl(browser, page, depth = 0) { } } -(async() => { - -mkdirSync(OUT_DIR); // create output dir if it doesn't exist. -await del([`${OUT_DIR}/*`]); // cleanup after last run. +function buildSitemap(rootURL = "") { + if (SITEMAP && crawledPages) { + let siteMap = createSitemap({ hostname: rootURL }); + crawledPages.forEach(pg => { + try { + siteMap.add({ url: pg.url, title: pg.title }) + } catch (err) { + // bad url, don't add. + } + }); -const browser = await puppeteer.launch(); -const page = await browser.newPage(); -if (VIEWPORT) { - await page.setViewport(VIEWPORT); + let path = `./${OUT_DIR}/sitemap.xml`; + try { + fs.writeFile(path, siteMap.toString(true), function (err) { + if (err) throw err; + }); + } catch (err) { + throw err; + } + } } -const root = {url: URL}; -await crawl(browser, root); +(async () => { + + mkdirSync(OUT_DIR); // create output dir if it doesn't exist. + await del([`${OUT_DIR}/*`]); // cleanup after last run. + + const browser = await puppeteer.launch(); + const page = await browser.newPage(); + if (VIEWPORT) { + await page.setViewport(VIEWPORT); + } + + const root = { url: URL }; + await crawl(browser, root); + + if (SITEMAP) { + buildSitemap(URL); + } -await util.promisify(fs.writeFile)(`./${OUT_DIR}/crawl.json`, JSON.stringify(root, null, ' ')); + await util.promisify(fs.writeFile)(`./${OUT_DIR}/crawl.json`, JSON.stringify(root, null, ' ')); -await browser.close(); + await browser.close(); })(); diff --git a/package.json b/package.json index bde8c05..bac07d7 100644 --- a/package.json +++ b/package.json @@ -29,6 +29,7 @@ "request-promise-native": "^1.0.5", "resize-img": "^1.1.2", "sharp": "^0.21.1", + "sitemap": "^4.1.1", "ws": "^6.1.2", "yargs": "^12.0.5" }