From b90ad2d473bb21b400bb7afc3b940737b10a48df Mon Sep 17 00:00:00 2001 From: Dean Srebnik <49134864+load1n9@users.noreply.github.com> Date: Sun, 29 Sep 2024 23:36:20 -0400 Subject: [PATCH] feat: crawler --- deno.jsonc | 6 +- examples/crawler.ts | 30 ++++++ ext/crawler.ts | 239 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 274 insertions(+), 1 deletion(-) create mode 100644 examples/crawler.ts create mode 100644 ext/crawler.ts diff --git a/deno.jsonc b/deno.jsonc index 0eb9c6d..ea9a14f 100644 --- a/deno.jsonc +++ b/deno.jsonc @@ -1,7 +1,11 @@ { "name": "@astral/astral", "version": "0.4.7", - "exports": "./mod.ts", + "exports": { + ".": "./mod.ts", + "./ext/crawler": "./ext/crawler.ts" + }, + "tasks": { // The task to automatically generate `./src/celestial.ts` "bind": "deno run -A ./bindings/_tools/generate/mod.ts && deno fmt", diff --git a/examples/crawler.ts b/examples/crawler.ts new file mode 100644 index 0000000..f21804f --- /dev/null +++ b/examples/crawler.ts @@ -0,0 +1,30 @@ +import { Crawler } from "../ext/crawler.ts"; + +// Create an instance of the crawler +const crawler = new Crawler({ + maxDepth: 2, + userAgent: + "Mozilla/5.0 (compatible; Crawler4AI/1.0; +https://yourdomain.com/bot)", + customHeaders: { + "Accept-Language": "en-US,en;q=0.9", + }, + concurrency: 5, + outputFormat: "json", + mediaTags: true, + extractLinks: true, + metadata: true, + screenshots: true, + customScripts: [ + // Custom JavaScript code as strings + 'console.log("Custom script executed");', + ], + sessionManagement: false, +}); + +// URLs to crawl +const urls = [ + "https://deno.church", +]; + +// Start crawling +await crawler.crawlUrls(urls); diff --git a/ext/crawler.ts b/ext/crawler.ts new file mode 100644 index 0000000..930a118 --- /dev/null +++ b/ext/crawler.ts @@ -0,0 +1,239 @@ +// deno-lint-ignore-file no-explicit-any + +import { type Browser, launch } from "../mod.ts"; + +/** + * This class is used to crawl websites and extract data such as images, videos, links, metadata, etc. for easy to use data for AI models. + */ +export class Crawler { + #browser?: Browser; + #maxDepth: number; + #visited: Set; + #userAgent: string; + #customHeaders: Record; + #concurrency: number; + #outputFormat: "json" | "html" | "markdown"; + #mediaTags: boolean; + #extractLinks: boolean; + #metadata: boolean; + #screenshots: boolean; + #customScripts: string[]; + #sessionManagement: boolean; + + constructor(options: { + maxDepth?: number; + userAgent?: string; + customHeaders?: Record; + concurrency?: number; + outputFormat?: "json" | "html" | "markdown"; + mediaTags?: boolean; + extractLinks?: boolean; + metadata?: boolean; + screenshots?: boolean; + customScripts?: string[]; + sessionManagement?: boolean; + } = {}) { + this.#maxDepth = options.maxDepth || 2; + this.#visited = new Set(); + this.#userAgent = options.userAgent || "Crawler4AI"; + this.#customHeaders = options.customHeaders || {}; + this.#concurrency = options.concurrency || 5; + this.#outputFormat = options.outputFormat || "json"; + this.#mediaTags = options.mediaTags !== undefined + ? options.mediaTags + : true; + this.#extractLinks = options.extractLinks !== undefined + ? options.extractLinks + : true; + this.#metadata = options.metadata !== undefined + ? options.metadata + : true; + this.#screenshots = options.screenshots || false; + this.#customScripts = options.customScripts || []; + this.#sessionManagement = options.sessionManagement || false; + } + + /** + * Initialize the browser instance + */ + async init() { + this.#browser = await launch({ + headless: true, + args: this.#sessionManagement ? [] : ["--incognito"], + }); + } + + async close() { + if (this.#browser) { + await this.#browser.close(); + } + } + + /** + * Crawl multiple URLs + */ + async crawlUrls(urls: string[]) { + await this.init(); + + const semaphore = new Semaphore(this.#concurrency); + + const crawlPromises = urls.map((url) => + semaphore.acquire() + .then(() => this.crawl(url)) + .finally(() => semaphore.release()) + ); + + await Promise.all(crawlPromises); + await this.close(); + } + + /** + * Crawl a single URL + */ + async crawl(url: string, depth: number = 0) { + if (depth > this.#maxDepth || this.#visited.has(url)) { + return; + } + this.#visited.add(url); + + const page = await this.#browser!.newPage(url); + + // await page.setExtraHTTPHeaders(this.#customHeaders); + // await page.setUserAgent(this.#userAgent); + + await page.evaluate((customScripts: string[]) => { + for (const script of customScripts) { + const scriptElement = document.createElement("script"); + scriptElement.text = script; + document.body.appendChild(scriptElement); + } + }, { args: [this.#customScripts] }); + + const data = await this.extractData(page, url); + + await this.saveData(url, data); + + if (this.#screenshots) { + try { + await Deno.stat("screenshots"); + } catch (error) { + if (error instanceof Deno.errors.NotFound) { + await Deno.mkdir("screenshots", { recursive: true }); + } + } + const img = await page.screenshot({ format: "png" }); + await Deno.writeFile( + `screenshots/${this.#sanitizeFilename(url)}.png`, + img, + ); + } + + if (this.#extractLinks && data.links) { + const crawlPromises = data.links.map((link: string) => + this.crawl(link, depth + 1) + ); + await Promise.all(crawlPromises); + } + + await page.close(); + } + + /** + * Extract data from the page + */ + async extractData(page: any, _url: string) { + const data = await page.evaluate( + (mediaTagsEnabled: boolean, metadataEnabled: boolean) => { + const data: any = {}; + + if (mediaTagsEnabled) { + data.images = Array.from(document.querySelectorAll("img")) + .map((img) => img.src); + data.videos = Array.from(document.querySelectorAll("video")) + .map((video) => video.src); + data.audios = Array.from(document.querySelectorAll("audio")) + .map((audio) => audio.src); + } + + if (metadataEnabled) { + data.title = document.title; + data.description = + document.querySelector('meta[name="description"]') + ?.getAttribute("content") || ""; + data.keywords = + document.querySelector('meta[name="keywords"]') + ?.getAttribute("content") || ""; + } + + data.links = Array.from(document.querySelectorAll("a[href]")) + .map((a) => (a as HTMLAnchorElement).href); + + data.html = document.documentElement.outerHTML; + + return data; + }, + this.#mediaTags, + this.#metadata, + ); + + return data; + } + + /** + * Save the data to a file + * The filename is sanitized and saved in the output folder + * The data is saved in JSON format + */ + async saveData(url: string, data: any) { + const filename = `output/${this.#sanitizeFilename(url)}.json`; + try { + await Deno.stat("output"); + } catch (error) { + if (error instanceof Deno.errors.NotFound) { + await Deno.mkdir("output", { recursive: true }); + } + } + await Deno.writeTextFile(filename, JSON.stringify(data, null, 2)); + } + + #sanitizeFilename(url: string): string { + return url.replace(/[^a-z0-9]/gi, "_").toLowerCase(); + } +} + +/** + * Simple semaphore for #concurrency control + */ +class Semaphore { + #tasks: Array<() => void> = []; + #count: number; + + constructor(count: number) { + this.#count = count; + } + + acquire(): Promise { + return new Promise((resolve) => { + const task = () => { + this.#count--; + resolve(); + }; + + if (this.#count > 0) { + task(); + } else { + this.#tasks.push(task); + } + }); + } + + release() { + this.#count++; + if (this.#tasks.length > 0) { + const task = this.#tasks.shift(); + if (task) { + task(); + } + } + } +}