diff --git a/.gitignore b/.gitignore index 81d1daaf0..05b8496cc 100644 --- a/.gitignore +++ b/.gitignore @@ -22,3 +22,6 @@ pnpm-debug.log* # jetbrains setting folder .idea/ + +# Local Netlify folder +.netlify diff --git a/astro.config.mjs b/astro.config.mjs index c55d9dbce..71d98fe04 100644 --- a/astro.config.mjs +++ b/astro.config.mjs @@ -14,6 +14,8 @@ import { rehypeCopyCodePlugin } from './src/util/rehypeCopyCodePlugin.mjs'; import { autolinkConfig } from './src/util/rehypeHeadingsConfig'; import { rehypeNotesPlugin } from './src/util/rehypeNotesPlugin.mjs'; +import sitemap from '@astrojs/sitemap'; + const deployDomain = process.env.DEPLOY_DOMAIN ?? 'surrealdb.com'; const site = `https://${deployDomain}`; @@ -23,18 +25,11 @@ export default defineConfig({ base: '/docs', outDir: './dist/docs', trailingSlash: 'never', - integrations: [ - mdx(), - solidJs({ devtools: true }), - icon(), - tailwind({ - nesting: true, - }), - partytown(), - compress({ - Image: false, - }), - ], + integrations: [mdx(), solidJs({ devtools: true }), icon(), tailwind({ + nesting: true, + }), partytown(), compress({ + Image: false, + }), sitemap()], markdown: { remarkPlugins: [remarkCustomHeadingId], rehypePlugins: [ @@ -46,4 +41,4 @@ export default defineConfig({ ], syntaxHighlight: false, }, -}); +}); \ No newline at end of file diff --git a/bun.lockb b/bun.lockb index 5bc5fc2b0..f548a1e12 100755 Binary files a/bun.lockb and b/bun.lockb differ diff --git a/netlify.toml b/netlify.toml new file mode 100644 index 000000000..7eefe0c85 --- /dev/null +++ b/netlify.toml @@ -0,0 +1,9 @@ +[build] +command = "bun run build" +publish = "dist" + +[dev] +command = "bun run dev" + +[[plugins]] +package = "/plugins/crawler" diff --git a/package.json b/package.json index f5e4cdf2a..194bed40a 100644 --- a/package.json +++ b/package.json @@ -20,6 +20,7 @@ "@ark-ui/solid": "^3.12.0", "@astrojs/check": "^0.9.3", "@astrojs/mdx": "^3.1.5", + "@astrojs/sitemap": "^3.2.0", "@astrojs/solid-js": "^4.4.1", "@astrojs/tailwind": "^5.1.0", "@codemirror/lang-sql": "^6.7.1", @@ -73,5 +74,6 @@ "remark-custom-heading-id": "^2.0.0", "sass-embedded": "^1.78.0", "tiny-glob": "^0.2.9" - } + }, + "workspaces": ["plugins/*"] } diff --git a/plugins/crawler/index.mjs b/plugins/crawler/index.mjs new file mode 100644 index 000000000..aacc6a57c --- /dev/null +++ b/plugins/crawler/index.mjs @@ -0,0 +1,153 @@ +import fs from 'node:fs'; +import path from 'node:path'; +import { parseStringPromise } from 'xml2js'; +import { parse as parseHTML } from 'node-html-parser'; +import Surreal, { RecordId, surql } from 'surrealdb'; +import { cwd } from 'node:process'; + +export async function onSuccess() { + // const isLocalBuild = process.env.DEPLOY_URL == 'https://0--surrealdb-docs.netlify.app'; + const applyIndexes = + process.env.DEPLOY_PRIME_URL === + 'https://main--surrealdb-docs.netlify.app'; + const jobDate = new Date(); + const db = new Surreal(); + + db.emitter.subscribe('connected', () => + console.log('[DB] Connected to SurrealDB') + ); + db.emitter.subscribe('disconnected', () => + console.log('[DB] Disconnected from SurrealDB') + ); + db.emitter.subscribe('error', (e) => console.log('[DB] Error occurred', e)); + + if (applyIndexes) + await db.connect(process.env.SURREAL_ENDPOINT, { + namespace: process.env.SURREAL_NAMESPACE, + database: process.env.SURREAL_DATABASE, + auth: { + namespace: process.env.SURREAL_NAMESPACE, + database: process.env.SURREAL_DATABASE, + username: process.env.SURREAL_USERNAME, + password: process.env.SURREAL_PASSWORD, + }, + }); + + const buildDir = `${cwd()}/dist`; + const deployUrl = new URL(process.env.DEPLOY_PRIME_URL); + const hostname = deployUrl.hostname; + const sitemapPath = `${buildDir}/docs/sitemap-0.xml`; + console.log(`[CW] Build dir is: "${buildDir}"`); + console.log(`[CW] Deploy URL is: "${deployUrl}"`); + console.log(`[CW] Sitemap path is: "${sitemapPath}"`); + + const sitemapXml = fs.readFileSync(sitemapPath, 'utf8'); + const sitemap = await parseStringPromise(sitemapXml); + const urls = sitemap.urlset.url; + console.log(`[CW] The sitemap contains ${urls.length} url(s)`); + + const pathnames = urls.map((url) => { + let pathname = decodeURI(new URL(url.loc[0]).pathname); + if (pathname.endsWith('/')) pathname = pathname.slice(0, -1); + return pathname; + }); + const chunkSize = 1; + + for (let i = 0; i < pathnames.length; i += chunkSize) { + const chunk = pathnames.slice(i, i + chunkSize); + await Promise.all( + chunk.map(async (pathname, index) => { + console.log( + `[CW] Crawling page ${index + i + 1}/${pathnames.length}: ${pathname}` + ); + + const filePath = path.join(buildDir, pathname, 'index.html'); + const fileContent = fs + .readFileSync(filePath, 'utf-8') + .replace(/\0/g, ''); + + if (!fileContent) + throw new Error(`[CW] Failed to read file "${filePath}"`); + + const document = parseHTML(fileContent, { + blockTextElements: { + script: true, + style: true, + noscript: true, + }, + }); + + const scrapByQuerySelector = (query, blockContent) => + document + .querySelectorAll(query) + .map((el) => { + const block = blockContent?.(el) ?? el.textContent; + if (!block) return; + + const parts = block.split(/\s+/); + const trimmedParts = parts.filter(Boolean); // This removes any empty strings + const trimmedBlock = trimmedParts.join(' '); + if (trimmedBlock.length > 0) return trimmedBlock; + }) + .filter((a) => a); + + const title = document.querySelector('title').textContent; + const h1 = scrapByQuerySelector('h1'); + const h2 = scrapByQuerySelector('h2'); + const h3 = scrapByQuerySelector('h3'); + const h4 = scrapByQuerySelector('h4'); + const code = scrapByQuerySelector('code', (el) => + [...el.childNodes].map((el) => el.textContent).join('\n') + ); + const content = [ + ...scrapByQuerySelector('p,h1,h2,h3,h4,h5,h6,tr,th,td'), + ...code, + ]; + + if (applyIndexes && content.length > 0) { + const start = Date.now(); + const subject = new RecordId('page', [hostname, pathname]); + + console.log(`[IX] Indexing "${subject}"`); + await db.upsert(subject, { + title, + path: pathname, + hostname, + h1, + h2, + h3, + h4, + content, + code, + date: jobDate, + }); + + const elapsed = Date.now() - start; + console.log(`[IX] Took ${elapsed}ms to index "${subject}"`); + } else { + console.log('[IX] Skipping indexing, not on prod'); + } + }) + ); + } + + if (applyIndexes) { + console.log('[CW] Removing stale pages'); + await db.query( + /* surql */ ` + DELETE page WHERE + hostname = $hostname AND + (date IS NONE OR date < $jobDate) + `, + { + jobDate, + hostname: hostname, + } + ); + } else { + console.log('[CW] Skipping stale page removal, not on prod'); + } + + console.log('[CW] Closing connection to SurrealDB'); + await db.close(); +} diff --git a/plugins/crawler/manifest.yml b/plugins/crawler/manifest.yml new file mode 100644 index 000000000..ee9fa5f01 --- /dev/null +++ b/plugins/crawler/manifest.yml @@ -0,0 +1,3 @@ +# manifest.yml + +name: crawler diff --git a/plugins/crawler/package.json b/plugins/crawler/package.json new file mode 100644 index 000000000..0f38a4676 --- /dev/null +++ b/plugins/crawler/package.json @@ -0,0 +1,9 @@ +{ + "name": "crawler-plugin", + "main": "index.mjs", + "dependencies": { + "node-html-parser": "^6.1.11", + "surrealdb": "^1.0.6", + "xml2js": "^0.6.2" + } +} diff --git a/plugins/crawler/schema.surql b/plugins/crawler/schema.surql new file mode 100644 index 000000000..248eca7ee --- /dev/null +++ b/plugins/crawler/schema.surql @@ -0,0 +1,13 @@ +DEFINE TABLE OVERWRITE page SCHEMALESS PERMISSIONS FOR select FULL; +DEFINE ANALYZER OVERWRITE simple TOKENIZERS blank,class,camel,punct FILTERS snowball(english); +DEFINE INDEX OVERWRITE page_hostname ON page FIELDS hostname; +DEFINE INDEX OVERWRITE page_date_indexed ON page FIELDS date; +DEFINE INDEX OVERWRITE unique_page ON page FIELDS hostname, path UNIQUE; +DEFINE INDEX OVERWRITE page_title ON page FIELDS title SEARCH ANALYZER simple BM25(1.2,0.75); +DEFINE INDEX OVERWRITE page_path ON page FIELDS path SEARCH ANALYZER simple BM25(1.2,0.75); +DEFINE INDEX OVERWRITE page_h1 ON page FIELDS h1 SEARCH ANALYZER simple BM25(1.2,0.75); +DEFINE INDEX OVERWRITE page_h2 ON page FIELDS h2 SEARCH ANALYZER simple BM25(1.2,0.75); +DEFINE INDEX OVERWRITE page_h3 ON page FIELDS h3 SEARCH ANALYZER simple BM25(1.2,0.75); +DEFINE INDEX OVERWRITE page_h4 ON page FIELDS h4 SEARCH ANALYZER simple BM25(1.2,0.75); +DEFINE INDEX OVERWRITE page_content ON page FIELDS content SEARCH ANALYZER simple BM25(1.2,0.75) HIGHLIGHTS; +DEFINE INDEX OVERWRITE page_code ON page FIELDS code SEARCH ANALYZER simple BM25(1.2,0.75);