Skip to content

Commit

Permalink
Reintroduce crawler
Browse files Browse the repository at this point in the history
  • Loading branch information
kearfy committed Oct 8, 2024
1 parent 22d4f3d commit 527c12e
Show file tree
Hide file tree
Showing 9 changed files with 201 additions and 14 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -22,3 +22,6 @@ pnpm-debug.log*

# jetbrains setting folder
.idea/

# Local Netlify folder
.netlify
21 changes: 8 additions & 13 deletions astro.config.mjs
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ import { rehypeCopyCodePlugin } from './src/util/rehypeCopyCodePlugin.mjs';
import { autolinkConfig } from './src/util/rehypeHeadingsConfig';
import { rehypeNotesPlugin } from './src/util/rehypeNotesPlugin.mjs';

import sitemap from '@astrojs/sitemap';

const deployDomain = process.env.DEPLOY_DOMAIN ?? 'surrealdb.com';
const site = `https://${deployDomain}`;

Expand All @@ -23,18 +25,11 @@ export default defineConfig({
base: '/docs',
outDir: './dist/docs',
trailingSlash: 'never',
integrations: [
mdx(),
solidJs({ devtools: true }),
icon(),
tailwind({
nesting: true,
}),
partytown(),
compress({
Image: false,
}),
],
integrations: [mdx(), solidJs({ devtools: true }), icon(), tailwind({
nesting: true,
}), partytown(), compress({
Image: false,
}), sitemap()],
markdown: {
remarkPlugins: [remarkCustomHeadingId],
rehypePlugins: [
Expand All @@ -46,4 +41,4 @@ export default defineConfig({
],
syntaxHighlight: false,
},
});
});
Binary file modified bun.lockb
Binary file not shown.
9 changes: 9 additions & 0 deletions netlify.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
[build]
command = "bun run build"
publish = "dist"

[dev]
command = "bun run dev"

[[plugins]]
package = "/plugins/crawler"
4 changes: 3 additions & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
"@ark-ui/solid": "^3.12.0",
"@astrojs/check": "^0.9.3",
"@astrojs/mdx": "^3.1.5",
"@astrojs/sitemap": "^3.2.0",
"@astrojs/solid-js": "^4.4.1",
"@astrojs/tailwind": "^5.1.0",
"@codemirror/lang-sql": "^6.7.1",
Expand Down Expand Up @@ -73,5 +74,6 @@
"remark-custom-heading-id": "^2.0.0",
"sass-embedded": "^1.78.0",
"tiny-glob": "^0.2.9"
}
},
"workspaces": ["plugins/*"]
}
153 changes: 153 additions & 0 deletions plugins/crawler/index.mjs
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
import fs from 'node:fs';
import path from 'node:path';
import { parseStringPromise } from 'xml2js';
import { parse as parseHTML } from 'node-html-parser';
import Surreal, { RecordId, surql } from 'surrealdb';
import { cwd } from 'node:process';

export async function onSuccess() {
// const isLocalBuild = process.env.DEPLOY_URL == 'https://0--surrealdb-docs.netlify.app';
const applyIndexes =
process.env.DEPLOY_PRIME_URL ===
'https://main--surrealdb-docs.netlify.app';
const jobDate = new Date();
const db = new Surreal();

db.emitter.subscribe('connected', () =>
console.log('[DB] Connected to SurrealDB')
);
db.emitter.subscribe('disconnected', () =>
console.log('[DB] Disconnected from SurrealDB')
);
db.emitter.subscribe('error', (e) => console.log('[DB] Error occurred', e));

if (applyIndexes)
await db.connect(process.env.SURREAL_ENDPOINT, {
namespace: process.env.SURREAL_NAMESPACE,
database: process.env.SURREAL_DATABASE,
auth: {
namespace: process.env.SURREAL_NAMESPACE,
database: process.env.SURREAL_DATABASE,
username: process.env.SURREAL_USERNAME,
password: process.env.SURREAL_PASSWORD,
},
});

const buildDir = `${cwd()}/dist`;
const deployUrl = new URL(process.env.DEPLOY_PRIME_URL);
const hostname = deployUrl.hostname;
const sitemapPath = `${buildDir}/docs/sitemap-0.xml`;
console.log(`[CW] Build dir is: "${buildDir}"`);
console.log(`[CW] Deploy URL is: "${deployUrl}"`);
console.log(`[CW] Sitemap path is: "${sitemapPath}"`);

const sitemapXml = fs.readFileSync(sitemapPath, 'utf8');
const sitemap = await parseStringPromise(sitemapXml);
const urls = sitemap.urlset.url;
console.log(`[CW] The sitemap contains ${urls.length} url(s)`);

const pathnames = urls.map((url) => {
let pathname = decodeURI(new URL(url.loc[0]).pathname);
if (pathname.endsWith('/')) pathname = pathname.slice(0, -1);
return pathname;
});
const chunkSize = 1;

for (let i = 0; i < pathnames.length; i += chunkSize) {
const chunk = pathnames.slice(i, i + chunkSize);
await Promise.all(
chunk.map(async (pathname, index) => {
console.log(
`[CW] Crawling page ${index + i + 1}/${pathnames.length}: ${pathname}`
);

const filePath = path.join(buildDir, pathname, 'index.html');
const fileContent = fs
.readFileSync(filePath, 'utf-8')
.replace(/\0/g, '');

if (!fileContent)
throw new Error(`[CW] Failed to read file "${filePath}"`);

const document = parseHTML(fileContent, {
blockTextElements: {
script: true,
style: true,
noscript: true,
},
});

const scrapByQuerySelector = (query, blockContent) =>
document
.querySelectorAll(query)
.map((el) => {
const block = blockContent?.(el) ?? el.textContent;
if (!block) return;

const parts = block.split(/\s+/);
const trimmedParts = parts.filter(Boolean); // This removes any empty strings
const trimmedBlock = trimmedParts.join(' ');
if (trimmedBlock.length > 0) return trimmedBlock;
})
.filter((a) => a);

const title = document.querySelector('title').textContent;
const h1 = scrapByQuerySelector('h1');
const h2 = scrapByQuerySelector('h2');
const h3 = scrapByQuerySelector('h3');
const h4 = scrapByQuerySelector('h4');
const code = scrapByQuerySelector('code', (el) =>
[...el.childNodes].map((el) => el.textContent).join('\n')
);
const content = [
...scrapByQuerySelector('p,h1,h2,h3,h4,h5,h6,tr,th,td'),
...code,
];

if (applyIndexes && content.length > 0) {
const start = Date.now();
const subject = new RecordId('page', [hostname, pathname]);

console.log(`[IX] Indexing "${subject}"`);
await db.upsert(subject, {
title,
path: pathname,
hostname,
h1,
h2,
h3,
h4,
content,
code,
date: jobDate,
});

const elapsed = Date.now() - start;
console.log(`[IX] Took ${elapsed}ms to index "${subject}"`);
} else {
console.log('[IX] Skipping indexing, not on prod');
}
})
);
}

if (applyIndexes) {
console.log('[CW] Removing stale pages');
await db.query(
/* surql */ `
DELETE page WHERE
hostname = $hostname AND
(date IS NONE OR date < $jobDate)
`,
{
jobDate,
hostname: hostname,
}
);
} else {
console.log('[CW] Skipping stale page removal, not on prod');
}

console.log('[CW] Closing connection to SurrealDB');
await db.close();
}
3 changes: 3 additions & 0 deletions plugins/crawler/manifest.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# manifest.yml

name: crawler
9 changes: 9 additions & 0 deletions plugins/crawler/package.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
{
"name": "crawler-plugin",
"main": "index.mjs",
"dependencies": {
"node-html-parser": "^6.1.11",
"surrealdb": "^1.0.6",
"xml2js": "^0.6.2"
}
}
13 changes: 13 additions & 0 deletions plugins/crawler/schema.surql
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
DEFINE TABLE OVERWRITE page SCHEMALESS PERMISSIONS FOR select FULL;
DEFINE ANALYZER OVERWRITE simple TOKENIZERS blank,class,camel,punct FILTERS snowball(english);
DEFINE INDEX OVERWRITE page_hostname ON page FIELDS hostname;
DEFINE INDEX OVERWRITE page_date_indexed ON page FIELDS date;
DEFINE INDEX OVERWRITE unique_page ON page FIELDS hostname, path UNIQUE;
DEFINE INDEX OVERWRITE page_title ON page FIELDS title SEARCH ANALYZER simple BM25(1.2,0.75);
DEFINE INDEX OVERWRITE page_path ON page FIELDS path SEARCH ANALYZER simple BM25(1.2,0.75);
DEFINE INDEX OVERWRITE page_h1 ON page FIELDS h1 SEARCH ANALYZER simple BM25(1.2,0.75);
DEFINE INDEX OVERWRITE page_h2 ON page FIELDS h2 SEARCH ANALYZER simple BM25(1.2,0.75);
DEFINE INDEX OVERWRITE page_h3 ON page FIELDS h3 SEARCH ANALYZER simple BM25(1.2,0.75);
DEFINE INDEX OVERWRITE page_h4 ON page FIELDS h4 SEARCH ANALYZER simple BM25(1.2,0.75);
DEFINE INDEX OVERWRITE page_content ON page FIELDS content SEARCH ANALYZER simple BM25(1.2,0.75) HIGHLIGHTS;
DEFINE INDEX OVERWRITE page_code ON page FIELDS code SEARCH ANALYZER simple BM25(1.2,0.75);

0 comments on commit 527c12e

Please sign in to comment.