Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Reintroduce crawler #921

Merged
merged 5 commits into from
Oct 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -22,3 +22,6 @@ pnpm-debug.log*

# jetbrains setting folder
.idea/

# Local Netlify folder
.netlify
3 changes: 3 additions & 0 deletions astro.config.mjs
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ import { rehypeCopyCodePlugin } from './src/util/rehypeCopyCodePlugin.mjs';
import { autolinkConfig } from './src/util/rehypeHeadingsConfig';
import { rehypeNotesPlugin } from './src/util/rehypeNotesPlugin.mjs';

import sitemap from '@astrojs/sitemap';

const deployDomain = process.env.DEPLOY_DOMAIN ?? 'surrealdb.com';
const site = `https://${deployDomain}`;

Expand All @@ -34,6 +36,7 @@ export default defineConfig({
compress({
Image: false,
}),
sitemap(),
],
markdown: {
remarkPlugins: [remarkCustomHeadingId],
Expand Down
Binary file modified bun.lockb
Binary file not shown.
9 changes: 9 additions & 0 deletions netlify.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
[build]
command = "bun run build"
publish = "dist"

[dev]
command = "bun run dev"

[[plugins]]
package = "/plugins/crawler"
4 changes: 3 additions & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
"@ark-ui/solid": "^3.12.0",
"@astrojs/check": "^0.9.3",
"@astrojs/mdx": "^3.1.5",
"@astrojs/sitemap": "^3.2.0",
"@astrojs/solid-js": "^4.4.1",
"@astrojs/tailwind": "^5.1.0",
"@codemirror/lang-sql": "^6.7.1",
Expand Down Expand Up @@ -73,5 +74,6 @@
"remark-custom-heading-id": "^2.0.0",
"sass-embedded": "^1.78.0",
"tiny-glob": "^0.2.9"
}
},
"workspaces": ["plugins/*"]
}
147 changes: 147 additions & 0 deletions plugins/crawler/index.mjs
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
import fs from 'node:fs';
import path from 'node:path';
import { cwd } from 'node:process';
import { parse as parseHTML } from 'node-html-parser';
import Surreal, { RecordId, surql } from 'surrealdb';
import { parseStringPromise } from 'xml2js';

export async function onSuccess() {
const applyIndexes =
process.env.DEPLOY_URL !== 'https://0--surrealdb-docs.netlify.app';
const jobDate = new Date();
const db = new Surreal();

db.emitter.subscribe('connected', () =>
console.log('[DB] Connected to SurrealDB')
);
db.emitter.subscribe('disconnected', () =>
console.log('[DB] Disconnected from SurrealDB')
);
db.emitter.subscribe('error', (e) => console.log('[DB] Error occurred', e));

if (applyIndexes)
await db.connect(process.env.SURREAL_ENDPOINT, {
namespace: process.env.SURREAL_NAMESPACE,
database: process.env.SURREAL_DATABASE,
auth: {
namespace: process.env.SURREAL_NAMESPACE,
database: process.env.SURREAL_DATABASE,
username: process.env.SURREAL_USERNAME,
password: process.env.SURREAL_PASSWORD,
},
});

const buildDir = `${cwd()}/dist`;
const deployUrl = new URL(process.env.DEPLOY_PRIME_URL);
const hostname = deployUrl.hostname;
const sitemapPath = `${buildDir}/docs/sitemap-0.xml`;
console.log(`[CW] Build dir is: "${buildDir}"`);
console.log(`[CW] Deploy URL is: "${deployUrl}"`);
console.log(`[CW] Sitemap path is: "${sitemapPath}"`);

const sitemapXml = fs.readFileSync(sitemapPath, 'utf8');
const sitemap = await parseStringPromise(sitemapXml);
const urls = sitemap.urlset.url;
console.log(`[CW] The sitemap contains ${urls.length} url(s)`);

const pathnames = urls.map((url) => {
let pathname = decodeURI(new URL(url.loc[0]).pathname);
if (pathname.endsWith('/')) pathname = pathname.slice(0, -1);
return pathname;
});
const chunkSize = 1;

for (let i = 0; i < pathnames.length; i += chunkSize) {
const chunk = pathnames.slice(i, i + chunkSize);
await Promise.all(
chunk.map(async (pathname, index) => {
console.log(
`[CW] Crawling page ${index + i + 1}/${pathnames.length}: ${pathname}`
);

const filePath = path.join(buildDir, pathname, 'index.html');
const fileContent = fs
.readFileSync(filePath, 'utf-8')
.replace(/\0/g, '');

if (!fileContent)
throw new Error(`[CW] Failed to read file "${filePath}"`);

const document = parseHTML(fileContent, {
blockTextElements: {
script: true,
style: true,
noscript: true,
},
});

const scrapByQuerySelector = (query, blockContent) =>
document
.querySelectorAll(query)
.map((el) => {
const block = blockContent?.(el) ?? el.textContent;
if (!block) return;

const parts = block.split(/\s+/);
const trimmedParts = parts.filter(Boolean); // This removes any empty strings
const trimmedBlock = trimmedParts.join(' ');
if (trimmedBlock.length > 0) return trimmedBlock;
})
.filter((a) => a);

const title = document.querySelector('title').textContent;
const h1 = scrapByQuerySelector('h1');
const h2 = scrapByQuerySelector('h2');
const h3 = scrapByQuerySelector('h3');
const h4 = scrapByQuerySelector('h4');
const code = scrapByQuerySelector('code', (el) =>
[...el.childNodes].map((el) => el.textContent).join('\n')
);
const content = [
...scrapByQuerySelector('p,h1,h2,h3,h4,h5,h6,tr,th,td'),
...code,
];

if (applyIndexes && content.length > 0) {
const start = Date.now();
const subject = new RecordId('page', [hostname, pathname]);

console.log(`[IX] Indexing "${subject}"`);
await db.upsert(subject, {
title,
path: pathname,
hostname,
h1,
h2,
h3,
h4,
content,
code,
date: jobDate,
});

const elapsed = Date.now() - start;
console.log(`[IX] Took ${elapsed}ms to index "${subject}"`);
} else {
console.log('[IX] Skipping indexing, not on prod');
}
})
);
}

if (applyIndexes) {
console.log('[CW] Removing stale pages');
await db.query(
surql`
DELETE page WHERE
hostname = ${hostname} AND
(date IS NONE OR date < ${jobDate})
`
);
} else {
console.log('[CW] Skipping stale page removal, not on prod');
}

console.log('[CW] Closing connection to SurrealDB');
await db.close();
}
3 changes: 3 additions & 0 deletions plugins/crawler/manifest.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# manifest.yml

name: crawler
9 changes: 9 additions & 0 deletions plugins/crawler/package.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
{
"name": "crawler-plugin",
"main": "index.mjs",
"dependencies": {
"node-html-parser": "^6.1.11",
"surrealdb": "^1.0.6",
"xml2js": "^0.6.2"
}
}
13 changes: 13 additions & 0 deletions plugins/crawler/schema.surql
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
DEFINE TABLE OVERWRITE page SCHEMALESS PERMISSIONS FOR select FULL;
DEFINE ANALYZER OVERWRITE simple TOKENIZERS blank,class,camel,punct FILTERS snowball(english);
DEFINE INDEX OVERWRITE page_hostname ON page FIELDS hostname;
DEFINE INDEX OVERWRITE page_date_indexed ON page FIELDS date;
DEFINE INDEX OVERWRITE unique_page ON page FIELDS hostname, path UNIQUE;
DEFINE INDEX OVERWRITE page_title ON page FIELDS title SEARCH ANALYZER simple BM25(1.2,0.75);
DEFINE INDEX OVERWRITE page_path ON page FIELDS path SEARCH ANALYZER simple BM25(1.2,0.75);
DEFINE INDEX OVERWRITE page_h1 ON page FIELDS h1 SEARCH ANALYZER simple BM25(1.2,0.75);
DEFINE INDEX OVERWRITE page_h2 ON page FIELDS h2 SEARCH ANALYZER simple BM25(1.2,0.75);
DEFINE INDEX OVERWRITE page_h3 ON page FIELDS h3 SEARCH ANALYZER simple BM25(1.2,0.75);
DEFINE INDEX OVERWRITE page_h4 ON page FIELDS h4 SEARCH ANALYZER simple BM25(1.2,0.75);
DEFINE INDEX OVERWRITE page_content ON page FIELDS content SEARCH ANALYZER simple BM25(1.2,0.75) HIGHLIGHTS;
DEFINE INDEX OVERWRITE page_code ON page FIELDS code SEARCH ANALYZER simple BM25(1.2,0.75);
2 changes: 1 addition & 1 deletion src/util/rehypeHeadingsConfig.ts
Original file line number Diff line number Diff line change
Expand Up @@ -24,5 +24,5 @@ const AnchorLinkIcon = h(
export const autolinkConfig: Options = {
properties: { class: 'anchor-link' },
behavior: 'append',
content: (heading) => [AnchorLinkIcon],
content: () => [AnchorLinkIcon],
};