Skip to content

Commit

Permalink
switch to new index, perform full package installation in JavaScript
Browse files Browse the repository at this point in the history
  • Loading branch information
garrettgu10 committed Apr 16, 2024
1 parent a7783b1 commit 1400739
Show file tree
Hide file tree
Showing 10 changed files with 213 additions and 113 deletions.
10 changes: 5 additions & 5 deletions build/pyodide_bucket.bzl
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
# Do not edit this file by hand. See docs/pyodide.md for info on how to generate it.
# These variables are factored out here because they are being shared by the WORKSPACE files in
# both edgeworker and workerd, as well as src/pyodide/BUILD.bazel
PYODIDE_PACKAGE_BUCKET_URL = "https://pub-45d734c4145d4285b343833ee450ef38.r2.dev/20240412-experimental/"
PYODIDE_GITHUB_RELEASE_URL = "https://github.com/cloudflare/pyodide-build-scripts/releases/download/20240412-experimental/"
PYODIDE_LOCK_SHA256 = "db29ebb43fcd05cbc6fcba051ec7eb61a9a1bc4210353e29fdad57c6f9be1a5a"
PYODIDE_PACKAGES_TAR_ZIP_SHA256 = "6579f114f007ac307c55c221f1b5018e30c95a3cc45b86a334bbbfa442c1bf1b"
PYODIDE_ALL_WHEELS_ZIP_SHA256 = "f8a34a284a7bc2ffc44ae86a160423a8aaf8cbb88eca268e1ea9300a187cf3af"
PYODIDE_PACKAGE_BUCKET_URL = "https://pub-45d734c4145d4285b343833ee450ef38.r2.dev/20240415-experimental/"
PYODIDE_GITHUB_RELEASE_URL = "https://github.com/cloudflare/pyodide-build-scripts/releases/download/20240415-experimental/"
PYODIDE_LOCK_SHA256 = "67d1a24edf4f3ab2cf85c736391c04763ff722bf3aebf9ea3469d96e5f51e1da"
PYODIDE_PACKAGES_TAR_ZIP_SHA256 = "749967941204154e7ae866fe08f1216a3e5ee58ba6a3757231a5be0d9d4430f8"
PYODIDE_ALL_WHEELS_ZIP_SHA256 = "9e7c330ee93d81d0356cc2d585f217dfee58b623ad4535282baa6e82bd063eee"
3 changes: 2 additions & 1 deletion docs/pyodide.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,11 @@ workerd is linked against a Pyodide lock file, which is located within an R2 buc
If you know where the R2 bucket is (See build/pyodide_bucket.bzl) then the `pyodide-lock.json` file is located inside the root of the R2 directory for the Pyodide package bundle release.

This lock file contains some information used by workerd to pull in package requirements, including but not limited to:

- The versions of each package included in the package bundle
- The file names and SHA hashes of each package available for download in the bucket
- What the dependencies are for each package

## Generating pyodide_bucket.bzl
We have scripts and GitHub actions set up for building and uploading Pyodide package bundles onto R2. These are available [here](https://github.com/cloudflare/pyodide-build-scripts). Simply follow the instructions on that repo to build a new version of Pyodide or a new package bundle release.

We have scripts and GitHub actions set up for building and uploading Pyodide package bundles onto R2. These are available [here](https://github.com/cloudflare/pyodide-build-scripts). Simply follow the instructions on that repo to build a new version of Pyodide or a new package bundle release.
25 changes: 0 additions & 25 deletions src/pyodide/internal/builtin_wrappers.js
Original file line number Diff line number Diff line change
Expand Up @@ -130,28 +130,3 @@ export async function wasmInstantiate(module, imports) {
const instance = new WebAssembly.Instance(module, imports);
return { module, instance };
}

export function patchFetch(origin) {
// Patch fetch to first go through disk cache, but only when url points to origin
const origFetch = globalThis.fetch;
globalThis.fetch = async function (url, options) {
if (url.origin !== origin) {
return origFetch(url, options);
}

const fileName = url.pathname.substring(url.pathname.lastIndexOf("/") + 1);
const cached = DiskCache.get(fileName);
if (cached) {
return new Response(cached);
}

// we didn't find it in the disk cache, continue with original fetch
const response = await origFetch(url, options);

const arrayBuffer = await new Response(response.body.pipeThrough(new DecompressionStream("gzip"))).arrayBuffer();

console.log("decompressed", fileName, arrayBuffer.byteLength, "bytes");
DiskCache.put(fileName, arrayBuffer);
return new Response(arrayBuffer);
};
}
87 changes: 87 additions & 0 deletions src/pyodide/internal/loadPackage.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
/**
* This file contains code that roughly replaces pyodide.loadPackage, with workerd-specific
* optimizations:
* - Wheels are decompressed with a DecompressionStream instead of in Python
* - Wheels are overlaid onto the site-packages dir instead of actually being copied
* - Wheels are fetched from a disk cache if available.
*
* Note that loadPackages is only used in local dev for now, internally we use the full big bundle
* that contains all the packages ready to go.
*/

import { default as LOCKFILE } from "pyodide-internal:generated/pyodide-lock.json";
import { WORKERD_INDEX_URL } from "pyodide-internal:metadata";
import { SITE_PACKAGES, LOAD_WHEELS_FROM_R2, getSitePackagesPath } from "pyodide-internal:setupPackages";
import { parseTarInfo } from "pyodide-internal:tar";
import { default as DiskCache } from "pyodide-internal:disk_cache";
import { createTarFS } from "pyodide-internal:tarfs";

async function loadBundle(requirement) {
// first check if the disk cache has what we want
const filename = LOCKFILE["packages"][requirement]["file_name"];
const cached = DiskCache.get(filename);
if (cached) {
return [requirement, cached];
}

// we didn't find it in the disk cache, continue with original fetch
const url = new URL(WORKERD_INDEX_URL + filename);
const response = await fetch(url);

const arrayBuffer = await new Response(response.body.pipeThrough(new DecompressionStream("gzip"))).arrayBuffer();

DiskCache.put(filename, arrayBuffer);
return [requirement, arrayBuffer];
};

/**
* ArrayBufferReader wraps around an arrayBuffer in a way that tar.js is able to read from
*/
class ArrayBufferReader {
constructor(arrayBuffer) {
this.arrayBuffer = arrayBuffer;
}

read(offset, buf){
// buf is a Uint8Array
const size = this.arrayBuffer.byteLength;
if (offset >= size || offset < 0) {
return 0;
}
let toCopy = buf.length;
if (size - offset < toCopy) {
toCopy = size - offset;
}
buf.set(new Uint8Array(this.arrayBuffer, offset, toCopy));
return toCopy;
}
}

export async function loadPackages(Module, requirements) {
if (!LOAD_WHEELS_FROM_R2) return;

let loadPromises = [];
let loading = [];
for (const req of requirements) {
if (SITE_PACKAGES.loadedRequirements.has(req)) continue;
loadPromises.push(loadBundle(req));
loading.push(req);
}

console.log("Loading " + loading.join(", "));

await Promise.all(loadPromises).then((buffers) => {
for (const [requirement, buffer] of buffers) {
const reader = new ArrayBufferReader(buffer);
const [tarInfo, soFiles] = parseTarInfo(reader);
SITE_PACKAGES.addSmallBundle(tarInfo, soFiles, requirement);
}
});

console.log("Loaded " + loading.join(", "));

const tarFS = createTarFS(Module);
const path = getSitePackagesPath(Module);
const info = SITE_PACKAGES.rootInfo;
Module.FS.mount(tarFS, { info }, path);
}
6 changes: 4 additions & 2 deletions src/pyodide/internal/python.js
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import { enterJaegerSpan } from "pyodide-internal:jaeger";
import {
SITE_PACKAGES_INFO,
TRANSITIVE_REQUIREMENTS,
SITE_PACKAGES,
adjustSysPath,
mountLib,
} from "pyodide-internal:setupPackages";
Expand Down Expand Up @@ -182,7 +183,7 @@ async function instantiateEmscriptenModule(emscriptenSettings) {
*/
async function prepareWasmLinearMemory(Module) {
// Note: if we are restoring from a snapshot, runtime is not initialized yet.
mountLib(Module, SITE_PACKAGES_INFO);
mountLib(Module, SITE_PACKAGES.rootInfo);
if (SHOULD_RESTORE_SNAPSHOT) {
restoreSnapshot(Module);
// Don't call adjustSysPath here: it was called in the other branch when we
Expand All @@ -194,6 +195,7 @@ async function prepareWasmLinearMemory(Module) {
}

export async function loadPyodide(lockfile, indexURL) {
console.log("loading pyodide");
const emscriptenSettings = getEmscriptenSettings(lockfile, indexURL);
const Module = await enterJaegerSpan("instantiate_emscripten", () =>
instantiateEmscriptenModule(emscriptenSettings),
Expand Down
163 changes: 101 additions & 62 deletions src/pyodide/internal/setupPackages.js
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@ import { createTarFS } from "pyodide-internal:tarfs";
import { createMetadataFS } from "pyodide-internal:metadatafs";
import { default as LOCKFILE } from "pyodide-internal:generated/pyodide-lock.json";
import { REQUIREMENTS, WORKERD_INDEX_URL } from "pyodide-internal:metadata";
import { patchFetch } from "pyodide-internal:builtin_wrappers";
import { simpleRunPython } from "pyodide-internal:util";

const canonicalizeNameRegex = /[-_.]+/g;
Expand All @@ -23,6 +22,86 @@ const STDLIB_PACKAGES = Object.values(LOCKFILE.packages)
.filter(({ install_dir }) => install_dir === "stdlib")
.map(({ name }) => canonicalizePackageName(name));

/**
* SitePackagesDir keeps track of the virtualized view of the site-packages
* directory generated for each worker.
*/
class SitePackagesDir {
constructor() {
this.rootInfo = {
children: new Map(),
mode: 0o777,
type: 5,
modtime: 0,
size: 0,
path: "",
name: "",
parts: [],
};
this.soFiles = [];
this.loadedRequirements = new Set();
}

/**
* mountOverlay "overlays" a directory onto the site-packages root directory.
* All files and subdirectories in the overlay will be accessible at site-packages by the worker.
* If a file or directory already exists, an error is thrown.
* @param {TarInfo} overlayInfo The directory that is to be "copied" into site-packages
*/
mountOverlay(overlayInfo) {
overlayInfo.children.forEach((val, key) => {
if (this.rootInfo.children.has(key)) {
throw new Error(
`File/folder ${key} being written by multiple packages`,
);
}
this.rootInfo.children.set(key, val);
});
}

/**
* A small bundle contains just a single package. The entire bundle will be overlaid onto site-packages.
* A small bundle can basically be thought of as a wheel.
* @param {TarInfo} tarInfo The root tarInfo for the small bundle (See tar.js)
* @param {List<String>} soFiles A list of .so files contained in the small bundle
* @param {String} requirement The canonicalized package name this small bundle corresponds to
*/
addSmallBundle(tarInfo, soFiles, requirement) {
for (const soFile of soFiles) {
this.soFiles.push(soFile.split("/"));
}
this.mountOverlay(tarInfo);
this.loadedRequirements.add(requirement);
}

/**
* A big bundle contains multiple packages, each package contained in a folder whose name is the canonicalized package name.
* This function overlays the requested packages onto the site-packages directory.
* @param {TarInfo} tarInfo The root tarInfo for the big bundle (See tar.js)
* @param {List<String>} soFiles A list of .so files contained in the big bundle
* @param {List<String>} requirements canonicalized list of packages to pick from the big bundle
*/
addBigBundle(tarInfo, soFiles, requirements) {
// add all the .so files we will need to preload from the big bundle
for (const soFile of soFiles) {
// If folder is in list of requirements include .so file in list to preload.
const [pkg, ...rest] = soFile.split("/");
if (requirements.has(pkg)) {
this.soFiles.push(rest);
}
}

for (const req of requirements) {
const child = tarInfo.children.get(req);
if (!child) {
throw new Error(`Requirement ${req} not found in pyodide packages tar`);
}
this.mountOverlay(child);
this.loadedRequirements.add(req);
}
}
};

/**
* This stitches together the view of the site packages directory. Each
* requirement corresponds to a folder in the original tar file. For each
Expand All @@ -33,52 +112,19 @@ const STDLIB_PACKAGES = Object.values(LOCKFILE.packages)
* directory so we can preload them.
*/
export function buildSitePackages(requirements) {
const [origTarInfo, origSoFiles] = parseTarInfo();
// We'd like to set USE_LOAD_PACKAGE = IS_WORKERD but we also build a funny
// workerd with the downstream package set. We can distinguish between them by
// looking at the contents. This uses the fact that the downstream set is
// larger, but there are a lot of differences...
const USE_LOAD_PACKAGE = origTarInfo.children.size < 10;
if (USE_LOAD_PACKAGE) {
requirements = new Set([...STDLIB_PACKAGES]);
} else {
requirements = new Set([...STDLIB_PACKAGES, ...requirements]);
}
const soFiles = [];
for (const soFile of origSoFiles) {
// If folder is in list of requirements include .so file in list to preload.
const [pkg, ...rest] = soFile.split("/");
if (requirements.has(pkg)) {
soFiles.push(rest);
}
}
const newTarInfo = {
children: new Map(),
mode: 0o777,
type: 5,
modtime: 0,
size: 0,
path: "",
name: "",
parts: [],
};

for (const req of requirements) {
const child = origTarInfo.children.get(req);
if (!child) {
throw new Error(`Requirement ${req} not found in pyodide packages tar`);
}
child.children.forEach((val, key) => {
if (newTarInfo.children.has(key)) {
throw new Error(
`File/folder ${key} being written by multiple packages`,
);
}
newTarInfo.children.set(key, val);
});
const [bigTarInfo, bigTarSoFiles] = parseTarInfo();

let LOAD_WHEELS_FROM_R2 = true;
let requirementsInBigBundle = new Set([...STDLIB_PACKAGES]);
if(bigTarInfo.children.size > 10) {
LOAD_WHEELS_FROM_R2 = false;
requirements.forEach(r => requirementsInBigBundle.add(r));
}

return [newTarInfo, soFiles, USE_LOAD_PACKAGE];
const res = new SitePackagesDir();
res.addBigBundle(bigTarInfo, bigTarSoFiles, requirementsInBigBundle);

return [res, LOAD_WHEELS_FROM_R2];
}

/**
Expand All @@ -89,23 +135,12 @@ export function buildSitePackages(requirements) {
* TODO: stop using loadPackage in workerd.
*/
export function patchLoadPackage(pyodide) {
if (!USE_LOAD_PACKAGE) {
pyodide.loadPackage = disabledLoadPackage;
return;
}
patchFetch(new URL(WORKERD_INDEX_URL).origin);
const origLoadPackage = pyodide.loadPackage;
function loadPackage(packages, options) {
return origLoadPackage(packages, {
checkIntegrity: false,
...options,
});
}
pyodide.loadPackage = loadPackage;
pyodide.loadPackage = disabledLoadPackage;
return;
}

function disabledLoadPackage() {
throw new Error("We only use loadPackage in workerd");
throw new Error("pyodide.loadPackage is disabled");
}

/**
Expand Down Expand Up @@ -138,7 +173,12 @@ export function mountLib(Module, info) {
const site_packages = getSitePackagesPath(Module);
Module.FS.mkdirTree(site_packages);
Module.FS.mkdirTree("/session/metadata");
Module.FS.mount(tarFS, { info }, site_packages);
if (!LOAD_WHEELS_FROM_R2) {
// if we are not loading additional wheels from R2, then we're done
// with site-packages and we can mount it here. Otherwise, we must mount it in
// loadPackages().
Module.FS.mount(tarFS, { info }, site_packages);
}
Module.FS.mount(mdFS, {}, "/session/metadata");
}

Expand Down Expand Up @@ -191,5 +231,4 @@ function addPackageToLoad(lockfile, name, toLoad) {

export { REQUIREMENTS };
export const TRANSITIVE_REQUIREMENTS = getTransitiveRequirements();
export const [SITE_PACKAGES_INFO, SITE_PACKAGES_SO_FILES, USE_LOAD_PACKAGE] =
buildSitePackages(TRANSITIVE_REQUIREMENTS);
export const [SITE_PACKAGES, LOAD_WHEELS_FROM_R2] = buildSitePackages(TRANSITIVE_REQUIREMENTS);
Loading

0 comments on commit 1400739

Please sign in to comment.