Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

switch to new package index, decompress into .tar in javascript #2024

Merged
merged 4 commits into from
May 7, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 5 additions & 4 deletions build/pyodide_bucket.bzl
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
# Do not edit this file by hand. See docs/pyodide.md for info on how to generate it.
# These variables are factored out here because they are being shared by the WORKSPACE files in
# both edgeworker and workerd, as well as src/pyodide/BUILD.bazel
PYODIDE_PACKAGE_BUCKET_URL = "https://pub-45d734c4145d4285b343833ee450ef38.r2.dev/20240320/"
PYODIDE_GITHUB_RELEASE_URL = "https://github.com/cloudflare/pyodide-build-scripts/releases/download/20240320/"
PYODIDE_LOCK_SHA256 = "a176311d4c449aac4ef7a333977af8b6e08224c115a9a6d05c04592c841b8a58"
PYODIDE_PACKAGES_TAR_ZIP_SHA256 = "e191bae60aad75f6a9c33fac5c0ff1ad2b0e564bdd7a07fbdc848df4b62c60a1"
PYODIDE_PACKAGE_BUCKET_URL = "https://pub-45d734c4145d4285b343833ee450ef38.r2.dev/20240415-experimental/"
PYODIDE_GITHUB_RELEASE_URL = "https://github.com/cloudflare/pyodide-build-scripts/releases/download/20240415-experimental/"
PYODIDE_LOCK_SHA256 = "67d1a24edf4f3ab2cf85c736391c04763ff722bf3aebf9ea3469d96e5f51e1da"
PYODIDE_PACKAGES_TAR_ZIP_SHA256 = "749967941204154e7ae866fe08f1216a3e5ee58ba6a3757231a5be0d9d4430f8"
PYODIDE_ALL_WHEELS_ZIP_SHA256 = "9e7c330ee93d81d0356cc2d585f217dfee58b623ad4535282baa6e82bd063eee"
3 changes: 2 additions & 1 deletion docs/pyodide.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,11 @@ workerd is linked against a Pyodide lock file, which is located within an R2 buc
If you know where the R2 bucket is (See build/pyodide_bucket.bzl) then the `pyodide-lock.json` file is located inside the root of the R2 directory for the Pyodide package bundle release.

This lock file contains some information used by workerd to pull in package requirements, including but not limited to:

- The versions of each package included in the package bundle
- The file names and SHA hashes of each package available for download in the bucket
- What the dependencies are for each package

## Generating pyodide_bucket.bzl
We have scripts and GitHub actions set up for building and uploading Pyodide package bundles onto R2. These are available [here](https://github.com/cloudflare/pyodide-build-scripts). Simply follow the instructions on that repo to build a new version of Pyodide or a new package bundle release.

We have scripts and GitHub actions set up for building and uploading Pyodide package bundles onto R2. These are available [here](https://github.com/cloudflare/pyodide-build-scripts). Simply follow the instructions on that repo to build a new version of Pyodide or a new package bundle release.
22 changes: 0 additions & 22 deletions src/pyodide/internal/builtin_wrappers.js
Original file line number Diff line number Diff line change
Expand Up @@ -114,25 +114,3 @@ export async function wasmInstantiate(module, imports) {
const instance = new WebAssembly.Instance(module, imports);
return { module, instance };
}

export function patchFetch(origin) {
// Patch fetch to first go through disk cache, but only when url points to origin
const origFetch = globalThis.fetch;
globalThis.fetch = async function (url, options) {
if (url.origin !== origin) {
return origFetch(url, options);
}

const fileName = url.pathname.substring(url.pathname.lastIndexOf("/") + 1);
const cached = DiskCache.get(fileName);
if (cached) {
return new Response(cached);
}

// we didn't find it in the disk cache, continue with original fetch
const response = await origFetch(url, options);
const arrayBuffer = await response.arrayBuffer();
DiskCache.put(fileName, arrayBuffer);
return new Response(arrayBuffer);
};
}
86 changes: 86 additions & 0 deletions src/pyodide/internal/loadPackage.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
/**
* This file contains code that roughly replaces pyodide.loadPackage, with workerd-specific
* optimizations:
* - Wheels are decompressed with a DecompressionStream instead of in Python
* - Wheels are overlaid onto the site-packages dir instead of actually being copied
* - Wheels are fetched from a disk cache if available.
*
* Note that loadPackages is only used in local dev for now, internally we use the full big bundle
* that contains all the packages ready to go.
*/

import { default as LOCKFILE } from "pyodide-internal:generated/pyodide-lock.json";
import { WORKERD_INDEX_URL } from "pyodide-internal:metadata";
import { SITE_PACKAGES, LOAD_WHEELS_FROM_R2, getSitePackagesPath } from "pyodide-internal:setupPackages";
import { parseTarInfo } from "pyodide-internal:tar";
import { default as DiskCache } from "pyodide-internal:disk_cache";
import { createTarFS } from "pyodide-internal:tarfs";

async function loadBundle(requirement) {
// first check if the disk cache has what we want
const filename = LOCKFILE["packages"][requirement]["file_name"];
const cached = DiskCache.get(filename);
if (cached) {
return [requirement, cached];
}

// we didn't find it in the disk cache, continue with original fetch
const url = new URL(WORKERD_INDEX_URL + filename);
const response = await fetch(url);

const arrayBuffer = await new Response(response.body.pipeThrough(new DecompressionStream("gzip"))).arrayBuffer();

DiskCache.put(filename, arrayBuffer);
return [requirement, arrayBuffer];
};

/**
* ArrayBufferReader wraps around an arrayBuffer in a way that tar.js is able to read from
*/
class ArrayBufferReader {
constructor(arrayBuffer) {
this.arrayBuffer = arrayBuffer;
}

read(offset, buf){
// buf is a Uint8Array
const size = this.arrayBuffer.byteLength;
if (offset >= size || offset < 0) {
return 0;
}
let toCopy = buf.length;
if (size - offset < toCopy) {
toCopy = size - offset;
}
buf.set(new Uint8Array(this.arrayBuffer, offset, toCopy));
return toCopy;
}
}
Comment on lines +40 to +58
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should tar.js instead be changed to expect an ArrayBuffer?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hmm, then we would have to expose an ArrayBuffer instead of a PackagesTarReader, not sure if we want that big of a refactor. This interface is meant to match exactly what PackagesTarReader does in JSG.


export async function loadPackages(Module, requirements) {
if (!LOAD_WHEELS_FROM_R2) return;

let loadPromises = [];
let loading = [];
for (const req of requirements) {
if (SITE_PACKAGES.loadedRequirements.has(req)) continue;
loadPromises.push(loadBundle(req));
loading.push(req);
}

console.log("Loading " + loading.join(", "));
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should these console.logs be removed?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I kept them there to match the original prints from the pyodide package loader. I'd prefer to keep them for now so we have more visibility on performance in testing.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
console.log("Loading " + loading.join(", "));


const buffers = await Promise.all(loadPromises);
for (const [requirement, buffer] of buffers) {
const reader = new ArrayBufferReader(buffer);
const [tarInfo, soFiles] = parseTarInfo(reader);
SITE_PACKAGES.addSmallBundle(tarInfo, soFiles, requirement);
}

console.log("Loaded " + loading.join(", "));
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
console.log("Loaded " + loading.join(", "));


const tarFS = createTarFS(Module);
const path = getSitePackagesPath(Module);
const info = SITE_PACKAGES.rootInfo;
Module.FS.mount(tarFS, { info }, path);
}
5 changes: 3 additions & 2 deletions src/pyodide/internal/python.js
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
Error.stackTraceLimit = Infinity;
import { enterJaegerSpan } from "pyodide-internal:jaeger";
import {
SITE_PACKAGES_INFO,
TRANSITIVE_REQUIREMENTS,
SITE_PACKAGES,
adjustSysPath,
mountLib,
} from "pyodide-internal:setupPackages";
Expand Down Expand Up @@ -187,7 +188,7 @@ async function instantiateEmscriptenModule(emscriptenSettings) {
*/
async function prepareWasmLinearMemory(Module) {
// Note: if we are restoring from a snapshot, runtime is not initialized yet.
mountLib(Module, SITE_PACKAGES_INFO);
mountLib(Module, SITE_PACKAGES.rootInfo);
entropyMountFiles(Module);
if (SHOULD_RESTORE_SNAPSHOT) {
restoreSnapshot(Module);
Expand Down
163 changes: 101 additions & 62 deletions src/pyodide/internal/setupPackages.js
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@ import { createTarFS } from "pyodide-internal:tarfs";
import { createMetadataFS } from "pyodide-internal:metadatafs";
import { default as LOCKFILE } from "pyodide-internal:generated/pyodide-lock.json";
import { REQUIREMENTS, WORKERD_INDEX_URL } from "pyodide-internal:metadata";
import { patchFetch } from "pyodide-internal:builtin_wrappers";
import { simpleRunPython } from "pyodide-internal:util";

const canonicalizeNameRegex = /[-_.]+/g;
Expand All @@ -23,6 +22,86 @@ const STDLIB_PACKAGES = Object.values(LOCKFILE.packages)
.filter(({ install_dir }) => install_dir === "stdlib")
.map(({ name }) => canonicalizePackageName(name));

/**
* SitePackagesDir keeps track of the virtualized view of the site-packages
* directory generated for each worker.
*/
class SitePackagesDir {
constructor() {
this.rootInfo = {
children: new Map(),
mode: 0o777,
type: 5,
modtime: 0,
size: 0,
path: "",
name: "",
parts: [],
};
this.soFiles = [];
this.loadedRequirements = new Set();
}

/**
* mountOverlay "overlays" a directory onto the site-packages root directory.
* All files and subdirectories in the overlay will be accessible at site-packages by the worker.
* If a file or directory already exists, an error is thrown.
* @param {TarInfo} overlayInfo The directory that is to be "copied" into site-packages
*/
mountOverlay(overlayInfo) {
overlayInfo.children.forEach((val, key) => {
if (this.rootInfo.children.has(key)) {
throw new Error(
`File/folder ${key} being written by multiple packages`,
);
}
this.rootInfo.children.set(key, val);
});
}

/**
* A small bundle contains just a single package. The entire bundle will be overlaid onto site-packages.
* A small bundle can basically be thought of as a wheel.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should it just be called addSingleWheel then?

* @param {TarInfo} tarInfo The root tarInfo for the small bundle (See tar.js)
* @param {List<String>} soFiles A list of .so files contained in the small bundle
* @param {String} requirement The canonicalized package name this small bundle corresponds to
*/
addSmallBundle(tarInfo, soFiles, requirement) {
for (const soFile of soFiles) {
this.soFiles.push(soFile.split("/"));
}
this.mountOverlay(tarInfo);
this.loadedRequirements.add(requirement);
}

/**
* A big bundle contains multiple packages, each package contained in a folder whose name is the canonicalized package name.
* This function overlays the requested packages onto the site-packages directory.
* @param {TarInfo} tarInfo The root tarInfo for the big bundle (See tar.js)
* @param {List<String>} soFiles A list of .so files contained in the big bundle
* @param {List<String>} requirements canonicalized list of packages to pick from the big bundle
*/
addBigBundle(tarInfo, soFiles, requirements) {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wonder why we have this split. Isn't addSmallBundle just the same as addBigBundle with a single requirement? Why separate them?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The structure of a big bundle is different from a small bundle. A big bundle contains one folder for each package, with each folder containing the structure of a wheel. A small bundle contains just the wheel for a single package, not contained within a folder.

// add all the .so files we will need to preload from the big bundle
for (const soFile of soFiles) {
// If folder is in list of requirements include .so file in list to preload.
const [pkg, ...rest] = soFile.split("/");
if (requirements.has(pkg)) {
this.soFiles.push(rest);
}
}

for (const req of requirements) {
const child = tarInfo.children.get(req);
if (!child) {
throw new Error(`Requirement ${req} not found in pyodide packages tar`);
}
this.mountOverlay(child);
this.loadedRequirements.add(req);
}
}
};

/**
* This stitches together the view of the site packages directory. Each
* requirement corresponds to a folder in the original tar file. For each
Expand All @@ -33,52 +112,19 @@ const STDLIB_PACKAGES = Object.values(LOCKFILE.packages)
* directory so we can preload them.
*/
export function buildSitePackages(requirements) {
const [origTarInfo, origSoFiles] = parseTarInfo();
// We'd like to set USE_LOAD_PACKAGE = IS_WORKERD but we also build a funny
// workerd with the downstream package set. We can distinguish between them by
// looking at the contents. This uses the fact that the downstream set is
// larger, but there are a lot of differences...
const USE_LOAD_PACKAGE = origTarInfo.children.size < 10;
if (USE_LOAD_PACKAGE) {
requirements = new Set([...STDLIB_PACKAGES]);
} else {
requirements = new Set([...STDLIB_PACKAGES, ...requirements]);
}
const soFiles = [];
for (const soFile of origSoFiles) {
// If folder is in list of requirements include .so file in list to preload.
const [pkg, ...rest] = soFile.split("/");
if (requirements.has(pkg)) {
soFiles.push(rest);
}
}
const newTarInfo = {
children: new Map(),
mode: 0o777,
type: 5,
modtime: 0,
size: 0,
path: "",
name: "",
parts: [],
};

for (const req of requirements) {
const child = origTarInfo.children.get(req);
if (!child) {
throw new Error(`Requirement ${req} not found in pyodide packages tar`);
}
child.children.forEach((val, key) => {
if (newTarInfo.children.has(key)) {
throw new Error(
`File/folder ${key} being written by multiple packages`,
);
}
newTarInfo.children.set(key, val);
});
const [bigTarInfo, bigTarSoFiles] = parseTarInfo();

let LOAD_WHEELS_FROM_R2 = true;
let requirementsInBigBundle = new Set([...STDLIB_PACKAGES]);
if (bigTarInfo.children.size > 10) {
LOAD_WHEELS_FROM_R2 = false;
requirements.forEach(r => requirementsInBigBundle.add(r));
}

return [newTarInfo, soFiles, USE_LOAD_PACKAGE];
const res = new SitePackagesDir();
res.addBigBundle(bigTarInfo, bigTarSoFiles, requirementsInBigBundle);

return [res, LOAD_WHEELS_FROM_R2];
}

/**
Expand All @@ -89,23 +135,12 @@ export function buildSitePackages(requirements) {
* TODO: stop using loadPackage in workerd.
*/
export function patchLoadPackage(pyodide) {
if (!USE_LOAD_PACKAGE) {
pyodide.loadPackage = disabledLoadPackage;
return;
}
patchFetch(new URL(WORKERD_INDEX_URL).origin);
const origLoadPackage = pyodide.loadPackage;
function loadPackage(packages, options) {
return origLoadPackage(packages, {
checkIntegrity: false,
...options,
});
}
pyodide.loadPackage = loadPackage;
pyodide.loadPackage = disabledLoadPackage;
return;
}

function disabledLoadPackage() {
throw new Error("We only use loadPackage in workerd");
throw new Error("pyodide.loadPackage is disabled because packages are encoded in the binary");
}

/**
Expand Down Expand Up @@ -138,7 +173,12 @@ export function mountLib(Module, info) {
const site_packages = getSitePackagesPath(Module);
Module.FS.mkdirTree(site_packages);
Module.FS.mkdirTree("/session/metadata");
Module.FS.mount(tarFS, { info }, site_packages);
if (!LOAD_WHEELS_FROM_R2) {
// if we are not loading additional wheels from R2, then we're done
// with site-packages and we can mount it here. Otherwise, we must mount it in
// loadPackages().
Module.FS.mount(tarFS, { info }, site_packages);
}
Module.FS.mount(mdFS, {}, "/session/metadata");
}

Expand Down Expand Up @@ -191,5 +231,4 @@ function addPackageToLoad(lockfile, name, toLoad) {

export { REQUIREMENTS };
export const TRANSITIVE_REQUIREMENTS = getTransitiveRequirements();
export const [SITE_PACKAGES_INFO, SITE_PACKAGES_SO_FILES, USE_LOAD_PACKAGE] =
buildSitePackages(TRANSITIVE_REQUIREMENTS);
export const [SITE_PACKAGES, LOAD_WHEELS_FROM_R2] = buildSitePackages(TRANSITIVE_REQUIREMENTS);
Loading
Loading