-
Notifications
You must be signed in to change notification settings - Fork 303
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
switch to new package index, decompress into .tar in javascript #2024
Changes from all commits
cc5ae97
9dd0e1d
74ba19f
5559c51
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,7 +1,8 @@ | ||
# Do not edit this file by hand. See docs/pyodide.md for info on how to generate it. | ||
# These variables are factored out here because they are being shared by the WORKSPACE files in | ||
# both edgeworker and workerd, as well as src/pyodide/BUILD.bazel | ||
PYODIDE_PACKAGE_BUCKET_URL = "https://pub-45d734c4145d4285b343833ee450ef38.r2.dev/20240320/" | ||
PYODIDE_GITHUB_RELEASE_URL = "https://github.com/cloudflare/pyodide-build-scripts/releases/download/20240320/" | ||
PYODIDE_LOCK_SHA256 = "a176311d4c449aac4ef7a333977af8b6e08224c115a9a6d05c04592c841b8a58" | ||
PYODIDE_PACKAGES_TAR_ZIP_SHA256 = "e191bae60aad75f6a9c33fac5c0ff1ad2b0e564bdd7a07fbdc848df4b62c60a1" | ||
PYODIDE_PACKAGE_BUCKET_URL = "https://pub-45d734c4145d4285b343833ee450ef38.r2.dev/20240415-experimental/" | ||
PYODIDE_GITHUB_RELEASE_URL = "https://github.com/cloudflare/pyodide-build-scripts/releases/download/20240415-experimental/" | ||
PYODIDE_LOCK_SHA256 = "67d1a24edf4f3ab2cf85c736391c04763ff722bf3aebf9ea3469d96e5f51e1da" | ||
PYODIDE_PACKAGES_TAR_ZIP_SHA256 = "749967941204154e7ae866fe08f1216a3e5ee58ba6a3757231a5be0d9d4430f8" | ||
PYODIDE_ALL_WHEELS_ZIP_SHA256 = "9e7c330ee93d81d0356cc2d585f217dfee58b623ad4535282baa6e82bd063eee" |
Original file line number | Diff line number | Diff line change | ||
---|---|---|---|---|
@@ -0,0 +1,86 @@ | ||||
/** | ||||
* This file contains code that roughly replaces pyodide.loadPackage, with workerd-specific | ||||
* optimizations: | ||||
* - Wheels are decompressed with a DecompressionStream instead of in Python | ||||
* - Wheels are overlaid onto the site-packages dir instead of actually being copied | ||||
* - Wheels are fetched from a disk cache if available. | ||||
* | ||||
* Note that loadPackages is only used in local dev for now, internally we use the full big bundle | ||||
* that contains all the packages ready to go. | ||||
*/ | ||||
|
||||
import { default as LOCKFILE } from "pyodide-internal:generated/pyodide-lock.json"; | ||||
import { WORKERD_INDEX_URL } from "pyodide-internal:metadata"; | ||||
import { SITE_PACKAGES, LOAD_WHEELS_FROM_R2, getSitePackagesPath } from "pyodide-internal:setupPackages"; | ||||
import { parseTarInfo } from "pyodide-internal:tar"; | ||||
import { default as DiskCache } from "pyodide-internal:disk_cache"; | ||||
import { createTarFS } from "pyodide-internal:tarfs"; | ||||
|
||||
async function loadBundle(requirement) { | ||||
// first check if the disk cache has what we want | ||||
const filename = LOCKFILE["packages"][requirement]["file_name"]; | ||||
const cached = DiskCache.get(filename); | ||||
if (cached) { | ||||
return [requirement, cached]; | ||||
} | ||||
|
||||
// we didn't find it in the disk cache, continue with original fetch | ||||
const url = new URL(WORKERD_INDEX_URL + filename); | ||||
const response = await fetch(url); | ||||
|
||||
const arrayBuffer = await new Response(response.body.pipeThrough(new DecompressionStream("gzip"))).arrayBuffer(); | ||||
|
||||
DiskCache.put(filename, arrayBuffer); | ||||
return [requirement, arrayBuffer]; | ||||
}; | ||||
|
||||
/** | ||||
* ArrayBufferReader wraps around an arrayBuffer in a way that tar.js is able to read from | ||||
*/ | ||||
class ArrayBufferReader { | ||||
constructor(arrayBuffer) { | ||||
this.arrayBuffer = arrayBuffer; | ||||
} | ||||
|
||||
read(offset, buf){ | ||||
// buf is a Uint8Array | ||||
const size = this.arrayBuffer.byteLength; | ||||
if (offset >= size || offset < 0) { | ||||
return 0; | ||||
} | ||||
let toCopy = buf.length; | ||||
if (size - offset < toCopy) { | ||||
toCopy = size - offset; | ||||
} | ||||
buf.set(new Uint8Array(this.arrayBuffer, offset, toCopy)); | ||||
return toCopy; | ||||
} | ||||
} | ||||
|
||||
export async function loadPackages(Module, requirements) { | ||||
if (!LOAD_WHEELS_FROM_R2) return; | ||||
|
||||
let loadPromises = []; | ||||
let loading = []; | ||||
for (const req of requirements) { | ||||
if (SITE_PACKAGES.loadedRequirements.has(req)) continue; | ||||
loadPromises.push(loadBundle(req)); | ||||
loading.push(req); | ||||
} | ||||
|
||||
console.log("Loading " + loading.join(", ")); | ||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should these console.logs be removed? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I kept them there to match the original prints from the pyodide package loader. I'd prefer to keep them for now so we have more visibility on performance in testing. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||
|
||||
const buffers = await Promise.all(loadPromises); | ||||
for (const [requirement, buffer] of buffers) { | ||||
const reader = new ArrayBufferReader(buffer); | ||||
const [tarInfo, soFiles] = parseTarInfo(reader); | ||||
SITE_PACKAGES.addSmallBundle(tarInfo, soFiles, requirement); | ||||
} | ||||
|
||||
console.log("Loaded " + loading.join(", ")); | ||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||
|
||||
const tarFS = createTarFS(Module); | ||||
const path = getSitePackagesPath(Module); | ||||
const info = SITE_PACKAGES.rootInfo; | ||||
Module.FS.mount(tarFS, { info }, path); | ||||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3,7 +3,6 @@ import { createTarFS } from "pyodide-internal:tarfs"; | |
import { createMetadataFS } from "pyodide-internal:metadatafs"; | ||
import { default as LOCKFILE } from "pyodide-internal:generated/pyodide-lock.json"; | ||
import { REQUIREMENTS, WORKERD_INDEX_URL } from "pyodide-internal:metadata"; | ||
import { patchFetch } from "pyodide-internal:builtin_wrappers"; | ||
import { simpleRunPython } from "pyodide-internal:util"; | ||
|
||
const canonicalizeNameRegex = /[-_.]+/g; | ||
|
@@ -23,6 +22,86 @@ const STDLIB_PACKAGES = Object.values(LOCKFILE.packages) | |
.filter(({ install_dir }) => install_dir === "stdlib") | ||
.map(({ name }) => canonicalizePackageName(name)); | ||
|
||
/** | ||
* SitePackagesDir keeps track of the virtualized view of the site-packages | ||
* directory generated for each worker. | ||
*/ | ||
class SitePackagesDir { | ||
constructor() { | ||
this.rootInfo = { | ||
children: new Map(), | ||
mode: 0o777, | ||
type: 5, | ||
modtime: 0, | ||
size: 0, | ||
path: "", | ||
name: "", | ||
parts: [], | ||
}; | ||
this.soFiles = []; | ||
this.loadedRequirements = new Set(); | ||
} | ||
|
||
/** | ||
* mountOverlay "overlays" a directory onto the site-packages root directory. | ||
* All files and subdirectories in the overlay will be accessible at site-packages by the worker. | ||
* If a file or directory already exists, an error is thrown. | ||
* @param {TarInfo} overlayInfo The directory that is to be "copied" into site-packages | ||
*/ | ||
mountOverlay(overlayInfo) { | ||
overlayInfo.children.forEach((val, key) => { | ||
if (this.rootInfo.children.has(key)) { | ||
throw new Error( | ||
`File/folder ${key} being written by multiple packages`, | ||
); | ||
} | ||
this.rootInfo.children.set(key, val); | ||
}); | ||
} | ||
|
||
/** | ||
* A small bundle contains just a single package. The entire bundle will be overlaid onto site-packages. | ||
* A small bundle can basically be thought of as a wheel. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should it just be called |
||
* @param {TarInfo} tarInfo The root tarInfo for the small bundle (See tar.js) | ||
* @param {List<String>} soFiles A list of .so files contained in the small bundle | ||
* @param {String} requirement The canonicalized package name this small bundle corresponds to | ||
*/ | ||
addSmallBundle(tarInfo, soFiles, requirement) { | ||
for (const soFile of soFiles) { | ||
this.soFiles.push(soFile.split("/")); | ||
} | ||
this.mountOverlay(tarInfo); | ||
this.loadedRequirements.add(requirement); | ||
} | ||
|
||
/** | ||
* A big bundle contains multiple packages, each package contained in a folder whose name is the canonicalized package name. | ||
* This function overlays the requested packages onto the site-packages directory. | ||
* @param {TarInfo} tarInfo The root tarInfo for the big bundle (See tar.js) | ||
* @param {List<String>} soFiles A list of .so files contained in the big bundle | ||
* @param {List<String>} requirements canonicalized list of packages to pick from the big bundle | ||
*/ | ||
addBigBundle(tarInfo, soFiles, requirements) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I wonder why we have this split. Isn't There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The structure of a big bundle is different from a small bundle. A big bundle contains one folder for each package, with each folder containing the structure of a wheel. A small bundle contains just the wheel for a single package, not contained within a folder. |
||
// add all the .so files we will need to preload from the big bundle | ||
for (const soFile of soFiles) { | ||
// If folder is in list of requirements include .so file in list to preload. | ||
const [pkg, ...rest] = soFile.split("/"); | ||
if (requirements.has(pkg)) { | ||
this.soFiles.push(rest); | ||
} | ||
} | ||
|
||
for (const req of requirements) { | ||
const child = tarInfo.children.get(req); | ||
if (!child) { | ||
throw new Error(`Requirement ${req} not found in pyodide packages tar`); | ||
} | ||
this.mountOverlay(child); | ||
this.loadedRequirements.add(req); | ||
} | ||
} | ||
}; | ||
|
||
/** | ||
* This stitches together the view of the site packages directory. Each | ||
* requirement corresponds to a folder in the original tar file. For each | ||
|
@@ -33,52 +112,19 @@ const STDLIB_PACKAGES = Object.values(LOCKFILE.packages) | |
* directory so we can preload them. | ||
*/ | ||
export function buildSitePackages(requirements) { | ||
const [origTarInfo, origSoFiles] = parseTarInfo(); | ||
// We'd like to set USE_LOAD_PACKAGE = IS_WORKERD but we also build a funny | ||
// workerd with the downstream package set. We can distinguish between them by | ||
// looking at the contents. This uses the fact that the downstream set is | ||
// larger, but there are a lot of differences... | ||
const USE_LOAD_PACKAGE = origTarInfo.children.size < 10; | ||
if (USE_LOAD_PACKAGE) { | ||
requirements = new Set([...STDLIB_PACKAGES]); | ||
} else { | ||
requirements = new Set([...STDLIB_PACKAGES, ...requirements]); | ||
} | ||
const soFiles = []; | ||
for (const soFile of origSoFiles) { | ||
// If folder is in list of requirements include .so file in list to preload. | ||
const [pkg, ...rest] = soFile.split("/"); | ||
if (requirements.has(pkg)) { | ||
soFiles.push(rest); | ||
} | ||
} | ||
const newTarInfo = { | ||
children: new Map(), | ||
mode: 0o777, | ||
type: 5, | ||
modtime: 0, | ||
size: 0, | ||
path: "", | ||
name: "", | ||
parts: [], | ||
}; | ||
|
||
for (const req of requirements) { | ||
const child = origTarInfo.children.get(req); | ||
if (!child) { | ||
throw new Error(`Requirement ${req} not found in pyodide packages tar`); | ||
} | ||
child.children.forEach((val, key) => { | ||
if (newTarInfo.children.has(key)) { | ||
throw new Error( | ||
`File/folder ${key} being written by multiple packages`, | ||
); | ||
} | ||
newTarInfo.children.set(key, val); | ||
}); | ||
const [bigTarInfo, bigTarSoFiles] = parseTarInfo(); | ||
|
||
let LOAD_WHEELS_FROM_R2 = true; | ||
let requirementsInBigBundle = new Set([...STDLIB_PACKAGES]); | ||
if (bigTarInfo.children.size > 10) { | ||
LOAD_WHEELS_FROM_R2 = false; | ||
requirements.forEach(r => requirementsInBigBundle.add(r)); | ||
} | ||
|
||
return [newTarInfo, soFiles, USE_LOAD_PACKAGE]; | ||
const res = new SitePackagesDir(); | ||
res.addBigBundle(bigTarInfo, bigTarSoFiles, requirementsInBigBundle); | ||
|
||
return [res, LOAD_WHEELS_FROM_R2]; | ||
} | ||
|
||
/** | ||
|
@@ -89,23 +135,12 @@ export function buildSitePackages(requirements) { | |
* TODO: stop using loadPackage in workerd. | ||
*/ | ||
export function patchLoadPackage(pyodide) { | ||
if (!USE_LOAD_PACKAGE) { | ||
pyodide.loadPackage = disabledLoadPackage; | ||
return; | ||
} | ||
patchFetch(new URL(WORKERD_INDEX_URL).origin); | ||
const origLoadPackage = pyodide.loadPackage; | ||
function loadPackage(packages, options) { | ||
return origLoadPackage(packages, { | ||
checkIntegrity: false, | ||
...options, | ||
}); | ||
} | ||
pyodide.loadPackage = loadPackage; | ||
pyodide.loadPackage = disabledLoadPackage; | ||
return; | ||
} | ||
|
||
function disabledLoadPackage() { | ||
throw new Error("We only use loadPackage in workerd"); | ||
throw new Error("pyodide.loadPackage is disabled because packages are encoded in the binary"); | ||
} | ||
|
||
/** | ||
|
@@ -138,7 +173,12 @@ export function mountLib(Module, info) { | |
const site_packages = getSitePackagesPath(Module); | ||
Module.FS.mkdirTree(site_packages); | ||
Module.FS.mkdirTree("/session/metadata"); | ||
Module.FS.mount(tarFS, { info }, site_packages); | ||
if (!LOAD_WHEELS_FROM_R2) { | ||
// if we are not loading additional wheels from R2, then we're done | ||
// with site-packages and we can mount it here. Otherwise, we must mount it in | ||
// loadPackages(). | ||
Module.FS.mount(tarFS, { info }, site_packages); | ||
} | ||
Module.FS.mount(mdFS, {}, "/session/metadata"); | ||
} | ||
|
||
|
@@ -191,5 +231,4 @@ function addPackageToLoad(lockfile, name, toLoad) { | |
|
||
export { REQUIREMENTS }; | ||
export const TRANSITIVE_REQUIREMENTS = getTransitiveRequirements(); | ||
export const [SITE_PACKAGES_INFO, SITE_PACKAGES_SO_FILES, USE_LOAD_PACKAGE] = | ||
buildSitePackages(TRANSITIVE_REQUIREMENTS); | ||
export const [SITE_PACKAGES, LOAD_WHEELS_FROM_R2] = buildSitePackages(TRANSITIVE_REQUIREMENTS); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should tar.js instead be changed to expect an ArrayBuffer?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
hmm, then we would have to expose an ArrayBuffer instead of a PackagesTarReader, not sure if we want that big of a refactor. This interface is meant to match exactly what PackagesTarReader does in JSG.