Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Disk Cache for Pyodide Wheels #1851

Merged
merged 8 commits into from
Apr 2, 2024
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions src/pyodide/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,10 @@ REPLACEMENTS = [
[
"reportUndefinedSymbols()",
"reportUndefinedSymbolsNoOp()"
],
[
"Module.reportUndefinedSymbolsNoOp()",
"Module.reportUndefinedSymbols()"
]
]

Expand Down
23 changes: 23 additions & 0 deletions src/pyodide/internal/builtin_wrappers.js
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import { default as UnsafeEval } from "internal:unsafe-eval";
import { default as DiskCache } from "pyodide-internal:disk_cache";

let lastTime;
let lastDelta = 0;
Expand Down Expand Up @@ -129,3 +130,25 @@ export async function wasmInstantiate(module, imports) {
const instance = new WebAssembly.Instance(module, imports);
return { module, instance };
}

export function patchFetch(origin) {
// Patch fetch to first go through disk cache, but only when url points to origin
const origFetch = globalThis.fetch;
globalThis.fetch = async function (url, options) {
if(url.origin !== origin) {
return origFetch(url, options);
}

const fileName = url.pathname.substring(url.pathname.lastIndexOf("/") + 1);
const cached = DiskCache.get(fileName);
if (cached) {
return new Response(cached);
}

// we didn't find it in the disk cache, continue with original fetch
const response = await origFetch(url, options);
const arrayBuffer = await response.arrayBuffer();
DiskCache.put(fileName, arrayBuffer);
return new Response(arrayBuffer);
};
}
4 changes: 3 additions & 1 deletion src/pyodide/internal/setupPackages.js
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@ import { parseTarInfo } from "pyodide-internal:tar";
import { createTarFS } from "pyodide-internal:tarfs";
import { createMetadataFS } from "pyodide-internal:metadatafs";
import { default as LOCKFILE } from "pyodide-internal:generated/pyodide-lock.json";
import { REQUIREMENTS } from "pyodide-internal:metadata";
import { REQUIREMENTS, WORKERD_INDEX_URL } from "pyodide-internal:metadata";
import { patchFetch } from "pyodide-internal:builtin_wrappers";

const canonicalizeNameRegex = /[-_.]+/g;

Expand Down Expand Up @@ -91,6 +92,7 @@ export function patchLoadPackage(pyodide) {
pyodide.loadPackage = disabledLoadPackage;
return;
}
patchFetch(new URL(WORKERD_INDEX_URL).origin);
const origLoadPackage = pyodide.loadPackage;
function loadPackage(packages, options) {
return origLoadPackage(packages, {
Expand Down
34 changes: 34 additions & 0 deletions src/workerd/api/pyodide/pyodide.c++
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
#include "pyodide.h"
#include <kj/string.h>
#include <workerd/util/string-buffer.h>
#include "kj/array.h"
#include "kj/common.h"
#include "kj/debug.h"
Expand Down Expand Up @@ -120,4 +122,36 @@ jsg::Ref<PyodideMetadataReader> makePyodideMetadataReader(Worker::Reader conf) {
false /* isTracing */, false /* createBaselineSnapshot */, kj::none /* memorySnapshot */);
}

const kj::Maybe<kj::Own<const kj::Directory>> DiskCache::NULL_CACHE_ROOT = kj::none;

jsg::Optional<kj::Array<kj::byte>> DiskCache::get(jsg::Lock& js, kj::String key) {
KJ_IF_SOME(root, cacheRoot) {
kj::Path path(key);
auto file = root->tryOpenFile(path);

KJ_IF_SOME(f, file) {
return f->readAllBytes();
} else {
return kj::none;
}
} else {
return kj::none;
}
}

void DiskCache::put(jsg::Lock& js, kj::String key, kj::Array<kj::byte> data) {
KJ_IF_SOME(root, cacheRoot) {
kj::Path path(key);
auto file = root->tryOpenFile(path, kj::WriteMode::CREATE | kj::WriteMode::MODIFY);

KJ_IF_SOME(f, file) {
f->writeAll(data);
} else {
KJ_LOG(ERROR, "DiskCache: Failed to open file", key);
}
} else {
return;
}
}

} // namespace workerd::api::pyodide
21 changes: 21 additions & 0 deletions src/workerd/api/pyodide/pyodide.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
#include "kj/array.h"
#include "kj/debug.h"
#include <kj/common.h>
#include <kj/filesystem.h>
#include <pyodide/generated/pyodide_extra.capnp.h>
#include <pyodide/pyodide.capnp.h>
#include <workerd/jsg/jsg.h>
Expand Down Expand Up @@ -241,6 +242,25 @@ class DisabledInternalJaeger : public jsg::Object {
}
};

// This cache is used by Pyodide to store wheels fetched over the internet across workerd restarts in local dev only
class DiskCache: public jsg::Object {
static const kj::Maybe<kj::Own<const kj::Directory>> NULL_CACHE_ROOT; // always set to kj::none

const kj::Maybe<kj::Own<const kj::Directory>> &cacheRoot;
public:
DiskCache(): cacheRoot(NULL_CACHE_ROOT) {}; // Disabled disk cache
DiskCache(const kj::Maybe<kj::Own<const kj::Directory>> &cacheRoot): cacheRoot(cacheRoot) {};

jsg::Optional<kj::Array<kj::byte>> get(jsg::Lock& js, kj::String key);
void put(jsg::Lock& js, kj::String key, kj::Array<kj::byte> data);

JSG_RESOURCE_TYPE(DiskCache) {
JSG_METHOD(get);
JSG_METHOD(put);
}
};


// A limiter which will throw if the startup is found to exceed limits. The script will still be
// able to run for longer than the limit, but an error will be thrown as soon as the startup
// finishes. This way we can enforce a Python-specific startup limit.
Expand Down Expand Up @@ -300,6 +320,7 @@ jsg::Ref<PyodideMetadataReader> makePyodideMetadataReader(Worker::Reader conf);
api::pyodide::PackagesTarReader, \
api::pyodide::PyodideMetadataReader, \
api::pyodide::ArtifactBundler, \
api::pyodide::DiskCache, \
api::pyodide::DisabledInternalJaeger,\
api::pyodide::SimplePythonLimiter

Expand Down
3 changes: 2 additions & 1 deletion src/workerd/server/server.c++
Original file line number Diff line number Diff line change
Expand Up @@ -2650,7 +2650,8 @@ kj::Own<Server::Service> Server::makeWorker(kj::StringPtr name, config::Worker::
featureFlags.asReader(),
*limitEnforcer,
kj::atomicAddRef(*observer),
*memoryCacheProvider);
*memoryCacheProvider,
diskCacheRoot);
auto inspectorPolicy = Worker::Isolate::InspectorPolicy::DISALLOW;
if (inspectorOverride != kj::none) {
// For workerd, if the inspector is enabled, it is always fully trusted.
Expand Down
4 changes: 4 additions & 0 deletions src/workerd/server/server.h
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,9 @@ class Server: private kj::TaskSet::ErrorHandler {
void enableControl(uint fd) {
controlOverride = kj::heap<kj::FdOutputStream>(fd);
}
void setDiskCacheRoot(kj::Maybe<kj::Own<const kj::Directory>> &&dkr) {
diskCacheRoot = kj::mv(dkr);
}

// Runs the server using the given config.
kj::Promise<void> run(jsg::V8System& v8System, config::Config::Reader conf,
Expand Down Expand Up @@ -90,6 +93,7 @@ class Server: private kj::TaskSet::ErrorHandler {
kj::Network& network;
kj::EntropySource& entropySource;
kj::Function<void(kj::String)> reportConfigError;
kj::Maybe<kj::Own<const kj::Directory>> diskCacheRoot;

bool experimental = false;

Expand Down
29 changes: 25 additions & 4 deletions src/workerd/server/workerd-api.c++
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,7 @@ struct WorkerdApi::Impl {
kj::Own<CompatibilityFlags::Reader> features;
JsgWorkerdIsolate jsgIsolate;
api::MemoryCacheProvider& memoryCacheProvider;
kj::Maybe<kj::Own<const kj::Directory>>& pyodideCacheRoot;

class Configuration {
public:
Expand All @@ -130,10 +131,11 @@ struct WorkerdApi::Impl {
CompatibilityFlags::Reader featuresParam,
IsolateLimitEnforcer& limitEnforcer,
kj::Own<jsg::IsolateObserver> observer,
api::MemoryCacheProvider& memoryCacheProvider)
api::MemoryCacheProvider& memoryCacheProvider,
kj::Maybe<kj::Own<const kj::Directory>>& pyodideCacheRoot)
: features(capnp::clone(featuresParam)),
jsgIsolate(v8System, Configuration(*this), kj::mv(observer), limitEnforcer.getCreateParams()),
memoryCacheProvider(memoryCacheProvider) {}
memoryCacheProvider(memoryCacheProvider), pyodideCacheRoot(pyodideCacheRoot) {}

static v8::Local<v8::String> compileTextGlobal(JsgWorkerdIsolate::Lock& lock,
capnp::Text::Reader reader) {
Expand Down Expand Up @@ -173,9 +175,10 @@ WorkerdApi::WorkerdApi(jsg::V8System& v8System,
CompatibilityFlags::Reader features,
IsolateLimitEnforcer& limitEnforcer,
kj::Own<jsg::IsolateObserver> observer,
api::MemoryCacheProvider& memoryCacheProvider)
api::MemoryCacheProvider& memoryCacheProvider,
kj::Maybe<kj::Own<const kj::Directory>> &pyodideCacheRoot)
: impl(kj::heap<Impl>(v8System, features, limitEnforcer, kj::mv(observer),
memoryCacheProvider)) {}
memoryCacheProvider, pyodideCacheRoot)) {}
WorkerdApi::~WorkerdApi() noexcept(false) {}

kj::Own<jsg::Lock> WorkerdApi::lock(jsg::V8StackScope& stackScope) const {
Expand Down Expand Up @@ -454,6 +457,24 @@ void WorkerdApi::compileModules(
jsg::ModuleRegistry::Type::INTERNAL);
}

// Inject disk cache module
{
using ModuleInfo = jsg::ModuleRegistry::ModuleInfo;
using ObjectModuleInfo = jsg::ModuleRegistry::ObjectModuleInfo;
using ResolveMethod = jsg::ModuleRegistry::ResolveMethod;
auto specifier = "pyodide-internal:disk_cache";
auto diskCache = jsg::alloc<DiskCache>(impl->pyodideCacheRoot);
modules->addBuiltinModule(
specifier,
[specifier = kj::str(specifier), diskCache = kj::mv(diskCache)](
jsg::Lock& js, ResolveMethod, kj::Maybe<const kj::Path&>&) mutable {
auto& wrapper = JsgWorkerdIsolate_TypeWrapper::from(js.v8Isolate);
auto wrap = wrapper.wrap(js.v8Context(), kj::none, kj::mv(diskCache));
return kj::Maybe(ModuleInfo(js, specifier, kj::none, ObjectModuleInfo(js, wrap)));
},
jsg::ModuleRegistry::Type::INTERNAL);
}

// Inject a (disabled) SimplePythonLimiter
{
using ModuleInfo = jsg::ModuleRegistry::ModuleInfo;
Expand Down
3 changes: 2 additions & 1 deletion src/workerd/server/workerd-api.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,8 @@ class WorkerdApi final: public Worker::Api {
CompatibilityFlags::Reader features,
IsolateLimitEnforcer& limitEnforcer,
kj::Own<jsg::IsolateObserver> observer,
api::MemoryCacheProvider& memoryCacheProvider);
api::MemoryCacheProvider& memoryCacheProvider,
kj::Maybe<kj::Own<const kj::Directory>>& pyodideCacheRoot);
~WorkerdApi() noexcept(false);

static const WorkerdApi& from(const Worker::Api&);
Expand Down
10 changes: 9 additions & 1 deletion src/workerd/server/workerd.c++
Original file line number Diff line number Diff line change
Expand Up @@ -726,7 +726,9 @@ public:
"Useful for development, but not recommended in production.")
.addOption({"experimental"}, [this]() { server.allowExperimental(); return true; },
"Permit the use of experimental features which may break backwards "
"compatibility in a future release.");
"compatibility in a future release.")
.addOptionWithArg({"disk-cache-dir"}, CLI_METHOD(diskCacheDir), "<path>",
"Use <path> as a disk cache to avoid repeatedly fetching packages from the internet. ");
}

kj::MainFunc addServeOptions(kj::MainBuilder& builder) {
Expand Down Expand Up @@ -927,6 +929,12 @@ public:
server.enableControl(fd);
}

void diskCacheDir(kj::StringPtr pathStr) {
kj::Path path = fs->getCurrentPath().eval(pathStr);
kj::Maybe<kj::Own<const kj::Directory>> dir = fs->getRoot().tryOpenSubdir(path, kj::WriteMode::MODIFY);
server.setDiskCacheRoot(kj::mv(dir));
}
Comment on lines +932 to +936
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should this parameter have a default value pointing at $TMPDIR/workerd-python-package-cache?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

On linux and mac I'd like to see it default to:

  • $XDG_CACHE_HOME/workerd-python-package-cache if XDG_CACHE_HOME is defined
  • $HOME/.cache/workerd-python-package-cache otherwise.

Unless you have a strong opinion that it should go in /tmp instead.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

True, better to follow the XDG spec here.

For macOS I think we want ~/Library/Caches/ and I guess %LOCALAPPDATA% for Windows.


void watch() {
#if _WIN32
auto& w = watcher.emplace(io.win32EventPort);
Expand Down
3 changes: 2 additions & 1 deletion src/workerd/tests/test-fixture.c++
Original file line number Diff line number Diff line change
Expand Up @@ -278,12 +278,13 @@ TestFixture::TestFixture(SetupParams&& params)
isolateLimitEnforcer(kj::heap<MockIsolateLimitEnforcer>()),
errorReporter(kj::heap<MockErrorReporter>()),
memoryCacheProvider(kj::heap<api::MemoryCacheProvider>()),
diskCacheRoot(kj::none),
api(kj::heap<server::WorkerdApi>(
testV8System,
params.featureFlags.orDefault(CompatibilityFlags::Reader()),
*isolateLimitEnforcer,
kj::atomicRefcounted<IsolateObserver>(),
*memoryCacheProvider)),
*memoryCacheProvider, diskCacheRoot)),
workerIsolate(kj::atomicRefcounted<Worker::Isolate>(
kj::mv(api),
kj::atomicRefcounted<IsolateObserver>(),
Expand Down
1 change: 1 addition & 0 deletions src/workerd/tests/test-fixture.h
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,7 @@ struct TestFixture {
kj::Own<IsolateLimitEnforcer> isolateLimitEnforcer;
kj::Own<Worker::ValidationErrorReporter> errorReporter;
kj::Own<api::MemoryCacheProvider> memoryCacheProvider;
kj::Maybe<kj::Own<const kj::Directory>> diskCacheRoot;
kj::Own<Worker::Api> api;
kj::Own<Worker::Isolate> workerIsolate;
kj::Own<Worker::Script> workerScript;
Expand Down
Loading