diff --git a/src/pyodide/BUILD.bazel b/src/pyodide/BUILD.bazel index 453e61f56ac..faf31e05ae8 100644 --- a/src/pyodide/BUILD.bazel +++ b/src/pyodide/BUILD.bazel @@ -117,14 +117,14 @@ REPLACEMENTS = [ "Date.now", "monotonicDateNow", ], - [ - "crypto.getRandomValues", - "getRandomValues" - ], [ "reportUndefinedSymbols()", "reportUndefinedSymbolsNoOp()" ], + [ + "crypto.getRandomValues(", + "getRandomValues(Module, ", + ] ] load("//:build/pyodide_bucket.bzl", "PYODIDE_PACKAGE_BUCKET_URL") @@ -153,12 +153,14 @@ wd_js_bundle( internal_data_modules = ["generated/python_stdlib.zip"] + glob([ "internal/*.py", "internal/patches/*.py", + "internal/topLevelEntropy/*.py", ]), internal_json_modules = ["generated/pyodide-lock.json", "generated/pyodide-bucket.json"], internal_modules = [ "generated/pyodide.asm.js", ] + glob([ "internal/*.js", + "internal/topLevelEntropy/*.js", ]), internal_wasm_modules = ["generated/pyodide.asm.wasm"], schema_id = "0xbcc8f57c63814005", diff --git a/src/pyodide/internal/builtin_wrappers.js b/src/pyodide/internal/builtin_wrappers.js index b4006467897..38bc20b9328 100644 --- a/src/pyodide/internal/builtin_wrappers.js +++ b/src/pyodide/internal/builtin_wrappers.js @@ -1,5 +1,6 @@ import { default as UnsafeEval } from "internal:unsafe-eval"; import { default as DiskCache } from "pyodide-internal:disk_cache"; +export { getRandomValues } from "pyodide-internal:topLevelEntropy/lib"; let lastTime; let lastDelta = 0; @@ -19,23 +20,6 @@ export function monotonicDateNow() { return now + lastDelta; } -/** - * We initialize Python at top level, but it tries to initialize the random seed with - * crypto.getRandomValues which will fail at top level. So we don't produce any entropy the first - * time around and we reseed the rng in the first request context before executing user code. - */ -export function getRandomValues(arr) { - try { - return crypto.getRandomValues(arr); - } catch (e) { - if (e.message.includes("Disallowed operation called within global scope")) { - // random.seed() can't work at startup. We'll seed again under the request scope. - return arr; - } - throw e; - } -} - /** * First check that the callee is what we expect, then use `UnsafeEval` to * construct a `WasmModule`. diff --git a/src/pyodide/internal/python.js b/src/pyodide/internal/python.js index b598a379b38..63e67c627a2 100644 --- a/src/pyodide/internal/python.js +++ b/src/pyodide/internal/python.js @@ -1,3 +1,4 @@ +Error.stackTraceLimit = Infinity; import { enterJaegerSpan } from "pyodide-internal:jaeger"; import { SITE_PACKAGES_INFO, @@ -12,6 +13,11 @@ import { maybeSetupSnapshotUpload, restoreSnapshot, } from "pyodide-internal:snapshot"; +import { + entropyMountFiles, + entropyAfterRuntimeInit, + entropyBeforeTopLevel, +} from "pyodide-internal:topLevelEntropy/lib"; /** * This file is a simplified version of the Pyodide loader: @@ -121,8 +127,9 @@ function getEmscriptenSettings(lockfile, indexURL) { // environment variables go here env: { HOME: "/session", - // We don't have access to cryptographic rng at startup so we cannot support hash - // randomization. Setting `PYTHONHASHSEED` disables it. + // We don't have access to entropy at startup so we cannot support hash + // randomization. Setting `PYTHONHASHSEED` disables it. See further + // discussion in topLevelEntropy/entropy_patches.py PYTHONHASHSEED: "111", }, // This is the index that we use as the base URL to fetch the wheels. @@ -177,20 +184,23 @@ async function instantiateEmscriptenModule(emscriptenSettings) { * APIs, we call this function. If `MEMORY` is defined, then we will have passed * `noInitialRun: true` and so the C runtime is in an incoherent state until we * restore the linear memory from the snapshot. - * - * Returns `true` when existing memory snapshot was loaded. */ async function prepareWasmLinearMemory(Module) { // Note: if we are restoring from a snapshot, runtime is not initialized yet. mountLib(Module, SITE_PACKAGES_INFO); + entropyMountFiles(Module); if (SHOULD_RESTORE_SNAPSHOT) { restoreSnapshot(Module); - // Don't call adjustSysPath here: it was called in the other branch when we - // were creating the snapshot so the outcome of that is already baked in. + } + // entropyAfterRuntimeInit adjusts JS state ==> always needs to be called. + entropyAfterRuntimeInit(Module); + if (SHOULD_RESTORE_SNAPSHOT) { return; } + // The effects of these are purely in Python state so they only need to be run + // if we didn't restore a snapshot. + entropyBeforeTopLevel(Module); adjustSysPath(Module); - maybeSetupSnapshotUpload(Module); } export async function loadPyodide(lockfile, indexURL) { @@ -201,6 +211,7 @@ export async function loadPyodide(lockfile, indexURL) { await enterJaegerSpan("prepare_wasm_linear_memory", () => prepareWasmLinearMemory(Module), ); + maybeSetupSnapshotUpload(Module); // Finish setting up Pyodide's ffi so we can use the nice Python interface await enterJaegerSpan("finalize_bootstrap", Module.API.finalizeBootstrap); diff --git a/src/pyodide/internal/snapshot.js b/src/pyodide/internal/snapshot.js index c0c0b336120..6968896addf 100644 --- a/src/pyodide/internal/snapshot.js +++ b/src/pyodide/internal/snapshot.js @@ -15,6 +15,8 @@ import { } from "pyodide-internal:metadata"; import { reportError, simpleRunPython } from "pyodide-internal:util"; +let LOADED_BASELINE_SNAPSHOT; + /** * This file is a simplified version of the Pyodide loader: * https://github.com/pyodide/pyodide/blob/main/src/js/pyodide.ts @@ -29,8 +31,10 @@ import { reportError, simpleRunPython } from "pyodide-internal:util"; */ import { _createPyodideModule } from "pyodide-internal:generated/pyodide.asm"; -const TOP_LEVEL_SNAPSHOT = ArtifactBundler.isEwValidating() || SHOULD_SNAPSHOT_TO_DISK; -const SHOULD_UPLOAD_SNAPSHOT = ArtifactBundler.isEnabled() || TOP_LEVEL_SNAPSHOT; +const TOP_LEVEL_SNAPSHOT = + ArtifactBundler.isEwValidating() || SHOULD_SNAPSHOT_TO_DISK; +const SHOULD_UPLOAD_SNAPSHOT = + ArtifactBundler.isEnabled() || TOP_LEVEL_SNAPSHOT; /** * Global variable for the memory snapshot. On the first run we stick a copy of @@ -116,10 +120,7 @@ const PRELOADED_SO_FILES = []; */ export function preloadDynamicLibs(Module) { let SO_FILES_TO_LOAD = SITE_PACKAGES_SO_FILES; - if ( - IS_CREATING_BASELINE_SNAPSHOT || - DSO_METADATA?.settings?.baselineSnapshot - ) { + if (LOADED_BASELINE_SNAPSHOT && LOADED_SNAPSHOT_VERSION === 1) { // Ideally this should be just // [[ '_lzma.so' ], [ '_ssl.so' ]] // but we put a few more because we messed up the memory snapshot... @@ -129,7 +130,12 @@ export function preloadDynamicLibs(Module) { ["_sqlite3.so"], ["_ssl.so"], ]; - // SO_FILES_TO_LOAD = [[ '_lzma.so' ], [ '_ssl.so' ]]; + } + if ( + IS_CREATING_BASELINE_SNAPSHOT || + (LOADED_BASELINE_SNAPSHOT && LOADED_SNAPSHOT_VERSION === 2) + ) { + SO_FILES_TO_LOAD = [["_lzma.so"], ["_ssl.so"]]; } try { const sitePackages = getSitePackagesPath(Module); @@ -313,8 +319,9 @@ export function maybeSetupSnapshotUpload(Module) { // "\x00snp" const SNAPSHOT_MAGIC = 0x706e7300; -const SNAPSHOT_VERSION = 1; +const CREATE_SNAPSHOT_VERSION = 2; const HEADER_SIZE = 4 * 4; +export let LOADED_SNAPSHOT_VERSION = undefined; /** * Encode heap and dsoJSON into the memory snapshot artifact that we'll upload @@ -332,7 +339,7 @@ function encodeSnapshot(heap, dsoJSON) { ); const uint32View = new Uint32Array(toUpload.buffer); uint32View[0] = SNAPSHOT_MAGIC; - uint32View[1] = SNAPSHOT_VERSION; + uint32View[1] = CREATE_SNAPSHOT_VERSION; uint32View[2] = snapshotOffset; uint32View[3] = jsonLength; toUpload.subarray(snapshotOffset).set(heap); @@ -347,9 +354,9 @@ function decodeSnapshot() { let offset = 0; MEMORY_SNAPSHOT_READER.readMemorySnapshot(offset, buf); offset += 8; - let snapshotVersion = 0; + LOADED_SNAPSHOT_VERSION = 0; if (buf[0] == SNAPSHOT_MAGIC) { - snapshotVersion = buf[1]; + LOADED_SNAPSHOT_VERSION = buf[1]; MEMORY_SNAPSHOT_READER.readMemorySnapshot(offset, buf); offset += 8; } @@ -361,6 +368,7 @@ function decodeSnapshot() { MEMORY_SNAPSHOT_READER.readMemorySnapshot(offset, jsonBuf); const jsonTxt = new TextDecoder().decode(jsonBuf); DSO_METADATA = JSON.parse(jsonTxt); + LOADED_BASELINE_SNAPSHOT = Number(DSO_METADATA?.settings?.baselineSnapshot); READ_MEMORY = function (Module) { // restore memory from snapshot MEMORY_SNAPSHOT_READER.readMemorySnapshot(snapshotOffset, Module.HEAP8); @@ -423,5 +431,6 @@ export function maybeStoreMemorySnapshot() { ArtifactBundler.storeMemorySnapshot(getMemoryToUpload()); } else if (SHOULD_SNAPSHOT_TO_DISK) { DiskCache.put("snapshot.bin", getMemoryToUpload()); + console.log("Saved snapshot to disk"); } } diff --git a/src/pyodide/internal/topLevelEntropy/__init__.py b/src/pyodide/internal/topLevelEntropy/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/src/pyodide/internal/topLevelEntropy/entropy_import_context.py b/src/pyodide/internal/topLevelEntropy/entropy_import_context.py new file mode 100644 index 00000000000..74f1db22d8f --- /dev/null +++ b/src/pyodide/internal/topLevelEntropy/entropy_import_context.py @@ -0,0 +1,133 @@ +""" +Manage import context for modules that use getentropy() at startup. + +We install a metapath finder in import_patch_manager.py which executes the +module in the context manager returned by +get_entropy_import_context(module_name). + +This module defines get_entropy_import_context which + +"random" and "numpy.random.mtrand" also have some additional patches that need +to be installed as part of their import context to prevent top level crashes. + +Other rust packages are likely to need similar treatment to pydantic_core. +""" + +from contextlib import contextmanager +from array import array +from .import_patch_manager import block_calls + +import sys + +RUST_PACKAGES = ["pydantic_core", "tiktoken"] +MODULES_TO_PATCH = ["random", "numpy.random", "numpy.random.mtrand"] + RUST_PACKAGES + +# Control number of allowed entropy calls. + +ALLOWED_ENTROPY_CALLS = array("b", [0]) + + +def get_bad_entropy_flag(): + # simpleRunPython reads out stderr. We put the address there so we can fish it out... + # We could use ctypes instead of array but ctypes weighs an extra 100kb compared to array. + print(ALLOWED_ENTROPY_CALLS.buffer_info()[0], file=sys.stderr) + + +def is_bad_entropy_enabled(): + """This is used in entropy_patches.py to let calls to disabled functions + through if we are allowing bad entropy + """ + return ALLOWED_ENTROPY_CALLS[0] > 0 + + +@contextmanager +def allow_bad_entropy_calls(n): + ALLOWED_ENTROPY_CALLS[0] = n + yield + if ALLOWED_ENTROPY_CALLS[0] > 0: + raise RuntimeError( + f"{ALLOWED_ENTROPY_CALLS[0]} unexpected leftover getentropy calls " + ) + + +# Module instantiation context managers + + +def get_entropy_import_context(name): + """Look up the import context. + + If there is a function called _context, we'll use that. Otherwise, + we have a default for rust packages. (Currently only used for tiktoken). + """ + if name not in MODULES_TO_PATCH: + return None + funcname = name.replace(".", "_").replace("-", "_") + "_context" + res = globals().get(funcname, None) + if res: + return res + if name in RUST_PACKAGES: + # Initial import needs one entropy call to initialize std::collections::HashMap hash seed + return rust_package_context + raise Exception(f"Missing context for {name}") + + +@contextmanager +def rust_package_context(module): + """Rust packages need one entropy call if they create a rust hash map at + init time.""" + with allow_bad_entropy_calls(1): + yield + + +@contextmanager +def random_context(module): + """Importing random calls getentropy() 10 times it seems""" + with allow_bad_entropy_calls(10): + yield + # Block calls to functions that use the bad random seed we produced from the + # ten getentropy() calls. Instantiating Random with a given seed is fine, + # instantiating it without a seed will call getentropy() and fail. + # Instantiating SystemRandom is fine, calling it's methods will call + # getentropy() and fail. + block_calls(module, allowlist=["Random", "SystemRandom"]) + + +@contextmanager +def numpy_random_context(module): + """numpy.random doesn't call getentropy() itself, but we want to block calls + that might use the bad seed. + + TODO: Maybe there are more calls we can whitelist? + TODO: Is it not enough to just block numpy.random.mtrand calls? + """ + yield + # Calling default_rng() with a given seed is fine, calling it without a seed + # will call getentropy() and fail. + block_calls(module, allowlist=["default_rng"]) + + +@contextmanager +def numpy_random_mtrand_context(module): + # numpy.random.mtrand calls secrets.randbits at top level to seed itself. + # This will fail if we don't let it through. + with allow_bad_entropy_calls(1): + yield + # Block calls until we get a chance to replace the bad random seed. + block_calls(module) + + +@contextmanager +def pydantic_core_context(module): + try: + # Initial import needs one entropy call to initialize std::collections::HashMap hash seed + with allow_bad_entropy_calls(1): + yield + finally: + try: + with allow_bad_entropy_calls(1): + # validate_core_schema makes an ahash::AHashMap which makes another entropy call for + # its hash seed. It will throw an error but only after making the needed entropy + # call. + module.validate_core_schema(None) + except module.SchemaError: + pass diff --git a/src/pyodide/internal/topLevelEntropy/entropy_patches.py b/src/pyodide/internal/topLevelEntropy/entropy_patches.py new file mode 100644 index 00000000000..93e681b6523 --- /dev/null +++ b/src/pyodide/internal/topLevelEntropy/entropy_patches.py @@ -0,0 +1,134 @@ +""" +Handle the top level getentropy() mess: + +The C stdlib function getentropy() `getentropy()` calls +`crpyto.getRandomValues()` but this throws an error at top level which causes a +fatal error. + +Goals: + +1. Avoid top-level calls to the C stdlib function getentropy(), these fatally + fail. Patch these to raise Python errors instead. +2. Allow top level import of `random` and `numpy.random` modules. These seed + themselves with the functions that we patched in step 1, we temporarily + replace the `getentropy()` calls with no-ops to let them through. +3. Install wrapper modules at top level that only allow calls to a whitelisted + set of functions from `random` and `numpy.random` that don't use the bad + seeds that came from step 2. +4. Put it all back. +5. Reseed the rng before entering the request scope for the first time. + +Steps 1, part of 4, and 5 are handled here, steps 2, 3, and part of 4 are +handled in _cloudflare_random_overlays. +""" + +import _random +import sys +import os +from functools import wraps + +from .entropy_import_context import is_bad_entropy_enabled, get_entropy_import_context +from .import_patch_manager import ( + install_import_patch_manager, + remove_import_patch_manager, +) + +IN_REQUEST_CONTEXT = False + + +def should_allow_entropy_call(): + """This helps us raise Python errors rather than fatal errors in some cases. + + It doesn't really matter that much since we're not likely to recover from + these anyways but it feels better. + """ + # Allow if we've either entered request context or if we've temporarily enabled entropy. + return IN_REQUEST_CONTEXT or is_bad_entropy_enabled() + + +# Step 1. +# +# Prevent calls to getentropy(). The intended way for `getentropy()` to fail is to set an EIO error, +# which turns into a Python OSError, so we raise this same error so that if we patch `getentropy` +# from the Emscripten C stdlib we can remove these patches without changing the behavior. + +EIO = 29 + +orig_urandom = os.urandom + + +@wraps(orig_urandom) +def patch_urandom(*args): + if not should_allow_entropy_call(): + raise OSError(EIO, "Cannot get entropy outside of request context") + return orig_urandom(*args) + + +def disable_urandom(): + """ + Python os.urandom() calls C getentropy() which calls JS crypto.getRandomValues() which throws at + top level, fatally crashing the interpreter. + + TODO: Patch Emscripten's getentropy() to return EIO if `crypto.getRandomValues()` throws. Then + we can remove this. + """ + os.urandom = patch_urandom + + +def restore_urandom(): + os.urandom = orig_urandom + + +orig_Random_seed = _random.Random.seed + + +@wraps(orig_Random_seed) +def patched_seed(self, val): + """ + Random.seed calls _PyOs_URandom which will fatally fail in top level. Prevent this by raising a + RuntimeError instead. + """ + if val is None and not should_allow_entropy_call(): + raise OSError(EIO, "Cannot get entropy outside of request context") + return orig_Random_seed(self, val) + + +def disable_random_seed(): + # Install patch to block calls to PyOs_URandom + _random.Random.seed = patched_seed + + +def restore_random_seed(): + # Restore original random seed behavior + _random.Random.seed = orig_Random_seed + + +def reseed_rng(): + """ + Step 5: Have to reseed randomness in the IoContext of the first request since we gave a low + quality seed when it was seeded at top level. + """ + from random import seed + + seed() + + if "numpy.random" in sys.modules: + from numpy.random import seed + + seed() + + +def before_top_level(): + disable_urandom() + disable_random_seed() + install_import_patch_manager(get_entropy_import_context) + + +def before_first_request(): + global IN_REQUEST_CONTEXT + + IN_REQUEST_CONTEXT = True + restore_urandom() + restore_random_seed() + remove_import_patch_manager() + reseed_rng() diff --git a/src/pyodide/internal/topLevelEntropy/import_patch_manager.py b/src/pyodide/internal/topLevelEntropy/import_patch_manager.py new file mode 100644 index 00000000000..2599cdda23f --- /dev/null +++ b/src/pyodide/internal/topLevelEntropy/import_patch_manager.py @@ -0,0 +1,163 @@ +""" +A metapath finder which calls get_import_context(module_name). If it returns a +value that is not None, this is interpreted as a context manager that should be +used when executing the module top level scope. + +When we're done, we put back the original module. The wrapper module and wrapper +stubs will persist in the wild, so we need to make sure they behave the same way +as the originals after we put them back. This is controlled by the +IN_REQUEST_CONTEXT variable. +""" + +from functools import wraps +import sys + + +class PatchLoader: + """Loader that calls the original loader in the given context manager""" + + def __init__(self, orig_loader, import_context): + self.orig_loader = orig_loader + self.import_context = import_context + + def __getattr__(self, name): + return getattr(self.orig_loader, name) + + def exec_module(self, module): + with self.import_context(module): + self.orig_loader.exec_module(module) + + +class PatchFinder: + """Finder that returns our PatchLoader if get_import_context returns an import + context for the module. Otherwise, return None. + """ + + def __init__(self, get_import_context): + self.get_import_context = get_import_context + + def invalidate_caches(self): + pass + + def find_spec( + self, + fullname: str, + path, + target, + ): + import_context = self.get_import_context(fullname) + if not import_context: + # Not ours + return None + + for finder in sys.meta_path: + if isinstance(finder, PatchFinder): + # Avoid infinite recurse. Presumably this is the first entry. + continue + spec = finder.find_spec(fullname, path, target) + if spec: + # Found original module spec + break + else: + # Not found. This is going to be an ImportError. + return None + # Overwrite the loader with our wrapped loader + spec.loader = PatchLoader(spec.loader, import_context) + return spec + + @staticmethod + def install(get_import_context): + sys.meta_path.insert(0, PatchFinder(get_import_context)) + + @staticmethod + def remove(): + for idx, val in enumerate(sys.meta_path): + if isinstance(val, PatchFinder): + break + del sys.meta_path[idx] + + +def install_import_patch_manager(get_import_context): + PatchFinder.install(get_import_context) + + +def remove_import_patch_manager(): + PatchFinder.remove() + unblock_calls() + + +# We remove the metapath entry and replace the patched sys.modules entries with +# the original modules before the request context, but the patched copies can +# still be used from top level imports. When IN_REQUEST_CONTEXT is True, we need +# to make sure that our patches behave like the original imports. +IN_REQUEST_CONTEXT = False +# Keep track of the unblocked modules so we can put them backk into sys.modules +# when we're done. +ORIG_MODULES = {} + + +def block_calls(module, *, allowlist=[]): + # Called from the import context for modules that need to block calls. + sys.modules[module.__name__] = BlockedCallModule(module, allowlist) + ORIG_MODULES[module.__name__] = module + + +def unblock_calls(): + # Remove the patches when we're ready to enable entropy calls. + global IN_REQUEST_CONTEXT + + IN_REQUEST_CONTEXT = True + for name, val in ORIG_MODULES.items(): + sys.modules[name] = val + + +class BlockedCallModule: + """A proxy class that wraps a module that we want to block calls to + + Attribute access is passed on to the original module but if the result is a + callable that isn't in the allow list, we wrap it with a function that + raises an error unless IN_REQUEST_CONTEXT is true. + + Note that because we define __getattribute__ and __setattr__, we cannot do + direct reads or assignments e.g., `self.a = 1`. This risks recursion errors + if there is a typo. Instead, we have to call super().__setattr__. + + This has the advantage that it avoids name clashes if the proxied module + actually defines variables called _mod or _allow_list. + """ + + def __init__(self, module, allowlist): + super().__setattr__("_mod", module) + super().__setattr__("_allow_list", allowlist) + + def __getattribute__(self, key): + mod = super().__getattribute__("_mod") + orig = getattr(mod, key) + if IN_REQUEST_CONTEXT: + return orig + if not callable(orig): + return orig + + if key in super().__getattribute__("_allow_list"): + return orig + + # If we aren't in a request scope, the value is a callable, and it's not in the allow_list, + # return a wrapper that raises an error if it's called before entering the request scope. + # TODO: this doesn't wrap classes correctly, does it matter? + @wraps(orig) + def wrapper(*args, **kwargs): + if not IN_REQUEST_CONTEXT: + raise RuntimeError( + f"Cannot use {mod.__name__}.{key}() outside of request context" + ) + return orig(*args, **kwargs) + + return wrapper + + def __setattr__(self, key, val): + mod = super().__getattribute__("_mod") + setattr(mod, key, val) + + def __dir__(self): + mod = super().__getattribute__("_mod") + return dir(mod) diff --git a/src/pyodide/internal/topLevelEntropy/lib.js b/src/pyodide/internal/topLevelEntropy/lib.js new file mode 100644 index 00000000000..0170b77f791 --- /dev/null +++ b/src/pyodide/internal/topLevelEntropy/lib.js @@ -0,0 +1,177 @@ +/** + * Handle the top level getentropy() mess. See entropy_patches.py which is the + * main file for the entropy patches. + * + * This file installs the relevant files and calls the exports from + * entropy_patches.py. setupShouldAllowBadEntropy reads out the address of the + * byte that we use to control calls to crypto.getRandomValues from Python. + */ + +import { default as entropyPatches } from "pyodide-internal:topLevelEntropy/entropy_patches.py"; +import { default as entropyImportContext } from "pyodide-internal:topLevelEntropy/entropy_import_context.py"; +import { default as importPatchManager } from "pyodide-internal:topLevelEntropy/import_patch_manager.py"; +import { IS_TRACING } from "pyodide-internal:metadata"; +import { LOADED_SNAPSHOT_VERSION } from "pyodide-internal:snapshot"; +import { simpleRunPython } from "pyodide-internal:util"; + +// TODO: When we've updated all the snapshots, remove this. +const SHOULD_GATE_ENTROPY = + !IS_TRACING && + (LOADED_SNAPSHOT_VERSION === undefined || LOADED_SNAPSHOT_VERSION === 2); + +let allowed_entropy_calls_addr; + +/** + * Set up a byte for communication between JS and Python. + * + * We make an array in Python and then get its address in JavaScript so + * shouldAllowBadEntropy can check / write back the value + */ +function setupShouldAllowBadEntropy(Module) { + // get_bad_entropy_flag prints the address we want into stderr which is returned into res. + // We parse this as an integer. + const res = simpleRunPython( + Module, + "from _cloudflare.entropy_import_context import get_bad_entropy_flag;" + + "get_bad_entropy_flag();" + + "del get_bad_entropy_flag", + ); + allowed_entropy_calls_addr = Number(res); +} + +function shouldAllowBadEntropy(Module) { + if (!SHOULD_GATE_ENTROPY) { + return true; + } + const val = Module.HEAP8[allowed_entropy_calls_addr]; + if (val) { + Module.HEAP8[allowed_entropy_calls_addr]--; + return true; + } + return false; +} + +/** + * Some packages need hash or random seeds at import time. We carefully track + * how much bad entropy we're giving everyone so that hopefully none of it ends + * up in a place where the end user needed good entropy. In particular, we think + * it's acceptable to give poor entropy for hash seeds but not for random seeds. + * The random libraries are allowed to initialize themselves with a bad seed but + * we disable them until we have a chance to reseed. + * + * See entropy_import_context.py where `allow_bad_entropy_calls` is used to dole + * out the bad entropy. + */ +export function getRandomValues(Module, arr) { + try { + return crypto.getRandomValues(arr); + } catch (e) { + if ( + !e.message.includes("Disallowed operation called within global scope") + ) { + Module._dump_traceback(); + throw e; + } + if (!shouldAllowBadEntropy(Module)) { + Module._dump_traceback(); + throw e; + } + // "entropy" in the test suite is a bunch of 42's. Good to use a readily identifiable pattern + // here which is different than the test suite. + arr.fill(43); + } +} + +/** + * We call this regardless of whether we are restoring from a snapshot or not, + * after instantiating the Emscripten module but before restoring the snapshot. + * Hypothetically, we could skip it for new dedicated snapshots. + */ +export function entropyMountFiles(Module) { + Module.FS.mkdir(`/lib/python3.12/site-packages/_cloudflare`); + Module.FS.writeFile( + `/lib/python3.12/site-packages/_cloudflare/__init__.py`, + new Uint8Array(0), + { canOwn: true }, + ); + Module.FS.writeFile( + `/lib/python3.12/site-packages/_cloudflare/entropy_patches.py`, + new Uint8Array(entropyPatches), + { canOwn: true }, + ); + Module.FS.writeFile( + `/lib/python3.12/site-packages/_cloudflare/entropy_import_context.py`, + new Uint8Array(entropyImportContext), + { canOwn: true }, + ); + Module.FS.writeFile( + `/lib/python3.12/site-packages/_cloudflare/import_patch_manager.py`, + new Uint8Array(importPatchManager), + { canOwn: true }, + ); +} + +/** + * This prepares us to execute the top level scope. It changes JS state so it + * needs to be called whether restoring snapshot or not. We have to call this + * after the runtime is ready, so after restoring the snapshot in the snapshot + * branch and after entropyMountFiles in the no-snapshot branch. + */ +export function entropyAfterRuntimeInit(Module) { + setupShouldAllowBadEntropy(Module); +} + +/** + * This prepares us to execute the top level scope. It changes only Python state + * so it doesn't need to be called when restoring from snapshot. + */ +export function entropyBeforeTopLevel(Module) { + if (!SHOULD_GATE_ENTROPY) { + return; + } + simpleRunPython( + Module, + ` +from _cloudflare.entropy_patches import before_top_level +before_top_level() +del before_top_level +`, + ); +} + +let isReady = false; +/** + * Called to reseed rngs and turn off blocks that prevent access to rng APIs. + */ +export function entropyBeforeRequest(Module) { + if (isReady) { + // I think this is only ever called once, but we guard it just to be sure. + return; + } + isReady = true; + if (SHOULD_GATE_ENTROPY) { + simpleRunPython( + Module, + ` +from _cloudflare.entropy_patches import before_first_request +before_first_request() +del before_first_request + `, + ); + } else { + // If we shouldn't gate entropy, we just need to reseed_rng. We first have + // to call invalidate_caches b/c the snapshot doesn't know about + // _cloudflare.entropy_patches. + simpleRunPython( + Module, + ` +from importlib import invalidate_caches +invalidate_caches() +del invalidate_caches +from _cloudflare.entropy_patches import reseed_rng +reseed_rng() +del reseed_rng + `, + ); + } +} diff --git a/src/pyodide/internal/util.js b/src/pyodide/internal/util.js index 9f775d299d2..f7b3b62ab55 100644 --- a/src/pyodide/internal/util.js +++ b/src/pyodide/internal/util.js @@ -41,4 +41,5 @@ export function simpleRunPython(emscriptenModule, code) { } throw new Error("Failed"); } + return err; } diff --git a/src/pyodide/python-entrypoint-helper.js b/src/pyodide/python-entrypoint-helper.js index 7e17da36c72..7bd45e17691 100644 --- a/src/pyodide/python-entrypoint-helper.js +++ b/src/pyodide/python-entrypoint-helper.js @@ -2,7 +2,10 @@ // python-entrypoint.js USER module. import { loadPyodide } from "pyodide-internal:python"; -import { uploadArtifacts, maybeStoreMemorySnapshot } from "pyodide-internal:snapshot"; +import { + uploadArtifacts, + maybeStoreMemorySnapshot, +} from "pyodide-internal:snapshot"; import { enterJaegerSpan } from "pyodide-internal:jaeger"; import { REQUIREMENTS, @@ -19,6 +22,7 @@ import { } from "pyodide-internal:metadata"; import { reportError } from "pyodide-internal:util"; import { default as Limiter } from "pyodide-internal:limiter"; +import { entropyBeforeRequest } from "pyodide-internal:topLevelEntropy/lib"; function pyimportMainModule(pyodide) { if (!MAIN_MODULE_NAME.endsWith(".py")) { @@ -124,27 +128,11 @@ function getMainModule() { }); } -/** - * Have to reseed randomness in the IoContext of the first request since we gave a low quality seed - * when it was seeded at top level. - */ -let isSeeded = false; -function reseedRandom(pyodide) { - if (isSeeded) { - return; - } - isSeeded = true; - pyodide.runPython(` - from random import seed - seed() - del seed - `); -} - async function preparePython() { const pyodide = await getPyodide(); - reseedRandom(pyodide); - return await getMainModule(); + const mainModule = await getMainModule(); + entropyBeforeRequest(pyodide._module); + return mainModule; } function makeHandler(pyHandlerName) { diff --git a/src/workerd/server/tests/python/BUILD.bazel b/src/workerd/server/tests/python/BUILD.bazel index 7c178564672..380c8d60f2c 100644 --- a/src/workerd/server/tests/python/BUILD.bazel +++ b/src/workerd/server/tests/python/BUILD.bazel @@ -22,6 +22,17 @@ wd_test( ), ) +wd_test( + src = "random/random.wd-test", + args = ["--experimental"], + data = glob( + [ + "random/*", + ], + exclude = ["**/*.wd-test"], + ), +) + # langchain test: disabled for now because it's flaky # TODO: reenable this? # diff --git a/src/workerd/server/tests/python/random/random.wd-test b/src/workerd/server/tests/python/random/random.wd-test new file mode 100644 index 00000000000..612bdd1645b --- /dev/null +++ b/src/workerd/server/tests/python/random/random.wd-test @@ -0,0 +1,15 @@ +using Workerd = import "/workerd/workerd.capnp"; + +const unitTests :Workerd.Config = ( + services = [ + ( name = "python-hello", + worker = ( + modules = [ + (name = "worker.py", pythonModule = embed "worker.py") + ], + compatibilityDate = "2024-01-15", + compatibilityFlags = ["python_workers"], + ) + ), + ], +); diff --git a/src/workerd/server/tests/python/random/worker.py b/src/workerd/server/tests/python/random/worker.py new file mode 100644 index 00000000000..df63bcfa830 --- /dev/null +++ b/src/workerd/server/tests/python/random/worker.py @@ -0,0 +1,57 @@ +""" +Verify that calling `random` at the top-level throws. + +Calls to random should only work inside a request context. +""" + +from random import random, randbytes, choice + +try: + random() +except RuntimeError as e: + assert ( + repr(e) + == "RuntimeError('Cannot use random.random() outside of request context')" + ) +else: + assert False + +try: + randbytes(5) +except RuntimeError as e: + assert ( + repr(e) + == "RuntimeError('Cannot use random.randbytes() outside of request context')" + ) +else: + assert False + +try: + choice([1, 2, 3]) +except RuntimeError as e: + assert ( + repr(e) + == "RuntimeError('Cannot use random.choice() outside of request context')" + ) +else: + assert False + + +def t1(): + from random import random, randbytes + + random() + randbytes(5) + choice([1, 2, 3]) + + +def t2(): + random() + randbytes(5) + choice([1, 2, 3]) + + t1() + + +def test(): + t2()