diff --git a/src/pyodide/BUILD.bazel b/src/pyodide/BUILD.bazel index 453e61f56aca..7d2c354aedd3 100644 --- a/src/pyodide/BUILD.bazel +++ b/src/pyodide/BUILD.bazel @@ -81,7 +81,7 @@ copy_file( # TODO: all of these should be fixed by linking our own Pyodide or by upstreaming. PRELUDE = """ -import { newWasmModule, monotonicDateNow, wasmInstantiate, getRandomValues } from "pyodide-internal:builtin_wrappers"; +import { newWasmModule, monotonicDateNow, wasmInstantiate } from "pyodide-internal:builtin_wrappers"; // Pyodide uses `new URL(some_url, location)` to resolve the path in `loadPackage`. Setting // `location = undefined` makes this throw an error if some_url is not an absolute url. Which is what @@ -117,10 +117,6 @@ REPLACEMENTS = [ "Date.now", "monotonicDateNow", ], - [ - "crypto.getRandomValues", - "getRandomValues" - ], [ "reportUndefinedSymbols()", "reportUndefinedSymbolsNoOp()" diff --git a/src/pyodide/internal/builtin_wrappers.js b/src/pyodide/internal/builtin_wrappers.js index b40064678978..3721137c1947 100644 --- a/src/pyodide/internal/builtin_wrappers.js +++ b/src/pyodide/internal/builtin_wrappers.js @@ -19,23 +19,6 @@ export function monotonicDateNow() { return now + lastDelta; } -/** - * We initialize Python at top level, but it tries to initialize the random seed with - * crypto.getRandomValues which will fail at top level. So we don't produce any entropy the first - * time around and we reseed the rng in the first request context before executing user code. - */ -export function getRandomValues(arr) { - try { - return crypto.getRandomValues(arr); - } catch (e) { - if (e.message.includes("Disallowed operation called within global scope")) { - // random.seed() can't work at startup. We'll seed again under the request scope. - return arr; - } - throw e; - } -} - /** * First check that the callee is what we expect, then use `UnsafeEval` to * construct a `WasmModule`. diff --git a/src/pyodide/internal/patches/_cloudflare_random_overlay_module.py b/src/pyodide/internal/patches/_cloudflare_random_overlay_module.py new file mode 100644 index 000000000000..04d6fa3aa447 --- /dev/null +++ b/src/pyodide/internal/patches/_cloudflare_random_overlay_module.py @@ -0,0 +1,10 @@ +def __getattr__(key): + from _cloudflare_random_overlays import overlay_getattr + + return overlay_getattr(__name__, key) + + +def __dir__(): + from _cloudflare_random_overlays import overlay_dir + + return overlay_dir(__name__) diff --git a/src/pyodide/internal/patches/_cloudflare_random_overlays.py b/src/pyodide/internal/patches/_cloudflare_random_overlays.py new file mode 100644 index 000000000000..25649912bdab --- /dev/null +++ b/src/pyodide/internal/patches/_cloudflare_random_overlays.py @@ -0,0 +1,202 @@ +""" +Manage overlay modules for random modules + +The actual contents of the overlay are in _cloudflare_random_overlay_module, it just implements a +module __getattr__ and __dir__ that provide the __name__ of the particular overlay and call back +into the impls here. + +We only lazily import the original module so that we can install an overlay for `numpy.random` even +when `numpy.random` isn't installed. This also avoids paying to instantiate the overlaid module if +it's not necessary. This means we have to remove our overlay from `sys.modules` before importing the +original module the first time and put it back afterwards. "random" and "numpy.random.mtrand" also +have some additional patches that need to be installed as part of their import context to prevent +top level crashes. + +When we're done, we put back the original module but the wrapper module and wrapper stubs will +persist in the wild, so we need to make sure they behave the same way as the originals after we put +them back. This is controlled by the IN_REQUEST_CONTEXT variable. +""" + +from contextlib import contextmanager, nullcontext +from importlib import import_module +from functools import wraps +from pathlib import Path +from types import ModuleType + +import sys + +RANDOM_OVERLAY_MODULE_STR = Path("/lib/random_overlay/random_overlay.py").read_text() +MODULES_TO_OVERLAY = ["random", "numpy.random", "numpy.random.mtrand"] +# We remove the overlay before the request context, but it can still be used from +# top level imports. When IN_REQUEST_CONTEXT is True, we need to make sure that our patches +# behave like the original imports. +IN_REQUEST_CONTEXT = False + +OVERLAY_ORIG_MODULES = {} + + +def load_orig_module(name): + with import_context(name): + return import_module(name) + + +def get_orig_module(name): + mod = OVERLAY_ORIG_MODULES[name] + if not mod: + OVERLAY_ORIG_MODULES[name] = mod = load_orig_module(name) + return mod + + +def install_random_overlay(name): + """Install an overlay for the module which disables most calls. + + We store the original module if it's already been imported into OVERLAY_ORIG_MODULES so we can + restore when we're done. + """ + + OVERLAY_ORIG_MODULES[name] = sys.modules.get(name, None) + module = ModuleType(name) + exec(RANDOM_OVERLAY_MODULE_STR, module.__dict__) + sys.modules[name] = module + + +def install_random_overlays(): + for name in MODULES_TO_OVERLAY: + install_random_overlay(name) + + +def remove_random_overlay(name): + orig_module = OVERLAY_ORIG_MODULES[name] + if orig_module: + # Put back original random module + sys.modules[name] = orig_module + else: + # The overlay wasn't ever used so just strip it out of sys.modules. + del sys.modules[name] + + +def remove_random_overlays(): + global IN_REQUEST_CONTEXT + + IN_REQUEST_CONTEXT = True + for name in MODULES_TO_OVERLAY: + remove_random_overlay(name) + + +def overlay_dir(name): + return dir(get_orig_module(name)) + + +# Whitelist of functions that are definitely okay to call at top level. +MODULE_ALLOW_LIST = { + "random": ["Random", "SystemRandom"], + "numpy.random": ["default_rng"], +} + + +def overlay_getattr(name, key): + mod = get_orig_module(name) + orig = getattr(mod, key) + if IN_REQUEST_CONTEXT: + return orig + if not callable(orig): + return orig + + allow_list = MODULE_ALLOW_LIST.get(name, []) + if key in allow_list: + return orig + + # If we aren't in a request scope, the value is a callable, and it's not in the allow_list, + # return a wrapper that raises an error if it's called before entering the request scope. + # TODO: this doesn't wrap classes correctly, does it matter? + @wraps(orig) + def wrapper(*args, **kwargs): + print(name, key, "IN_REQUEST_CONTEXT", IN_REQUEST_CONTEXT) + if not IN_REQUEST_CONTEXT: + raise RuntimeError(f"Cannot use {name}.{key}() outside of request context") + return orig(*args, **kwargs) + + return wrapper + + +@contextmanager +def import_context(name): + """Set up the context for loading the original module + + We have to remove the overlay from sys.modules and restore it afterwards, plus there is some + module-specific context needed to prevent top level errors when seeding. + """ + if IN_REQUEST_CONTEXT: + # If we've already removed the overlays, we'll hit this path if someone did + # `import some_mod` at top level, then accesses `some_mod.some_attr` in the + # request context. remove_random_overlays() will have already ensured that + # there is no entry in sys.modules, so we don't have to do anything here + yield + return + # Remove the overlay from sys.modules. Otherwise we'll import our overlay and + # not the original. + self_mod = sys.modules.pop(name) + # Choose appropriate module-specific context if any. + if name == "random": + context = import_context_random + elif name == "numpy.random.mtrand": + context = import_context_numpy_mrand + else: + context = import_context_default + + try: + yield from context() + finally: + # Put overlay back in sys.modules + sys.modules[name] = self_mod + + +def import_context_default(): + # No extra setup + yield + + +def import_context_random(): + """We've made _random.Random.seed raise an error, but random calls it at top level. + + To prevent the top level import from failing, we need to temporarily make seed a no-op just for + the random import. + """ + import _random + + orig_seed = _random.Random.seed + + @wraps(orig_seed) + def patch_seed(*args): + pass + + _random.Random.seed = patch_seed + try: + yield + finally: + _random.Random.seed = orig_seed + + +def import_context_numpy_mrand(): + """numpy.random.mrand will attempt to seed itself using secrets.randbits at import time. + + To prevent top level import from failing, we need to temporarily make secrets.randbits return a + constant just for the mrand import. + """ + import secrets + + orig_randbits = secrets.randbits + patched = True + + @wraps(orig_randbits) + def patch_randbits(*args): + if patched: + return 0 + return orig_randbits(*args) + + secrets.randbits = patch_randbits + try: + yield + finally: + secrets.randbits = orig_randbits + patched = False diff --git a/src/pyodide/internal/patches/_cloudflare_random_patches.py b/src/pyodide/internal/patches/_cloudflare_random_patches.py new file mode 100644 index 000000000000..19653754eb79 --- /dev/null +++ b/src/pyodide/internal/patches/_cloudflare_random_patches.py @@ -0,0 +1,115 @@ +""" +Handle the randomness mess. + +Goals: + +1. Avoid top-level calls to the C stdlib function getentropy(), these fatally fail. Patch these to + raise Python errors instead. +2. Allow top level import of `random` and `numpy.random` modules. These seed themselves with the + functions that we patched in step 1, we temporarily replace the `getentropy()` calls with no-ops + to let them through. +3. Install wrapper modules at top level that only allow calls to a whitelisted set of functions from + `random` and `numpy.random` that don't use the bad seeds that came from step 2. +4. Put it all back. +5. Reseed the rng before entering the request scope for the first time. + +Steps 1, part of 4, and 5 are handled here, steps 2, 3, and part of 4 are handled in +_cloudflare_random_overlays. +""" + +import _random +import sys +import os +from functools import wraps + +from _cloudflare_random_overlays import install_random_overlays, remove_random_overlays + + +IN_REQUEST_CONTEXT = False + +# Step 1. +# +# Prevent calls to getentropy(). The intended way for `getentropy()` to fail is to set an EIO error, +# which turns into a Python OSError, so we raise this same error so that if we patch `getentropy` +# from the Emscripten C stdlib we can remove these patches without changing the behavior. + +EIO = 29 + +orig_urandom = os.urandom + + +@wraps(orig_urandom) +def patch_urandom(*args): + if not IN_REQUEST_CONTEXT: + raise OSError(EIO, "Cannot get entropy outside of request context") + return orig_urandom(*args) + + +def disable_urandom(): + """ + Python os.urandom() calls C getentropy() which calls JS crypto.getRandomValues() which throws at + top level, fatally crashing the interpreter. + + TODO: Patch Emscripten's getentropy() to return EIO if `crypto.getRandomValues()` throws. Then + we can remove this. + """ + os.urandom = patch_urandom + + +def restore_urandom(): + os.urandom = orig_urandom + + +orig_Random_seed = _random.Random.seed + + +@wraps(orig_Random_seed) +def patched_seed(self, val): + """ + Random.seed calls _PyOs_URandom which will fatally fail in top level. Prevent this by raising a + RuntimeError instead. + """ + if val is None and not IN_REQUEST_CONTEXT: + raise OSError(EIO, "Cannot get entropy outside of request context") + return orig_Random_seed(self, val) + + +def disable_random_seed(): + # Install patch to block calls to PyOs_URandom + _random.Random.seed = patched_seed + + +def restore_random_seed(): + # Restore original random seed behavior + _random.Random.seed = orig_Random_seed + + +def reseed_rng(): + """ + Step 5: Have to reseed randomness in the IoContext of the first request since we gave a low + quality seed when it was seeded at top level. + """ + from random import seed + + seed() + + if "numpy.random" in sys.modules: + from numpy.random import seed + + seed() + + +def before_top_level(): + disable_urandom() + disable_random_seed() + install_random_overlays() + + +def before_first_request(): + global IN_REQUEST_CONTEXT + + IN_REQUEST_CONTEXT = True + restore_urandom() + restore_random_seed() + remove_random_overlays() + reseed_rng() diff --git a/src/pyodide/internal/python.js b/src/pyodide/internal/python.js index b598a379b38a..b3c847852346 100644 --- a/src/pyodide/internal/python.js +++ b/src/pyodide/internal/python.js @@ -12,6 +12,7 @@ import { maybeSetupSnapshotUpload, restoreSnapshot, } from "pyodide-internal:snapshot"; +import { randomBeforeTopLevel } from "pyodide-internal:random"; /** * This file is a simplified version of the Pyodide loader: @@ -122,7 +123,8 @@ function getEmscriptenSettings(lockfile, indexURL) { env: { HOME: "/session", // We don't have access to cryptographic rng at startup so we cannot support hash - // randomization. Setting `PYTHONHASHSEED` disables it. + // randomization. Setting `PYTHONHASHSEED` disables it. See further discussion in + // _cloudflare_random_patches.py PYTHONHASHSEED: "111", }, // This is the index that we use as the base URL to fetch the wheels. @@ -189,6 +191,7 @@ async function prepareWasmLinearMemory(Module) { // were creating the snapshot so the outcome of that is already baked in. return; } + randomBeforeTopLevel(Module); adjustSysPath(Module); maybeSetupSnapshotUpload(Module); } diff --git a/src/pyodide/internal/random.js b/src/pyodide/internal/random.js new file mode 100644 index 000000000000..a74d85edc892 --- /dev/null +++ b/src/pyodide/internal/random.js @@ -0,0 +1,55 @@ +/** + * Handle the randomness mess. See patches/_cloudflare_random_patches.py which is the main file for + * random patches. + * + * This file just installs the relevant files and calls the exports from _cloudflare_random_patches. + */ + +import { default as randomPatches } from "pyodide-internal:patches/_cloudflare_random_patches.py"; +import { default as randomOverlays } from "pyodide-internal:patches/_cloudflare_random_overlays.py"; +import { default as randomModuleOverlay } from "pyodide-internal:patches/_cloudflare_random_overlay_module.py"; +import { simpleRunPython } from "pyodide-internal:util"; + +export function randomBeforeTopLevel(Module) { + Module.FS.writeFile( + `/lib/python3.12/site-packages/_cloudflare_random_patches.py`, + new Uint8Array(randomPatches), + { canOwn: true }, + ); + Module.FS.writeFile( + `/lib/python3.12/site-packages/_cloudflare_random_overlays.py`, + new Uint8Array(randomOverlays), + { canOwn: true }, + ); + Module.FS.mkdir("/lib/random_overlay"); + Module.FS.writeFile( + `/lib/random_overlay/random_overlay.py`, + new Uint8Array(randomModuleOverlay), + { canOwn: true }, + ); + simpleRunPython( + Module, + ` +from _cloudflare_random_patches import before_top_level +before_top_level() +del before_top_level +`, + ); +} + +// I think this is only ever called once, but we guard it just to be sure. +let isReady = false; +export function randomBeforeRequest(Module) { + if (isReady) { + return; + } + isReady = true; + simpleRunPython( + Module, + ` +from _cloudflare_random_patches import before_first_request +before_first_request() +del before_first_request + `, + ); +} diff --git a/src/pyodide/python-entrypoint-helper.js b/src/pyodide/python-entrypoint-helper.js index 4f9063ed1b62..87a66022d887 100644 --- a/src/pyodide/python-entrypoint-helper.js +++ b/src/pyodide/python-entrypoint-helper.js @@ -20,6 +20,7 @@ import { import { default as ArtifactBundler } from "pyodide-internal:artifacts"; import { reportError } from "pyodide-internal:util"; import { default as Limiter } from "pyodide-internal:limiter"; +import { randomBeforeRequest } from "pyodide-internal:random"; function pyimportMainModule(pyodide) { if (!MAIN_MODULE_NAME.endsWith(".py")) { @@ -125,27 +126,11 @@ function getMainModule() { }); } -/** - * Have to reseed randomness in the IoContext of the first request since we gave a low quality seed - * when it was seeded at top level. - */ -let isSeeded = false; -function reseedRandom(pyodide) { - if (isSeeded) { - return; - } - isSeeded = true; - pyodide.runPython(` - from random import seed - seed() - del seed - `); -} - async function preparePython() { const pyodide = await getPyodide(); - reseedRandom(pyodide); - return await getMainModule(); + const mainModule = await getMainModule(); + randomBeforeRequest(pyodide._module); + return mainModule; } function makeHandler(pyHandlerName) { diff --git a/src/workerd/server/tests/python/BUILD.bazel b/src/workerd/server/tests/python/BUILD.bazel index 7c1785646720..380c8d60f2c0 100644 --- a/src/workerd/server/tests/python/BUILD.bazel +++ b/src/workerd/server/tests/python/BUILD.bazel @@ -22,6 +22,17 @@ wd_test( ), ) +wd_test( + src = "random/random.wd-test", + args = ["--experimental"], + data = glob( + [ + "random/*", + ], + exclude = ["**/*.wd-test"], + ), +) + # langchain test: disabled for now because it's flaky # TODO: reenable this? # diff --git a/src/workerd/server/tests/python/random/random.wd-test b/src/workerd/server/tests/python/random/random.wd-test new file mode 100644 index 000000000000..612bdd1645b6 --- /dev/null +++ b/src/workerd/server/tests/python/random/random.wd-test @@ -0,0 +1,15 @@ +using Workerd = import "/workerd/workerd.capnp"; + +const unitTests :Workerd.Config = ( + services = [ + ( name = "python-hello", + worker = ( + modules = [ + (name = "worker.py", pythonModule = embed "worker.py") + ], + compatibilityDate = "2024-01-15", + compatibilityFlags = ["python_workers"], + ) + ), + ], +); diff --git a/src/workerd/server/tests/python/random/worker.py b/src/workerd/server/tests/python/random/worker.py new file mode 100644 index 000000000000..df63bcfa8301 --- /dev/null +++ b/src/workerd/server/tests/python/random/worker.py @@ -0,0 +1,57 @@ +""" +Verify that calling `random` at the top-level throws. + +Calls to random should only work inside a request context. +""" + +from random import random, randbytes, choice + +try: + random() +except RuntimeError as e: + assert ( + repr(e) + == "RuntimeError('Cannot use random.random() outside of request context')" + ) +else: + assert False + +try: + randbytes(5) +except RuntimeError as e: + assert ( + repr(e) + == "RuntimeError('Cannot use random.randbytes() outside of request context')" + ) +else: + assert False + +try: + choice([1, 2, 3]) +except RuntimeError as e: + assert ( + repr(e) + == "RuntimeError('Cannot use random.choice() outside of request context')" + ) +else: + assert False + + +def t1(): + from random import random, randbytes + + random() + randbytes(5) + choice([1, 2, 3]) + + +def t2(): + random() + randbytes(5) + choice([1, 2, 3]) + + t1() + + +def test(): + t2()