Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Python: Make top level random() raise #1952

Merged
merged 1 commit into from
Apr 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 6 additions & 4 deletions src/pyodide/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -117,14 +117,14 @@ REPLACEMENTS = [
"Date.now",
"monotonicDateNow",
],
[
"crypto.getRandomValues",
"getRandomValues"
],
[
"reportUndefinedSymbols()",
"reportUndefinedSymbolsNoOp()"
],
[
"crypto.getRandomValues(",
"getRandomValues(Module, ",
]
]

load("//:build/pyodide_bucket.bzl", "PYODIDE_PACKAGE_BUCKET_URL")
Expand Down Expand Up @@ -153,12 +153,14 @@ wd_js_bundle(
internal_data_modules = ["generated/python_stdlib.zip"] + glob([
"internal/*.py",
"internal/patches/*.py",
"internal/topLevelEntropy/*.py",
]),
internal_json_modules = ["generated/pyodide-lock.json", "generated/pyodide-bucket.json"],
internal_modules = [
"generated/pyodide.asm.js",
] + glob([
"internal/*.js",
"internal/topLevelEntropy/*.js",
]),
internal_wasm_modules = ["generated/pyodide.asm.wasm"],
schema_id = "0xbcc8f57c63814005",
Expand Down
18 changes: 1 addition & 17 deletions src/pyodide/internal/builtin_wrappers.js
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import { default as UnsafeEval } from "internal:unsafe-eval";
import { default as DiskCache } from "pyodide-internal:disk_cache";
export { getRandomValues } from "pyodide-internal:topLevelEntropy/lib";

let lastTime;
let lastDelta = 0;
Expand All @@ -19,23 +20,6 @@ export function monotonicDateNow() {
return now + lastDelta;
}

/**
* We initialize Python at top level, but it tries to initialize the random seed with
* crypto.getRandomValues which will fail at top level. So we don't produce any entropy the first
* time around and we reseed the rng in the first request context before executing user code.
*/
export function getRandomValues(arr) {
try {
return crypto.getRandomValues(arr);
} catch (e) {
if (e.message.includes("Disallowed operation called within global scope")) {
// random.seed() can't work at startup. We'll seed again under the request scope.
return arr;
}
throw e;
}
}

/**
* First check that the callee is what we expect, then use `UnsafeEval` to
* construct a `WasmModule`.
Expand Down
25 changes: 18 additions & 7 deletions src/pyodide/internal/python.js
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
Error.stackTraceLimit = Infinity;
import { enterJaegerSpan } from "pyodide-internal:jaeger";
import {
SITE_PACKAGES_INFO,
Expand All @@ -12,6 +13,11 @@ import {
maybeSetupSnapshotUpload,
restoreSnapshot,
} from "pyodide-internal:snapshot";
import {
entropyMountFiles,
entropyAfterRuntimeInit,
entropyBeforeTopLevel,
} from "pyodide-internal:topLevelEntropy/lib";

/**
* This file is a simplified version of the Pyodide loader:
Expand Down Expand Up @@ -121,8 +127,9 @@ function getEmscriptenSettings(lockfile, indexURL) {
// environment variables go here
env: {
HOME: "/session",
// We don't have access to cryptographic rng at startup so we cannot support hash
// randomization. Setting `PYTHONHASHSEED` disables it.
// We don't have access to entropy at startup so we cannot support hash
// randomization. Setting `PYTHONHASHSEED` disables it. See further
// discussion in topLevelEntropy/entropy_patches.py
PYTHONHASHSEED: "111",
},
// This is the index that we use as the base URL to fetch the wheels.
Expand Down Expand Up @@ -177,20 +184,23 @@ async function instantiateEmscriptenModule(emscriptenSettings) {
* APIs, we call this function. If `MEMORY` is defined, then we will have passed
* `noInitialRun: true` and so the C runtime is in an incoherent state until we
* restore the linear memory from the snapshot.
*
* Returns `true` when existing memory snapshot was loaded.
*/
async function prepareWasmLinearMemory(Module) {
// Note: if we are restoring from a snapshot, runtime is not initialized yet.
mountLib(Module, SITE_PACKAGES_INFO);
entropyMountFiles(Module);
if (SHOULD_RESTORE_SNAPSHOT) {
restoreSnapshot(Module);
// Don't call adjustSysPath here: it was called in the other branch when we
// were creating the snapshot so the outcome of that is already baked in.
}
// entropyAfterRuntimeInit adjusts JS state ==> always needs to be called.
entropyAfterRuntimeInit(Module);
if (SHOULD_RESTORE_SNAPSHOT) {
return;
}
// The effects of these are purely in Python state so they only need to be run
// if we didn't restore a snapshot.
entropyBeforeTopLevel(Module);
adjustSysPath(Module);
maybeSetupSnapshotUpload(Module);
}

export async function loadPyodide(lockfile, indexURL) {
Expand All @@ -201,6 +211,7 @@ export async function loadPyodide(lockfile, indexURL) {
await enterJaegerSpan("prepare_wasm_linear_memory", () =>
prepareWasmLinearMemory(Module),
);
maybeSetupSnapshotUpload(Module);

// Finish setting up Pyodide's ffi so we can use the nice Python interface
await enterJaegerSpan("finalize_bootstrap", Module.API.finalizeBootstrap);
Expand Down
31 changes: 20 additions & 11 deletions src/pyodide/internal/snapshot.js
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ import {
} from "pyodide-internal:metadata";
import { reportError, simpleRunPython } from "pyodide-internal:util";

let LOADED_BASELINE_SNAPSHOT;

/**
* This file is a simplified version of the Pyodide loader:
* https://github.com/pyodide/pyodide/blob/main/src/js/pyodide.ts
Expand All @@ -29,8 +31,10 @@ import { reportError, simpleRunPython } from "pyodide-internal:util";
*/
import { _createPyodideModule } from "pyodide-internal:generated/pyodide.asm";

const TOP_LEVEL_SNAPSHOT = ArtifactBundler.isEwValidating() || SHOULD_SNAPSHOT_TO_DISK;
const SHOULD_UPLOAD_SNAPSHOT = ArtifactBundler.isEnabled() || TOP_LEVEL_SNAPSHOT;
const TOP_LEVEL_SNAPSHOT =
ArtifactBundler.isEwValidating() || SHOULD_SNAPSHOT_TO_DISK;
const SHOULD_UPLOAD_SNAPSHOT =
ArtifactBundler.isEnabled() || TOP_LEVEL_SNAPSHOT;

/**
* Global variable for the memory snapshot. On the first run we stick a copy of
Expand Down Expand Up @@ -116,10 +120,7 @@ const PRELOADED_SO_FILES = [];
*/
export function preloadDynamicLibs(Module) {
let SO_FILES_TO_LOAD = SITE_PACKAGES_SO_FILES;
if (
IS_CREATING_BASELINE_SNAPSHOT ||
DSO_METADATA?.settings?.baselineSnapshot
) {
if (LOADED_BASELINE_SNAPSHOT && LOADED_SNAPSHOT_VERSION === 1) {
// Ideally this should be just
// [[ '_lzma.so' ], [ '_ssl.so' ]]
// but we put a few more because we messed up the memory snapshot...
Expand All @@ -129,7 +130,12 @@ export function preloadDynamicLibs(Module) {
["_sqlite3.so"],
["_ssl.so"],
];
// SO_FILES_TO_LOAD = [[ '_lzma.so' ], [ '_ssl.so' ]];
}
if (
IS_CREATING_BASELINE_SNAPSHOT ||
(LOADED_BASELINE_SNAPSHOT && LOADED_SNAPSHOT_VERSION === 2)
) {
SO_FILES_TO_LOAD = [["_lzma.so"], ["_ssl.so"]];
}
try {
const sitePackages = getSitePackagesPath(Module);
Expand Down Expand Up @@ -313,8 +319,9 @@ export function maybeSetupSnapshotUpload(Module) {

// "\x00snp"
const SNAPSHOT_MAGIC = 0x706e7300;
const SNAPSHOT_VERSION = 1;
const CREATE_SNAPSHOT_VERSION = 2;
const HEADER_SIZE = 4 * 4;
export let LOADED_SNAPSHOT_VERSION = undefined;

/**
* Encode heap and dsoJSON into the memory snapshot artifact that we'll upload
Expand All @@ -332,7 +339,7 @@ function encodeSnapshot(heap, dsoJSON) {
);
const uint32View = new Uint32Array(toUpload.buffer);
uint32View[0] = SNAPSHOT_MAGIC;
uint32View[1] = SNAPSHOT_VERSION;
uint32View[1] = CREATE_SNAPSHOT_VERSION;
uint32View[2] = snapshotOffset;
uint32View[3] = jsonLength;
toUpload.subarray(snapshotOffset).set(heap);
Expand All @@ -347,9 +354,9 @@ function decodeSnapshot() {
let offset = 0;
MEMORY_SNAPSHOT_READER.readMemorySnapshot(offset, buf);
offset += 8;
let snapshotVersion = 0;
LOADED_SNAPSHOT_VERSION = 0;
if (buf[0] == SNAPSHOT_MAGIC) {
snapshotVersion = buf[1];
LOADED_SNAPSHOT_VERSION = buf[1];
MEMORY_SNAPSHOT_READER.readMemorySnapshot(offset, buf);
offset += 8;
}
Expand All @@ -361,6 +368,7 @@ function decodeSnapshot() {
MEMORY_SNAPSHOT_READER.readMemorySnapshot(offset, jsonBuf);
const jsonTxt = new TextDecoder().decode(jsonBuf);
DSO_METADATA = JSON.parse(jsonTxt);
LOADED_BASELINE_SNAPSHOT = Number(DSO_METADATA?.settings?.baselineSnapshot);
READ_MEMORY = function (Module) {
// restore memory from snapshot
MEMORY_SNAPSHOT_READER.readMemorySnapshot(snapshotOffset, Module.HEAP8);
Expand Down Expand Up @@ -423,5 +431,6 @@ export function maybeStoreMemorySnapshot() {
ArtifactBundler.storeMemorySnapshot(getMemoryToUpload());
} else if (SHOULD_SNAPSHOT_TO_DISK) {
DiskCache.put("snapshot.bin", getMemoryToUpload());
console.log("Saved snapshot to disk");
}
}
Empty file.
133 changes: 133 additions & 0 deletions src/pyodide/internal/topLevelEntropy/entropy_import_context.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
"""
Manage import context for modules that use getentropy() at startup.

We install a metapath finder in import_patch_manager.py which executes the
module in the context manager returned by
get_entropy_import_context(module_name).

This module defines get_entropy_import_context which

"random" and "numpy.random.mtrand" also have some additional patches that need
to be installed as part of their import context to prevent top level crashes.

Other rust packages are likely to need similar treatment to pydantic_core.
"""

from contextlib import contextmanager
from array import array
from .import_patch_manager import block_calls

import sys

RUST_PACKAGES = ["pydantic_core", "tiktoken"]
MODULES_TO_PATCH = ["random", "numpy.random", "numpy.random.mtrand"] + RUST_PACKAGES

# Control number of allowed entropy calls.

ALLOWED_ENTROPY_CALLS = array("b", [0])


def get_bad_entropy_flag():
# simpleRunPython reads out stderr. We put the address there so we can fish it out...
# We could use ctypes instead of array but ctypes weighs an extra 100kb compared to array.
print(ALLOWED_ENTROPY_CALLS.buffer_info()[0], file=sys.stderr)


def is_bad_entropy_enabled():
"""This is used in entropy_patches.py to let calls to disabled functions
through if we are allowing bad entropy
"""
return ALLOWED_ENTROPY_CALLS[0] > 0


@contextmanager
def allow_bad_entropy_calls(n):
ALLOWED_ENTROPY_CALLS[0] = n
yield
if ALLOWED_ENTROPY_CALLS[0] > 0:
raise RuntimeError(
f"{ALLOWED_ENTROPY_CALLS[0]} unexpected leftover getentropy calls "
)


# Module instantiation context managers


def get_entropy_import_context(name):
"""Look up the import context.

If there is a function called <pkg_name>_context, we'll use that. Otherwise,
we have a default for rust packages. (Currently only used for tiktoken).
"""
if name not in MODULES_TO_PATCH:
return None
funcname = name.replace(".", "_").replace("-", "_") + "_context"
res = globals().get(funcname, None)
if res:
return res
if name in RUST_PACKAGES:
# Initial import needs one entropy call to initialize std::collections::HashMap hash seed
return rust_package_context
raise Exception(f"Missing context for {name}")


@contextmanager
def rust_package_context(module):
"""Rust packages need one entropy call if they create a rust hash map at
init time."""
with allow_bad_entropy_calls(1):
yield


@contextmanager
def random_context(module):
"""Importing random calls getentropy() 10 times it seems"""
with allow_bad_entropy_calls(10):
yield
# Block calls to functions that use the bad random seed we produced from the
# ten getentropy() calls. Instantiating Random with a given seed is fine,
# instantiating it without a seed will call getentropy() and fail.
# Instantiating SystemRandom is fine, calling it's methods will call
# getentropy() and fail.
block_calls(module, allowlist=["Random", "SystemRandom"])


@contextmanager
def numpy_random_context(module):
"""numpy.random doesn't call getentropy() itself, but we want to block calls
that might use the bad seed.

TODO: Maybe there are more calls we can whitelist?
TODO: Is it not enough to just block numpy.random.mtrand calls?
"""
yield
# Calling default_rng() with a given seed is fine, calling it without a seed
# will call getentropy() and fail.
block_calls(module, allowlist=["default_rng"])


@contextmanager
def numpy_random_mtrand_context(module):
# numpy.random.mtrand calls secrets.randbits at top level to seed itself.
# This will fail if we don't let it through.
with allow_bad_entropy_calls(1):
yield
# Block calls until we get a chance to replace the bad random seed.
block_calls(module)


@contextmanager
def pydantic_core_context(module):
try:
# Initial import needs one entropy call to initialize std::collections::HashMap hash seed
with allow_bad_entropy_calls(1):
yield
finally:
try:
with allow_bad_entropy_calls(1):
# validate_core_schema makes an ahash::AHashMap which makes another entropy call for
# its hash seed. It will throw an error but only after making the needed entropy
# call.
module.validate_core_schema(None)
except module.SchemaError:
pass
Loading
Loading