fix(swingset): disable metering around GC-sensitive operations

When a swingset kernel is part of a consensus machine, the visible state must be a deterministic function of userspace activity. Every member kernel must perform the same set of operations. However we are not yet confident that the timing of garbage collection will remain identical between kernels that experience different patterns of snapshot+restart. In particular, up until recently, the amount of "headroom" in the XS memory allocator was reset upon snapshot reload: the new XS engine only allocates as much RAM as the snapshot needs, whereas before the snapshot was taken, the RAM footprint could have been larger (e.g. if a large number of objects we allocated and then released), leading to more "headroom". Automatic GC is triggered by an attempt to allocate space which cannot be satisfied by this headroom, so it will happen more frequently in the post-reload engine than before the snapshot. To accommodate differences in GC timing between kernels that are otherwise operating in consensus, this commit introduces the "unmetered box": a span of execution that does not count against the meter. We take all of the liveslots operations that might be sensitive to the engine's GC behavior and put them "in" the box. This includes any code that calls `deref()` on the WeakRefs used in `slotToVal`, because the WeakRef can change state from "live" to "dead" when GC is provoked, which is sensitive to more than just userspace behavior. closes #3458
Agoric · Aug 6, 2021 · 9d5d652 · 9d5d652
1 parent 4400288
commit 9d5d652
Show file tree

Hide file tree

Showing 9 changed files with 102 additions and 31 deletions.
diff --git a/packages/SwingSet/docs/liveslots.md b/packages/SwingSet/docs/liveslots.md
@@ -69,11 +69,21 @@ Some useful things cannot be serialized: they will trigger an error.
 Uncertain:
 
 * Maps: This might actually serialize as pass-by-presence, since it has no non-function properties (in fact it has no own properties at all, they all live on `Map.prototype`, whose properties are all functions). The receiving side gets a Presence, not a Map, but invoking e.g. `E(p).get(123)` will return a promise that will be fulfilled with the results of `m.get(123)` on the sending side.
-* WeakMaps: same, except the values being passed into `get()` would be coming from the deserializer, and so they might not be that useful (especially since Liveslots does not implement distributed GC yet).
+* WeakMaps: same, except the values being passed into `get()` would be coming from the deserializer, and so they might not be that useful.
 
 ## How things get serialized
 
 * pass-by-presence objects: `{@qclass: "slot", index: slotIndex}`
 * local Promises: passed as a promise
 * promise returned by `E(p).foo()`: passes as a promise, with pipelining enabled
 * Function: rejected (todo: wrap)
+
+## Garbage Collection vs Metering
+
+When a swingset kernel is part of a consensus machine, the visible state must be a deterministic function of userspace activity. Every member kernel must perform the same set of operations.
+
+However we are not yet confident that the timing of garbage collection will remain identical between kernels that experience different patterns of snapshot+restart. In particular, up until recently, the amount of "headroom" in the XS memory allocator was reset upon snapshot reload: the new XS engine only allocates as much RAM as the snapshot needs, whereas before the snapshot was taken, the RAM footprint could have been larger (e.g. if a large number of objects we allocated and then released), leading to more "headroom". Automatic GC is triggered by an attempt to allocate space which cannot be satisfied by this headroom, so it will happen more frequently in the post-reload engine than before the snapshot. See issues #3428, #3458, and #3577 for details.
+
+To accommodate differences in GC timing between kernels that are otherwise operating in consensus, liveslots uses an "unmetered box": a span of execution that does not count against the meter. We take all of the liveslots operations that might be sensitive to the engine's GC behavior and put them "in" the box.
+
+This includes any code that calls `deref()` on the WeakRefs used in `slotToVal`, because the WeakRef can change state from "live" to "dead" when GC is provoked, which is sensitive to more than just userspace behavior.
diff --git a/packages/SwingSet/src/kernel/kernel.js b/packages/SwingSet/src/kernel/kernel.js
@@ -826,6 +826,8 @@ export default function buildKernel(
     FinalizationRegistry,
     waitUntilQuiescent,
     gcAndFinalize,
+    runWithoutMetering: thunk => thunk(),
+    runWithoutMeteringAsync: async thunk => thunk(),
   });
   const vatManagerFactory = makeVatManagerFactory({
     allVatPowers,

diff --git a/packages/SwingSet/src/kernel/liveSlots.js b/packages/SwingSet/src/kernel/liveSlots.js
@@ -31,7 +31,8 @@ const DEFAULT_VIRTUAL_OBJECT_CACHE_SIZE = 3; // XXX ridiculously small value to
  * @param {boolean} enableVatstore
  * @param {*} vatPowers
  * @param {*} vatParameters
- * @param {*} gcTools { WeakRef, FinalizationRegistry, waitUntilQuiescent, gcAndFinalize }
+ * @param {*} gcTools { WeakRef, FinalizationRegistry, waitUntilQuiescent, gcAndFinalize,
+ *                      runWithoutMetering, runWithoutMeteringAsync }
  * @param {Console} console
  * @returns {*} { vatGlobals, inescapableGlobalProperties, dispatch, setBuildRootObject }
  *
@@ -53,6 +54,7 @@ function build(
   console,
 ) {
   const { WeakRef, FinalizationRegistry } = gcTools;
+  const { runWithoutMetering, runWithoutMeteringAsync } = gcTools;
   const enableLSDebug = false;
   function lsdebug(...args) {
     if (enableLSDebug) {
@@ -461,6 +463,7 @@ function build(
       valToSlot.set(val, slot);
       slotToVal.set(slot, new WeakRef(val));
       if (type === 'object') {
+        // Set.delete() metering seems unaffected by presence/absence
         deadSet.delete(slot);
         droppedRegistry.register(val, slot, val);
       }
@@ -489,6 +492,12 @@ function build(
     }
   }
 
+  // The meter usage of convertSlotToVal is strongly affected by GC, because
+  // it only creates a new Presence if one does not already exist. Userspace
+  // moves from REACHABLE to UNREACHABLE, but the JS engine then moves to
+  // COLLECTED (and maybe FINALIZED) on its own, and we must not allow the
+  // latter changes to affect metering. So every call to convertSlotToVal (or
+  // m.unserialize) must be wrapped by runWithoutMetering().
   function convertSlotToVal(slot, iface = undefined) {
     const { type, allocatedByVat, virtual } = parseVatSlot(slot);
     const wr = slotToVal.get(slot);
@@ -692,7 +701,7 @@ function build(
     lsdebug(
       `ls[${forVatID}].dispatch.deliver ${target}.${method} -> ${result}`,
     );
-    const t = convertSlotToVal(target);
+    const t = runWithoutMetering(() => convertSlotToVal(target));
     assert(t, X`no target ${target}`);
     // TODO: if we acquire new decision-making authority over a promise that
     // we already knew about ('result' is already in slotToVal), we should no
@@ -708,7 +717,7 @@ function build(
       method = Symbol.asyncIterator;
     }
 
-    const args = m.unserialize(argsdata);
+    const args = runWithoutMetering(() => m.unserialize(argsdata));
 
     // If the method is missing, or is not a Function, or the method throws a
     // synchronous exception, we notify the caller (by rejecting the result
@@ -747,12 +756,22 @@ function build(
   function retirePromiseID(promiseID) {
     lsdebug(`Retiring ${forVatID}:${promiseID}`);
     importedPromisesByPromiseID.delete(promiseID);
-    const wr = slotToVal.get(promiseID);
-    const p = wr && wr.deref();
-    if (p) {
-      valToSlot.delete(p);
-      pendingPromises.delete(p);
-    }
+    // TODO: I *think* this could do without runWithoutMetering, but I have
+    // to think more about it. Usually any vpid handed to retirePromiseID()
+    // will already be in both slotToVal and pendingPromises. We added this
+    // check for some good reason, but I no longer remember why. If
+    // pendingPromises is guaranteed to hold onto the Promise, then slotToVal
+    // should always have a live weakRef for it, and there won't be any
+    // GC-based variance in the execution of wr.deref() or the other code
+    // that it enables.
+    runWithoutMetering(() => {
+      const wr = slotToVal.get(promiseID);
+      const p = wr && wr.deref();
+      if (p) {
+        valToSlot.delete(p);
+        pendingPromises.delete(p);
+      }
+    });
     slotToVal.delete(promiseID);
   }
 
@@ -802,7 +821,7 @@ function build(
       X`unknown promiseID '${promiseID}'`,
     );
     const pRec = importedPromisesByPromiseID.get(promiseID);
-    const val = m.unserialize(data);
+    const val = runWithoutMetering(() => m.unserialize(data));
     if (rejected) {
       pRec.reject(val);
     } else {
@@ -824,6 +843,7 @@ function build(
     const imports = finishCollectingPromiseImports();
     for (const slot of imports) {
       if (slotToVal.get(slot)) {
+        // we'll only subscribe to new promises, which is within consensus
         syscall.subscribe(slot);
       }
     }
@@ -835,15 +855,17 @@ function build(
     vrefs.map(vref => assert(parseVatSlot(vref).allocatedByVat));
     // console.log(`-- liveslots acting upon dropExports ${vrefs.join(',')}`);
     for (const vref of vrefs) {
-      const wr = slotToVal.get(vref);
-      const o = wr && wr.deref();
-      if (o) {
-        exportedRemotables.delete(o);
-      }
-      const { virtual } = parseVatSlot(vref);
-      if (virtual) {
-        vom.setExported(vref, false);
-      }
+      runWithoutMetering(() => {
+        const wr = slotToVal.get(vref);
+        const o = wr && wr.deref();
+        if (o) {
+          exportedRemotables.delete(o);
+        }
+        const { virtual } = parseVatSlot(vref);
+        if (virtual) {
+          vom.setExported(vref, false);
+        }
+      });
     }
   }
 
@@ -890,14 +912,15 @@ function build(
 
   function retireExports(vrefs) {
     assert(Array.isArray(vrefs));
-    vrefs.forEach(retireOneExport);
+    runWithoutMetering(() => vrefs.forEach(retireOneExport));
   }
 
   function retireImports(vrefs) {
     assert(Array.isArray(vrefs));
     vrefs.map(vref => insistVatType('object', vref));
     vrefs.map(vref => assert(!parseVatSlot(vref).allocatedByVat));
     // console.log(`-- liveslots ignoring retireImports ${vrefs.join(',')}`);
+    // use runWithoutMetering() here too
   }
 
   // TODO: when we add notifyForward, guard against cycles
@@ -1032,6 +1055,15 @@ function build(
 
   const { waitUntilQuiescent, gcAndFinalize } = gcTools;
 
+  async function finish() {
+    await gcAndFinalize();
+    const doMore = processDeadSet();
+    if (doMore) {
+      return finish();
+    }
+    return undefined;
+  }
+
   /**
    * This low-level liveslots code is responsible for deciding when userspace
    * is done with a crank. Userspace code can use Promises, so it can add as
@@ -1060,15 +1092,7 @@ function build(
 
     // Now that userspace is idle, we can drive GC until we think we've
     // stopped.
-    async function finish() {
-      await gcAndFinalize();
-      const doMore = processDeadSet();
-      if (doMore) {
-        return finish();
-      }
-      return undefined;
-    }
-    return finish();
+    return runWithoutMeteringAsync(finish);
   }
   harden(dispatch);
 

diff --git a/packages/SwingSet/src/kernel/vatManager/supervisor-nodeworker.js b/packages/SwingSet/src/kernel/vatManager/supervisor-nodeworker.js
@@ -75,6 +75,8 @@ parentPort.on('message', ([type, ...margs]) => {
       FinalizationRegistry,
       waitUntilQuiescent,
       gcAndFinalize: makeGcAndFinalize(engineGC),
+      runWithoutMetering: thunk => thunk(),
+      runWithoutMeteringAsync: async thunk => thunk(),
     });
     const ls = makeLiveSlots(
       syscall,

diff --git a/packages/SwingSet/src/kernel/vatManager/supervisor-subprocess-node.js b/packages/SwingSet/src/kernel/vatManager/supervisor-subprocess-node.js
@@ -91,6 +91,8 @@ fromParent.on('data', ([type, ...margs]) => {
       FinalizationRegistry,
       waitUntilQuiescent,
       gcAndFinalize: makeGcAndFinalize(engineGC),
+      runWithoutMetering: thunk => thunk(),
+      runWithoutMeteringAsync: async thunk => thunk(),
     });
     const ls = makeLiveSlots(
       syscall,

diff --git a/packages/SwingSet/src/kernel/vatManager/supervisor-subprocess-xsnap.js b/packages/SwingSet/src/kernel/vatManager/supervisor-subprocess-xsnap.js
@@ -30,6 +30,26 @@ function workerLog(first, ...args) {
 
 workerLog(`supervisor started`);
 
+function runWithoutMetering(thunk) {
+  const limit = globalThis.currentMeterLimit();
+  const before = globalThis.resetMeter(0, 0);
+  try {
+    return thunk();
+  } finally {
+    globalThis.resetMeter(limit, before);
+  }
+}
+
+async function runWithoutMeteringAsync(thunk) {
+  const limit = globalThis.currentMeterLimit();
+  const before = globalThis.resetMeter(0, 0);
+  try {
+    return await thunk();
+  } finally {
+    globalThis.resetMeter(limit, before);
+  }
+}
+
 /**
  * Wrap byte-level protocols with tagged array codec.
  *
@@ -180,6 +200,8 @@ function makeWorker(port) {
       // FIXME(mfig): Here is where GC-per-crank is silently disabled.
       // We need to do a better analysis of the tradeoffs.
       gcAndFinalize: makeGcAndFinalize(gcEveryCrank && globalThis.gc),
+      runWithoutMetering,
+      runWithoutMeteringAsync,
     });
 
     const ls = makeLiveSlots(

diff --git a/packages/SwingSet/test/liveslots-helpers.js b/packages/SwingSet/test/liveslots-helpers.js
@@ -46,6 +46,8 @@ export function makeDispatch(
     FinalizationRegistry,
     waitUntilQuiescent,
     gcAndFinalize: makeGcAndFinalize(engineGC),
+    runWithoutMetering: thunk => thunk(),
+    runWithoutMeteringAsync: async thunk => thunk(),
   });
   const { setBuildRootObject, dispatch } = makeLiveSlots(
     syscall,

diff --git a/packages/SwingSet/test/test-liveslots.js b/packages/SwingSet/test/test-liveslots.js
@@ -906,6 +906,8 @@ function makeMockGC() {
     getAllFRs,
     waitUntilQuiescent,
     gcAndFinalize: mockGCAndFinalize,
+    runWithoutMetering: thunk => thunk(),
+    runWithoutMeteringAsync: async thunk => thunk(),
   });
 }
 

diff --git a/packages/SwingSet/test/test-marshal.js b/packages/SwingSet/test/test-marshal.js
@@ -10,7 +10,12 @@ import { makeMarshaller } from '../src/kernel/liveSlots.js';
 
 import { buildVatController } from '../src/index.js';
 
-const gcTools = harden({ WeakRef, FinalizationRegistry });
+const gcTools = harden({
+  WeakRef,
+  FinalizationRegistry,
+  runWithoutMetering: thunk => thunk(),
+  runWithoutMeteringAsync: async thunk => thunk(),
+});
 
 async function prep() {
   const config = {};