Skip to content

Commit

Permalink
fix(swingset): add remediation of 9039
Browse files Browse the repository at this point in the history
We use the `upgradeSwingset()` mechanism (when switching to version 3)
to identify and fix all the leftover kpids caused by #9039. This
performs automatic remediation of the damage, exactly once per
database.

The v2-to-v3 upgrader will look for all c-list promise entries, then
check to see if the kpid is settled. It ignores the ones that have
`dispatch.notify` events in either the runQueue or the
acceptanceQueue. It then enqueues notify events to the vats, adjusts
the refcounts and kernel stats accordinging, and finally and bumps the
version to 3.

These kpids will be for promises which were decided by an upgraded
vat, but not resolved by upgrade time. The kernel "disconnects" (rejects)
these promises, but #9039 failed to remove them from the upgraded
vat's c-list.

On the first run after this remediation, the kernel will deliver those
notifies, which will delete the c-list entries and decrement their
refcounts, which may trigger the usual deletions and further
decrefs. The notifies will be delivered to the vat's new incarnation,
which will harmlessly ignore them (as unrecognized vpids). This turned
out to be the easiest way to get all the possible cleanups to run.

New tests were added to exercise the remediation code.

As this implements the last part of the fix, it:

fixes #9039
  • Loading branch information
warner committed Oct 11, 2024
1 parent 03d0925 commit 1a8c19c
Show file tree
Hide file tree
Showing 7 changed files with 384 additions and 18 deletions.
151 changes: 151 additions & 0 deletions packages/SwingSet/src/controller/upgradeSwingset.js
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,10 @@ import {
DEFAULT_GC_KREFS_PER_BOYD,
getAllDynamicVats,
getAllStaticVats,
incrementReferenceCount,
addToQueue,
} from '../kernel/state/kernelKeeper.js';
import { enumeratePrefixedKeys } from '../kernel/state/storageHelper.js';

const upgradeVatV0toV1 = (kvStore, defaultReapDirtThreshold, vatID) => {
// This is called, once per vat, when upgradeSwingset migrates from
Expand Down Expand Up @@ -204,6 +207,154 @@ export const upgradeSwingset = kernelStorage => {
version = 2;
}

if (version < 3) {
// v3 means that we've completed remediation for bug #9039
console.log(`Starting remediation of bug #9039`);

// find all terminated vats
const terminated = new Set(JSON.parse(getRequired('vats.terminated')));

// find all live vats
const allVatIDs = [];
for (const [_name, vatID] of getAllStaticVats(kvStore)) {
if (!terminated.has(vatID)) {
allVatIDs.push(vatID);
}
}
for (const vatID of getAllDynamicVats(getRequired)) {
if (!terminated.has(vatID)) {
allVatIDs.push(vatID);
}
}

// find all pending notifies
const notifies = new Map(); // .get(kpid) = [vatIDs..];
const [runHead, runTail] = JSON.parse(getRequired('runQueue'));
for (let p = runHead; p < runTail; p += 1) {
const rq = JSON.parse(getRequired(`runQueue.${p}`));
if (rq.type === 'notify') {
const { vatID, kpid } = rq;
assert(vatID);
assert(kpid);
if (!notifies.has(kpid)) {
notifies.set(kpid, []);
}
notifies.get(kpid).push(vatID);
}
}
const [accHead, accTail] = JSON.parse(getRequired('acceptanceQueue'));
for (let p = accHead; p < accTail; p += 1) {
const rq = JSON.parse(getRequired(`acceptanceQueue.${p}`));
if (rq.type === 'notify') {
const { vatID, kpid } = rq;
assert(vatID);
assert(kpid);
if (!notifies.has(kpid)) {
notifies.set(kpid, []);
}
notifies.get(kpid).push(vatID);
}
}
console.log(` - pending notifies:`, notifies);

// cache of known-settled kpids: will grow to num(kpids)
const settledKPIDs = new Set();
const nonSettledKPIDs = new Set();
const isSettled = kpid => {
if (settledKPIDs.has(kpid)) {
return true;
}
if (nonSettledKPIDs.has(kpid)) {
return false;
}
const state = kvStore.get(`${kpid}.state`);
// missing state means the kpid is deleted somehow, shouldn't happen
assert(state, `${kpid}.state is missing`);
if (state === 'unresolved') {
nonSettledKPIDs.add(kpid);
return false;
}
settledKPIDs.add(kpid);
return true;
};

// walk vNN.c.kpNN for all vats, for each one check the
// kpNN.state, for the settled ones check for a pending notify,
// record the ones without a pending notify

const buggyKPIDs = []; // [kpid, vatID]
for (const vatID of allVatIDs) {
const prefix = `${vatID}.c.`;
const len = prefix.length;
const ckpPrefix = `${vatID}.c.kp`;
for (const key of enumeratePrefixedKeys(kvStore, ckpPrefix)) {
const kpid = key.slice(len);
if (isSettled(kpid)) {
const n = notifies.get(kpid);
if (!n || !n.includes(vatID)) {
// there is no pending notify
buggyKPIDs.push([kpid, vatID]);
}
}
}
}
console.log(` - found ${buggyKPIDs.length} buggy kpids, enqueueing fixes`);

// now fix it. The bug means we failed to delete the c-list entry
// and decref it back when the promise was rejected. That decref
// would have pushed the kpid onto maybeFreeKrefs, which would
// have triggered a refcount check at end-of-crank, which might
// have deleted the promise records (if nothing else was
// referencing the promise, like arguments in messages enqueued to
// unresolved promises, or something transient on the
// run-queue). Deleting those promise records might have decreffed
// krefs in the rejection data (although in general 9039 rejects
// those promises with non-slot-bearing DisconnectionObjects).
//
// To avoid duplicating a lot of kernel code inside this upgrade
// handler, we do the simplest possible thing: enqueue a notify to
// the upgraded vat for all these leftover promises. The new vat
// incarnation will ignore it (they don't recognize the vpid), but
// the dispatch.notify() delivery will clear the c-list and decref
// the kpid, and will trigger all the usual GC work. Note that
// these notifies will be delivered before any activity the host
// app might trigger for e.g. a chain upgrade, but they should not
// cause userspace-visible behavior (non-slot-bearing rejection
// data means no other vat will even get a gc-action delivery:
// only the upgraded vat will see anything, and those deliveries
// won't make it past liveslots).

const kernelStats = JSON.parse(getRequired('kernelStats'));
// copied from kernel/state/stats.js, awkward to factor out
const incStat = (stat, delta = 1) => {
assert.equal(stat, 'acceptanceQueueLength');
kernelStats[stat] += delta;
const maxStat = `${stat}Max`;
if (
kernelStats[maxStat] !== undefined &&
kernelStats[stat] > kernelStats[maxStat]
) {
kernelStats[maxStat] = kernelStats[stat];
}
const upStat = `${stat}Up`;
if (kernelStats[upStat] !== undefined) {
kernelStats[upStat] += delta;
}
};

for (const [kpid, vatID] of buggyKPIDs) {
const m = harden({ type: 'notify', vatID, kpid });
incrementReferenceCount(getRequired, kvStore, kpid, `enq|notify`);
addToQueue('acceptanceQueue', m, getRequired, kvStore, incStat);
}

kvStore.set('kernelStats', JSON.stringify(kernelStats));

console.log(` - #9039 remediation complete`);
modified = true;
version = 3;
}

if (modified) {
kvStore.set('version', `${version}`);
}
Expand Down
9 changes: 6 additions & 3 deletions packages/SwingSet/src/kernel/state/kernelKeeper.js
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ const enableKernelGC = true;
export { DEFAULT_REAP_DIRT_THRESHOLD_KEY };

// most recent DB schema version
export const CURRENT_SCHEMA_VERSION = 2;
export const CURRENT_SCHEMA_VERSION = 3;

// Kernel state lives in a key-value store supporting key retrieval by
// lexicographic range. All keys and values are strings.
Expand All @@ -73,9 +73,9 @@ export const CURRENT_SCHEMA_VERSION = 2;
// only modified by a call to upgradeSwingset(). See below for
// deltas/upgrades from one version to the next.
//
// The current ("v2") schema keys/values are:
// The current ("v3") schema keys/values are:
//
// version = '2'
// version = '3'
// vat.names = JSON([names..])
// vat.dynamicIDs = JSON([vatIDs..])
// vat.name.$NAME = $vatID = v$NN
Expand Down Expand Up @@ -179,6 +179,9 @@ export const CURRENT_SCHEMA_VERSION = 2;
// v2:
// * change `version` to `'2'`
// * add `vats.terminated` with `[]` as initial value
// v3:
// * change `version` to `'3'`
// * perform remediation for bug #9039

/** @type {(s: string) => string[]} s */
export function commaSplit(s) {
Expand Down
4 changes: 2 additions & 2 deletions packages/SwingSet/test/snapshots/state.test.js.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@ Generated by [AVA](https://avajs.dev).

> initial state
'7b16bffd29f6a2d11bae7b536ef4c230af8cadc29284928b6cc2f7338507a987'
'af35907384e9d63dd9fc4d4df0440005c0ee81ef88f86089a0e0a280fe3793af'

> expected activityhash
'7dbf5a49d4e2b999c431730fcd4927c01c713eaa54fe273626e4201853e38d3b'
'040e27413c25f3ce668d9778add3b3d39547358ded553c0b9fba898004968d1b'
Binary file modified packages/SwingSet/test/snapshots/state.test.js.snap
Binary file not shown.
14 changes: 7 additions & 7 deletions packages/SwingSet/test/state.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,7 @@ test('kernel state', async t => {
k.emitCrankHashes();

checkState(t, store.dump, [
['version', '2'],
['version', '3'],
['crankNumber', '0'],
['gcActions', '[]'],
['runQueue', '[1,1]'],
Expand Down Expand Up @@ -223,7 +223,7 @@ test('kernelKeeper vat names', async t => {

k.emitCrankHashes();
checkState(t, store.dump, [
['version', '2'],
['version', '3'],
['crankNumber', '0'],
['gcActions', '[]'],
['runQueue', '[1,1]'],
Expand Down Expand Up @@ -279,7 +279,7 @@ test('kernelKeeper device names', async t => {

k.emitCrankHashes();
checkState(t, store.dump, [
['version', '2'],
['version', '3'],
['crankNumber', '0'],
['gcActions', '[]'],
['runQueue', '[1,1]'],
Expand Down Expand Up @@ -462,7 +462,7 @@ test('kernelKeeper promises', async t => {
k.emitCrankHashes();

checkState(t, store.dump, [
['version', '2'],
['version', '3'],
['crankNumber', '0'],
['device.nextID', '7'],
['vat.nextID', '1'],
Expand Down Expand Up @@ -1078,7 +1078,7 @@ test('dirt upgrade', async t => {
// * v3.reapCountdown: 'never'
// * v3.reapInterval: 'never'

t.is(k.kvStore.get('version'), '2');
t.is(k.kvStore.get('version'), '3');
k.kvStore.delete(`kernel.defaultReapDirtThreshold`);
k.kvStore.set(`kernel.defaultReapInterval`, '1000');

Expand Down Expand Up @@ -1168,7 +1168,7 @@ test('v2 upgrade', async t => {
k.saveStats();

// roll back to v1
t.is(k.kvStore.get('version'), '2');
t.is(k.kvStore.get('version'), '3');
k.kvStore.delete(`vats.terminated`);
k.kvStore.set('version', '1');

Expand All @@ -1187,5 +1187,5 @@ test('v2 upgrade', async t => {

t.true(k2.kvStore.has(`vats.terminated`));
t.deepEqual(JSON.parse(k2.kvStore.get(`vats.terminated`)), []);
t.is(k2.kvStore.get(`version`), '2');
t.is(k2.kvStore.get(`version`), '3');
});
2 changes: 1 addition & 1 deletion packages/SwingSet/test/transcript-light.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ test('transcript-light load', async t => {
t.teardown(c.shutdown);
const serialized0 = debug.serialize();
const kvstate0 = debug.dump().kvEntries;
t.is(kvstate0.version, '2');
t.is(kvstate0.version, '3');
t.is(kvstate0.runQueue, '[1,1]');
t.not(kvstate0.acceptanceQueue, '[]');

Expand Down
Loading

0 comments on commit 1a8c19c

Please sign in to comment.