diff --git a/packages/SwingSet/src/controller/upgradeSwingset.js b/packages/SwingSet/src/controller/upgradeSwingset.js index 87195b8cfae5..b9e79c6ed45b 100644 --- a/packages/SwingSet/src/controller/upgradeSwingset.js +++ b/packages/SwingSet/src/controller/upgradeSwingset.js @@ -3,7 +3,10 @@ import { DEFAULT_GC_KREFS_PER_BOYD, getAllDynamicVats, getAllStaticVats, + incrementReferenceCount, + addToQueue, } from '../kernel/state/kernelKeeper.js'; +import { enumeratePrefixedKeys } from '../kernel/state/storageHelper.js'; const upgradeVatV0toV1 = (kvStore, defaultReapDirtThreshold, vatID) => { // This is called, once per vat, when upgradeSwingset migrates from @@ -204,6 +207,154 @@ export const upgradeSwingset = kernelStorage => { version = 2; } + if (version < 3) { + // v3 means that we've completed remediation for bug #9039 + console.log(`Starting remediation of bug #9039`); + + // find all terminated vats + const terminated = new Set(JSON.parse(getRequired('vats.terminated'))); + + // find all live vats + const allVatIDs = []; + for (const [_name, vatID] of getAllStaticVats(kvStore)) { + if (!terminated.has(vatID)) { + allVatIDs.push(vatID); + } + } + for (const vatID of getAllDynamicVats(getRequired)) { + if (!terminated.has(vatID)) { + allVatIDs.push(vatID); + } + } + + // find all pending notifies + const notifies = new Map(); // .get(kpid) = [vatIDs..]; + const [runHead, runTail] = JSON.parse(getRequired('runQueue')); + for (let p = runHead; p < runTail; p += 1) { + const rq = JSON.parse(getRequired(`runQueue.${p}`)); + if (rq.type === 'notify') { + const { vatID, kpid } = rq; + assert(vatID); + assert(kpid); + if (!notifies.has(kpid)) { + notifies.set(kpid, []); + } + notifies.get(kpid).push(vatID); + } + } + const [accHead, accTail] = JSON.parse(getRequired('acceptanceQueue')); + for (let p = accHead; p < accTail; p += 1) { + const rq = JSON.parse(getRequired(`acceptanceQueue.${p}`)); + if (rq.type === 'notify') { + const { vatID, kpid } = rq; + assert(vatID); + assert(kpid); + if (!notifies.has(kpid)) { + notifies.set(kpid, []); + } + notifies.get(kpid).push(vatID); + } + } + console.log(` - pending notifies:`, notifies); + + // cache of known-settled kpids: will grow to num(kpids) + const settledKPIDs = new Set(); + const nonSettledKPIDs = new Set(); + const isSettled = kpid => { + if (settledKPIDs.has(kpid)) { + return true; + } + if (nonSettledKPIDs.has(kpid)) { + return false; + } + const state = kvStore.get(`${kpid}.state`); + // missing state means the kpid is deleted somehow, shouldn't happen + assert(state, `${kpid}.state is missing`); + if (state === 'unresolved') { + nonSettledKPIDs.add(kpid); + return false; + } + settledKPIDs.add(kpid); + return true; + }; + + // walk vNN.c.kpNN for all vats, for each one check the + // kpNN.state, for the settled ones check for a pending notify, + // record the ones without a pending notify + + const buggyKPIDs = []; // [kpid, vatID] + for (const vatID of allVatIDs) { + const prefix = `${vatID}.c.`; + const len = prefix.length; + const ckpPrefix = `${vatID}.c.kp`; + for (const key of enumeratePrefixedKeys(kvStore, ckpPrefix)) { + const kpid = key.slice(len); + if (isSettled(kpid)) { + const n = notifies.get(kpid); + if (!n || !n.includes(vatID)) { + // there is no pending notify + buggyKPIDs.push([kpid, vatID]); + } + } + } + } + console.log(` - found ${buggyKPIDs.length} buggy kpids, enqueueing fixes`); + + // now fix it. The bug means we failed to delete the c-list entry + // and decref it back when the promise was rejected. That decref + // would have pushed the kpid onto maybeFreeKrefs, which would + // have triggered a refcount check at end-of-crank, which might + // have deleted the promise records (if nothing else was + // referencing the promise, like arguments in messages enqueued to + // unresolved promises, or something transient on the + // run-queue). Deleting those promise records might have decreffed + // krefs in the rejection data (although in general 9039 rejects + // those promises with non-slot-bearing DisconnectionObjects). + // + // To avoid duplicating a lot of kernel code inside this upgrade + // handler, we do the simplest possible thing: enqueue a notify to + // the upgraded vat for all these leftover promises. The new vat + // incarnation will ignore it (they don't recognize the vpid), but + // the dispatch.notify() delivery will clear the c-list and decref + // the kpid, and will trigger all the usual GC work. Note that + // these notifies will be delivered before any activity the host + // app might trigger for e.g. a chain upgrade, but they should not + // cause userspace-visible behavior (non-slot-bearing rejection + // data means no other vat will even get a gc-action delivery: + // only the upgraded vat will see anything, and those deliveries + // won't make it past liveslots). + + const kernelStats = JSON.parse(getRequired('kernelStats')); + // copied from kernel/state/stats.js, awkward to factor out + const incStat = (stat, delta = 1) => { + assert.equal(stat, 'acceptanceQueueLength'); + kernelStats[stat] += delta; + const maxStat = `${stat}Max`; + if ( + kernelStats[maxStat] !== undefined && + kernelStats[stat] > kernelStats[maxStat] + ) { + kernelStats[maxStat] = kernelStats[stat]; + } + const upStat = `${stat}Up`; + if (kernelStats[upStat] !== undefined) { + kernelStats[upStat] += delta; + } + }; + + for (const [kpid, vatID] of buggyKPIDs) { + const m = harden({ type: 'notify', vatID, kpid }); + incrementReferenceCount(getRequired, kvStore, kpid, `enq|notify`); + addToQueue('acceptanceQueue', m, getRequired, kvStore, incStat); + } + + kvStore.set('kernelStats', JSON.stringify(kernelStats)); + + console.log(` - #9039 remediation complete`); + modified = true; + version = 3; + } + if (modified) { kvStore.set('version', `${version}`); } diff --git a/packages/SwingSet/src/kernel/state/kernelKeeper.js b/packages/SwingSet/src/kernel/state/kernelKeeper.js index bd58d97302b7..7d8ec56c3cb5 100644 --- a/packages/SwingSet/src/kernel/state/kernelKeeper.js +++ b/packages/SwingSet/src/kernel/state/kernelKeeper.js @@ -54,7 +54,7 @@ const enableKernelGC = true; export { DEFAULT_REAP_DIRT_THRESHOLD_KEY }; // most recent DB schema version -export const CURRENT_SCHEMA_VERSION = 2; +export const CURRENT_SCHEMA_VERSION = 3; // Kernel state lives in a key-value store supporting key retrieval by // lexicographic range. All keys and values are strings. @@ -73,9 +73,9 @@ export const CURRENT_SCHEMA_VERSION = 2; // only modified by a call to upgradeSwingset(). See below for // deltas/upgrades from one version to the next. // -// The current ("v2") schema keys/values are: +// The current ("v3") schema keys/values are: // -// version = '2' +// version = '3' // vat.names = JSON([names..]) // vat.dynamicIDs = JSON([vatIDs..]) // vat.name.$NAME = $vatID = v$NN @@ -179,6 +179,9 @@ export const CURRENT_SCHEMA_VERSION = 2; // v2: // * change `version` to `'2'` // * add `vats.terminated` with `[]` as initial value +// v3: +// * change `version` to `'3'` +// * perform remediation for bug #9039 /** @type {(s: string) => string[]} s */ export function commaSplit(s) { diff --git a/packages/SwingSet/test/snapshots/state.test.js.md b/packages/SwingSet/test/snapshots/state.test.js.md index 4c02a66077df..674e2a3c8116 100644 --- a/packages/SwingSet/test/snapshots/state.test.js.md +++ b/packages/SwingSet/test/snapshots/state.test.js.md @@ -8,8 +8,8 @@ Generated by [AVA](https://avajs.dev). > initial state - '7b16bffd29f6a2d11bae7b536ef4c230af8cadc29284928b6cc2f7338507a987' + 'af35907384e9d63dd9fc4d4df0440005c0ee81ef88f86089a0e0a280fe3793af' > expected activityhash - '7dbf5a49d4e2b999c431730fcd4927c01c713eaa54fe273626e4201853e38d3b' + '040e27413c25f3ce668d9778add3b3d39547358ded553c0b9fba898004968d1b' diff --git a/packages/SwingSet/test/snapshots/state.test.js.snap b/packages/SwingSet/test/snapshots/state.test.js.snap index 0efcf0fac96c..a4a79f56520a 100644 Binary files a/packages/SwingSet/test/snapshots/state.test.js.snap and b/packages/SwingSet/test/snapshots/state.test.js.snap differ diff --git a/packages/SwingSet/test/state.test.js b/packages/SwingSet/test/state.test.js index c60a01aa79c6..8a5076b14a6d 100644 --- a/packages/SwingSet/test/state.test.js +++ b/packages/SwingSet/test/state.test.js @@ -183,7 +183,7 @@ test('kernel state', async t => { k.emitCrankHashes(); checkState(t, store.dump, [ - ['version', '2'], + ['version', '3'], ['crankNumber', '0'], ['gcActions', '[]'], ['runQueue', '[1,1]'], @@ -223,7 +223,7 @@ test('kernelKeeper vat names', async t => { k.emitCrankHashes(); checkState(t, store.dump, [ - ['version', '2'], + ['version', '3'], ['crankNumber', '0'], ['gcActions', '[]'], ['runQueue', '[1,1]'], @@ -279,7 +279,7 @@ test('kernelKeeper device names', async t => { k.emitCrankHashes(); checkState(t, store.dump, [ - ['version', '2'], + ['version', '3'], ['crankNumber', '0'], ['gcActions', '[]'], ['runQueue', '[1,1]'], @@ -462,7 +462,7 @@ test('kernelKeeper promises', async t => { k.emitCrankHashes(); checkState(t, store.dump, [ - ['version', '2'], + ['version', '3'], ['crankNumber', '0'], ['device.nextID', '7'], ['vat.nextID', '1'], @@ -1078,7 +1078,7 @@ test('dirt upgrade', async t => { // * v3.reapCountdown: 'never' // * v3.reapInterval: 'never' - t.is(k.kvStore.get('version'), '2'); + t.is(k.kvStore.get('version'), '3'); k.kvStore.delete(`kernel.defaultReapDirtThreshold`); k.kvStore.set(`kernel.defaultReapInterval`, '1000'); @@ -1168,7 +1168,7 @@ test('v2 upgrade', async t => { k.saveStats(); // roll back to v1 - t.is(k.kvStore.get('version'), '2'); + t.is(k.kvStore.get('version'), '3'); k.kvStore.delete(`vats.terminated`); k.kvStore.set('version', '1'); @@ -1187,5 +1187,5 @@ test('v2 upgrade', async t => { t.true(k2.kvStore.has(`vats.terminated`)); t.deepEqual(JSON.parse(k2.kvStore.get(`vats.terminated`)), []); - t.is(k2.kvStore.get(`version`), '2'); + t.is(k2.kvStore.get(`version`), '3'); }); diff --git a/packages/SwingSet/test/transcript-light.test.js b/packages/SwingSet/test/transcript-light.test.js index 9049eb57845a..e911a4e3451f 100644 --- a/packages/SwingSet/test/transcript-light.test.js +++ b/packages/SwingSet/test/transcript-light.test.js @@ -17,7 +17,7 @@ test('transcript-light load', async t => { t.teardown(c.shutdown); const serialized0 = debug.serialize(); const kvstate0 = debug.dump().kvEntries; - t.is(kvstate0.version, '2'); + t.is(kvstate0.version, '3'); t.is(kvstate0.runQueue, '[1,1]'); t.not(kvstate0.acceptanceQueue, '[]'); diff --git a/packages/SwingSet/test/upgrade-swingset.test.js b/packages/SwingSet/test/upgrade-swingset.test.js index a18b2699dd83..a14e409b83ed 100644 --- a/packages/SwingSet/test/upgrade-swingset.test.js +++ b/packages/SwingSet/test/upgrade-swingset.test.js @@ -1,9 +1,12 @@ /* eslint-disable no-underscore-dangle */ // @ts-nocheck -import { initSwingStore } from '@agoric/swing-store'; +// eslint-disable-next-line import/order import { test } from '../tools/prepare-test-env-ava.js'; +import { initSwingStore } from '@agoric/swing-store'; +import { kser } from '@agoric/kmarshal'; + import { initializeSwingset, makeSwingsetController, @@ -28,7 +31,7 @@ test('kernel refuses to run with out-of-date DB - v0', async t => { // kernelkeeper v0 schema, just deleting the version key and adding // 'initialized' - t.is(kvStore.get('version'), '2'); + t.is(kvStore.get('version'), '3'); kvStore.delete(`version`); kvStore.set('initialized', 'true'); await commit(); @@ -51,7 +54,7 @@ test('kernel refuses to run with out-of-date DB - v1', async t => { // kernelkeeper v1 schema, by reducing the version key and removing // vats.terminated - t.is(kvStore.get('version'), '2'); + t.is(kvStore.get('version'), '3'); kvStore.set(`version`, '1'); kvStore.delete('vats.terminated'); await commit(); @@ -62,6 +65,28 @@ test('kernel refuses to run with out-of-date DB - v1', async t => { }); }); +test('kernel refuses to run with out-of-date DB - v2', async t => { + const { hostStorage, kernelStorage } = initSwingStore(); + const { commit } = hostStorage; + const { kvStore } = kernelStorage; + const config = {}; + await initializeSwingset(config, [], kernelStorage, t.context.data); + await commit(); + + // now doctor the initial state to make it look like the + // kernelkeeper v1 schema, by reducing the version key and removing + // vats.terminated + + t.is(kvStore.get('version'), '3'); + kvStore.set(`version`, '2'); + await commit(); + + // Now build a controller around this modified state, which should fail. + await t.throwsAsync(() => makeSwingsetController(kernelStorage), { + message: /kernel DB is too old/, + }); +}); + test('upgrade kernel state', async t => { const { hostStorage, kernelStorage } = initSwingStore(); const { commit } = hostStorage; @@ -96,7 +121,7 @@ test('upgrade kernel state', async t => { t.true(kvStore.has('kernel.defaultReapDirtThreshold')); - t.is(kvStore.get('version'), '2'); + t.is(kvStore.get('version'), '3'); kvStore.delete('version'); // i.e. revert to v0 kvStore.set('initialized', 'true'); kvStore.delete('vats.terminated'); @@ -186,7 +211,7 @@ test('upgrade non-reaping kernel state', async t => { t.true(kvStore.has('kernel.defaultReapDirtThreshold')); - t.is(kvStore.get('version'), '2'); + t.is(kvStore.get('version'), '3'); kvStore.delete('version'); // i.e. revert to v0 kvStore.set('initialized', 'true'); kvStore.delete('vats.terminated'); @@ -229,3 +254,190 @@ test('upgrade non-reaping kernel state', async t => { gcKrefs: 'never', }); }); + +test('v3 upgrade', async t => { + // exercise the remediation code for bug #9039 + const { hostStorage, kernelStorage, debug } = initSwingStore(); + const { commit } = hostStorage; + const { kvStore } = kernelStorage; + const config = {}; + await initializeSwingset(config, [], kernelStorage, t.context.data); + await commit(); + + // doctor the initial state to inject #9039 problems, then check + // that upgrade applies the expected fixes. We pretend that + // v1-vatAdmin was upgraded and left some promises lying around. + + const vatID = kvStore.get('vat.name.vatAdmin'); + t.truthy(vatID); + + const disconnectionObject = { + name: 'vatUpgraded', + upgradeMessage: 'test upgrade', + incarnationNumber: 0, + }; + const dccd = kser(disconnectionObject); + + t.is(kvStore.get('version'), '3'); + kvStore.set('version', '2'); // revert to v2 + const runQueue = []; + const acceptanceQueue = []; + const nextID = Number(kvStore.get('kp.nextID')); + const p1 = `kp${nextID}`; + const p2 = `kp${nextID + 1}`; + const p3 = `kp${nextID + 2}`; + const p4 = `kp${nextID + 3}`; + const p5 = `kp${nextID + 4}`; + const p6 = `kp${nextID + 5}`; + kvStore.set('kp.nextID', `${nextID + 6}`); + + // first promise "was" known only to the upgraded vat, but not + // self-subscribed, so no notify was sent: remediated + kvStore.set(`${p1}.state`, 'rejected'); + kvStore.set(`${p1}.data.body`, dccd.body); + kvStore.set(`${p1}.data.slots`, ''); + kvStore.set(`${p1}.refCount`, '1'); + kvStore.set(`${vatID}.c.${p1}`, 'R p+90'); + kvStore.set(`${vatID}.c.p+90`, p1); + + // second promise was also only known to upgraded vat, but we + // pretend it was self-subscribed, and the notify is still sitting + // in the run-queue: ignored + kvStore.set(`${p2}.state`, 'rejected'); + kvStore.set(`${p2}.data.body`, dccd.body); + kvStore.set(`${p2}.data.slots`, ''); + kvStore.set(`${p2}.refCount`, '2'); // c-list, runQueue + kvStore.set(`${vatID}.c.${p2}`, 'R p+91'); + kvStore.set(`${vatID}.c.p+91`, p2); + runQueue.push({ type: 'notify', vatID, kpid: p2 }); + + // third promise is only known to upgraded vat, but self-subscribed, + // and the notify is still sitting in the acceptance queue: ignored + kvStore.set(`${p3}.state`, 'rejected'); + kvStore.set(`${p3}.data.body`, dccd.body); + kvStore.set(`${p3}.data.slots`, ''); + kvStore.set(`${p3}.refCount`, '2'); // c-list, acceptanceQueue + kvStore.set(`${vatID}.c.${p3}`, 'R p+92'); + kvStore.set(`${vatID}.c.p+92`, p3); + acceptanceQueue.push({ type: 'notify', vatID, kpid: p3 }); + + // fourth promise has additional references, still remediated + kvStore.set(`${p4}.state`, 'rejected'); + kvStore.set(`${p4}.data.body`, dccd.body); + kvStore.set(`${p4}.data.slots`, ''); + // note: we aren't being specific about *where* the other reference + // is coming from. A plausible source is an argument of a message + // queued to some other unresolved promise. A non-plausible one is + // in the c-list of some other vat (as a settled promise that one + // should have gotten a notify too, assuming they were subscribed, + // and they shouldn't be not subscribed). If the refcounts were + // stored in a DB with more runtime checking, we'd be creating an + // illegal situation here, but it's not. + kvStore.set(`${p4}.refCount`, '2'); // c-list, other + kvStore.set(`${vatID}.c.${p4}`, 'R p+93'); + kvStore.set(`${vatID}.c.p+93`, p4); + + // fifth promise is fulfilled, not rejected, without a notify: + // remediated (even though strictly speaking 9039 is about rejected + // promises) + kvStore.set(`${p5}.state`, 'fulfilled'); + kvStore.set(`${p5}.data.body`, '#{}'); + kvStore.set(`${p5}.data.slots`, ''); + kvStore.set(`${p5}.refCount`, '1'); + kvStore.set(`${vatID}.c.${p5}`, 'R p+95'); + kvStore.set(`${vatID}.c.p+95`, p5); + + // sixth promise is unresolved: ignored + kvStore.set(`${p6}.state`, 'unresolved'); + kvStore.set(`${p6}.subscribers`, ''); + kvStore.set(`${p6}.queue.nextID`, `0`); + kvStore.set(`${p6}.refCount`, `1`); + kvStore.set(`${p6}.decider`, vatID); + kvStore.set(`${vatID}.c.${p6}`, 'R p+96'); + kvStore.set(`${vatID}.c.p+96`, p6); + + // now update queues + + // eslint-disable-next-line prefer-const + let [runHead, runTail] = JSON.parse(kvStore.get('runQueue')); + for (const m of runQueue) { + kvStore.set(`runQueue.${runTail}`, JSON.stringify(m)); + runTail += 1; + } + kvStore.set('runQueue', JSON.stringify([runHead, runTail])); + + // eslint-disable-next-line prefer-const + let [accHead, accTail] = JSON.parse(kvStore.get('acceptanceQueue')); + for (const m of acceptanceQueue) { + kvStore.set(`acceptanceQueue.${accTail}`, JSON.stringify(m)); + accTail += 1; + } + kvStore.set('acceptanceQueue', JSON.stringify([accHead, accTail])); + + let stats = JSON.parse(kvStore.get('kernelStats')); + stats.runQueueLength += runQueue.length; + stats.runQueueLengthUp += runQueue.length; + stats.runQueueLengthMax = runQueue.length; + stats.acceptanceQueueLength += acceptanceQueue.length; + stats.acceptanceQueueLengthUp += acceptanceQueue.length; + stats.acceptanceQueueLengthMax = acceptanceQueue.length; + kvStore.set('kernelStats', JSON.stringify(stats)); + + await commit(); + + const data = { ...debug.dump().kvEntries }; + + // confirm that this state is too old for the kernel to use + await t.throwsAsync(() => makeSwingsetController(kernelStorage), { + message: /kernel DB is too old/, + }); + + // upgrade it + upgradeSwingset(kernelStorage); + // now we should be good to go + const _controller = await makeSwingsetController(kernelStorage); + + // check state by mutating our dumped copy and then comparing + // against a new dump + + t.deepEqual({ a: 1, b: 2 }, { b: 2, a: 1 }); + // expect notifies for p1/p4/p5 in acceptance queue + const [head, tail] = JSON.parse(kvStore.get('acceptanceQueue')); + t.is(head, accHead); + t.is(tail, accTail + 3); + data.acceptanceQueue = JSON.stringify([accHead, accTail + 3]); + // note: we aren't JSON-parsing the entries, so this depends upon + // the properties being assigned in this exact order + const np1 = JSON.stringify({ type: 'notify', vatID, kpid: p1 }); + data[`acceptanceQueue.${tail - 3}`] = np1; + const np4 = JSON.stringify({ type: 'notify', vatID, kpid: p4 }); + data[`acceptanceQueue.${tail - 2}`] = np4; + const np5 = JSON.stringify({ type: 'notify', vatID, kpid: p5 }); + data[`acceptanceQueue.${tail - 1}`] = np5; + + // stats are updated with the queue changes + stats = JSON.parse(data.kernelStats); + stats.acceptanceQueueLength += 3; + stats.acceptanceQueueLengthUp += 3; + stats.acceptanceQueueLengthMax = stats.acceptanceQueueLength; + data.kernelStats = JSON.stringify(stats); + + // the refcounts should now be one larger, because of the queued + // notifies + data[`${p1}.refCount`] = '2'; + data[`${p2}.refCount`] = '2'; + data[`${p3}.refCount`] = '2'; + data[`${p4}.refCount`] = '3'; + data[`${p5}.refCount`] = '2'; + data[`${p6}.refCount`] = '1'; + + // the version is bumped, indicating we don't need to perform this + // remediation again (because the bug is fixed and we won't be + // creating similar corruption in the future) + data.version = '3'; + + // no other state changes are expected + + const newData = { ...debug.dump().kvEntries }; + t.deepEqual(data, newData); +});