Skip to content

Commit

Permalink
fix(SwingSet): improve slogging during replay
Browse files Browse the repository at this point in the history
* during transcript replay, record (simulated) syscalls and delivery results
in the slog
  * previously we only recorded deliveries
  * this captures GC syscalls, to investigate divergence bugs
  * it also records metering results for the delivery, for the same reason
* during normal delivery, record syscall data even if the syscall fails
  * vat-fatal syscall errors, like dropping an object that was already
    retired, now capture the vat's attempted syscall, even if the translation
    into kernelspace failed, or the kernelSyscallHandler invocation failed
  * previously we lost all record of these syscall attempts, making it hard
    to debug the reason for vat termination
* slog entries for replayed deliveries/syscalls look just like the normal
  ones, but now have a `replay` boolean to distinguish the two
  * replayed deliveries do not have a `crankNum`, since we deliberately do
    not record that in the transcript, so we don't know the value during replay
* we compute a `deliveryNum` for each replayed message from the transcript
  position offset, which ought to match the original. This might need some
  refactoring to make fewer (or more) assumptions about the shape of a
  StreamPosition.

refs #3428 (not as a fix, but as a critical debugging tool)
  • Loading branch information
warner committed Jul 17, 2021
1 parent 94d9417 commit fec24f4
Show file tree
Hide file tree
Showing 3 changed files with 58 additions and 32 deletions.
54 changes: 31 additions & 23 deletions packages/SwingSet/src/kernel/kernel.js
Original file line number Diff line number Diff line change
Expand Up @@ -749,6 +749,9 @@ export default function buildKernel(
return harden(['error', problem]);
}
let ksc;
let kres;
let vres;

try {
// This can fail if the vat asks for something not on their clist, or
// their syscall doesn't make sense in some other way, or due to a
Expand All @@ -760,34 +763,39 @@ export default function buildKernel(
console.log(`error during syscall translation:`, vaterr);
const problem = 'syscall translation error: prepare to die';
setTerminationTrigger(vatID, true, true, makeError(problem));
return harden(['error', problem]);
vres = harden(['error', problem]);
// we leave this catch() with ksc=undefined, so no doKernelSyscall()
}

/** @type { FinishFunction } */
const finish = kernelSlog.syscall(vatID, ksc, vatSyscallObject);
let vres;
try {
// this can fail if kernel or device code is buggy
const kres = kernelSyscallHandler.doKernelSyscall(ksc);
// kres is a KernelResult ([successFlag, value]), but since errors
// here are signalled with exceptions, kres is ['ok', value]. Vats
// (liveslots) record the response in the transcript (which is why we
// use 'null' instead of 'undefined', TODO clean this up), but otherwise
// most syscalls ignore it. The one syscall that pays attention is
// callNow(), which assumes it's capdata.
vres = translators.kernelSyscallResultToVatSyscallResult(ksc[0], kres);
// here, vres is either ['ok', null] or ['ok', capdata]
finish(kres, vres); // TODO call meaningfully on failure too?
} catch (err) {
// kernel/device errors cause a kernel panic
panic(`error during syscall/device.invoke: ${err}`, err);
// the kernel is now in a shutdown state, but it may take a while to
// grind to a halt
const problem = 'you killed my kernel. prepare to die';
setTerminationTrigger(vatID, true, true, makeError(problem));
return harden(['error', problem]);
}

if (ksc) {
try {
// this can fail if kernel or device code is buggy
kres = kernelSyscallHandler.doKernelSyscall(ksc);
// kres is a KernelResult ([successFlag, value]), but since errors
// here are signalled with exceptions, kres is ['ok', value]. Vats
// (liveslots) record the response in the transcript (which is why we
// use 'null' instead of 'undefined', TODO clean this up), but otherwise
// most syscalls ignore it. The one syscall that pays attention is
// callNow(), which assumes it's capdata.
vres = translators.kernelSyscallResultToVatSyscallResult(
ksc[0],
kres,
);
// here, vres is either ['ok', null] or ['ok', capdata]
} catch (err) {
// kernel/device errors cause a kernel panic
panic(`error during syscall/device.invoke: ${err}`, err);
// the kernel is now in a shutdown state, but it may take a while to
// grind to a halt
const problem = 'you killed my kernel. prepare to die';
setTerminationTrigger(vatID, true, true, makeError(problem));
vres = harden(['error', problem]);
}
}
finish(kres, vres);
return vres;
}
return vatSyscallHandler;
Expand Down
4 changes: 2 additions & 2 deletions packages/SwingSet/src/kernel/slogger.js
Original file line number Diff line number Diff line change
Expand Up @@ -137,12 +137,12 @@ export function makeSlogger(slogCallbacks, writeObj) {
}

// kd: kernelDelivery, vd: vatDelivery
function delivery(newCrankNum, newDeliveryNum, kd, vd) {
function delivery(newCrankNum, newDeliveryNum, kd, vd, replay = false) {
assertOldState(IDLE, 'reentrant delivery?');
state = DELIVERY;
crankNum = newCrankNum;
deliveryNum = newDeliveryNum;
const when = { crankNum, vatID, deliveryNum };
const when = { crankNum, vatID, deliveryNum, replay };
write({ type: 'deliver', ...when, kd, vd });
syscallNum = 0;

Expand Down
32 changes: 25 additions & 7 deletions packages/SwingSet/src/kernel/vatManager/manager-helper.js
Original file line number Diff line number Diff line change
Expand Up @@ -166,17 +166,26 @@ function makeManagerKit(
assert(transcriptManager, `delivery replay with no transcript`);
transcriptManager.startReplay();
transcriptManager.startReplayDelivery(expectedSyscalls);
kernelSlog.write({
type: 'start-replay-delivery',

// we slog the replay just like the original, but some fields are missing
const newCrankNum = undefined; // TODO think of a way to correlate this
const kd = undefined;
const vd = delivery;
const replay = true;
const finish = kernelSlog.delivery(
vatID,
delivery,
newCrankNum,
deliveryNum,
});
await deliver(delivery);
kd,
vd,
replay,
);
const status = await deliver(delivery);
finish(status);
transcriptManager.finishReplayDelivery();
transcriptManager.checkReplayError();
transcriptManager.finishReplay();
kernelSlog.write({ type: 'finish-replay-delivery', vatID, deliveryNum });
return status;
}

/**
Expand All @@ -189,7 +198,15 @@ function makeManagerKit(
if (transcriptManager) {
const total = vatKeeper.vatStats().transcriptCount;
kernelSlog.write({ type: 'start-replay', vatID, deliveries: total });
let deliveryNum = 0;
// TODO glean deliveryNum better, make sure we get the post-snapshot
// transcript starting point right. getTranscript() should probably
// return [deliveryNum, t] pairs. Think about how to provide an
// accurate crankNum, because I'm not sure I want that in transcript.
// I'm guarding against `.itemCount` being missing because that's
// supposed to be a private field of StreamPosition, but now that we've
// switched streamStore to sqlite it might be reasonable to make it
// public (or just define StreamPosition to be an integer).
let deliveryNum = (startPos && startPos.itemCount) || 0;
for (const t of vatKeeper.getTranscript(startPos)) {
// if (deliveryNum % 100 === 0) {
// console.debug(`replay vatID:${vatID} deliveryNum:${deliveryNum} / ${total}`);
Expand Down Expand Up @@ -224,6 +241,7 @@ function makeManagerKit(
// its sleep. Gently prevent their twitching paws from doing anything.

// but if the puppy deviates one inch from previous twitches, explode
kernelSlog.syscall(vatID, undefined, vso);
const data = transcriptManager.simulateSyscall(vso);
return harden(['ok', data]);
}
Expand Down

0 comments on commit fec24f4

Please sign in to comment.