Skip to content

Commit

Permalink
Merge pull request tigerbeetle#1990 from tigerbeetle/dj-grid-scrub-cycle
Browse files Browse the repository at this point in the history
Grid/Scrubber: Model probability of data loss in terms of cycle interval
  • Loading branch information
sentientwaffle authored Jun 7, 2024
2 parents 59ae156 + c370973 commit b17a2f2
Show file tree
Hide file tree
Showing 3 changed files with 126 additions and 3 deletions.
2 changes: 1 addition & 1 deletion src/config.zig
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ const ConfigProcess = struct {
grid_missing_tables_max: usize = 3,
aof_record: bool = false,
grid_scrubber_reads_max: usize = 1,
grid_scrubber_cycle_ms: usize = std.time.ms_per_day * 90,
grid_scrubber_cycle_ms: usize = std.time.ms_per_day * 180,
grid_scrubber_interval_ms_min: usize = std.time.ms_per_s / 20,
grid_scrubber_interval_ms_max: usize = std.time.ms_per_s * 10,
aof_recovery: bool = false,
Expand Down
4 changes: 2 additions & 2 deletions src/constants.zig
Original file line number Diff line number Diff line change
Expand Up @@ -366,8 +366,8 @@ pub const grid_scrubber_reads_max = config.process.grid_scrubber_reads_max;
/// (assuming a fully-loaded data file – maximum size and 100% acquired):
///
/// storage_size_limit_max = 16TiB
/// grid_scrubber_cycle_seconds = 90 days * 24 hr/day * 60 min/hr * 60 s/min (1 cycle/year)
/// read_bytes_per_second = storage_size_max / grid_scrubber_cycle_seconds ≈ 2.16 MiB/s
/// grid_scrubber_cycle_seconds = 180 days * 24 hr/day * 60 min/hr * 60 s/min (2 cycle/year)
/// read_bytes_per_second = storage_size_max / grid_scrubber_cycle_seconds ≈ 1.08 MiB/s
///
pub const grid_scrubber_cycle_ticks = config.process.grid_scrubber_cycle_ms / tick_ms;

Expand Down
123 changes: 123 additions & 0 deletions src/vsr/grid_scrubber.zig
Original file line number Diff line number Diff line change
Expand Up @@ -636,3 +636,126 @@ fn ManifestBlockIteratorType(comptime ManifestLog: type) type {
}
};
}

// Model the probability that the cluster experiences data loss due to bitrot.
// Specifically, that *every* copy of *any* block is corrupted before the scrubber can repair it.
//
// Optimistic assumptions (see below):
// - Faults are independent between replicas. ¹
// - Faults are independent (i.e. uncorrelated) in space and time. ²
//
// Pessimistic assumptions:
// - There are only 3 (quorum_replication) copies of each sector.
// - Scrub randomization is ignored.
// - The simulated fault rate is much greater than a real disk's. (See `sector_faults_per_year`).
// - Reads, writes, and repairs due to other workloads (besides the scrubber) are not modelled.
// - All blocks are always full (512KiB).
//
// ¹: To mitigate the risk of correlated errors in production, replicas could use different SSD
// (hardware) models.
//
// ²: SSD faults are not independent (in either time or space).
// See, for example:
// - "An In-Depth Study of Correlated Failures in Production SSD-Based Data Centers"
// (https://www.usenix.org/system/files/fast21-han.pdf)
// - "Flash Reliability in Production: The Expected and the Unexpected"
// (https://www.usenix.org/system/files/conference/fast16/fast16-papers-schroeder.pdf)
// That being said, for the purposes of modelling scrubbing, it is a decent approximation because
// blocks are large relative to sectors. (Additionally, blocks that are written together are often
// scrubbed together).
test "GridScrubber cycle interval" {
// Parameters:

// The number of years that the test is "running". As the test runs longer, the probability that
// the cluster will experience data loss increases.
const test_duration_years = 20;

// The number of days between scrubs of a particular sector.
// Equivalently, the number of days to scrub the entire data file.
const cycle_interval_days = 180;

// The total size of the data file.
// Note that since this parameter is separate from the faults/year rate, increasing
// `storage_size` actually reduces the likelihood of data loss.
const storage_size = 16 * (1024 * 1024 * 1024 * 1024);

// The expected (average) number of sector faults per year.
// I can't find any good, recent statistics for faults on SSDs.
//
// Most papers express the fault rate as "UBER" (uncorrectable bit errors per total bits read).
// But "Flash Reliability in Production: The Expected and the Unexpected" §5.1 finds that
// UBER's underlying assumption ­ that the uncorrectable errors is correlated to the number of
// bytes read ­ is false. (That paper only shares "fraction of drives affected by an error",
// which is too coarse for this model's purposes.)
//
// Instead, the parameter is chosen conservatively ­ greater than the "true" number by at least
// an order of magnitude.
const sector_faults_per_year = 10_000;

// A block has multiple sectors. If any of a block's sectors are corrupt, then the block is
// corrupt.
//
// Increasing this parameter increases the likelihood of eventual data loss.
// (Intuitively, a single bitrot within 1GiB is more likely than a single bitrot within 1KiB.)
const block_size = 512 * 1024;

// The total number of copies of each sector.
// The cluster is recoverable if a sector's number of faults is less than `replicas_total`.
// Set to 3 rather than 6 since 3 is the quorum_replication.
const replicas_total = 3;

const sector_size = constants.sector_size;

// Computation:

const block_sectors = @divExact(block_size, sector_size);
const storage_sectors = @divExact(storage_size, sector_size);
const storage_blocks = @divExact(storage_size, block_size);
const test_duration_days = test_duration_years * 365;
const test_duration_cycles = stdx.div_ceil(test_duration_days, cycle_interval_days);
const sector_faults_per_cycle =
stdx.div_ceil(sector_faults_per_year * cycle_interval_days, 365);

// P(a specific block is uncorrupted for an entire cycle)
// If any of the block's sectors is corrupted, then the whole block is corrupted.
const p_block_healthy_per_cycle = std.math.pow(
f64,
@as(f64, @floatFromInt(storage_sectors - block_sectors)) /
@as(f64, @floatFromInt(storage_sectors)),
@as(f64, @floatFromInt(sector_faults_per_cycle)),
);

const p_block_corrupt_per_cycle = 1.0 - p_block_healthy_per_cycle;
// P(a specific block is corrupted on all replicas during a single cycle)
const p_cluster_block_corrupt_per_cycle =
std.math.pow(f64, p_block_corrupt_per_cycle, @as(f64, @floatFromInt(replicas_total)));
// P(a specific block is uncorrupted on at least one replica during a single cycle)
const p_cluster_block_healthy_per_cycle = 1.0 - p_cluster_block_corrupt_per_cycle;

// P(a specific block is uncorrupted on at least one replica for all cycles)
// Note that each cycle can be considered independently because we assume that if is at the end
// of the cycle there is at least one healthy copy, then all of the corrupt copies are repaired.
const p_cluster_block_healthy_per_span = std.math.pow(
f64,
p_cluster_block_healthy_per_cycle,
@as(f64, @floatFromInt(test_duration_cycles)),
);

// P(each block is uncorrupted on at least one replica for all cycles)
const p_cluster_blocks_healthy_per_span = std.math.pow(
f64,
p_cluster_block_healthy_per_span,
@as(f64, @floatFromInt(storage_blocks)),
);

// P(at some point during all cycles, at least one block is corrupt across all replicas)
// In other words, P(eventual data loss).
const p_cluster_blocks_corrupt_per_span = 1.0 - p_cluster_blocks_healthy_per_span;

const Snap = @import("../testing/snaptest.zig").Snap;
const snap = Snap.snap;

try snap(@src(),
\\4.3582921528e-03
).diff_fmt("{e:.10}", .{p_cluster_blocks_corrupt_per_span});
}

0 comments on commit b17a2f2

Please sign in to comment.