Merge pull request tigerbeetle#1990 from tigerbeetle/dj-grid-scrub-cycle

Grid/Scrubber: Model probability of data loss in terms of cycle interval
kdrag0n · Jun 7, 2024 · b17a2f2 · b17a2f2
2 parents 59ae156 + c370973
commit b17a2f2
Show file tree

Hide file tree

Showing 3 changed files with 126 additions and 3 deletions.
diff --git a/src/config.zig b/src/config.zig
@@ -129,7 +129,7 @@ const ConfigProcess = struct {
     grid_missing_tables_max: usize = 3,
     aof_record: bool = false,
     grid_scrubber_reads_max: usize = 1,
-    grid_scrubber_cycle_ms: usize = std.time.ms_per_day * 90,
+    grid_scrubber_cycle_ms: usize = std.time.ms_per_day * 180,
     grid_scrubber_interval_ms_min: usize = std.time.ms_per_s / 20,
     grid_scrubber_interval_ms_max: usize = std.time.ms_per_s * 10,
     aof_recovery: bool = false,

diff --git a/src/constants.zig b/src/constants.zig
@@ -366,8 +366,8 @@ pub const grid_scrubber_reads_max = config.process.grid_scrubber_reads_max;
 /// (assuming a fully-loaded data file – maximum size and 100% acquired):
 ///
 ///   storage_size_limit_max      = 16TiB
-///   grid_scrubber_cycle_seconds = 90 days * 24 hr/day * 60 min/hr * 60 s/min (1 cycle/year)
-///   read_bytes_per_second       = storage_size_max / grid_scrubber_cycle_seconds ≈ 2.16 MiB/s
+///   grid_scrubber_cycle_seconds = 180 days * 24 hr/day * 60 min/hr * 60 s/min (2 cycle/year)
+///   read_bytes_per_second       = storage_size_max / grid_scrubber_cycle_seconds ≈ 1.08 MiB/s
 ///
 pub const grid_scrubber_cycle_ticks = config.process.grid_scrubber_cycle_ms / tick_ms;
 

diff --git a/src/vsr/grid_scrubber.zig b/src/vsr/grid_scrubber.zig
@@ -636,3 +636,126 @@ fn ManifestBlockIteratorType(comptime ManifestLog: type) type {
         }
     };
 }
+
+// Model the probability that the cluster experiences data loss due to bitrot.
+// Specifically, that *every* copy of *any* block is corrupted before the scrubber can repair it.
+//
+// Optimistic assumptions (see below):
+// - Faults are independent between replicas. ¹
+// - Faults are independent (i.e. uncorrelated) in space and time. ²
+//
+// Pessimistic assumptions:
+// - There are only 3 (quorum_replication) copies of each sector.
+// - Scrub randomization is ignored.
+// - The simulated fault rate is much greater than a real disk's. (See `sector_faults_per_year`).
+// - Reads, writes, and repairs due to other workloads (besides the scrubber) are not modelled.
+// - All blocks are always full (512KiB).
+//
+// ¹: To mitigate the risk of correlated errors in production, replicas could use different SSD
+// (hardware) models.
+//
+// ²: SSD faults are not independent (in either time or space).
+// See, for example:
+// - "An In-Depth Study of Correlated Failures in Production SSD-Based Data Centers"
+//   (https://www.usenix.org/system/files/fast21-han.pdf)
+// - "Flash Reliability in Production: The Expected and the Unexpected"
+//   (https://www.usenix.org/system/files/conference/fast16/fast16-papers-schroeder.pdf)
+// That being said, for the purposes of modelling scrubbing, it is a decent approximation because
+// blocks are large relative to sectors. (Additionally, blocks that are written together are often
+// scrubbed together).
+test "GridScrubber cycle interval" {
+    // Parameters:
+
+    // The number of years that the test is "running". As the test runs longer, the probability that
+    // the cluster will experience data loss increases.
+    const test_duration_years = 20;
+
+    // The number of days between scrubs of a particular sector.
+    // Equivalently, the number of days to scrub the entire data file.
+    const cycle_interval_days = 180;
+
+    // The total size of the data file.
+    // Note that since this parameter is separate from the faults/year rate, increasing
+    // `storage_size` actually reduces the likelihood of data loss.
+    const storage_size = 16 * (1024 * 1024 * 1024 * 1024);
+
+    // The expected (average) number of sector faults per year.
+    // I can't find any good, recent statistics for faults on SSDs.
+    //
+    // Most papers express the fault rate as "UBER" (uncorrectable bit errors per total bits read).
+    // But "Flash Reliability in Production: The Expected and the Unexpected" §5.1 finds that
+    // UBER's underlying assumption  that the uncorrectable errors is correlated to the number of
+    // bytes read  is false. (That paper only shares "fraction of drives affected by an error",
+    // which is too coarse for this model's purposes.)
+    //
+    // Instead, the parameter is chosen conservatively  greater than the "true" number by at least
+    // an order of magnitude.
+    const sector_faults_per_year = 10_000;
+
+    // A block has multiple sectors. If any of a block's sectors are corrupt, then the block is
+    // corrupt.
+    //
+    // Increasing this parameter increases the likelihood of eventual data loss.
+    // (Intuitively, a single bitrot within 1GiB is more likely than a single bitrot within 1KiB.)
+    const block_size = 512 * 1024;
+
+    // The total number of copies of each sector.
+    // The cluster is recoverable if a sector's number of faults is less than `replicas_total`.
+    // Set to 3 rather than 6 since 3 is the quorum_replication.
+    const replicas_total = 3;
+
+    const sector_size = constants.sector_size;
+
+    // Computation:
+
+    const block_sectors = @divExact(block_size, sector_size);
+    const storage_sectors = @divExact(storage_size, sector_size);
+    const storage_blocks = @divExact(storage_size, block_size);
+    const test_duration_days = test_duration_years * 365;
+    const test_duration_cycles = stdx.div_ceil(test_duration_days, cycle_interval_days);
+    const sector_faults_per_cycle =
+        stdx.div_ceil(sector_faults_per_year * cycle_interval_days, 365);
+
+    // P(a specific block is uncorrupted for an entire cycle)
+    // If any of the block's sectors is corrupted, then the whole block is corrupted.
+    const p_block_healthy_per_cycle = std.math.pow(
+        f64,
+        @as(f64, @floatFromInt(storage_sectors - block_sectors)) /
+            @as(f64, @floatFromInt(storage_sectors)),
+        @as(f64, @floatFromInt(sector_faults_per_cycle)),
+    );
+
+    const p_block_corrupt_per_cycle = 1.0 - p_block_healthy_per_cycle;
+    // P(a specific block is corrupted on all replicas during a single cycle)
+    const p_cluster_block_corrupt_per_cycle =
+        std.math.pow(f64, p_block_corrupt_per_cycle, @as(f64, @floatFromInt(replicas_total)));
+    // P(a specific block is uncorrupted on at least one replica during a single cycle)
+    const p_cluster_block_healthy_per_cycle = 1.0 - p_cluster_block_corrupt_per_cycle;
+
+    // P(a specific block is uncorrupted on at least one replica for all cycles)
+    // Note that each cycle can be considered independently because we assume that if is at the end
+    // of the cycle there is at least one healthy copy, then all of the corrupt copies are repaired.
+    const p_cluster_block_healthy_per_span = std.math.pow(
+        f64,
+        p_cluster_block_healthy_per_cycle,
+        @as(f64, @floatFromInt(test_duration_cycles)),
+    );
+
+    // P(each block is uncorrupted on at least one replica for all cycles)
+    const p_cluster_blocks_healthy_per_span = std.math.pow(
+        f64,
+        p_cluster_block_healthy_per_span,
+        @as(f64, @floatFromInt(storage_blocks)),
+    );
+
+    // P(at some point during all cycles, at least one block is corrupt across all replicas)
+    // In other words, P(eventual data loss).
+    const p_cluster_blocks_corrupt_per_span = 1.0 - p_cluster_blocks_healthy_per_span;
+
+    const Snap = @import("../testing/snaptest.zig").Snap;
+    const snap = Snap.snap;
+
+    try snap(@src(),
+        \\4.3582921528e-03
+    ).diff_fmt("{e:.10}", .{p_cluster_blocks_corrupt_per_span});
+}