diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index dec70c60cec1..142f55b299e5 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -33,7 +33,7 @@ * under sponsorship from the FreeBSD Foundation. * Copyright (c) 2021 Allan Jude * Copyright (c) 2021 Toomas Soome - * Copyright (c) 2023, Klara Inc. + * Copyright (c) 2023, 2024, Klara Inc. * Copyright (c) 2023, Rob Norris */ @@ -1914,23 +1914,25 @@ dump_log_spacemaps(spa_t *spa) } static void -dump_dde(const ddt_t *ddt, const ddt_entry_t *dde, uint64_t index) +dump_ddt_entry(const ddt_t *ddt, const ddt_lightweight_entry_t *ddlwe, + uint64_t index) { - const ddt_phys_t *ddp = dde->dde_phys; - const ddt_key_t *ddk = &dde->dde_key; - const char *types[4] = { "ditto", "single", "double", "triple" }; + const ddt_key_t *ddk = &ddlwe->ddlwe_key; char blkbuf[BP_SPRINTF_LEN]; blkptr_t blk; int p; - for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { - if (ddp->ddp_phys_birth == 0) + for (p = 0; p < DDT_NPHYS(ddt); p++) { + const ddt_univ_phys_t *ddp = &ddlwe->ddlwe_phys; + ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p); + + if (ddt_phys_birth(ddp, v) == 0) continue; - ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk); + ddt_bp_create(ddt->ddt_checksum, ddk, ddp, v, &blk); snprintf_blkptr(blkbuf, sizeof (blkbuf), &blk); - (void) printf("index %llx refcnt %llu %s %s\n", - (u_longlong_t)index, (u_longlong_t)ddp->ddp_refcnt, - types[p], blkbuf); + (void) printf("index %llx refcnt %llu phys %d %s\n", + (u_longlong_t)index, (u_longlong_t)ddt_phys_refcnt(ddp, v), + p, blkbuf); } } @@ -1960,7 +1962,7 @@ static void dump_ddt(ddt_t *ddt, ddt_type_t type, ddt_class_t class) { char name[DDT_NAMELEN]; - ddt_entry_t dde; + ddt_lightweight_entry_t ddlwe; uint64_t walk = 0; dmu_object_info_t doi; uint64_t count, dspace, mspace; @@ -2001,8 +2003,8 @@ dump_ddt(ddt_t *ddt, ddt_type_t type, ddt_class_t class) (void) printf("%s contents:\n\n", name); - while ((error = ddt_object_walk(ddt, type, class, &walk, &dde)) == 0) - dump_dde(ddt, &dde, walk); + while ((error = ddt_object_walk(ddt, type, class, &walk, &ddlwe)) == 0) + dump_ddt_entry(ddt, &ddlwe, walk); ASSERT3U(error, ==, ENOENT); @@ -3287,9 +3289,45 @@ fuid_table_destroy(void) } } +/* + * Clean up DDT internal state. ddt_lookup() adds entries to ddt_tree, which on + * a live pool are normally cleaned up during ddt_sync(). We can't do that (and + * wouldn't want to anyway), but if we don't clean up the presence of stuff on + * ddt_tree will trip asserts in ddt_table_free(). So, we clean up ourselves. + * + * Note that this is not a particularly efficient way to do this, but + * ddt_remove() is the only public method that can do the work we need, and it + * requires the right locks and etc to do the job. This is only ever called + * during zdb shutdown so efficiency is not especially important. + */ +static void +zdb_ddt_cleanup(spa_t *spa) +{ + for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { + ddt_t *ddt = spa->spa_ddt[c]; + if (!ddt) + continue; + + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + ddt_enter(ddt); + ddt_entry_t *dde = avl_first(&ddt->ddt_tree), *next; + while (dde) { + next = AVL_NEXT(&ddt->ddt_tree, dde); + dde->dde_io = NULL; + ddt_remove(ddt, dde); + dde = next; + } + ddt_exit(ddt); + spa_config_exit(spa, SCL_CONFIG, FTAG); + } +} + static void zdb_exit(int reason) { + if (spa != NULL) + zdb_ddt_cleanup(spa); + if (os != NULL) { close_objset(os, FTAG); } else if (spa != NULL) { @@ -5633,7 +5671,6 @@ static void zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp, dmu_object_type_t type) { - uint64_t refcnt = 0; int i; ASSERT(type < ZDB_OT_TOTAL); @@ -5641,8 +5678,161 @@ zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp, if (zilog && zil_bp_tree_add(zilog, bp) != 0) return; + /* + * This flag controls if we will issue a claim for the block while + * counting it, to ensure that all blocks are referenced in space maps. + * We don't issue claims if we're not doing leak tracking, because it's + * expensive if the user isn't interested. We also don't claim the + * second or later occurences of cloned or dedup'd blocks, because we + * already claimed them the first time. + */ + boolean_t do_claim = !dump_opt['L']; + spa_config_enter(zcb->zcb_spa, SCL_CONFIG, FTAG, RW_READER); + blkptr_t tempbp; + if (BP_GET_DEDUP(bp)) { + /* + * Dedup'd blocks are special. We need to count them, so we can + * later uncount them when reporting leaked space, and we must + * only claim them once. + * + * We use the existing dedup system to track what we've seen. + * The first time we see a block, we do a ddt_lookup() to see + * if it exists in the DDT. If we're doing leak tracking, we + * claim the block at this time. + * + * Each time we see a block, we reduce the refcount in the + * entry by one, and add to the size and count of dedup'd + * blocks to report at the end. + */ + + ddt_t *ddt = ddt_select(zcb->zcb_spa, bp); + + ddt_enter(ddt); + + /* + * Find the block. This will create the entry in memory, but + * we'll know if that happened by its refcount. + */ + ddt_entry_t *dde = ddt_lookup(ddt, bp); + + /* + * ddt_lookup() can only return NULL if this block didn't exist + * in the DDT and creating it would take the DDT over its + * quota. Since we got the block from disk, it must exist in + * the DDT, so this can't happen. + */ + VERIFY3P(dde, !=, NULL); + + /* Get the phys for this variant */ + ddt_phys_variant_t v = ddt_phys_select(ddt, dde, bp); + + /* + * This entry may have multiple sets of DVAs. We must claim + * each set the first time we see them in a real block on disk, + * or count them on subsequent occurences. We don't have a + * convenient way to track the first time we see each variant, + * so we repurpose dde_io as a set of "seen" flag bits. We can + * do this safely in zdb because it never writes, so it will + * never have a writing zio for this block in that pointer. + */ + boolean_t seen = !!(((uintptr_t)dde->dde_io) & (1 << v)); + if (!seen) + dde->dde_io = + (void *)(((uintptr_t)dde->dde_io) | (1 << v)); + + /* Consume a reference for this block. */ + VERIFY3U(ddt_phys_total_refcnt(ddt, dde), >, 0); + ddt_phys_decref(dde->dde_phys, v); + + /* + * If this entry has a single flat phys, it may have been + * extended with additional DVAs at some time in its life. + * This block might be from before it was fully extended, and + * so have fewer DVAs. + * + * If this is the first time we've seen this block, and we + * claimed it as-is, then we would miss the claim on some + * number of DVAs, which would then be seen as leaked. + * + * In all cases, if we've had fewer DVAs, then the asize would + * be too small, and would lead to the pool apparently using + * more space than allocated. + * + * To handle this, we copy the canonical set of DVAs from the + * entry back to the block pointer before we claim it. + */ + if (v == DDT_PHYS_FLAT) { + ASSERT3U(BP_GET_BIRTH(bp), ==, + ddt_phys_birth(dde->dde_phys, v)); + tempbp = *bp; + ddt_bp_fill(dde->dde_phys, v, &tempbp, + BP_GET_BIRTH(bp)); + bp = &tempbp; + } + + if (seen) { + /* + * The second or later time we see this block, + * it's a duplicate and we count it. + */ + zcb->zcb_dedup_asize += BP_GET_ASIZE(bp); + zcb->zcb_dedup_blocks++; + + /* Already claimed, don't do it again. */ + do_claim = B_FALSE; + } + + ddt_exit(ddt); + } else if (zcb->zcb_brt_is_active && + brt_maybe_exists(zcb->zcb_spa, bp)) { + /* + * Cloned blocks are special. We need to count them, so we can + * later uncount them when reporting leaked space, and we must + * only claim them once. + * + * To do this, we keep our own in-memory BRT. For each block + * we haven't seen before, we look it up in the real BRT and + * if its there, we note it and its refcount then proceed as + * normal. If we see the block again, we count it as a clone + * and then give it no further consideration. + */ + zdb_brt_entry_t zbre_search, *zbre; + avl_index_t where; + + zbre_search.zbre_dva = bp->blk_dva[0]; + zbre = avl_find(&zcb->zcb_brt, &zbre_search, &where); + if (zbre == NULL) { + /* Not seen before; track it */ + uint64_t refcnt = + brt_entry_get_refcount(zcb->zcb_spa, bp); + if (refcnt > 0) { + zbre = umem_zalloc(sizeof (zdb_brt_entry_t), + UMEM_NOFAIL); + zbre->zbre_dva = bp->blk_dva[0]; + zbre->zbre_refcount = refcnt; + avl_insert(&zcb->zcb_brt, zbre, where); + } + } else { + /* + * Second or later occurrence, count it and take a + * refcount. + */ + zcb->zcb_clone_asize += BP_GET_ASIZE(bp); + zcb->zcb_clone_blocks++; + + zbre->zbre_refcount--; + if (zbre->zbre_refcount == 0) { + avl_remove(&zcb->zcb_brt, zbre); + umem_free(zbre, sizeof (zdb_brt_entry_t)); + } + + /* Already claimed, don't do it again. */ + do_claim = B_FALSE; + } + } + for (i = 0; i < 4; i++) { int l = (i < 2) ? BP_GET_LEVEL(bp) : ZB_TOTAL; int t = (i & 1) ? type : ZDB_OT_TOTAL; @@ -5745,71 +5935,12 @@ zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp, zcb->zcb_asize_len[bin] += BP_GET_ASIZE(bp); zcb->zcb_asize_total += BP_GET_ASIZE(bp); - if (zcb->zcb_brt_is_active && brt_maybe_exists(zcb->zcb_spa, bp)) { - /* - * Cloned blocks are special. We need to count them, so we can - * later uncount them when reporting leaked space, and we must - * only claim them them once. - * - * To do this, we keep our own in-memory BRT. For each block - * we haven't seen before, we look it up in the real BRT and - * if its there, we note it and its refcount then proceed as - * normal. If we see the block again, we count it as a clone - * and then give it no further consideration. - */ - zdb_brt_entry_t zbre_search, *zbre; - avl_index_t where; - - zbre_search.zbre_dva = bp->blk_dva[0]; - zbre = avl_find(&zcb->zcb_brt, &zbre_search, &where); - if (zbre != NULL) { - zcb->zcb_clone_asize += BP_GET_ASIZE(bp); - zcb->zcb_clone_blocks++; - - zbre->zbre_refcount--; - if (zbre->zbre_refcount == 0) { - avl_remove(&zcb->zcb_brt, zbre); - umem_free(zbre, sizeof (zdb_brt_entry_t)); - } - return; - } - - uint64_t crefcnt = brt_entry_get_refcount(zcb->zcb_spa, bp); - if (crefcnt > 0) { - zbre = umem_zalloc(sizeof (zdb_brt_entry_t), - UMEM_NOFAIL); - zbre->zbre_dva = bp->blk_dva[0]; - zbre->zbre_refcount = crefcnt; - avl_insert(&zcb->zcb_brt, zbre, where); - } - } - - if (dump_opt['L']) + if (!do_claim) return; - if (BP_GET_DEDUP(bp)) { - ddt_t *ddt; - ddt_entry_t *dde; - - ddt = ddt_select(zcb->zcb_spa, bp); - ddt_enter(ddt); - dde = ddt_lookup(ddt, bp, B_FALSE); - - if (dde == NULL) { - refcnt = 0; - } else { - ddt_phys_t *ddp = ddt_phys_select(dde, bp); - ddt_phys_decref(ddp); - refcnt = ddp->ddp_refcnt; - if (ddt_phys_total_refcnt(dde) == 0) - ddt_remove(ddt, dde); - } - ddt_exit(ddt); - } - - VERIFY3U(zio_wait(zio_claim(NULL, zcb->zcb_spa, - refcnt ? 0 : spa_min_claim_txg(zcb->zcb_spa), - bp, NULL, NULL, ZIO_FLAG_CANFAIL)), ==, 0); + VERIFY0(zio_wait(zio_claim(NULL, zcb->zcb_spa, + spa_min_claim_txg(zcb->zcb_spa), bp, NULL, NULL, + ZIO_FLAG_CANFAIL))); } static void @@ -6120,49 +6251,6 @@ zdb_load_obsolete_counts(vdev_t *vd) return (counts); } -static void -zdb_ddt_leak_init(spa_t *spa, zdb_cb_t *zcb) -{ - ddt_bookmark_t ddb = {0}; - ddt_entry_t dde; - int error; - int p; - - ASSERT(!dump_opt['L']); - - while ((error = ddt_walk(spa, &ddb, &dde)) == 0) { - blkptr_t blk; - ddt_phys_t *ddp = dde.dde_phys; - - if (ddb.ddb_class == DDT_CLASS_UNIQUE) - return; - - ASSERT(ddt_phys_total_refcnt(&dde) > 1); - ddt_t *ddt = spa->spa_ddt[ddb.ddb_checksum]; - VERIFY(ddt); - - for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { - if (ddp->ddp_phys_birth == 0) - continue; - ddt_bp_create(ddb.ddb_checksum, - &dde.dde_key, ddp, &blk); - if (p == DDT_PHYS_DITTO) { - zdb_count_block(zcb, NULL, &blk, ZDB_OT_DITTO); - } else { - zcb->zcb_dedup_asize += - BP_GET_ASIZE(&blk) * (ddp->ddp_refcnt - 1); - zcb->zcb_dedup_blocks++; - } - } - - ddt_enter(ddt); - VERIFY(ddt_lookup(ddt, &blk, B_TRUE) != NULL); - ddt_exit(ddt); - } - - ASSERT(error == ENOENT); -} - typedef struct checkpoint_sm_exclude_entry_arg { vdev_t *cseea_vd; uint64_t cseea_checkpoint_size; @@ -6546,10 +6634,6 @@ zdb_leak_init(spa_t *spa, zdb_cb_t *zcb) (void) bpobj_iterate_nofree(&dp->dp_obsolete_bpobj, increment_indirect_mapping_cb, zcb, NULL); } - - spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); - zdb_ddt_leak_init(spa, zcb); - spa_config_exit(spa, SCL_CONFIG, FTAG); } static boolean_t @@ -6814,6 +6898,8 @@ dump_block_stats(spa_t *spa) int e, c, err; bp_embedded_type_t i; + ddt_prefetch_all(spa); + zcb = umem_zalloc(sizeof (zdb_cb_t), UMEM_NOFAIL); if (spa_feature_is_active(spa, SPA_FEATURE_BLOCK_CLONING)) { @@ -6938,7 +7024,6 @@ dump_block_stats(spa_t *spa) (u_longlong_t)total_alloc, (dump_opt['L']) ? "unreachable" : "leaked", (longlong_t)(total_alloc - total_found)); - leaks = B_TRUE; } if (tzb->zb_count == 0) { @@ -8022,16 +8107,21 @@ dump_mos_leaks(spa_t *spa) mos_leak_vdev(spa->spa_root_vdev); - for (uint64_t class = 0; class < DDT_CLASSES; class++) { - for (uint64_t type = 0; type < DDT_TYPES; type++) { - for (uint64_t cksum = 0; - cksum < ZIO_CHECKSUM_FUNCTIONS; cksum++) { - ddt_t *ddt = spa->spa_ddt[cksum]; - if (!ddt) - continue; + for (uint64_t c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { + ddt_t *ddt = spa->spa_ddt[c]; + if (!ddt) + continue; + + /* DDT store objects */ + for (ddt_type_t type = 0; type < DDT_TYPES; type++) { + for (ddt_class_t class = 0; class < DDT_CLASSES; + class++) { mos_obj_refd(ddt->ddt_object[type][class]); } } + + /* FDT container */ + mos_obj_refd(ddt->ddt_dir_object); } if (spa->spa_brt != NULL) { @@ -9624,6 +9714,9 @@ main(int argc, char **argv) } fini: + if (spa != NULL) + zdb_ddt_cleanup(spa); + if (os != NULL) { close_objset(os, FTAG); } else if (spa != NULL) { diff --git a/include/sys/ddt.h b/include/sys/ddt.h index 66d59cebacde..11e09eef3bcc 100644 --- a/include/sys/ddt.h +++ b/include/sys/ddt.h @@ -39,6 +39,12 @@ extern "C" { struct abd; +/* + * DDT-wide feature flags. These are set in ddt_flags by ddt_configure(). + */ +#define DDT_FLAG_FLAT (1 << 0) /* single extensible phys */ +#define DDT_FLAG_MASK (DDT_FLAG_FLAT) + /* * DDT on-disk storage object types. Each one corresponds to specific * implementation, see ddt_ops_t. The value itself is not stored on disk. @@ -120,30 +126,80 @@ typedef struct { * characteristics of the stored block, such as its location on disk (DVAs), * birth txg and ref count. * - * Note that an entry has an array of four ddt_phys_t, one for each number of - * DVAs (copies= property) and another for additional "ditto" copies. Most - * users of ddt_phys_t will handle indexing into or counting the phys they - * want. + * The "traditional" entry has an array of four, one for each number of DVAs + * (copies= property) and another for additional "ditto" copies. Users of the + * traditional struct will specify the variant (index) of the one they want. + * + * The newer "flat" entry has only a single form that is specified using the + * DDT_PHYS_FLAT variant. + * + * Since the value size varies, use one of the size macros when interfacing + * with the ddt zap. */ -typedef struct { - dva_t ddp_dva[SPA_DVAS_PER_BP]; - uint64_t ddp_refcnt; - uint64_t ddp_phys_birth; -} ddt_phys_t; + +#define DDT_PHYS_MAX (4) /* - * Named indexes into the ddt_phys_t array in each entry. + * Note - this can be used in a flexible array and allocated for + * a specific size (ddp_trad or ddp_flat). So be careful not to + * copy using "=" assignment but instead use ddt_phys_copy(). + */ +typedef union { + /* + * Traditional physical payload value for DDT zap (256 bytes) + */ + struct { + dva_t ddp_dva[SPA_DVAS_PER_BP]; + uint64_t ddp_refcnt; + uint64_t ddp_phys_birth; + } ddp_trad[DDT_PHYS_MAX]; + + /* + * Flat physical payload value for DDT zap (72 bytes) + */ + struct { + dva_t ddp_dva[SPA_DVAS_PER_BP]; + uint64_t ddp_refcnt; + uint64_t ddp_phys_birth; /* txg based from BP */ + uint64_t ddp_class_start; /* in realtime seconds */ + } ddp_flat; +} ddt_univ_phys_t; + +/* + * This enum denotes which variant of a ddt_univ_phys_t to target. For + * a traditional DDT entry, it represents the indexes into the ddp_trad + * array. Any consumer of a ddt_univ_phys_t needs to know which variant + * is being targeted. * * Note, we no longer generate new DDT_PHYS_DITTO-type blocks. However, * we maintain the ability to free existing dedup-ditto blocks. */ -enum ddt_phys_type { + +typedef enum { DDT_PHYS_DITTO = 0, DDT_PHYS_SINGLE = 1, DDT_PHYS_DOUBLE = 2, DDT_PHYS_TRIPLE = 3, - DDT_PHYS_TYPES -}; + DDT_PHYS_FLAT = 4, + DDT_PHYS_NONE = 5 +} ddt_phys_variant_t; + +#define DDT_PHYS_VARIANT(ddt, p) \ + (ASSERT((p) < DDT_PHYS_NONE), \ + ((ddt)->ddt_flags & DDT_FLAG_FLAT ? DDT_PHYS_FLAT : (p))) + +#define DDT_TRAD_PHYS_SIZE sizeof (((ddt_univ_phys_t *)0)->ddp_trad) +#define DDT_FLAT_PHYS_SIZE sizeof (((ddt_univ_phys_t *)0)->ddp_flat) + +#define _DDT_PHYS_SWITCH(ddt, flat, trad) \ + (((ddt)->ddt_flags & DDT_FLAG_FLAT) ? (flat) : (trad)) + +#define DDT_PHYS_SIZE(ddt) _DDT_PHYS_SWITCH(ddt, \ + DDT_FLAT_PHYS_SIZE, DDT_TRAD_PHYS_SIZE) + +#define DDT_NPHYS(ddt) _DDT_PHYS_SWITCH(ddt, 1, DDT_PHYS_MAX) +#define DDT_PHYS_FOR_COPIES(ddt, p) _DDT_PHYS_SWITCH(ddt, 0, p) +#define DDT_PHYS_IS_DITTO(ddt, p) _DDT_PHYS_SWITCH(ddt, 0, (p == 0)) /* * A "live" entry, holding changes to an entry made this txg, and other data to @@ -154,16 +210,25 @@ enum ddt_phys_type { #define DDE_FLAG_LOADED (1 << 0) /* entry ready for use */ #define DDE_FLAG_OVERQUOTA (1 << 1) /* entry unusable, no space */ +/* + * Additional data to support entry update or repair. This is fixed size + * because its relatively rarely used. + */ typedef struct { - /* key must be first for ddt_key_compare */ - ddt_key_t dde_key; /* ddt_tree key */ - ddt_phys_t dde_phys[DDT_PHYS_TYPES]; /* on-disk data */ + /* copy of data after a repair read, to be rewritten */ + abd_t *dde_repair_abd; + + /* original phys contents before update, for error handling */ + ddt_univ_phys_t dde_orig_phys; /* in-flight update IOs */ - zio_t *dde_lead_zio[DDT_PHYS_TYPES]; + zio_t *dde_lead_zio[DDT_PHYS_MAX]; +} ddt_entry_io_t; - /* copy of data after a repair read, to be rewritten */ - struct abd *dde_repair_abd; +typedef struct { + /* key must be first for ddt_key_compare */ + ddt_key_t dde_key; /* ddt_tree key */ + avl_node_t dde_node; /* ddt_tree_node */ /* storage type and class the entry was loaded from */ ddt_type_t dde_type; @@ -173,9 +238,22 @@ typedef struct { kcondvar_t dde_cv; /* signaled when load completes */ uint64_t dde_waiters; /* count of waiters on dde_cv */ - avl_node_t dde_node; /* ddt_tree node */ + ddt_entry_io_t *dde_io; /* IO support, when required */ + + ddt_univ_phys_t dde_phys[]; /* flexible -- allocated size varies */ } ddt_entry_t; +/* + * A lightweight entry is for short-lived or transient uses, like iterating or + * inspecting, when you don't care where it came from. + */ +typedef struct { + ddt_key_t ddlwe_key; + ddt_type_t ddlwe_type; + ddt_class_t ddlwe_class; + ddt_univ_phys_t ddlwe_phys; +} ddt_lightweight_entry_t; + /* * In-core DDT object. This covers all entries and stats for a the whole pool * for a given checksum type. @@ -185,11 +263,15 @@ typedef struct { avl_tree_t ddt_tree; /* "live" (changed) entries this txg */ - avl_tree_t ddt_repair_tree; /* entries being repaired */ + avl_tree_t ddt_repair_tree; /* entries being repaired */ + + enum zio_checksum ddt_checksum; /* checksum algorithm in use */ + spa_t *ddt_spa; /* pool this ddt is on */ + objset_t *ddt_os; /* ddt objset (always MOS) */ - enum zio_checksum ddt_checksum; /* checksum algorithm in use */ - spa_t *ddt_spa; /* pool this ddt is on */ - objset_t *ddt_os; /* ddt objset (always MOS) */ + uint64_t ddt_dir_object; /* MOS dir holding ddt objects */ + uint64_t ddt_version; /* DDT version */ + uint64_t ddt_flags; /* FDT option flags */ /* per-type/per-class entry store objects */ uint64_t ddt_object[DDT_TYPES][DDT_CLASSES]; @@ -215,16 +297,26 @@ typedef struct { uint64_t ddb_cursor; } ddt_bookmark_t; -extern void ddt_bp_fill(const ddt_phys_t *ddp, blkptr_t *bp, - uint64_t txg); +extern void ddt_bp_fill(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v, + blkptr_t *bp, uint64_t txg); extern void ddt_bp_create(enum zio_checksum checksum, const ddt_key_t *ddk, - const ddt_phys_t *ddp, blkptr_t *bp); + const ddt_univ_phys_t *ddp, ddt_phys_variant_t v, blkptr_t *bp); -extern void ddt_phys_fill(ddt_phys_t *ddp, const blkptr_t *bp); -extern void ddt_phys_clear(ddt_phys_t *ddp); -extern void ddt_phys_addref(ddt_phys_t *ddp); -extern void ddt_phys_decref(ddt_phys_t *ddp); -extern ddt_phys_t *ddt_phys_select(const ddt_entry_t *dde, const blkptr_t *bp); +extern void ddt_phys_extend(ddt_univ_phys_t *ddp, ddt_phys_variant_t v, + const blkptr_t *bp); +extern void ddt_phys_copy(ddt_univ_phys_t *dst, const ddt_univ_phys_t *src, + ddt_phys_variant_t v); +extern void ddt_phys_clear(ddt_univ_phys_t *ddp, ddt_phys_variant_t v); +extern void ddt_phys_addref(ddt_univ_phys_t *ddp, ddt_phys_variant_t v); +extern uint64_t ddt_phys_decref(ddt_univ_phys_t *ddp, ddt_phys_variant_t v); +extern uint64_t ddt_phys_refcnt(const ddt_univ_phys_t *ddp, + ddt_phys_variant_t v); +extern ddt_phys_variant_t ddt_phys_select(const ddt_t *ddt, + const ddt_entry_t *dde, const blkptr_t *bp); +extern uint64_t ddt_phys_birth(const ddt_univ_phys_t *ddp, + ddt_phys_variant_t v); +extern int ddt_phys_dva_count(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v, + boolean_t encrypted); extern void ddt_histogram_add(ddt_histogram_t *dst, const ddt_histogram_t *src); extern void ddt_histogram_stat(ddt_stat_t *dds, const ddt_histogram_t *ddh); @@ -243,7 +335,7 @@ extern void ddt_enter(ddt_t *ddt); extern void ddt_exit(ddt_t *ddt); extern void ddt_init(void); extern void ddt_fini(void); -extern ddt_entry_t *ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add); +extern ddt_entry_t *ddt_lookup(ddt_t *ddt, const blkptr_t *bp); extern void ddt_remove(ddt_t *ddt, ddt_entry_t *dde); extern void ddt_prefetch(spa_t *spa, const blkptr_t *bp); extern void ddt_prefetch_all(spa_t *spa); @@ -251,6 +343,8 @@ extern void ddt_prefetch_all(spa_t *spa); extern boolean_t ddt_class_contains(spa_t *spa, ddt_class_t max_class, const blkptr_t *bp); +extern void ddt_alloc_entry_io(ddt_entry_t *dde); + extern ddt_entry_t *ddt_repair_start(ddt_t *ddt, const blkptr_t *bp); extern void ddt_repair_done(ddt_t *ddt, ddt_entry_t *dde); @@ -260,7 +354,8 @@ extern void ddt_create(spa_t *spa); extern int ddt_load(spa_t *spa); extern void ddt_unload(spa_t *spa); extern void ddt_sync(spa_t *spa, uint64_t txg); -extern int ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_entry_t *dde); +extern int ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, + ddt_lightweight_entry_t *ddlwe); extern boolean_t ddt_addref(spa_t *spa, const blkptr_t *bp); diff --git a/include/sys/ddt_impl.h b/include/sys/ddt_impl.h index 4aaab10c8737..c4e681fb117b 100644 --- a/include/sys/ddt_impl.h +++ b/include/sys/ddt_impl.h @@ -33,6 +33,23 @@ extern "C" { #endif +/* DDT version numbers */ +#define DDT_VERSION_LEGACY (0) +#define DDT_VERSION_FDT (1) + +/* Names of interesting objects in the DDT root dir */ +#define DDT_DIR_VERSION "version" +#define DDT_DIR_FLAGS "flags" + +/* Fill a lightweight entry from a live entry. */ +#define DDT_ENTRY_TO_LIGHTWEIGHT(ddt, dde, ddlwe) do { \ + memset((ddlwe), 0, sizeof (*ddlwe)); \ + (ddlwe)->ddlwe_key = (dde)->dde_key; \ + (ddlwe)->ddlwe_type = (dde)->dde_type; \ + (ddlwe)->ddlwe_class = (dde)->dde_class; \ + memcpy(&(ddlwe)->ddlwe_phys, (dde)->dde_phys, DDT_PHYS_SIZE(ddt)); \ +} while (0) + /* * Ops vector to access a specific DDT object type. */ @@ -42,19 +59,19 @@ typedef struct { boolean_t prehash); int (*ddt_op_destroy)(objset_t *os, uint64_t object, dmu_tx_t *tx); int (*ddt_op_lookup)(objset_t *os, uint64_t object, - const ddt_key_t *ddk, ddt_phys_t *phys, size_t psize); + const ddt_key_t *ddk, void *phys, size_t psize); int (*ddt_op_contains)(objset_t *os, uint64_t object, const ddt_key_t *ddk); void (*ddt_op_prefetch)(objset_t *os, uint64_t object, const ddt_key_t *ddk); void (*ddt_op_prefetch_all)(objset_t *os, uint64_t object); int (*ddt_op_update)(objset_t *os, uint64_t object, - const ddt_key_t *ddk, const ddt_phys_t *phys, size_t psize, + const ddt_key_t *ddk, const void *phys, size_t psize, dmu_tx_t *tx); int (*ddt_op_remove)(objset_t *os, uint64_t object, const ddt_key_t *ddk, dmu_tx_t *tx); int (*ddt_op_walk)(objset_t *os, uint64_t object, uint64_t *walk, - ddt_key_t *ddk, ddt_phys_t *phys, size_t psize); + ddt_key_t *ddk, void *phys, size_t psize); int (*ddt_op_count)(objset_t *os, uint64_t object, uint64_t *count); } ddt_ops_t; @@ -74,7 +91,7 @@ extern void ddt_stat_update(ddt_t *ddt, ddt_entry_t *dde, uint64_t neg); */ #define DDT_NAMELEN 32 -extern uint64_t ddt_phys_total_refcnt(const ddt_entry_t *dde); +extern uint64_t ddt_phys_total_refcnt(const ddt_t *ddt, const ddt_entry_t *dde); extern void ddt_key_fill(ddt_key_t *ddk, const blkptr_t *bp); @@ -83,7 +100,7 @@ extern void ddt_stat_add(ddt_stat_t *dst, const ddt_stat_t *src, uint64_t neg); extern void ddt_object_name(ddt_t *ddt, ddt_type_t type, ddt_class_t clazz, char *name); extern int ddt_object_walk(ddt_t *ddt, ddt_type_t type, ddt_class_t clazz, - uint64_t *walk, ddt_entry_t *dde); + uint64_t *walk, ddt_lightweight_entry_t *ddlwe); extern int ddt_object_count(ddt_t *ddt, ddt_type_t type, ddt_class_t clazz, uint64_t *count); extern int ddt_object_info(ddt_t *ddt, ddt_type_t type, ddt_class_t clazz, diff --git a/include/sys/dmu.h b/include/sys/dmu.h index 1376cbef763c..5b80dc315945 100644 --- a/include/sys/dmu.h +++ b/include/sys/dmu.h @@ -376,6 +376,7 @@ typedef struct dmu_buf { #define DMU_POOL_TMP_USERREFS "tmp_userrefs" #define DMU_POOL_DDT "DDT-%s-%s-%s" #define DMU_POOL_DDT_STATS "DDT-statistics" +#define DMU_POOL_DDT_DIR "DDT-%s" #define DMU_POOL_CREATION_VERSION "creation_version" #define DMU_POOL_SCAN "scan" #define DMU_POOL_ERRORSCRUB "error_scrub" diff --git a/include/sys/dsl_scan.h b/include/sys/dsl_scan.h index f32f59a2bedf..63734dbc176f 100644 --- a/include/sys/dsl_scan.h +++ b/include/sys/dsl_scan.h @@ -202,7 +202,7 @@ boolean_t dsl_scan_resilvering(struct dsl_pool *dp); boolean_t dsl_scan_resilver_scheduled(struct dsl_pool *dp); boolean_t dsl_dataset_unstable(struct dsl_dataset *ds); void dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum, - ddt_entry_t *dde, dmu_tx_t *tx); + ddt_t *ddt, ddt_lightweight_entry_t *ddlwe, dmu_tx_t *tx); void dsl_scan_ds_destroyed(struct dsl_dataset *ds, struct dmu_tx *tx); void dsl_scan_ds_snapshotted(struct dsl_dataset *ds, struct dmu_tx *tx); void dsl_scan_ds_clone_swapped(struct dsl_dataset *ds1, struct dsl_dataset *ds2, diff --git a/include/sys/spa.h b/include/sys/spa.h index 3998f5a6de73..a70912335b16 100644 --- a/include/sys/spa.h +++ b/include/sys/spa.h @@ -572,7 +572,7 @@ typedef struct blkptr { #define BP_IS_RAIDZ(bp) (DVA_GET_ASIZE(&(bp)->blk_dva[0]) > \ BP_GET_PSIZE(bp)) -#define BP_ZERO(bp) \ +#define BP_ZERO_DVAS(bp) \ { \ (bp)->blk_dva[0].dva_word[0] = 0; \ (bp)->blk_dva[0].dva_word[1] = 0; \ @@ -580,6 +580,11 @@ typedef struct blkptr { (bp)->blk_dva[1].dva_word[1] = 0; \ (bp)->blk_dva[2].dva_word[0] = 0; \ (bp)->blk_dva[2].dva_word[1] = 0; \ +} + +#define BP_ZERO(bp) \ +{ \ + BP_ZERO_DVAS(bp); \ (bp)->blk_prop = 0; \ (bp)->blk_pad[0] = 0; \ (bp)->blk_pad[1] = 0; \ diff --git a/include/zfeature_common.h b/include/zfeature_common.h index 2515ba321759..5733a8187a95 100644 --- a/include/zfeature_common.h +++ b/include/zfeature_common.h @@ -82,6 +82,7 @@ typedef enum spa_feature { SPA_FEATURE_AVZ_V2, SPA_FEATURE_REDACTION_LIST_SPILL, SPA_FEATURE_RAIDZ_EXPANSION, + SPA_FEATURE_FAST_DEDUP, SPA_FEATURES } spa_feature_t; diff --git a/lib/libzfs/libzfs.abi b/lib/libzfs/libzfs.abi index 51c8dc9647ee..88baa4168c31 100644 --- a/lib/libzfs/libzfs.abi +++ b/lib/libzfs/libzfs.abi @@ -616,7 +616,7 @@ - + @@ -6006,7 +6006,8 @@ - + + @@ -9131,8 +9132,8 @@ - - + + @@ -9209,7 +9210,7 @@ - + diff --git a/man/man7/zpool-features.7 b/man/man7/zpool-features.7 index ea3c68dc6083..ff6e485a4819 100644 --- a/man/man7/zpool-features.7 +++ b/man/man7/zpool-features.7 @@ -17,8 +17,9 @@ .\" Copyright (c) 2019, Klara Inc. .\" Copyright (c) 2019, Allan Jude .\" Copyright (c) 2021, Colm Buckley +.\" Copyright (c) 2023, Klara Inc. .\" -.Dd June 23, 2022 +.Dd February 14, 2024 .Dt ZPOOL-FEATURES 7 .Os . @@ -550,6 +551,20 @@ when an encrypted dataset is created and will be returned to the .Sy enabled state when all datasets that use this feature are destroyed. . +.feature com.klarasystems fast_dedup yes +This feature allows more advanced deduplication features to be enabled on new +dedup tables. +.Pp +This feature will be +.Sy active +when the first deduplicated block is written after a new dedup table is created +(ie after a new pool creation, or new checksum used on a dataset with +.Sy dedup +enabled). +It will be returned to the +.Sy enabled +state when all deduplicated blocks using it are freed. +. .feature com.delphix extensible_dataset no This feature allows more flexible use of internal ZFS data structures, and exists for other features to depend on. diff --git a/module/zcommon/zfeature_common.c b/module/zcommon/zfeature_common.c index 309d9bf14cd4..8dec5f27b0af 100644 --- a/module/zcommon/zfeature_common.c +++ b/module/zcommon/zfeature_common.c @@ -754,6 +754,12 @@ zpool_feature_init(void) "Support for raidz expansion", ZFEATURE_FLAG_MOS, ZFEATURE_TYPE_BOOLEAN, NULL, sfeatures); + zfeature_register(SPA_FEATURE_FAST_DEDUP, + "com.klarasystems:fast_dedup", "fast_dedup", + "Support for advanced deduplication", + ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN, NULL, + sfeatures); + zfs_mod_list_supported_free(sfeatures); } diff --git a/module/zfs/ddt.c b/module/zfs/ddt.c index d70ae1a031d5..59526394bd07 100644 --- a/module/zfs/ddt.c +++ b/module/zfs/ddt.c @@ -39,6 +39,7 @@ #include #include #include +#include /* * # DDT: Deduplication tables @@ -74,12 +75,19 @@ * fill the BP with the DVAs from the entry, increment the refcount and cause * the write IO to return immediately. * - * Each ddt_phys_t slot in the entry represents a separate dedup block for the - * same content/checksum. The slot is selected based on the zp_copies parameter - * the block is written with, that is, the number of DVAs in the block. The - * "ditto" slot (DDT_PHYS_DITTO) used to be used for now-removed "dedupditto" - * feature. These are no longer written, and will be freed if encountered on - * old pools. + * Traditionally, each ddt_phys_t slot in the entry represents a separate dedup + * block for the same content/checksum. The slot is selected based on the + * zp_copies parameter the block is written with, that is, the number of DVAs + * in the block. The "ditto" slot (DDT_PHYS_DITTO) used to be used for + * now-removed "dedupditto" feature. These are no longer written, and will be + * freed if encountered on old pools. + * + * If the "fast_dedup" feature is enabled, new dedup tables will be created + * with the "flat phys" option. In this mode, there is only one ddt_phys_t + * slot. If a write is issued for an entry that exists, but has fewer DVAs, + * then only as many new DVAs are allocated and written to make up the + * shortfall. The existing entry is then extended (ddt_phys_extend()) with the + * new DVAs. * * ## Lifetime of an entry * @@ -129,6 +137,16 @@ * from the alternate block. If the block is actually damaged, this will invoke * the pool's "self-healing" mechanism, and repair the block. * + * If the "fast_dedup" feature is enabled, the "flat phys" option will be in + * use, so there is only ever one ddt_phys_t slot. The repair process will + * still happen in this case, though it is unlikely to succeed as there will + * usually be no other equivalent blocks to fall back on (though there might + * be, if this was an early version of a dedup'd block that has since been + * extended). + * + * Note that this repair mechanism is in addition to and separate from the + * regular OpenZFS scrub and self-healing mechanisms. + * * ## Scanning (scrub/resilver) * * If dedup is active, the scrub machinery will walk the dedup table first, and @@ -161,7 +179,15 @@ c == ZIO_CHECKSUM_BLAKE3) static kmem_cache_t *ddt_cache; -static kmem_cache_t *ddt_entry_cache; + +static kmem_cache_t *ddt_entry_flat_cache; +static kmem_cache_t *ddt_entry_trad_cache; + +#define DDT_ENTRY_FLAT_SIZE (sizeof (ddt_entry_t) + DDT_FLAT_PHYS_SIZE) +#define DDT_ENTRY_TRAD_SIZE (sizeof (ddt_entry_t) + DDT_TRAD_PHYS_SIZE) + +#define DDT_ENTRY_SIZE(ddt) \ + _DDT_PHYS_SWITCH(ddt, DDT_ENTRY_FLAT_SIZE, DDT_ENTRY_TRAD_SIZE) /* * Enable/disable prefetching of dedup-ed blocks which are going to be freed. @@ -185,6 +211,18 @@ static const char *const ddt_class_name[DDT_CLASSES] = { "unique", }; +/* + * DDT feature flags automatically enabled for each on-disk version. Note that + * versions >0 cannot exist on disk without SPA_FEATURE_FAST_DEDUP enabled. + */ +static const uint64_t ddt_version_flags[] = { + [DDT_VERSION_LEGACY] = 0, + [DDT_VERSION_FDT] = DDT_FLAG_FLAT, +}; + +/* Dummy version to signal that configure is still necessary */ +#define DDT_VERSION_UNCONFIGURED (UINT64_MAX) + static void ddt_object_create(ddt_t *ddt, ddt_type_t type, ddt_class_t class, dmu_tx_t *tx) @@ -196,14 +234,18 @@ ddt_object_create(ddt_t *ddt, ddt_type_t type, ddt_class_t class, ZCHECKSUM_FLAG_DEDUP; char name[DDT_NAMELEN]; + ASSERT3U(ddt->ddt_dir_object, >, 0); + ddt_object_name(ddt, type, class, name); ASSERT3U(*objectp, ==, 0); VERIFY0(ddt_ops[type]->ddt_op_create(os, objectp, tx, prehash)); ASSERT3U(*objectp, !=, 0); - VERIFY0(zap_add(os, DMU_POOL_DIRECTORY_OBJECT, name, - sizeof (uint64_t), 1, objectp, tx)); + ASSERT3U(ddt->ddt_version, !=, DDT_VERSION_UNCONFIGURED); + + VERIFY0(zap_add(os, ddt->ddt_dir_object, name, sizeof (uint64_t), 1, + objectp, tx)); VERIFY0(zap_add(os, spa->spa_ddt_stat_object, name, sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t), @@ -220,13 +262,15 @@ ddt_object_destroy(ddt_t *ddt, ddt_type_t type, ddt_class_t class, uint64_t count; char name[DDT_NAMELEN]; + ASSERT3U(ddt->ddt_dir_object, >, 0); + ddt_object_name(ddt, type, class, name); ASSERT3U(*objectp, !=, 0); ASSERT(ddt_histogram_empty(&ddt->ddt_histogram[type][class])); VERIFY0(ddt_object_count(ddt, type, class, &count)); VERIFY0(count); - VERIFY0(zap_remove(os, DMU_POOL_DIRECTORY_OBJECT, name, tx)); + VERIFY0(zap_remove(os, ddt->ddt_dir_object, name, tx)); VERIFY0(zap_remove(os, spa->spa_ddt_stat_object, name, tx)); VERIFY0(ddt_ops[type]->ddt_op_destroy(os, *objectp, tx)); memset(&ddt->ddt_object_stats[type][class], 0, sizeof (ddt_object_t)); @@ -243,9 +287,18 @@ ddt_object_load(ddt_t *ddt, ddt_type_t type, ddt_class_t class) char name[DDT_NAMELEN]; int error; + if (ddt->ddt_dir_object == 0) { + /* + * If we're configured but the containing dir doesn't exist + * yet, then this object can't possibly exist either. + */ + ASSERT3U(ddt->ddt_version, !=, DDT_VERSION_UNCONFIGURED); + return (SET_ERROR(ENOENT)); + } + ddt_object_name(ddt, type, class, name); - error = zap_lookup(ddt->ddt_os, DMU_POOL_DIRECTORY_OBJECT, name, + error = zap_lookup(ddt->ddt_os, ddt->ddt_dir_object, name, sizeof (uint64_t), 1, &ddt->ddt_object[type][class]); if (error != 0) return (error); @@ -315,7 +368,7 @@ ddt_object_lookup(ddt_t *ddt, ddt_type_t type, ddt_class_t class, return (ddt_ops[type]->ddt_op_lookup(ddt->ddt_os, ddt->ddt_object[type][class], &dde->dde_key, - dde->dde_phys, sizeof (dde->dde_phys))); + dde->dde_phys, DDT_PHYS_SIZE(ddt))); } static int @@ -357,8 +410,8 @@ ddt_object_update(ddt_t *ddt, ddt_type_t type, ddt_class_t class, ASSERT(ddt_object_exists(ddt, type, class)); return (ddt_ops[type]->ddt_op_update(ddt->ddt_os, - ddt->ddt_object[type][class], &dde->dde_key, dde->dde_phys, - sizeof (dde->dde_phys), tx)); + ddt->ddt_object[type][class], &dde->dde_key, + dde->dde_phys, DDT_PHYS_SIZE(ddt), tx)); } static int @@ -373,13 +426,19 @@ ddt_object_remove(ddt_t *ddt, ddt_type_t type, ddt_class_t class, int ddt_object_walk(ddt_t *ddt, ddt_type_t type, ddt_class_t class, - uint64_t *walk, ddt_entry_t *dde) + uint64_t *walk, ddt_lightweight_entry_t *ddlwe) { ASSERT(ddt_object_exists(ddt, type, class)); - return (ddt_ops[type]->ddt_op_walk(ddt->ddt_os, - ddt->ddt_object[type][class], walk, &dde->dde_key, - dde->dde_phys, sizeof (dde->dde_phys))); + int error = ddt_ops[type]->ddt_op_walk(ddt->ddt_os, + ddt->ddt_object[type][class], walk, &ddlwe->ddlwe_key, + &ddlwe->ddlwe_phys, DDT_PHYS_SIZE(ddt)); + if (error == 0) { + ddlwe->ddlwe_type = type; + ddlwe->ddlwe_class = class; + return (0); + } + return (error); } int @@ -413,13 +472,25 @@ ddt_object_name(ddt_t *ddt, ddt_type_t type, ddt_class_t class, } void -ddt_bp_fill(const ddt_phys_t *ddp, blkptr_t *bp, uint64_t txg) +ddt_bp_fill(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v, + blkptr_t *bp, uint64_t txg) { ASSERT3U(txg, !=, 0); + ASSERT3U(v, <, DDT_PHYS_NONE); + uint64_t phys_birth; + const dva_t *dvap; + + if (v == DDT_PHYS_FLAT) { + phys_birth = ddp->ddp_flat.ddp_phys_birth; + dvap = ddp->ddp_flat.ddp_dva; + } else { + phys_birth = ddp->ddp_trad[v].ddp_phys_birth; + dvap = ddp->ddp_trad[v].ddp_dva; + } for (int d = 0; d < SPA_DVAS_PER_BP; d++) - bp->blk_dva[d] = ddp->ddp_dva[d]; - BP_SET_BIRTH(bp, txg, ddp->ddp_phys_birth); + bp->blk_dva[d] = dvap[d]; + BP_SET_BIRTH(bp, txg, phys_birth); } /* @@ -427,13 +498,13 @@ ddt_bp_fill(const ddt_phys_t *ddp, blkptr_t *bp, uint64_t txg) * will be missing the salt / IV required to do a full decrypting read. */ void -ddt_bp_create(enum zio_checksum checksum, - const ddt_key_t *ddk, const ddt_phys_t *ddp, blkptr_t *bp) +ddt_bp_create(enum zio_checksum checksum, const ddt_key_t *ddk, + const ddt_univ_phys_t *ddp, ddt_phys_variant_t v, blkptr_t *bp) { BP_ZERO(bp); if (ddp != NULL) - ddt_bp_fill(ddp, bp, ddp->ddp_phys_birth); + ddt_bp_fill(ddp, v, bp, ddt_phys_birth(ddp, v)); bp->blk_cksum = ddk->ddk_cksum; @@ -464,42 +535,101 @@ ddt_key_fill(ddt_key_t *ddk, const blkptr_t *bp) } void -ddt_phys_fill(ddt_phys_t *ddp, const blkptr_t *bp) +ddt_phys_extend(ddt_univ_phys_t *ddp, ddt_phys_variant_t v, const blkptr_t *bp) { - ASSERT0(ddp->ddp_phys_birth); + ASSERT3U(v, <, DDT_PHYS_NONE); + int bp_ndvas = BP_GET_NDVAS(bp); + int ddp_max_dvas = BP_IS_ENCRYPTED(bp) ? + SPA_DVAS_PER_BP - 1 : SPA_DVAS_PER_BP; + dva_t *dvas = (v == DDT_PHYS_FLAT) ? + ddp->ddp_flat.ddp_dva : ddp->ddp_trad[v].ddp_dva; + + int s = 0, d = 0; + while (s < bp_ndvas && d < ddp_max_dvas) { + if (DVA_IS_VALID(&dvas[d])) { + d++; + continue; + } + dvas[d] = bp->blk_dva[s]; + s++; d++; + } - for (int d = 0; d < SPA_DVAS_PER_BP; d++) - ddp->ddp_dva[d] = bp->blk_dva[d]; - ddp->ddp_phys_birth = BP_GET_BIRTH(bp); + /* + * If the caller offered us more DVAs than we can fit, something has + * gone wrong in their accounting. zio_ddt_write() should never ask for + * more than we need. + */ + ASSERT3U(s, ==, bp_ndvas); + + if (BP_IS_ENCRYPTED(bp)) + dvas[2] = bp->blk_dva[2]; + + if (ddt_phys_birth(ddp, v) == 0) { + if (v == DDT_PHYS_FLAT) + ddp->ddp_flat.ddp_phys_birth = BP_GET_BIRTH(bp); + else + ddp->ddp_trad[v].ddp_phys_birth = BP_GET_BIRTH(bp); + } } void -ddt_phys_clear(ddt_phys_t *ddp) +ddt_phys_copy(ddt_univ_phys_t *dst, const ddt_univ_phys_t *src, + ddt_phys_variant_t v) { - memset(ddp, 0, sizeof (*ddp)); + ASSERT3U(v, <, DDT_PHYS_NONE); + + if (v == DDT_PHYS_FLAT) + dst->ddp_flat = src->ddp_flat; + else + dst->ddp_trad[v] = src->ddp_trad[v]; } void -ddt_phys_addref(ddt_phys_t *ddp) +ddt_phys_clear(ddt_univ_phys_t *ddp, ddt_phys_variant_t v) { - ddp->ddp_refcnt++; + ASSERT3U(v, <, DDT_PHYS_NONE); + + if (v == DDT_PHYS_FLAT) + memset(&ddp->ddp_flat, 0, DDT_FLAT_PHYS_SIZE); + else + memset(&ddp->ddp_trad[v], 0, DDT_TRAD_PHYS_SIZE / DDT_PHYS_MAX); } void -ddt_phys_decref(ddt_phys_t *ddp) +ddt_phys_addref(ddt_univ_phys_t *ddp, ddt_phys_variant_t v) { - if (ddp) { - ASSERT3U(ddp->ddp_refcnt, >, 0); - ddp->ddp_refcnt--; - } + ASSERT3U(v, <, DDT_PHYS_NONE); + + if (v == DDT_PHYS_FLAT) + ddp->ddp_flat.ddp_refcnt++; + else + ddp->ddp_trad[v].ddp_refcnt++; +} + +uint64_t +ddt_phys_decref(ddt_univ_phys_t *ddp, ddt_phys_variant_t v) +{ + ASSERT3U(v, <, DDT_PHYS_NONE); + + uint64_t *refcntp; + + if (v == DDT_PHYS_FLAT) + refcntp = &ddp->ddp_flat.ddp_refcnt; + else + refcntp = &ddp->ddp_trad[v].ddp_refcnt; + + ASSERT3U(*refcntp, >, 0); + (*refcntp)--; + return (*refcntp); } static void -ddt_phys_free(ddt_t *ddt, ddt_key_t *ddk, ddt_phys_t *ddp, uint64_t txg) +ddt_phys_free(ddt_t *ddt, ddt_key_t *ddk, ddt_univ_phys_t *ddp, + ddt_phys_variant_t v, uint64_t txg) { blkptr_t blk; - ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk); + ddt_bp_create(ddt->ddt_checksum, ddk, ddp, v, &blk); /* * We clear the dedup bit so that zio_free() will actually free the @@ -507,30 +637,80 @@ ddt_phys_free(ddt_t *ddt, ddt_key_t *ddk, ddt_phys_t *ddp, uint64_t txg) */ BP_SET_DEDUP(&blk, 0); - ddt_phys_clear(ddp); + ddt_phys_clear(ddp, v); zio_free(ddt->ddt_spa, txg, &blk); } -ddt_phys_t * -ddt_phys_select(const ddt_entry_t *dde, const blkptr_t *bp) +uint64_t +ddt_phys_birth(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v) +{ + ASSERT3U(v, <, DDT_PHYS_NONE); + + if (v == DDT_PHYS_FLAT) + return (ddp->ddp_flat.ddp_phys_birth); + else + return (ddp->ddp_trad[v].ddp_phys_birth); +} + +int +ddt_phys_dva_count(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v, + boolean_t encrypted) +{ + ASSERT3U(v, <, DDT_PHYS_NONE); + + const dva_t *dvas = (v == DDT_PHYS_FLAT) ? + ddp->ddp_flat.ddp_dva : ddp->ddp_trad[v].ddp_dva; + + return (DVA_IS_VALID(&dvas[0]) + + DVA_IS_VALID(&dvas[1]) + + DVA_IS_VALID(&dvas[2]) * !encrypted); +} + +ddt_phys_variant_t +ddt_phys_select(const ddt_t *ddt, const ddt_entry_t *dde, const blkptr_t *bp) { - ddt_phys_t *ddp = (ddt_phys_t *)dde->dde_phys; + const ddt_univ_phys_t *ddp = dde->dde_phys; - for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { - if (DVA_EQUAL(BP_IDENTITY(bp), &ddp->ddp_dva[0]) && - BP_GET_BIRTH(bp) == ddp->ddp_phys_birth) - return (ddp); + if (ddt->ddt_flags & DDT_FLAG_FLAT) { + if (DVA_EQUAL(BP_IDENTITY(bp), &ddp->ddp_flat.ddp_dva[0]) && + BP_GET_BIRTH(bp) == ddp->ddp_flat.ddp_phys_birth) { + return (DDT_PHYS_FLAT); + } + } else /* traditional phys */ { + for (int p = 0; p < DDT_PHYS_MAX; p++) { + if (DVA_EQUAL(BP_IDENTITY(bp), + &ddp->ddp_trad[p].ddp_dva[0]) && + BP_GET_BIRTH(bp) == + ddp->ddp_trad[p].ddp_phys_birth) { + return (p); + } + } } - return (NULL); + return (DDT_PHYS_NONE); } uint64_t -ddt_phys_total_refcnt(const ddt_entry_t *dde) +ddt_phys_refcnt(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v) +{ + ASSERT3U(v, <, DDT_PHYS_NONE); + + if (v == DDT_PHYS_FLAT) + return (ddp->ddp_flat.ddp_refcnt); + else + return (ddp->ddp_trad[v].ddp_refcnt); +} + +uint64_t +ddt_phys_total_refcnt(const ddt_t *ddt, const ddt_entry_t *dde) { uint64_t refcnt = 0; - for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) - refcnt += dde->dde_phys[p].ddp_refcnt; + if (ddt->ddt_flags & DDT_FLAG_FLAT) { + refcnt = dde->dde_phys->ddp_flat.ddp_refcnt; + } else { + for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) + refcnt += dde->dde_phys->ddp_trad[p].ddp_refcnt; + } return (refcnt); } @@ -559,24 +739,33 @@ ddt_init(void) { ddt_cache = kmem_cache_create("ddt_cache", sizeof (ddt_t), 0, NULL, NULL, NULL, NULL, NULL, 0); - ddt_entry_cache = kmem_cache_create("ddt_entry_cache", - sizeof (ddt_entry_t), 0, NULL, NULL, NULL, NULL, NULL, 0); + ddt_entry_flat_cache = kmem_cache_create("ddt_entry_flat_cache", + DDT_ENTRY_FLAT_SIZE, 0, NULL, NULL, NULL, NULL, NULL, 0); + ddt_entry_trad_cache = kmem_cache_create("ddt_entry_trad_cache", + DDT_ENTRY_TRAD_SIZE, 0, NULL, NULL, NULL, NULL, NULL, 0); } void ddt_fini(void) { - kmem_cache_destroy(ddt_entry_cache); + kmem_cache_destroy(ddt_entry_trad_cache); + kmem_cache_destroy(ddt_entry_flat_cache); kmem_cache_destroy(ddt_cache); } static ddt_entry_t * -ddt_alloc(const ddt_key_t *ddk) +ddt_alloc(const ddt_t *ddt, const ddt_key_t *ddk) { ddt_entry_t *dde; - dde = kmem_cache_alloc(ddt_entry_cache, KM_SLEEP); - memset(dde, 0, sizeof (ddt_entry_t)); + if (ddt->ddt_flags & DDT_FLAG_FLAT) { + dde = kmem_cache_alloc(ddt_entry_flat_cache, KM_SLEEP); + memset(dde, 0, DDT_ENTRY_FLAT_SIZE); + } else { + dde = kmem_cache_alloc(ddt_entry_trad_cache, KM_SLEEP); + memset(dde, 0, DDT_ENTRY_TRAD_SIZE); + } + cv_init(&dde->dde_cv, NULL, CV_DEFAULT, NULL); dde->dde_key = *ddk; @@ -584,17 +773,31 @@ ddt_alloc(const ddt_key_t *ddk) return (dde); } +void +ddt_alloc_entry_io(ddt_entry_t *dde) +{ + if (dde->dde_io != NULL) + return; + + dde->dde_io = kmem_zalloc(sizeof (ddt_entry_io_t), KM_SLEEP); +} + static void -ddt_free(ddt_entry_t *dde) +ddt_free(const ddt_t *ddt, ddt_entry_t *dde) { - for (int p = 0; p < DDT_PHYS_TYPES; p++) - ASSERT3P(dde->dde_lead_zio[p], ==, NULL); + if (dde->dde_io != NULL) { + for (int p = 0; p < DDT_NPHYS(ddt); p++) + ASSERT3P(dde->dde_io->dde_lead_zio[p], ==, NULL); - if (dde->dde_repair_abd != NULL) - abd_free(dde->dde_repair_abd); + if (dde->dde_io->dde_repair_abd != NULL) + abd_free(dde->dde_io->dde_repair_abd); + + kmem_free(dde->dde_io, sizeof (ddt_entry_io_t)); + } cv_destroy(&dde->dde_cv); - kmem_cache_free(ddt_entry_cache, dde); + kmem_cache_free(ddt->ddt_flags & DDT_FLAG_FLAT ? + ddt_entry_flat_cache : ddt_entry_trad_cache, dde); } void @@ -603,7 +806,7 @@ ddt_remove(ddt_t *ddt, ddt_entry_t *dde) ASSERT(MUTEX_HELD(&ddt->ddt_lock)); avl_remove(&ddt->ddt_tree, dde); - ddt_free(dde); + ddt_free(ddt, dde); } static boolean_t @@ -684,8 +887,10 @@ ddt_prefetch_all(spa_t *spa) } } +static int ddt_configure(ddt_t *ddt, boolean_t new); + ddt_entry_t * -ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add) +ddt_lookup(ddt_t *ddt, const blkptr_t *bp) { spa_t *spa = ddt->ddt_spa; ddt_key_t search; @@ -697,6 +902,15 @@ ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add) ASSERT(MUTEX_HELD(&ddt->ddt_lock)); + if (ddt->ddt_version == DDT_VERSION_UNCONFIGURED) { + /* + * This is the first use of this DDT since the pool was + * created; finish getting it ready for use. + */ + VERIFY0(ddt_configure(ddt, B_TRUE)); + ASSERT3U(ddt->ddt_version, !=, DDT_VERSION_UNCONFIGURED); + } + ddt_key_fill(&search, bp); /* Find an existing live entry */ @@ -720,7 +934,7 @@ ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add) if (dde->dde_flags & DDE_FLAG_OVERQUOTA) { if (dde->dde_waiters == 0) { avl_remove(&ddt->ddt_tree, dde); - ddt_free(dde); + ddt_free(ddt, dde); } return (NULL); } @@ -728,12 +942,13 @@ ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add) return (dde); } - /* Not found. */ - if (!add) - return (NULL); - /* Time to make a new entry. */ - dde = ddt_alloc(&search); + dde = ddt_alloc(ddt, &search); + + /* Record the time this class was created (used by ddt prune) */ + if (ddt->ddt_flags & DDT_FLAG_FLAT) + dde->dde_phys->ddp_flat.ddp_class_start = gethrestime_sec(); + avl_insert(&ddt->ddt_tree, dde, where); /* @@ -770,7 +985,7 @@ ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add) /* Over quota. If no one is waiting, clean up right now. */ if (dde->dde_waiters == 0) { avl_remove(&ddt->ddt_tree, dde); - ddt_free(dde); + ddt_free(ddt, dde); return (NULL); } @@ -837,6 +1052,181 @@ ddt_key_compare(const void *x1, const void *x2) return (TREE_ISIGN(cmp)); } +/* Create the containing dir for this DDT and bump the feature count */ +static void +ddt_create_dir(ddt_t *ddt, dmu_tx_t *tx) +{ + ASSERT3U(ddt->ddt_dir_object, ==, 0); + ASSERT3U(ddt->ddt_version, ==, DDT_VERSION_FDT); + + char name[DDT_NAMELEN]; + snprintf(name, DDT_NAMELEN, DMU_POOL_DDT_DIR, + zio_checksum_table[ddt->ddt_checksum].ci_name); + + ddt->ddt_dir_object = zap_create_link(ddt->ddt_os, + DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT, name, tx); + + VERIFY0(zap_add(ddt->ddt_os, ddt->ddt_dir_object, DDT_DIR_VERSION, + sizeof (uint64_t), 1, &ddt->ddt_version, tx)); + VERIFY0(zap_add(ddt->ddt_os, ddt->ddt_dir_object, DDT_DIR_FLAGS, + sizeof (uint64_t), 1, &ddt->ddt_flags, tx)); + + spa_feature_incr(ddt->ddt_spa, SPA_FEATURE_FAST_DEDUP, tx); +} + +/* Destroy the containing dir and deactivate the feature */ +static void +ddt_destroy_dir(ddt_t *ddt, dmu_tx_t *tx) +{ + ASSERT3U(ddt->ddt_dir_object, !=, 0); + ASSERT3U(ddt->ddt_dir_object, !=, DMU_POOL_DIRECTORY_OBJECT); + ASSERT3U(ddt->ddt_version, ==, DDT_VERSION_FDT); + + char name[DDT_NAMELEN]; + snprintf(name, DDT_NAMELEN, DMU_POOL_DDT_DIR, + zio_checksum_table[ddt->ddt_checksum].ci_name); + + for (ddt_type_t type = 0; type < DDT_TYPES; type++) { + for (ddt_class_t class = 0; class < DDT_CLASSES; class++) { + ASSERT(!ddt_object_exists(ddt, type, class)); + } + } + + uint64_t count; + ASSERT0(zap_count(ddt->ddt_os, ddt->ddt_dir_object, &count)); + ASSERT0(zap_contains(ddt->ddt_os, ddt->ddt_dir_object, + DDT_DIR_VERSION)); + ASSERT0(zap_contains(ddt->ddt_os, ddt->ddt_dir_object, DDT_DIR_FLAGS)); + ASSERT3U(count, ==, 2); + + VERIFY0(zap_remove(ddt->ddt_os, DMU_POOL_DIRECTORY_OBJECT, name, tx)); + VERIFY0(zap_destroy(ddt->ddt_os, ddt->ddt_dir_object, tx)); + + ddt->ddt_dir_object = 0; + + spa_feature_decr(ddt->ddt_spa, SPA_FEATURE_FAST_DEDUP, tx); +} + +/* + * Determine, flags and on-disk layout from what's already stored. If there's + * nothing stored, then if new is false, returns ENOENT, and if true, selects + * based on pool config. + */ +static int +ddt_configure(ddt_t *ddt, boolean_t new) +{ + spa_t *spa = ddt->ddt_spa; + char name[DDT_NAMELEN]; + int error; + + ASSERT3U(spa_load_state(spa), !=, SPA_LOAD_CREATE); + + boolean_t fdt_enabled = + spa_feature_is_enabled(spa, SPA_FEATURE_FAST_DEDUP); + boolean_t fdt_active = + spa_feature_is_active(spa, SPA_FEATURE_FAST_DEDUP); + + /* + * First, look for the global DDT stats object. If its not there, then + * there's never been a DDT written before ever, and we know we're + * starting from scratch. + */ + error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_DDT_STATS, sizeof (uint64_t), 1, + &spa->spa_ddt_stat_object); + if (error != 0) { + if (error != ENOENT) + return (error); + goto not_found; + } + + if (fdt_active) { + /* + * Now look for a DDT directory. If it exists, then it has + * everything we need. + */ + snprintf(name, DDT_NAMELEN, DMU_POOL_DDT_DIR, + zio_checksum_table[ddt->ddt_checksum].ci_name); + + error = zap_lookup(spa->spa_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, name, sizeof (uint64_t), 1, + &ddt->ddt_dir_object); + if (error == 0) { + ASSERT3U(spa->spa_meta_objset, ==, ddt->ddt_os); + + error = zap_lookup(ddt->ddt_os, ddt->ddt_dir_object, + DDT_DIR_VERSION, sizeof (uint64_t), 1, + &ddt->ddt_version); + if (error != 0) + return (error); + + error = zap_lookup(ddt->ddt_os, ddt->ddt_dir_object, + DDT_DIR_FLAGS, sizeof (uint64_t), 1, + &ddt->ddt_flags); + if (error != 0) + return (error); + + if (ddt->ddt_version != DDT_VERSION_FDT) { + zfs_dbgmsg("ddt_configure: spa=%s ddt_dir=%s " + "unknown version %llu", spa_name(spa), + name, (u_longlong_t)ddt->ddt_version); + return (SET_ERROR(EINVAL)); + } + + if ((ddt->ddt_flags & ~DDT_FLAG_MASK) != 0) { + zfs_dbgmsg("ddt_configure: spa=%s ddt_dir=%s " + "version=%llu unknown flags %llx", + spa_name(spa), name, + (u_longlong_t)ddt->ddt_flags, + (u_longlong_t)ddt->ddt_version); + return (SET_ERROR(EINVAL)); + } + + return (0); + } + if (error != ENOENT) + return (error); + } + + /* Any object in the root indicates a traditional setup. */ + for (ddt_type_t type = 0; type < DDT_TYPES; type++) { + for (ddt_class_t class = 0; class < DDT_CLASSES; class++) { + ddt_object_name(ddt, type, class, name); + uint64_t obj; + error = zap_lookup(spa->spa_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, name, sizeof (uint64_t), + 1, &obj); + if (error == ENOENT) + continue; + if (error != 0) + return (error); + + ddt->ddt_version = DDT_VERSION_LEGACY; + ddt->ddt_flags = ddt_version_flags[ddt->ddt_version]; + ddt->ddt_dir_object = DMU_POOL_DIRECTORY_OBJECT; + + return (0); + } + } + +not_found: + if (!new) + return (SET_ERROR(ENOENT)); + + /* Nothing on disk, so set up for the best version we can */ + if (fdt_enabled) { + ddt->ddt_version = DDT_VERSION_FDT; + ddt->ddt_flags = ddt_version_flags[ddt->ddt_version]; + ddt->ddt_dir_object = 0; /* create on first use */ + } else { + ddt->ddt_version = DDT_VERSION_LEGACY; + ddt->ddt_flags = ddt_version_flags[ddt->ddt_version]; + ddt->ddt_dir_object = DMU_POOL_DIRECTORY_OBJECT; + } + + return (0); +} + static ddt_t * ddt_table_alloc(spa_t *spa, enum zio_checksum c) { @@ -853,6 +1243,7 @@ ddt_table_alloc(spa_t *spa, enum zio_checksum c) ddt->ddt_checksum = c; ddt->ddt_spa = spa; ddt->ddt_os = spa->spa_meta_objset; + ddt->ddt_version = DDT_VERSION_UNCONFIGURED; return (ddt); } @@ -889,7 +1280,6 @@ ddt_load(spa_t *spa) error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DDT_STATS, sizeof (uint64_t), 1, &spa->spa_ddt_stat_object); - if (error) return (error == ENOENT ? 0 : error); @@ -898,6 +1288,12 @@ ddt_load(spa_t *spa) continue; ddt_t *ddt = spa->spa_ddt[c]; + error = ddt_configure(ddt, B_FALSE); + if (error == ENOENT) + continue; + if (error != 0) + return (error); + for (ddt_type_t type = 0; type < DDT_TYPES; type++) { for (ddt_class_t class = 0; class < DDT_CLASSES; class++) { @@ -912,10 +1308,11 @@ ddt_load(spa_t *spa) */ memcpy(&ddt->ddt_histogram_cache, ddt->ddt_histogram, sizeof (ddt->ddt_histogram)); - spa->spa_dedup_dspace = ~0ULL; - spa->spa_dedup_dsize = ~0ULL; } + spa->spa_dedup_dspace = ~0ULL; + spa->spa_dedup_dsize = ~0ULL; + return (0); } @@ -964,7 +1361,8 @@ ddt_repair_start(ddt_t *ddt, const blkptr_t *bp) ddt_key_fill(&ddk, bp); - dde = ddt_alloc(&ddk); + dde = ddt_alloc(ddt, &ddk); + ddt_alloc_entry_io(dde); for (ddt_type_t type = 0; type < DDT_TYPES; type++) { for (ddt_class_t class = 0; class < DDT_CLASSES; class++) { @@ -979,7 +1377,7 @@ ddt_repair_start(ddt_t *ddt, const blkptr_t *bp) } } - memset(dde->dde_phys, 0, sizeof (dde->dde_phys)); + memset(dde->dde_phys, 0, DDT_PHYS_SIZE(ddt)); return (dde); } @@ -991,11 +1389,12 @@ ddt_repair_done(ddt_t *ddt, ddt_entry_t *dde) ddt_enter(ddt); - if (dde->dde_repair_abd != NULL && spa_writeable(ddt->ddt_spa) && + if (dde->dde_io->dde_repair_abd != NULL && + spa_writeable(ddt->ddt_spa) && avl_find(&ddt->ddt_repair_tree, dde, &where) == NULL) avl_insert(&ddt->ddt_repair_tree, dde, where); else - ddt_free(dde); + ddt_free(ddt, dde); ddt_exit(ddt); } @@ -1003,16 +1402,15 @@ ddt_repair_done(ddt_t *ddt, ddt_entry_t *dde) static void ddt_repair_entry_done(zio_t *zio) { + ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp); ddt_entry_t *rdde = zio->io_private; - ddt_free(rdde); + ddt_free(ddt, rdde); } static void ddt_repair_entry(ddt_t *ddt, ddt_entry_t *dde, ddt_entry_t *rdde, zio_t *rio) { - ddt_phys_t *ddp = dde->dde_phys; - ddt_phys_t *rddp = rdde->dde_phys; ddt_key_t *ddk = &dde->dde_key; ddt_key_t *rddk = &rdde->dde_key; zio_t *zio; @@ -1021,15 +1419,31 @@ ddt_repair_entry(ddt_t *ddt, ddt_entry_t *dde, ddt_entry_t *rdde, zio_t *rio) zio = zio_null(rio, rio->io_spa, NULL, ddt_repair_entry_done, rdde, rio->io_flags); - for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++, rddp++) { - if (ddp->ddp_phys_birth == 0 || - ddp->ddp_phys_birth != rddp->ddp_phys_birth || - memcmp(ddp->ddp_dva, rddp->ddp_dva, sizeof (ddp->ddp_dva))) + for (int p = 0; p < DDT_NPHYS(ddt); p++) { + ddt_univ_phys_t *ddp = dde->dde_phys; + ddt_univ_phys_t *rddp = rdde->dde_phys; + ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p); + uint64_t phys_birth = ddt_phys_birth(ddp, v); + const dva_t *dvas, *rdvas; + + if (ddt->ddt_flags & DDT_FLAG_FLAT) { + dvas = ddp->ddp_flat.ddp_dva; + rdvas = rddp->ddp_flat.ddp_dva; + } else { + dvas = ddp->ddp_trad[p].ddp_dva; + rdvas = rddp->ddp_trad[p].ddp_dva; + } + + if (phys_birth == 0 || + phys_birth != ddt_phys_birth(rddp, v) || + memcmp(dvas, rdvas, sizeof (dva_t) * SPA_DVAS_PER_BP)) continue; - ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk); + + ddt_bp_create(ddt->ddt_checksum, ddk, ddp, v, &blk); zio_nowait(zio_rewrite(zio, zio->io_spa, 0, &blk, - rdde->dde_repair_abd, DDK_GET_PSIZE(rddk), NULL, NULL, - ZIO_PRIORITY_SYNC_WRITE, ZIO_DDT_CHILD_FLAGS(zio), NULL)); + rdde->dde_io->dde_repair_abd, DDK_GET_PSIZE(rddk), + NULL, NULL, ZIO_PRIORITY_SYNC_WRITE, + ZIO_DDT_CHILD_FLAGS(zio), NULL)); } zio_nowait(zio); @@ -1051,7 +1465,8 @@ ddt_repair_table(ddt_t *ddt, zio_t *rio) rdde_next = AVL_NEXT(t, rdde); avl_remove(&ddt->ddt_repair_tree, rdde); ddt_exit(ddt); - ddt_bp_create(ddt->ddt_checksum, &rdde->dde_key, NULL, &blk); + ddt_bp_create(ddt->ddt_checksum, &rdde->dde_key, NULL, + DDT_PHYS_NONE, &blk); dde = ddt_repair_start(ddt, &blk); ddt_repair_entry(ddt, dde, rdde, rio); ddt_repair_done(ddt, dde); @@ -1064,7 +1479,6 @@ static void ddt_sync_entry(ddt_t *ddt, ddt_entry_t *dde, dmu_tx_t *tx, uint64_t txg) { dsl_pool_t *dp = ddt->ddt_spa->spa_dsl_pool; - ddt_phys_t *ddp = dde->dde_phys; ddt_key_t *ddk = &dde->dde_key; ddt_type_t otype = dde->dde_type; ddt_type_t ntype = DDT_TYPE_DEFAULT; @@ -1074,27 +1488,30 @@ ddt_sync_entry(ddt_t *ddt, ddt_entry_t *dde, dmu_tx_t *tx, uint64_t txg) ASSERT(dde->dde_flags & DDE_FLAG_LOADED); - for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { - ASSERT3P(dde->dde_lead_zio[p], ==, NULL); - if (ddp->ddp_phys_birth == 0) { - ASSERT0(ddp->ddp_refcnt); + for (int p = 0; p < DDT_NPHYS(ddt); p++) { + ASSERT(dde->dde_io == NULL || + dde->dde_io->dde_lead_zio[p] == NULL); + ddt_univ_phys_t *ddp = dde->dde_phys; + ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p); + uint64_t phys_refcnt = ddt_phys_refcnt(ddp, v); + + if (ddt_phys_birth(ddp, v) == 0) { + ASSERT0(phys_refcnt); continue; } - if (p == DDT_PHYS_DITTO) { + if (DDT_PHYS_IS_DITTO(ddt, p)) { /* * Note, we no longer create DDT-DITTO blocks, but we * don't want to leak any written by older software. */ - ddt_phys_free(ddt, ddk, ddp, txg); + ddt_phys_free(ddt, ddk, ddp, v, txg); continue; } - if (ddp->ddp_refcnt == 0) - ddt_phys_free(ddt, ddk, ddp, txg); - total_refcnt += ddp->ddp_refcnt; + if (phys_refcnt == 0) + ddt_phys_free(ddt, ddk, ddp, v, txg); + total_refcnt += phys_refcnt; } - /* We do not create new DDT-DITTO blocks. */ - ASSERT0(dde->dde_phys[DDT_PHYS_DITTO].ddp_phys_birth); if (total_refcnt > 1) nclass = DDT_CLASS_DUPLICATE; else @@ -1123,8 +1540,10 @@ ddt_sync_entry(ddt_t *ddt, ddt_entry_t *dde, dmu_tx_t *tx, uint64_t txg) * traversing.) */ if (nclass < oclass) { + ddt_lightweight_entry_t ddlwe; + DDT_ENTRY_TO_LIGHTWEIGHT(ddt, dde, &ddlwe); dsl_scan_ddt_entry(dp->dp_scan, - ddt->ddt_checksum, dde, tx); + ddt->ddt_checksum, ddt, &ddlwe, tx); } } } @@ -1147,25 +1566,44 @@ ddt_sync_table(ddt_t *ddt, dmu_tx_t *tx, uint64_t txg) DMU_POOL_DDT_STATS, tx); } + if (ddt->ddt_version == DDT_VERSION_FDT && ddt->ddt_dir_object == 0) + ddt_create_dir(ddt, tx); + while ((dde = avl_destroy_nodes(&ddt->ddt_tree, &cookie)) != NULL) { ddt_sync_entry(ddt, dde, tx, txg); - ddt_free(dde); + ddt_free(ddt, dde); } + uint64_t count = 0; for (ddt_type_t type = 0; type < DDT_TYPES; type++) { - uint64_t add, count = 0; + uint64_t add, tcount = 0; for (ddt_class_t class = 0; class < DDT_CLASSES; class++) { if (ddt_object_exists(ddt, type, class)) { ddt_object_sync(ddt, type, class, tx); VERIFY0(ddt_object_count(ddt, type, class, &add)); - count += add; + tcount += add; } } for (ddt_class_t class = 0; class < DDT_CLASSES; class++) { - if (count == 0 && ddt_object_exists(ddt, type, class)) + if (tcount == 0 && ddt_object_exists(ddt, type, class)) ddt_object_destroy(ddt, type, class, tx); } + count += tcount; + } + + if (count == 0) { + /* + * No entries left on the DDT, so reset the version for next + * time. This allows us to handle the feature being changed + * since the DDT was originally created. New entries should get + * whatever the feature currently demands. + */ + if (ddt->ddt_version == DDT_VERSION_FDT) + ddt_destroy_dir(ddt, tx); + + ddt->ddt_version = DDT_VERSION_UNCONFIGURED; + ddt->ddt_flags = 0; } memcpy(&ddt->ddt_histogram_cache, ddt->ddt_histogram, @@ -1212,7 +1650,7 @@ ddt_sync(spa_t *spa, uint64_t txg) } int -ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_entry_t *dde) +ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_lightweight_entry_t *ddlwe) { do { do { @@ -1225,10 +1663,8 @@ ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_entry_t *dde) ddb->ddb_class)) { error = ddt_object_walk(ddt, ddb->ddb_type, ddb->ddb_class, - &ddb->ddb_cursor, dde); + &ddb->ddb_cursor, ddlwe); } - dde->dde_type = ddb->ddb_type; - dde->dde_class = ddb->ddb_class; if (error == 0) return (0); if (error != ENOENT) @@ -1262,7 +1698,7 @@ ddt_addref(spa_t *spa, const blkptr_t *bp) ddt = ddt_select(spa, bp); ddt_enter(ddt); - dde = ddt_lookup(ddt, bp, B_TRUE); + dde = ddt_lookup(ddt, bp); /* Can be NULL if the entry for this block was pruned. */ if (dde == NULL) { @@ -1272,11 +1708,10 @@ ddt_addref(spa_t *spa, const blkptr_t *bp) } if (dde->dde_type < DDT_TYPES) { - ddt_phys_t *ddp; - ASSERT3S(dde->dde_class, <, DDT_CLASSES); - ddp = &dde->dde_phys[BP_GET_NDVAS(bp)]; + int p = DDT_PHYS_FOR_COPIES(ddt, BP_GET_NDVAS(bp)); + ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p); /* * This entry already existed (dde_type is real), so it must @@ -1288,9 +1723,9 @@ ddt_addref(spa_t *spa, const blkptr_t *bp) * likely further action is required to fill out the DDT entry, * and this is a place that is likely to be missed in testing. */ - ASSERT3U(ddp->ddp_refcnt, >, 0); + ASSERT3U(ddt_phys_refcnt(dde->dde_phys, v), >, 0); - ddt_phys_addref(ddp); + ddt_phys_addref(dde->dde_phys, v); result = B_TRUE; } else { /* diff --git a/module/zfs/ddt_stats.c b/module/zfs/ddt_stats.c index 82b682019ae9..6da77bbca5cb 100644 --- a/module/zfs/ddt_stats.c +++ b/module/zfs/ddt_stats.c @@ -36,24 +36,29 @@ static void ddt_stat_generate(ddt_t *ddt, ddt_entry_t *dde, ddt_stat_t *dds) { spa_t *spa = ddt->ddt_spa; - ddt_phys_t *ddp = dde->dde_phys; ddt_key_t *ddk = &dde->dde_key; uint64_t lsize = DDK_GET_LSIZE(ddk); uint64_t psize = DDK_GET_PSIZE(ddk); memset(dds, 0, sizeof (*dds)); - for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { - uint64_t dsize = 0; - uint64_t refcnt = ddp->ddp_refcnt; + for (int p = 0; p < DDT_NPHYS(ddt); p++) { + const ddt_univ_phys_t *ddp = dde->dde_phys; + ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p); - if (ddp->ddp_phys_birth == 0) + if (ddt_phys_birth(ddp, v) == 0) continue; - int ndvas = DDK_GET_CRYPT(&dde->dde_key) ? - SPA_DVAS_PER_BP - 1 : SPA_DVAS_PER_BP; + int ndvas = ddt_phys_dva_count(ddp, v, + DDK_GET_CRYPT(&dde->dde_key)); + const dva_t *dvas = (ddt->ddt_flags & DDT_FLAG_FLAT) ? + ddp->ddp_flat.ddp_dva : ddp->ddp_trad[p].ddp_dva; + + uint64_t dsize = 0; for (int d = 0; d < ndvas; d++) - dsize += dva_get_dsize_sync(spa, &ddp->ddp_dva[d]); + dsize += dva_get_dsize_sync(spa, &dvas[d]); + + uint64_t refcnt = ddt_phys_refcnt(ddp, v); dds->dds_blocks += 1; dds->dds_lsize += lsize; diff --git a/module/zfs/ddt_zap.c b/module/zfs/ddt_zap.c index 7ce7461a2b25..4e01624f3684 100644 --- a/module/zfs/ddt_zap.c +++ b/module/zfs/ddt_zap.c @@ -22,6 +22,7 @@ /* * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2018 by Delphix. All rights reserved. + * Copyright (c) 2023, Klara Inc. */ #include @@ -108,7 +109,7 @@ ddt_zap_destroy(objset_t *os, uint64_t object, dmu_tx_t *tx) static int ddt_zap_lookup(objset_t *os, uint64_t object, - const ddt_key_t *ddk, ddt_phys_t *phys, size_t psize) + const ddt_key_t *ddk, void *phys, size_t psize) { uchar_t *cbuf; uint64_t one, csize; @@ -155,7 +156,7 @@ ddt_zap_prefetch_all(objset_t *os, uint64_t object) static int ddt_zap_update(objset_t *os, uint64_t object, const ddt_key_t *ddk, - const ddt_phys_t *phys, size_t psize, dmu_tx_t *tx) + const void *phys, size_t psize, dmu_tx_t *tx) { const size_t cbuf_size = psize + 1; @@ -181,7 +182,7 @@ ddt_zap_remove(objset_t *os, uint64_t object, const ddt_key_t *ddk, static int ddt_zap_walk(objset_t *os, uint64_t object, uint64_t *walk, ddt_key_t *ddk, - ddt_phys_t *phys, size_t psize) + void *phys, size_t psize) { zap_cursor_t zc; zap_attribute_t za; diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c index 085cfd3c5691..daf1bd5d637b 100644 --- a/module/zfs/dsl_scan.c +++ b/module/zfs/dsl_scan.c @@ -2929,11 +2929,10 @@ enqueue_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg) void dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum, - ddt_entry_t *dde, dmu_tx_t *tx) + ddt_t *ddt, ddt_lightweight_entry_t *ddlwe, dmu_tx_t *tx) { (void) tx; - const ddt_key_t *ddk = &dde->dde_key; - ddt_phys_t *ddp = dde->dde_phys; + const ddt_key_t *ddk = &ddlwe->ddlwe_key; blkptr_t bp; zbookmark_phys_t zb = { 0 }; @@ -2954,11 +2953,13 @@ dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum, if (scn->scn_done_txg != 0) return; - for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { - if (ddp->ddp_phys_birth == 0 || - ddp->ddp_phys_birth > scn->scn_phys.scn_max_txg) + for (int p = 0; p < DDT_NPHYS(ddt); p++) { + ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p); + uint64_t phys_birth = ddt_phys_birth(&ddlwe->ddlwe_phys, v); + + if (phys_birth == 0 || phys_birth > scn->scn_phys.scn_max_txg) continue; - ddt_bp_create(checksum, ddk, ddp, &bp); + ddt_bp_create(checksum, ddk, &ddlwe->ddlwe_phys, v, &bp); scn->scn_visited_this_txg++; scan_funcs[scn->scn_phys.scn_func](scn->scn_dp, &bp, &zb); @@ -3002,11 +3003,11 @@ static void dsl_scan_ddt(dsl_scan_t *scn, dmu_tx_t *tx) { ddt_bookmark_t *ddb = &scn->scn_phys.scn_ddt_bookmark; - ddt_entry_t dde = {{{{0}}}}; + ddt_lightweight_entry_t ddlwe = {0}; int error; uint64_t n = 0; - while ((error = ddt_walk(scn->scn_dp->dp_spa, ddb, &dde)) == 0) { + while ((error = ddt_walk(scn->scn_dp->dp_spa, ddb, &ddlwe)) == 0) { ddt_t *ddt; if (ddb->ddb_class > scn->scn_phys.scn_ddt_class_max) @@ -3021,7 +3022,7 @@ dsl_scan_ddt(dsl_scan_t *scn, dmu_tx_t *tx) ddt = scn->scn_dp->dp_spa->spa_ddt[ddb->ddb_checksum]; ASSERT(avl_first(&ddt->ddt_tree) == NULL); - dsl_scan_ddt_entry(scn, ddb->ddb_checksum, &dde, tx); + dsl_scan_ddt_entry(scn, ddb->ddb_checksum, ddt, &ddlwe, tx); n++; if (dsl_scan_check_suspend(scn, NULL)) diff --git a/module/zfs/zio.c b/module/zfs/zio.c index 6d08d4bd1633..1f3acb9b921e 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -3254,17 +3254,21 @@ static void zio_ddt_child_read_done(zio_t *zio) { blkptr_t *bp = zio->io_bp; + ddt_t *ddt; ddt_entry_t *dde = zio->io_private; - ddt_phys_t *ddp; zio_t *pio = zio_unique_parent(zio); mutex_enter(&pio->io_lock); - ddp = ddt_phys_select(dde, bp); - if (zio->io_error == 0) - ddt_phys_clear(ddp); /* this ddp doesn't need repair */ + ddt = ddt_select(zio->io_spa, bp); - if (zio->io_error == 0 && dde->dde_repair_abd == NULL) - dde->dde_repair_abd = zio->io_abd; + if (zio->io_error == 0) { + ddt_phys_variant_t v = ddt_phys_select(ddt, dde, bp); + /* this phys variant doesn't need repair */ + ddt_phys_clear(dde->dde_phys, v); + } + + if (zio->io_error == 0 && dde->dde_io->dde_repair_abd == NULL) + dde->dde_io->dde_repair_abd = zio->io_abd; else abd_free(zio->io_abd); mutex_exit(&pio->io_lock); @@ -3282,21 +3286,25 @@ zio_ddt_read_start(zio_t *zio) if (zio->io_child_error[ZIO_CHILD_DDT]) { ddt_t *ddt = ddt_select(zio->io_spa, bp); ddt_entry_t *dde = ddt_repair_start(ddt, bp); - ddt_phys_t *ddp = dde->dde_phys; - ddt_phys_t *ddp_self = ddt_phys_select(dde, bp); + ddt_phys_variant_t v_self = ddt_phys_select(ddt, dde, bp); + ddt_univ_phys_t *ddp = dde->dde_phys; blkptr_t blk; ASSERT(zio->io_vsd == NULL); zio->io_vsd = dde; - if (ddp_self == NULL) + if (v_self == DDT_PHYS_NONE) return (zio); - for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { - if (ddp->ddp_phys_birth == 0 || ddp == ddp_self) + /* issue I/O for the other copies */ + for (int p = 0; p < DDT_NPHYS(ddt); p++) { + ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p); + + if (ddt_phys_birth(ddp, v) == 0 || v == v_self) continue; - ddt_bp_create(ddt->ddt_checksum, &dde->dde_key, ddp, - &blk); + + ddt_bp_create(ddt->ddt_checksum, &dde->dde_key, + ddp, v, &blk); zio_nowait(zio_read(zio, zio->io_spa, &blk, abd_alloc_for_io(zio->io_size, B_TRUE), zio->io_size, zio_ddt_child_read_done, dde, @@ -3338,8 +3346,8 @@ zio_ddt_read_done(zio_t *zio) zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE); return (NULL); } - if (dde->dde_repair_abd != NULL) { - abd_copy(zio->io_abd, dde->dde_repair_abd, + if (dde->dde_io->dde_repair_abd != NULL) { + abd_copy(zio->io_abd, dde->dde_io->dde_repair_abd, zio->io_size); zio->io_child_error[ZIO_CHILD_DDT] = 0; } @@ -3372,28 +3380,36 @@ zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde) * loaded). */ - for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) { - zio_t *lio = dde->dde_lead_zio[p]; + for (int p = 0; p < DDT_NPHYS(ddt); p++) { + if (DDT_PHYS_IS_DITTO(ddt, p)) + continue; + + if (dde->dde_io == NULL) + continue; - if (lio != NULL && do_raw) { + zio_t *lio = dde->dde_io->dde_lead_zio[p]; + if (lio == NULL) + continue; + + if (do_raw) return (lio->io_size != zio->io_size || abd_cmp(zio->io_abd, lio->io_abd) != 0); - } else if (lio != NULL) { - return (lio->io_orig_size != zio->io_orig_size || - abd_cmp(zio->io_orig_abd, lio->io_orig_abd) != 0); - } + + return (lio->io_orig_size != zio->io_orig_size || + abd_cmp(zio->io_orig_abd, lio->io_orig_abd) != 0); } - for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) { - ddt_phys_t *ddp = &dde->dde_phys[p]; + for (int p = 0; p < DDT_NPHYS(ddt); p++) { + ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p); + uint64_t phys_birth = ddt_phys_birth(dde->dde_phys, v); - if (ddp->ddp_phys_birth != 0 && do_raw) { + if (phys_birth != 0 && do_raw) { blkptr_t blk = *zio->io_bp; uint64_t psize; abd_t *tmpabd; int error; - ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth); + ddt_bp_fill(dde->dde_phys, v, &blk, phys_birth); psize = BP_GET_PSIZE(&blk); if (psize != zio->io_size) @@ -3416,13 +3432,13 @@ zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde) abd_free(tmpabd); ddt_enter(ddt); return (error != 0); - } else if (ddp->ddp_phys_birth != 0) { + } else if (phys_birth != 0) { arc_buf_t *abuf = NULL; arc_flags_t aflags = ARC_FLAG_WAIT; blkptr_t blk = *zio->io_bp; int error; - ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth); + ddt_bp_fill(dde->dde_phys, v, &blk, phys_birth); if (BP_GET_LSIZE(&blk) != zio->io_orig_size) return (B_TRUE); @@ -3450,50 +3466,87 @@ zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde) } static void -zio_ddt_child_write_ready(zio_t *zio) +zio_ddt_child_write_done(zio_t *zio) { - int p = zio->io_prop.zp_copies; ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp); ddt_entry_t *dde = zio->io_private; - ddt_phys_t *ddp = &dde->dde_phys[p]; - zio_t *pio; - if (zio->io_error) - return; + zio_link_t *zl = NULL; + ASSERT3P(zio_walk_parents(zio, &zl), !=, NULL); + + int p = DDT_PHYS_FOR_COPIES(ddt, zio->io_prop.zp_copies); + ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p); + ddt_univ_phys_t *ddp = dde->dde_phys; ddt_enter(ddt); - ASSERT(dde->dde_lead_zio[p] == zio); + /* we're the lead, so once we're done there's no one else outstanding */ + if (dde->dde_io->dde_lead_zio[p] == zio) + dde->dde_io->dde_lead_zio[p] = NULL; - ddt_phys_fill(ddp, zio->io_bp); + ddt_univ_phys_t *orig = &dde->dde_io->dde_orig_phys; - zio_link_t *zl = NULL; - while ((pio = zio_walk_parents(zio, &zl)) != NULL) - ddt_bp_fill(ddp, pio->io_bp, zio->io_txg); + if (zio->io_error != 0) { + /* + * The write failed, so we're about to abort the entire IO + * chain. We need to revert the entry back to what it was at + * the last time it was successfully extended. + */ + ddt_phys_copy(ddp, orig, v); + ddt_phys_clear(orig, v); + + ddt_exit(ddt); + return; + } + + /* + * We've successfully added new DVAs to the entry. Clear the saved + * state or, if there's still outstanding IO, remember it so we can + * revert to a known good state if that IO fails. + */ + if (dde->dde_io->dde_lead_zio[p] == NULL) + ddt_phys_clear(orig, v); + else + ddt_phys_copy(orig, ddp, v); + + /* + * Add references for all dedup writes that were waiting on the + * physical one, skipping any other physical writes that are waiting. + */ + zio_t *pio; + zl = NULL; + while ((pio = zio_walk_parents(zio, &zl)) != NULL) { + if (!(pio->io_flags & ZIO_FLAG_DDT_CHILD)) + ddt_phys_addref(ddp, v); + } ddt_exit(ddt); } static void -zio_ddt_child_write_done(zio_t *zio) +zio_ddt_child_write_ready(zio_t *zio) { - int p = zio->io_prop.zp_copies; ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp); ddt_entry_t *dde = zio->io_private; - ddt_phys_t *ddp = &dde->dde_phys[p]; + + zio_link_t *zl = NULL; + ASSERT3P(zio_walk_parents(zio, &zl), !=, NULL); + + int p = DDT_PHYS_FOR_COPIES(ddt, zio->io_prop.zp_copies); + ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p); + + if (zio->io_error != 0) + return; ddt_enter(ddt); - ASSERT(ddp->ddp_refcnt == 0); - ASSERT(dde->dde_lead_zio[p] == zio); - dde->dde_lead_zio[p] = NULL; + ddt_phys_extend(dde->dde_phys, v, zio->io_bp); - if (zio->io_error == 0) { - zio_link_t *zl = NULL; - while (zio_walk_parents(zio, &zl) != NULL) - ddt_phys_addref(ddp); - } else { - ddt_phys_clear(ddp); + zio_t *pio; + zl = NULL; + while ((pio = zio_walk_parents(zio, &zl)) != NULL) { + if (!(pio->io_flags & ZIO_FLAG_DDT_CHILD)) + ddt_bp_fill(dde->dde_phys, v, pio->io_bp, zio->io_txg); } ddt_exit(ddt); @@ -3506,11 +3559,8 @@ zio_ddt_write(zio_t *zio) blkptr_t *bp = zio->io_bp; uint64_t txg = zio->io_txg; zio_prop_t *zp = &zio->io_prop; - int p = zp->zp_copies; - zio_t *cio = NULL; ddt_t *ddt = ddt_select(spa, bp); ddt_entry_t *dde; - ddt_phys_t *ddp; ASSERT(BP_GET_DEDUP(bp)); ASSERT(BP_GET_CHECKSUM(bp) == zp->zp_checksum); @@ -3518,7 +3568,7 @@ zio_ddt_write(zio_t *zio) ASSERT(!(zio->io_bp_override && (zio->io_flags & ZIO_FLAG_RAW))); ddt_enter(ddt); - dde = ddt_lookup(ddt, bp, B_TRUE); + dde = ddt_lookup(ddt, bp); if (dde == NULL) { /* DDT size is over its quota so no new entries */ zp->zp_dedup = B_FALSE; @@ -3528,7 +3578,6 @@ zio_ddt_write(zio_t *zio) ddt_exit(ddt); return (zio); } - ddp = &dde->dde_phys[p]; if (zp->zp_dedup_verify && zio_ddt_collision(zio, ddt, dde)) { /* @@ -3553,28 +3602,226 @@ zio_ddt_write(zio_t *zio) return (zio); } - if (ddp->ddp_phys_birth != 0 || dde->dde_lead_zio[p] != NULL) { - if (ddp->ddp_phys_birth != 0) - ddt_bp_fill(ddp, bp, txg); - if (dde->dde_lead_zio[p] != NULL) - zio_add_child(zio, dde->dde_lead_zio[p]); - else - ddt_phys_addref(ddp); - } else if (zio->io_bp_override) { - ASSERT(BP_GET_LOGICAL_BIRTH(bp) == txg); - ASSERT(BP_EQUAL(bp, zio->io_bp_override)); - ddt_phys_fill(ddp, bp); - ddt_phys_addref(ddp); + int p = DDT_PHYS_FOR_COPIES(ddt, zp->zp_copies); + ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p); + ddt_univ_phys_t *ddp = dde->dde_phys; + + /* + * In the common cases, at this point we have a regular BP with no + * allocated DVAs, and the corresponding DDT entry for its checksum. + * Our goal is to fill the BP with enough DVAs to satisfy its copies= + * requirement. + * + * One of three things needs to happen to fulfill this: + * + * - if the DDT entry has enough DVAs to satisfy the BP, we just copy + * them out of the entry and return; + * + * - if the DDT entry has no DVAs (ie its brand new), then we have to + * issue the write as normal so that DVAs can be allocated and the + * data land on disk. We then copy the DVAs into the DDT entry on + * return. + * + * - if the DDT entry has some DVAs, but too few, we have to issue the + * write, adjusted to have allocate fewer copies. When it returns, we + * add the new DVAs to the DDT entry, and update the BP to have the + * full amount it originally requested. + * + * In all cases, if there's already a writing IO in flight, we need to + * defer the action until after the write is done. If our action is to + * write, we need to adjust our request for additional DVAs to match + * what will be in the DDT entry after it completes. In this way every + * IO can be guaranteed to recieve enough DVAs simply by joining the + * end of the chain and letting the sequence play out. + */ + + /* + * Number of DVAs in the DDT entry. If the BP is encrypted we ignore + * the third one as normal. + */ + int have_dvas = ddt_phys_dva_count(ddp, v, BP_IS_ENCRYPTED(bp)); + IMPLY(have_dvas == 0, ddt_phys_birth(ddp, v) == 0); + + /* Number of DVAs requested bya the IO. */ + uint8_t need_dvas = zp->zp_copies; + + /* + * What we do next depends on whether or not there's IO outstanding that + * will update this entry. + */ + if (dde->dde_io == NULL || dde->dde_io->dde_lead_zio[p] == NULL) { + /* + * No IO outstanding, so we only need to worry about ourselves. + */ + + /* + * Override BPs bring their own DVAs and their own problems. + */ + if (zio->io_bp_override) { + /* + * For a brand-new entry, all the work has been done + * for us, and we can just fill it out from the provided + * block and leave. + */ + if (have_dvas == 0) { + ASSERT(BP_GET_LOGICAL_BIRTH(bp) == txg); + ASSERT(BP_EQUAL(bp, zio->io_bp_override)); + ddt_phys_extend(ddp, v, bp); + ddt_phys_addref(ddp, v); + ddt_exit(ddt); + return (zio); + } + + /* + * If we already have this entry, then we want to treat + * it like a regular write. To do this we just wipe + * them out and proceed like a regular write. + * + * Even if there are some DVAs in the entry, we still + * have to clear them out. We can't use them to fill + * out the dedup entry, as they are all referenced + * together by a bp already on disk, and will be freed + * as a group. + */ + BP_ZERO_DVAS(bp); + BP_SET_BIRTH(bp, 0, 0); + } + + /* + * If there are enough DVAs in the entry to service our request, + * then we can just use them as-is. + */ + if (have_dvas >= need_dvas) { + ddt_bp_fill(ddp, v, bp, txg); + ddt_phys_addref(ddp, v); + ddt_exit(ddt); + return (zio); + } + + /* + * Otherwise, we have to issue IO to fill the entry up to the + * amount we need. + */ + need_dvas -= have_dvas; } else { - cio = zio_write(zio, spa, txg, bp, zio->io_orig_abd, - zio->io_orig_size, zio->io_orig_size, zp, - zio_ddt_child_write_ready, NULL, - zio_ddt_child_write_done, dde, zio->io_priority, - ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark); + /* + * There's a write in-flight. If there's already enough DVAs on + * the entry, then either there were already enough to start + * with, or the in-flight IO is between READY and DONE, and so + * has extended the entry with new DVAs. Either way, we don't + * need to do anything, we can just slot in behind it. + */ + + if (zio->io_bp_override) { + /* + * If there's a write out, then we're soon going to + * have our own copies of this block, so clear out the + * override block and treat it as a regular dedup + * write. See comment above. + */ + BP_ZERO_DVAS(bp); + BP_SET_BIRTH(bp, 0, 0); + } + + if (have_dvas >= need_dvas) { + /* + * A minor point: there might already be enough + * committed DVAs in the entry to service our request, + * but we don't know which are completed and which are + * allocated but not yet written. In this case, should + * the IO for the new DVAs fail, we will be on the end + * of the IO chain and will also recieve an error, even + * though our request could have been serviced. + * + * This is an extremely rare case, as it requires the + * original block to be copied with a request for a + * larger number of DVAs, then copied again requesting + * the same (or already fulfilled) number of DVAs while + * the first request is active, and then that first + * request errors. In return, the logic required to + * catch and handle it is complex. For now, I'm just + * not going to bother with it. + */ - zio_push_transform(cio, zio->io_abd, zio->io_size, 0, NULL); - dde->dde_lead_zio[p] = cio; + /* + * We always fill the bp here as we may have arrived + * after the in-flight write has passed READY, and so + * missed out. + */ + ddt_bp_fill(ddp, v, bp, txg); + zio_add_child(zio, dde->dde_io->dde_lead_zio[p]); + ddt_exit(ddt); + return (zio); + } + + /* + * There's not enough in the entry yet, so we need to look at + * the write in-flight and see how many DVAs it will have once + * it completes. + * + * The in-flight write has potentially had its copies request + * reduced (if we're filling out an existing entry), so we need + * to reach in and get the original write to find out what it is + * expecting. + * + * Note that the parent of the lead zio will always have the + * highest zp_copies of any zio in the chain, because ones that + * can be serviced without additional IO are always added to + * the back of the chain. + */ + zio_link_t *zl = NULL; + zio_t *pio = + zio_walk_parents(dde->dde_io->dde_lead_zio[p], &zl); + ASSERT(pio); + uint8_t parent_dvas = pio->io_prop.zp_copies; + + if (parent_dvas >= need_dvas) { + zio_add_child(zio, dde->dde_io->dde_lead_zio[p]); + ddt_exit(ddt); + return (zio); + } + + /* + * Still not enough, so we will need to issue to get the + * shortfall. + */ + need_dvas -= parent_dvas; + } + + /* + * We need to write. We will create a new write with the copies + * property adjusted to match the number of DVAs we need to need to + * grow the DDT entry by to satisfy the request. + */ + zio_prop_t czp = *zp; + czp.zp_copies = need_dvas; + zio_t *cio = zio_write(zio, spa, txg, bp, zio->io_orig_abd, + zio->io_orig_size, zio->io_orig_size, &czp, + zio_ddt_child_write_ready, NULL, + zio_ddt_child_write_done, dde, zio->io_priority, + ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark); + + zio_push_transform(cio, zio->io_abd, zio->io_size, 0, NULL); + + /* + * We are the new lead zio, because our parent has the highest + * zp_copies that has been requested for this entry so far. + */ + ddt_alloc_entry_io(dde); + if (dde->dde_io->dde_lead_zio[p] == NULL) { + /* + * First time out, take a copy of the stable entry to revert + * to if there's an error (see zio_ddt_child_write_done()) + */ + ddt_phys_copy(&dde->dde_io->dde_orig_phys, dde->dde_phys, v); + } else { + /* + * Make the existing chain our child, because it cannot + * complete until we have. + */ + zio_add_child(cio, dde->dde_io->dde_lead_zio[p]); } + dde->dde_io->dde_lead_zio[p] = cio; ddt_exit(ddt); @@ -3591,18 +3838,17 @@ zio_ddt_free(zio_t *zio) spa_t *spa = zio->io_spa; blkptr_t *bp = zio->io_bp; ddt_t *ddt = ddt_select(spa, bp); - ddt_entry_t *dde; - ddt_phys_t *ddp; + ddt_entry_t *dde = NULL; ASSERT(BP_GET_DEDUP(bp)); ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); ddt_enter(ddt); - freedde = dde = ddt_lookup(ddt, bp, B_TRUE); + freedde = dde = ddt_lookup(ddt, bp); if (dde) { - ddp = ddt_phys_select(dde, bp); - if (ddp) - ddt_phys_decref(ddp); + ddt_phys_variant_t v = ddt_phys_select(ddt, dde, bp); + if (v != DDT_PHYS_NONE) + ddt_phys_decref(dde->dde_phys, v); } ddt_exit(ddt); diff --git a/module/zfs/zio_compress.c b/module/zfs/zio_compress.c index e12d5498ccda..c3bceababa38 100644 --- a/module/zfs/zio_compress.c +++ b/module/zfs/zio_compress.c @@ -48,6 +48,10 @@ static unsigned long zio_decompress_fail_fraction = 0; /* * Compression vectors. + * + * NOTE: DO NOT CHANGE THE NAMES OF THESE COMPRESSION FUNCTIONS. + * THEY ARE USED AS ZAP KEY NAMES BY FAST DEDUP AND THEREFORE + * PART OF THE ON-DISK FORMAT. */ zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = { {"inherit", 0, NULL, NULL, NULL}, diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index 326eb2a44d37..ad131664698b 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -672,7 +672,9 @@ post = tags = ['functional', 'deadman'] [tests/functional/dedup] -tests = ['dedup_quota'] +tests = ['dedup_legacy_create', 'dedup_fdt_create', 'dedup_fdt_import', + 'dedup_legacy_create', 'dedup_legacy_import', 'dedup_legacy_fdt_upgrade', + 'dedup_legacy_fdt_mixed', 'dedup_quota'] pre = post = tags = ['functional', 'dedup'] diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index 9dcb097e2b38..bbeabc6dfb42 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -1424,6 +1424,12 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/deadman/deadman_zio.ksh \ functional/dedup/cleanup.ksh \ functional/dedup/setup.ksh \ + functional/dedup/dedup_fdt_create.ksh \ + functional/dedup/dedup_fdt_import.ksh \ + functional/dedup/dedup_legacy_create.ksh \ + functional/dedup/dedup_legacy_import.ksh \ + functional/dedup/dedup_legacy_fdt_upgrade.ksh \ + functional/dedup/dedup_legacy_fdt_mixed.ksh \ functional/dedup/dedup_quota.ksh \ functional/delegate/cleanup.ksh \ functional/delegate/setup.ksh \ diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg b/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg index e8a94ce209bc..50c1b7a9d09e 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg @@ -109,5 +109,6 @@ if is_linux || is_freebsd; then "feature@block_cloning" "feature@vdev_zaps_v2" "feature@raidz_expansion" + "feature@fast_dedup" ) fi diff --git a/tests/zfs-tests/tests/functional/dedup/dedup_fdt_create.ksh b/tests/zfs-tests/tests/functional/dedup/dedup_fdt_create.ksh new file mode 100755 index 000000000000..83c4d7c8e2aa --- /dev/null +++ b/tests/zfs-tests/tests/functional/dedup/dedup_fdt_create.ksh @@ -0,0 +1,99 @@ +#!/bin/ksh -p +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2024 Klara, Inc. +# + +# Simple test of dedup table operations (FDT) + +. $STF_SUITE/include/libtest.shlib + +log_assert "basic dedup (FDT) operations work" + +function cleanup +{ + destroy_pool $TESTPOOL +} + +log_onexit cleanup + +# create a pool with fast dedup enabled. we disable block cloning to ensure +# it doesn't get in the way of dedup, and we disable compression so our writes +# create predictable results on disk +# Use 'xattr=sa' to prevent selinux xattrs influencing our accounting +log_must zpool create -f \ + -o feature@fast_dedup=enabled \ + -O dedup=on \ + -o feature@block_cloning=disabled \ + -O compression=off \ + -O xattr=sa \ + $TESTPOOL $DISKS + +# confirm the feature is enabled +log_must test $(get_pool_prop feature@fast_dedup $TESTPOOL) = "enabled" + +# confirm there's no DDT keys in the MOS root +log_mustnot eval "zdb -dddd $TESTPOOL 1 | grep -q DDT-sha256" + +# create a file. this is four full blocks, so will produce four entries in the +# dedup table +log_must dd if=/dev/urandom of=/$TESTPOOL/file1 bs=128k count=4 +log_must zpool sync + +# feature should now be active +log_must test $(get_pool_prop feature@fast_dedup $TESTPOOL) = "active" + +# four entries in the unique table +log_must eval "zdb -D $TESTPOOL | grep -q 'DDT-sha256-zap-unique: 4 entries'" + +# single containing object in the MOS +log_must test $(zdb -dddd $TESTPOOL 1 | grep DDT-sha256 | wc -l) -eq 1 +obj=$(zdb -dddd $TESTPOOL 1 | grep DDT-sha256 | awk '{ print $NF }') + +# with only one ZAP inside +log_must test $(zdb -dddd $TESTPOOL $obj | grep DDT-sha256-zap- | wc -l) -eq 1 + +# copy the file +log_must cp /$TESTPOOL/file1 /$TESTPOOL/file2 +log_must zpool sync + +# now four entries in the duplicate table +log_must eval "zdb -D $TESTPOOL | grep -q 'DDT-sha256-zap-duplicate: 4 entries'" + +# now two DDT ZAPs in the container object; DDT ZAPs aren't cleaned up until +# the entire logical table is destroyed +log_must test $(zdb -dddd $TESTPOOL $obj | grep DDT-sha256-zap- | wc -l) -eq 2 + +# remove the files +log_must rm -f /$TESTPOOL/file* +log_must zpool sync + +# feature should move back to enabled +log_must test $(get_pool_prop feature@fast_dedup $TESTPOOL) = "enabled" + +# all DDTs empty +log_must eval "zdb -D $TESTPOOL | grep -q 'All DDTs are empty'" + +# logical table now destroyed; containing object destroyed +log_must test $(zdb -dddd $TESTPOOL 1 | grep DDT-sha256 | wc -l) -eq 0 + +log_pass "basic dedup (FDT) operations work" diff --git a/tests/zfs-tests/tests/functional/dedup/dedup_fdt_import.ksh b/tests/zfs-tests/tests/functional/dedup/dedup_fdt_import.ksh new file mode 100755 index 000000000000..f0f20671b95d --- /dev/null +++ b/tests/zfs-tests/tests/functional/dedup/dedup_fdt_import.ksh @@ -0,0 +1,112 @@ +#!/bin/ksh -p +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2024 Klara, Inc. +# + +# Ensure dedup retains version after import (FDT) + +. $STF_SUITE/include/libtest.shlib + +log_assert "dedup (FDT) retains version after import" + +function cleanup +{ + destroy_pool $TESTPOOL +} + +log_onexit cleanup + +# create a pool with fast dedup enabled. we disable block cloning to ensure +# it doesn't get in the way of dedup, and we disable compression so our writes +# create predictable results on disk +# Use 'xattr=sa' to prevent selinux xattrs influencing our accounting +log_must zpool create -f \ + -o feature@fast_dedup=enabled \ + -O dedup=on \ + -o feature@block_cloning=disabled \ + -O compression=off \ + -O xattr=sa \ + $TESTPOOL $DISKS + +# confirm the feature is enabled +log_must test $(get_pool_prop feature@fast_dedup $TESTPOOL) = "enabled" + +# confirm there's no DDT keys in the MOS root +log_mustnot eval "zdb -dddd $TESTPOOL 1 | grep -q DDT-sha256" + +# create a file. this is four full blocks, so will produce four entries in the +# dedup table +log_must dd if=/dev/urandom of=/$TESTPOOL/file1 bs=128k count=4 +log_must zpool sync + +# feature should now be active +log_must test $(get_pool_prop feature@fast_dedup $TESTPOOL) = "active" + +# four entries in the unique table +log_must eval "zdb -D $TESTPOOL | grep -q 'DDT-sha256-zap-unique: 4 entries'" + +# single containing object in the MOS +log_must test $(zdb -dddd $TESTPOOL 1 | grep DDT-sha256 | wc -l) -eq 1 +obj=$(zdb -dddd $TESTPOOL 1 | grep DDT-sha256 | awk '{ print $NF }') + +# with only one ZAP inside +log_must test $(zdb -dddd $TESTPOOL $obj | grep DDT-sha256-zap- | wc -l) -eq 1 + +# export and import the pool +zpool export $TESTPOOL +zpool import $TESTPOOL + +# feature still active +log_must test $(get_pool_prop feature@fast_dedup $TESTPOOL) = "active" + +# remove the file +log_must rm -f /$TESTPOOL/file1 +log_must zpool sync + +# feature should revert to enabled +log_must test $(get_pool_prop feature@fast_dedup $TESTPOOL) = "enabled" + +# all DDTs empty +log_must eval "zdb -D $TESTPOOL | grep -q 'All DDTs are empty'" + +# logical table now destroyed; containing object destroyed +log_must test $(zdb -dddd $TESTPOOL 1 | grep DDT-sha256 | wc -l) -eq 0 + +# create a new file +log_must dd if=/dev/urandom of=/$TESTPOOL/file2 bs=128k count=4 +log_must zpool sync + +# feature should be active again +log_must test $(get_pool_prop feature@fast_dedup $TESTPOOL) = "active" + +# four entries in the unique table +log_must eval "zdb -D $TESTPOOL | grep -q 'DDT-sha256-zap-unique: 4 entries'" + +# single containing object in the MOS +log_must test $(zdb -dddd $TESTPOOL 1 | grep DDT-sha256 | wc -l) -eq 1 +obj=$(zdb -dddd $TESTPOOL 1 | grep DDT-sha256 | awk '{ print $NF }') + +# with only one ZAP inside +log_must test $(zdb -dddd $TESTPOOL $obj | grep DDT-sha256-zap- | wc -l) -eq 1 + +log_pass "dedup (FDT) retains version after import" diff --git a/tests/zfs-tests/tests/functional/dedup/dedup_legacy_create.ksh b/tests/zfs-tests/tests/functional/dedup/dedup_legacy_create.ksh new file mode 100755 index 000000000000..e3efcf5c8b36 --- /dev/null +++ b/tests/zfs-tests/tests/functional/dedup/dedup_legacy_create.ksh @@ -0,0 +1,95 @@ +#!/bin/ksh -p +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2024 Klara, Inc. +# + +# Simple test of dedup table operations (legacy) + +. $STF_SUITE/include/libtest.shlib + +log_assert "basic dedup (legacy) operations work" + +function cleanup +{ + destroy_pool $TESTPOOL +} + +log_onexit cleanup + +# create a pool with legacy dedup enabled. we disable block cloning to ensure +# it doesn't get in the way of dedup, and we disable compression so our writes +# create predictable results on disk +# Use 'xattr=sa' to prevent selinux xattrs influencing our accounting +log_must zpool create -f \ + -o feature@fast_dedup=disabled \ + -O dedup=on \ + -o feature@block_cloning=disabled \ + -O compression=off \ + -O xattr=sa \ + $TESTPOOL $DISKS + +# confirm the feature is disabled +log_must test $(get_pool_prop feature@fast_dedup $TESTPOOL) = "disabled" + +# confirm there's no DDT keys in the MOS root +log_mustnot eval "zdb -dddd $TESTPOOL 1 | grep -q DDT-sha256" + +# create a file. this is four full blocks, so will produce four entries in the +# dedup table +log_must dd if=/dev/urandom of=/$TESTPOOL/file1 bs=128k count=4 +log_must zpool sync + +# feature should still be disabled +log_must test $(get_pool_prop feature@fast_dedup $TESTPOOL) = "disabled" + +# should be four entries in the unique table +log_must eval "zdb -D $TESTPOOL | grep -q 'DDT-sha256-zap-unique: 4 entries'" + +# should be just one DDT ZAP in the MOS +log_must test $(zdb -dddd $TESTPOOL 1 | grep DDT-sha256-zap- | wc -l) -eq 1 + +# copy the file +log_must cp /$TESTPOOL/file1 /$TESTPOOL/file2 +log_must zpool sync + +# now four entries in the duplicate table +log_must eval "zdb -D $TESTPOOL | grep -q 'DDT-sha256-zap-duplicate: 4 entries'" + +# now two DDT ZAPs in the MOS; DDT ZAPs aren't cleaned up until the entire +# logical table is destroyed +log_must test $(zdb -dddd $TESTPOOL 1 | grep DDT-sha256-zap- | wc -l) -eq 2 + +# remove the files +log_must rm -f /$TESTPOOL/file* +log_must zpool sync + +# feature should still be disabled +log_must test $(get_pool_prop feature@fast_dedup $TESTPOOL) = "disabled" + +# all DDTs empty +log_must eval "zdb -D $TESTPOOL | grep -q 'All DDTs are empty'" + +# logical table now destroyed; all DDT ZAPs removed +log_must test $(zdb -dddd $TESTPOOL 1 | grep DDT-sha256-zap- | wc -l) -eq 0 + +log_pass "basic dedup (legacy) operations work" diff --git a/tests/zfs-tests/tests/functional/dedup/dedup_legacy_fdt_mixed.ksh b/tests/zfs-tests/tests/functional/dedup/dedup_legacy_fdt_mixed.ksh new file mode 100755 index 000000000000..049ccaae3dca --- /dev/null +++ b/tests/zfs-tests/tests/functional/dedup/dedup_legacy_fdt_mixed.ksh @@ -0,0 +1,97 @@ +#!/bin/ksh -p +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2024 Klara, Inc. +# + +# Check legacy dedup table continues to work after pool upgrade to fast_dedup, +# but if deleted and recreated, the new table is FDT + +. $STF_SUITE/include/libtest.shlib + +log_assert "legacy and FDT dedup tables on the same pool can happily coexist" + +function cleanup +{ + destroy_pool $TESTPOOL +} + +log_onexit cleanup + +# create a pool with legacy dedup enabled. we disable block cloning to ensure +# it doesn't get in the way of dedup, and we disable compression so our writes +# create predictable results on disk +# Use 'xattr=sa' to prevent selinux xattrs influencing our accounting +log_must zpool create -f \ + -o feature@fast_dedup=disabled \ + -o feature@block_cloning=disabled \ + -O compression=off \ + -O xattr=sa \ + $TESTPOOL $DISKS + +# create two datasets, enabling a different dedup algorithm on each +log_must zfs create -o dedup=skein $TESTPOOL/ds1 +log_must zfs create -o dedup=blake3 $TESTPOOL/ds2 + +# confirm the feature is disabled +log_must test $(get_pool_prop feature@fast_dedup $TESTPOOL) = "disabled" + +# confirm there's no DDT keys in the MOS root +log_mustnot eval "zdb -dddd $TESTPOOL 1 | grep -q DDT-skein" +log_mustnot eval "zdb -dddd $TESTPOOL 1 | grep -q DDT-blake3" + +# create a file in the first dataset +log_must dd if=/dev/urandom of=/$TESTPOOL/ds1/file1 bs=128k count=4 +log_must zpool sync + +# should be four entries in the skein unique table +log_must eval "zdb -D $TESTPOOL | grep -q 'DDT-skein-zap-unique: 4 entries'" + +# should be just one DDT ZAP in the MOS +log_must test $(zdb -dddd $TESTPOOL 1 | grep DDT-.*-zap- | wc -l) -eq 1 + +# enable the fast_dedup feature +log_must zpool set feature@fast_dedup=enabled $TESTPOOL + +# confirm the feature is now enabled +log_must test $(get_pool_prop feature@fast_dedup $TESTPOOL) = "enabled" + +# create a file in the first dataset +log_must dd if=/dev/urandom of=/$TESTPOOL/ds2/file1 bs=128k count=4 +log_must zpool sync + +# feature should now be active +log_must test $(get_pool_prop feature@fast_dedup $TESTPOOL) = "active" + +# now also four entries in the blake3 unique table +log_must eval "zdb -D $TESTPOOL | grep -q 'DDT-blake3-zap-unique: 4 entries'" + +# two entries in the MOS: the legacy skein DDT ZAP, and the containing dir for +# the blake3 FDT table +log_must test $(zdb -dddd $TESTPOOL 1 | grep DDT-.*-zap- | wc -l) -eq 1 +log_must test $(zdb -dddd $TESTPOOL 1 | grep DDT-blake3 | wc -l) -eq 1 + +# containing object has one ZAP inside +obj=$(zdb -dddd $TESTPOOL 1 | grep DDT-blake3 | awk '{ print $NF }') +log_must test $(zdb -dddd $TESTPOOL $obj | grep DDT-.*-zap- | wc -l) -eq 1 + +log_pass "legacy and FDT dedup tables on the same pool can happily coexist" diff --git a/tests/zfs-tests/tests/functional/dedup/dedup_legacy_fdt_upgrade.ksh b/tests/zfs-tests/tests/functional/dedup/dedup_legacy_fdt_upgrade.ksh new file mode 100755 index 000000000000..d563fade88af --- /dev/null +++ b/tests/zfs-tests/tests/functional/dedup/dedup_legacy_fdt_upgrade.ksh @@ -0,0 +1,122 @@ +#!/bin/ksh -p +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2024 Klara, Inc. +# + +# Check legacy dedup table continues to work after pool upgrade to fast_dedup, +# but if deleted and recreated, the new table is FDT + +. $STF_SUITE/include/libtest.shlib + +log_assert "legacy dedup tables work after upgrade; new dedup tables created as FDT" + +function cleanup +{ + destroy_pool $TESTPOOL +} + +log_onexit cleanup + +# create a pool with legacy dedup enabled. we disable block cloning to ensure +# it doesn't get in the way of dedup, and we disable compression so our writes +# create predictable results on disk +# Use 'xattr=sa' to prevent selinux xattrs influencing our accounting +log_must zpool create -f \ + -o feature@fast_dedup=disabled \ + -O dedup=on \ + -o feature@block_cloning=disabled \ + -O compression=off \ + -O xattr=sa \ + $TESTPOOL $DISKS + +# confirm the feature is disabled +log_must test $(get_pool_prop feature@fast_dedup $TESTPOOL) = "disabled" + +# confirm there's no DDT keys in the MOS root +log_mustnot eval "zdb -dddd $TESTPOOL 1 | grep -q DDT-sha256" + +# create a file. this is four full blocks, so will produce four entries in the +# dedup table +log_must dd if=/dev/urandom of=/$TESTPOOL/file1 bs=128k count=4 +log_must zpool sync + +# feature should still be disabled +log_must test $(get_pool_prop feature@fast_dedup $TESTPOOL) = "disabled" + +# should be four entries in the unique table +log_must eval "zdb -D $TESTPOOL | grep -q 'DDT-sha256-zap-unique: 4 entries'" + +# should be just one DDT ZAP in the MOS +log_must test $(zdb -dddd $TESTPOOL 1 | grep DDT-sha256-zap- | wc -l) -eq 1 + +# enable the fast_dedup feature +log_must zpool set feature@fast_dedup=enabled $TESTPOOL + +# confirm the feature is now enabled +log_must test $(get_pool_prop feature@fast_dedup $TESTPOOL) = "enabled" + +# copy the file +log_must cp /$TESTPOOL/file1 /$TESTPOOL/file2 +log_must zpool sync + +# feature should still be enabled +log_must test $(get_pool_prop feature@fast_dedup $TESTPOOL) = "enabled" + +# now four entries in the duplicate table +log_must eval "zdb -D $TESTPOOL | grep -q 'DDT-sha256-zap-duplicate: 4 entries'" + +# now two DDT ZAPs in the MOS; DDT ZAPs aren't cleaned up until the entire +# logical table is destroyed +log_must test $(zdb -dddd $TESTPOOL 1 | grep DDT-sha256-zap- | wc -l) -eq 2 + +# remove the files +log_must rm -f /$TESTPOOL/file* +log_must zpool sync + +# feature should still be enabled +log_must test $(get_pool_prop feature@fast_dedup $TESTPOOL) = "enabled" + +# all DDTs empty +log_must eval "zdb -D $TESTPOOL | grep -q 'All DDTs are empty'" + +# logical table now destroyed; all DDT ZAPs removed +log_must test $(zdb -dddd $TESTPOOL 1 | grep DDT-sha256-zap- | wc -l) -eq 0 + +# create a new file +log_must dd if=/dev/urandom of=/$TESTPOOL/file3 bs=128k count=4 +log_must zpool sync + +# feature should now be active +log_must test $(get_pool_prop feature@fast_dedup $TESTPOOL) = "active" + +# four entries in the unique table +log_must eval "zdb -D $TESTPOOL | grep -q 'DDT-sha256-zap-unique: 4 entries'" + +# single containing object in the MOS +log_must test $(zdb -dddd $TESTPOOL 1 | grep DDT-sha256 | wc -l) -eq 1 +obj=$(zdb -dddd $TESTPOOL 1 | grep DDT-sha256 | awk '{ print $NF }') + +# with one ZAP inside +log_must test $(zdb -dddd $TESTPOOL $obj | grep DDT-sha256-zap- | wc -l) -eq 1 + +log_pass "legacy dedup tables work after upgrade; new dedup tables created as FDT" diff --git a/tests/zfs-tests/tests/functional/dedup/dedup_legacy_import.ksh b/tests/zfs-tests/tests/functional/dedup/dedup_legacy_import.ksh new file mode 100755 index 000000000000..a7b667eaf882 --- /dev/null +++ b/tests/zfs-tests/tests/functional/dedup/dedup_legacy_import.ksh @@ -0,0 +1,104 @@ +#!/bin/ksh -p +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2024 Klara, Inc. +# + +# Ensure dedup retains version after import (legacy) + +. $STF_SUITE/include/libtest.shlib + +log_assert "dedup (legacy) retains version after import" + +function cleanup +{ + destroy_pool $TESTPOOL +} + +log_onexit cleanup + +# create a pool with legacy dedup enabled. we disable block cloning to ensure +# it doesn't get in the way of dedup, and we disable compression so our writes +# create predictable results on disk +# Use 'xattr=sa' to prevent selinux xattrs influencing our accounting +log_must zpool create -f \ + -o feature@fast_dedup=disabled \ + -O dedup=on \ + -o feature@block_cloning=disabled \ + -O compression=off \ + -O xattr=sa \ + $TESTPOOL $DISKS + +# confirm the feature is disabled +log_must test $(get_pool_prop feature@fast_dedup $TESTPOOL) = "disabled" + +# confirm there's no DDT keys in the MOS root +log_mustnot eval "zdb -dddd $TESTPOOL 1 | grep -q DDT-sha256" + +# create a file. this is four full blocks, so will produce four entries in the +# dedup table +log_must dd if=/dev/urandom of=/$TESTPOOL/file1 bs=128k count=4 +log_must zpool sync + +# feature should still be disabled +log_must test $(get_pool_prop feature@fast_dedup $TESTPOOL) = "disabled" + +# should be four entries in the unique table +log_must eval "zdb -D $TESTPOOL | grep -q 'DDT-sha256-zap-unique: 4 entries'" + +# should be just one DDT ZAP in the MOS +log_must test $(zdb -dddd $TESTPOOL 1 | grep DDT-sha256-zap- | wc -l) -eq 1 + +# export and import the pool +zpool export $TESTPOOL +zpool import $TESTPOOL + +# confirm the feature is disabled +log_must test $(get_pool_prop feature@fast_dedup $TESTPOOL) = "disabled" + +# remove the file +log_must rm -f /$TESTPOOL/file1 +log_must zpool sync + +# feature should still be disabled +log_must test $(get_pool_prop feature@fast_dedup $TESTPOOL) = "disabled" + +# all DDTs empty +log_must eval "zdb -D $TESTPOOL | grep -q 'All DDTs are empty'" + +# logical table now destroyed; all DDT ZAPs removed +log_must test $(zdb -dddd $TESTPOOL 1 | grep DDT-sha256-zap- | wc -l) -eq 0 + +# create a new file +log_must dd if=/dev/urandom of=/$TESTPOOL/file2 bs=128k count=4 +log_must zpool sync + +# feature should still be disabled +log_must test $(get_pool_prop feature@fast_dedup $TESTPOOL) = "disabled" + +# should be four entries in the unique table +log_must eval "zdb -D $TESTPOOL | grep -q 'DDT-sha256-zap-unique: 4 entries'" + +# should be just one DDT ZAP in the MOS +log_must test $(zdb -dddd $TESTPOOL 1 | grep DDT-sha256-zap- | wc -l) -eq 1 + +log_pass "dedup (legacy) retains version after import" diff --git a/tests/zfs-tests/tests/functional/dedup/setup.ksh b/tests/zfs-tests/tests/functional/dedup/setup.ksh index 3c0830401f81..a21238879faf 100755 --- a/tests/zfs-tests/tests/functional/dedup/setup.ksh +++ b/tests/zfs-tests/tests/functional/dedup/setup.ksh @@ -25,7 +25,3 @@ # . $STF_SUITE/include/libtest.shlib - -DISK=${DISKS%% *} - -default_setup $DISK