From 596597bddeb88c38450f0eafe85d57486985e166 Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Tue, 20 Jun 2023 11:09:48 +1000 Subject: [PATCH] ddt: add "flat phys" feature Traditional dedup keeps a separate ddt_phys_t "type" for each possible count of DVAs (that is, copies=) parameter. Each of these are tracked independently of each other, and have their own set of DVAs. This leads to an (admittedly rare) situation where you can create as many as six copies of the data, by changing the copies= parameter between copying. This is both a waste of storage on disk, but also a waste of space in the stored DDT entries, since there never needs to be more than three DVAs to handle all possible values of copies=. This commit adds a new FDT feature, DDT_FLAG_FLAT. When active, only the first ddt_phys_t is used. Each time a block is written with the dedup bit set, this single phys is checked to see if it has enough DVAs to fulfill the request. If it does, the block is filled with the saved DVAs as normal. If not, an adjusted write is issued to create as many extra copies as are needed to fulfill the request, which are then saved into the entry too. Because a single phys is no longer an all-or-nothing, but can be transitioning from fewer to more DVAs, the write path now has to keep a copy of the previous "known good" DVA set so we can revert to it in case an error occurs. zio_ddt_write() has been restructured and heavily commented to make it much easier to see what's happening. Backwards compatibility is maintained simply by allocating four ddt_phys_t when the DDT_FLAG_FLAT flag is not set, and updating the phys selection macros to check the flag. In the old arrangement, each number of copies gets a whole phys, so it will always have either zero or all necessary DVAs filled, with no in-between, so the old behaviour naturally falls out of the new code. Signed-off-by: Rob Norris Co-authored-by: Don Brady Sponsored-by: Klara, Inc. Sponsored-by: iXsystems, Inc. --- cmd/zdb/zdb.c | 34 ++-- include/sys/ddt.h | 122 ++++++++++--- include/sys/ddt_impl.h | 20 +-- include/sys/dsl_scan.h | 2 +- include/sys/spa.h | 7 +- module/zfs/ddt.c | 328 ++++++++++++++++++++++++++--------- module/zfs/ddt_stats.c | 20 ++- module/zfs/ddt_zap.c | 6 +- module/zfs/dsl_scan.c | 14 +- module/zfs/zio.c | 380 +++++++++++++++++++++++++++++++++-------- 10 files changed, 709 insertions(+), 224 deletions(-) diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index 74b7e774cc59..567d096a763f 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -1922,14 +1922,16 @@ dump_ddt_entry(const ddt_t *ddt, const ddt_lightweight_entry_t *ddlwe, blkptr_t blk; int p; - for (p = 0; p < ddlwe->ddlwe_nphys; p++) { - const ddt_phys_t *ddp = &ddlwe->ddlwe_phys[p]; - if (ddp->ddp_phys_birth == 0) + for (p = 0; p < DDT_NPHYS(ddt); p++) { + const ddt_univ_phys_t *ddp = &ddlwe->ddlwe_phys; + ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p); + + if (ddt_phys_birth(ddp, v) == 0) continue; - ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk); + ddt_bp_create(ddt->ddt_checksum, ddk, ddp, v, &blk); snprintf_blkptr(blkbuf, sizeof (blkbuf), &blk); (void) printf("index %llx refcnt %llu phys %d %s\n", - (u_longlong_t)index, (u_longlong_t)ddp->ddp_refcnt, + (u_longlong_t)index, (u_longlong_t)ddt_phys_refcnt(ddp, v), p, blkbuf); } } @@ -5798,9 +5800,8 @@ zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp, if (dde == NULL) { refcnt = 0; } else { - ddt_phys_t *ddp = ddt_phys_select(ddt, dde, bp); - ddt_phys_decref(ddp); - refcnt = ddp->ddp_refcnt; + ddt_phys_variant_t v = ddt_phys_select(ddt, dde, bp); + refcnt = ddt_phys_decref(dde->dde_phys, v); if (ddt_phys_total_refcnt(ddt, dde) == 0) ddt_remove(ddt, dde); } @@ -6139,18 +6140,21 @@ zdb_ddt_leak_init(spa_t *spa, zdb_cb_t *zcb) VERIFY(ddt); uint64_t refcnt = 0; - for (int p = 0; p < ddlwe.ddlwe_nphys; p++) { - ddt_phys_t *ddp = &ddlwe.ddlwe_phys[p]; - if (ddp->ddp_phys_birth == 0) + for (int p = 0; p < DDT_NPHYS(ddt); p++) { + ddt_univ_phys_t *ddp = &ddlwe.ddlwe_phys; + ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p); + + if (ddt_phys_birth(ddp, v) == 0) continue; - refcnt += ddp->ddp_refcnt; + refcnt += ddt_phys_refcnt(ddp, v); + ddt_bp_create(ddb.ddb_checksum, - &ddlwe.ddlwe_key, ddp, &blk); + &ddlwe.ddlwe_key, ddp, v, &blk); if (DDT_PHYS_IS_DITTO(ddt, p)) { zdb_count_block(zcb, NULL, &blk, ZDB_OT_DITTO); } else { - zcb->zcb_dedup_asize += - BP_GET_ASIZE(&blk) * (ddp->ddp_refcnt - 1); + zcb->zcb_dedup_asize += BP_GET_ASIZE(&blk) * + (ddt_phys_refcnt(ddp, v) - 1); zcb->zcb_dedup_blocks++; } } diff --git a/include/sys/ddt.h b/include/sys/ddt.h index af1f8444090e..a54fa71999ca 100644 --- a/include/sys/ddt.h +++ b/include/sys/ddt.h @@ -42,8 +42,8 @@ struct abd; /* * DDT-wide feature flags. These are set in ddt_flags by ddt_configure(). */ -/* No flags yet. */ -#define DDT_FLAG_MASK (0) +#define DDT_FLAG_FLAT (1 << 0) /* single extensible phys */ +#define DDT_FLAG_MASK (DDT_FLAG_FLAT) /* * DDT on-disk storage object types. Each one corresponds to specific @@ -126,21 +126,80 @@ typedef struct { * characteristics of the stored block, such as its location on disk (DVAs), * birth txg and ref count. * - * Note that an entry has an array of four ddt_phys_t, one for each number of - * DVAs (copies= property) and another for additional "ditto" copies. Most - * users of ddt_phys_t will handle indexing into or counting the phys they - * want. + * The "traditional" entry has an array of four, one for each number of DVAs + * (copies= property) and another for additional "ditto" copies. Users of the + * traditional struct will specify the variant (index) of the one they want. + * + * The newer "flat" entry has only a single form that is specified using the + * DDT_PHYS_FLAT variant. + * + * Since the value size varies, use one of the size macros when interfacing + * with the ddt zap. */ -typedef struct { - dva_t ddp_dva[SPA_DVAS_PER_BP]; - uint64_t ddp_refcnt; - uint64_t ddp_phys_birth; -} ddt_phys_t; -#define DDT_PHYS_MAX (4) -#define DDT_NPHYS(ddt) ((ddt) ? DDT_PHYS_MAX : DDT_PHYS_MAX) -#define DDT_PHYS_IS_DITTO(ddt, p) ((ddt) && p == 0) -#define DDT_PHYS_FOR_COPIES(ddt, p) ((ddt) ? (p) : (p)) +#define DDT_PHYS_MAX (4) + +/* + * Note - this can be used in a flexible array and allocated for + * a specific size (ddp_trad or ddp_flat). So be careful not to + * copy using "=" assignment but instead use ddt_phys_copy(). + */ +typedef union { + /* + * Traditional physical payload value for DDT zap (256 bytes) + */ + struct { + dva_t ddp_dva[SPA_DVAS_PER_BP]; + uint64_t ddp_refcnt; + uint64_t ddp_phys_birth; + } ddp_trad[DDT_PHYS_MAX]; + + /* + * Flat physical payload value for DDT zap (72 bytes) + */ + struct { + dva_t ddp_dva[SPA_DVAS_PER_BP]; + uint64_t ddp_refcnt; + uint64_t ddp_phys_birth; /* txg based from BP */ + uint64_t ddp_class_start; /* in realtime seconds */ + } ddp_flat; +} ddt_univ_phys_t; + +/* + * This enum denotes which variant of a ddt_univ_phys_t to target. For + * a traditional DDT entry, it represents the indexes into the ddp_trad + * array. Any consumer of a ddt_univ_phys_t needs to know which variant + * is being targeted. + * + * Note, we no longer generate new DDT_PHYS_DITTO-type blocks. However, + * we maintain the ability to free existing dedup-ditto blocks. + */ + +typedef enum { + DDT_PHYS_DITTO = 0, + DDT_PHYS_SINGLE = 1, + DDT_PHYS_DOUBLE = 2, + DDT_PHYS_TRIPLE = 3, + DDT_PHYS_FLAT = 4, + DDT_PHYS_NONE = 5 +} ddt_phys_variant_t; + +#define DDT_PHYS_VARIANT(ddt, p) \ + (ASSERT((p) < DDT_PHYS_NONE), \ + ((ddt)->ddt_flags & DDT_FLAG_FLAT ? DDT_PHYS_FLAT : (p))) + +#define DDT_TRAD_PHYS_SIZE sizeof (((ddt_univ_phys_t *)0)->ddp_trad) +#define DDT_FLAT_PHYS_SIZE sizeof (((ddt_univ_phys_t *)0)->ddp_flat) + +#define _DDT_PHYS_SWITCH(ddt, flat, trad) \ + (((ddt)->ddt_flags & DDT_FLAG_FLAT) ? (flat) : (trad)) + +#define DDT_PHYS_SIZE(ddt) _DDT_PHYS_SWITCH(ddt, \ + DDT_FLAT_PHYS_SIZE, DDT_TRAD_PHYS_SIZE) + +#define DDT_NPHYS(ddt) _DDT_PHYS_SWITCH(ddt, 1, DDT_PHYS_MAX) +#define DDT_PHYS_FOR_COPIES(ddt, p) _DDT_PHYS_SWITCH(ddt, 0, p) +#define DDT_PHYS_IS_DITTO(ddt, p) _DDT_PHYS_SWITCH(ddt, 0, (p == 0)) /* * A "live" entry, holding changes to an entry made this txg, and other data to @@ -159,6 +218,9 @@ typedef struct { /* copy of data after a repair read, to be rewritten */ abd_t *dde_repair_abd; + /* original phys contents before update, for error handling */ + ddt_univ_phys_t dde_orig_phys; + /* in-flight update IOs */ zio_t *dde_lead_zio[DDT_PHYS_MAX]; } ddt_entry_io_t; @@ -178,7 +240,7 @@ typedef struct { ddt_entry_io_t *dde_io; /* IO support, when required */ - ddt_phys_t dde_phys[]; /* physical data */ + ddt_univ_phys_t dde_phys[]; /* flexible -- allocated size varies */ } ddt_entry_t; /* @@ -189,8 +251,7 @@ typedef struct { ddt_key_t ddlwe_key; ddt_type_t ddlwe_type; ddt_class_t ddlwe_class; - uint8_t ddlwe_nphys; - ddt_phys_t ddlwe_phys[DDT_PHYS_MAX]; + ddt_univ_phys_t ddlwe_phys; } ddt_lightweight_entry_t; /* @@ -236,17 +297,26 @@ typedef struct { uint64_t ddb_cursor; } ddt_bookmark_t; -extern void ddt_bp_fill(const ddt_phys_t *ddp, blkptr_t *bp, - uint64_t txg); +extern void ddt_bp_fill(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v, + blkptr_t *bp, uint64_t txg); extern void ddt_bp_create(enum zio_checksum checksum, const ddt_key_t *ddk, - const ddt_phys_t *ddp, blkptr_t *bp); + const ddt_univ_phys_t *ddp, ddt_phys_variant_t v, blkptr_t *bp); -extern void ddt_phys_fill(ddt_phys_t *ddp, const blkptr_t *bp); -extern void ddt_phys_clear(ddt_phys_t *ddp); -extern void ddt_phys_addref(ddt_phys_t *ddp); -extern void ddt_phys_decref(ddt_phys_t *ddp); -extern ddt_phys_t *ddt_phys_select(const ddt_t *ddt, const ddt_entry_t *dde, +extern void ddt_phys_extend(ddt_univ_phys_t *ddp, ddt_phys_variant_t v, const blkptr_t *bp); +extern void ddt_phys_copy(ddt_univ_phys_t *dst, const ddt_univ_phys_t *src, + ddt_phys_variant_t v); +extern void ddt_phys_clear(ddt_univ_phys_t *ddp, ddt_phys_variant_t v); +extern void ddt_phys_addref(ddt_univ_phys_t *ddp, ddt_phys_variant_t v); +extern uint64_t ddt_phys_decref(ddt_univ_phys_t *ddp, ddt_phys_variant_t v); +extern uint64_t ddt_phys_refcnt(const ddt_univ_phys_t *ddp, + ddt_phys_variant_t v); +extern ddt_phys_variant_t ddt_phys_select(const ddt_t *ddt, + const ddt_entry_t *dde, const blkptr_t *bp); +extern uint64_t ddt_phys_birth(const ddt_univ_phys_t *ddp, + ddt_phys_variant_t v); +extern int ddt_phys_dva_count(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v, + boolean_t encrypted); extern void ddt_histogram_add(ddt_histogram_t *dst, const ddt_histogram_t *src); extern void ddt_histogram_stat(ddt_stat_t *dds, const ddt_histogram_t *ddh); diff --git a/include/sys/ddt_impl.h b/include/sys/ddt_impl.h index e88a046ab8ae..c4e681fb117b 100644 --- a/include/sys/ddt_impl.h +++ b/include/sys/ddt_impl.h @@ -42,14 +42,12 @@ extern "C" { #define DDT_DIR_FLAGS "flags" /* Fill a lightweight entry from a live entry. */ -#define DDT_ENTRY_TO_LIGHTWEIGHT(ddt, dde, ddlwe) do { \ - memset((ddlwe), 0, sizeof (*ddlwe)); \ - (ddlwe)->ddlwe_key = (dde)->dde_key; \ - (ddlwe)->ddlwe_type = (dde)->dde_type; \ - (ddlwe)->ddlwe_class = (dde)->dde_class; \ - (ddlwe)->ddlwe_nphys = DDT_NPHYS(ddt); \ - for (int p = 0; p < (ddlwe)->ddlwe_nphys; p++) \ - (ddlwe)->ddlwe_phys[p] = (dde)->dde_phys[p]; \ +#define DDT_ENTRY_TO_LIGHTWEIGHT(ddt, dde, ddlwe) do { \ + memset((ddlwe), 0, sizeof (*ddlwe)); \ + (ddlwe)->ddlwe_key = (dde)->dde_key; \ + (ddlwe)->ddlwe_type = (dde)->dde_type; \ + (ddlwe)->ddlwe_class = (dde)->dde_class; \ + memcpy(&(ddlwe)->ddlwe_phys, (dde)->dde_phys, DDT_PHYS_SIZE(ddt)); \ } while (0) /* @@ -61,19 +59,19 @@ typedef struct { boolean_t prehash); int (*ddt_op_destroy)(objset_t *os, uint64_t object, dmu_tx_t *tx); int (*ddt_op_lookup)(objset_t *os, uint64_t object, - const ddt_key_t *ddk, ddt_phys_t *phys, size_t psize); + const ddt_key_t *ddk, void *phys, size_t psize); int (*ddt_op_contains)(objset_t *os, uint64_t object, const ddt_key_t *ddk); void (*ddt_op_prefetch)(objset_t *os, uint64_t object, const ddt_key_t *ddk); void (*ddt_op_prefetch_all)(objset_t *os, uint64_t object); int (*ddt_op_update)(objset_t *os, uint64_t object, - const ddt_key_t *ddk, const ddt_phys_t *phys, size_t psize, + const ddt_key_t *ddk, const void *phys, size_t psize, dmu_tx_t *tx); int (*ddt_op_remove)(objset_t *os, uint64_t object, const ddt_key_t *ddk, dmu_tx_t *tx); int (*ddt_op_walk)(objset_t *os, uint64_t object, uint64_t *walk, - ddt_key_t *ddk, ddt_phys_t *phys, size_t psize); + ddt_key_t *ddk, void *phys, size_t psize); int (*ddt_op_count)(objset_t *os, uint64_t object, uint64_t *count); } ddt_ops_t; diff --git a/include/sys/dsl_scan.h b/include/sys/dsl_scan.h index b91d7f4be88f..63734dbc176f 100644 --- a/include/sys/dsl_scan.h +++ b/include/sys/dsl_scan.h @@ -202,7 +202,7 @@ boolean_t dsl_scan_resilvering(struct dsl_pool *dp); boolean_t dsl_scan_resilver_scheduled(struct dsl_pool *dp); boolean_t dsl_dataset_unstable(struct dsl_dataset *ds); void dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum, - ddt_lightweight_entry_t *ddlwe, dmu_tx_t *tx); + ddt_t *ddt, ddt_lightweight_entry_t *ddlwe, dmu_tx_t *tx); void dsl_scan_ds_destroyed(struct dsl_dataset *ds, struct dmu_tx *tx); void dsl_scan_ds_snapshotted(struct dsl_dataset *ds, struct dmu_tx *tx); void dsl_scan_ds_clone_swapped(struct dsl_dataset *ds1, struct dsl_dataset *ds2, diff --git a/include/sys/spa.h b/include/sys/spa.h index 12321c6562be..80bf75ab22ac 100644 --- a/include/sys/spa.h +++ b/include/sys/spa.h @@ -572,7 +572,7 @@ typedef struct blkptr { #define BP_IS_RAIDZ(bp) (DVA_GET_ASIZE(&(bp)->blk_dva[0]) > \ BP_GET_PSIZE(bp)) -#define BP_ZERO(bp) \ +#define BP_ZERO_DVAS(bp) \ { \ (bp)->blk_dva[0].dva_word[0] = 0; \ (bp)->blk_dva[0].dva_word[1] = 0; \ @@ -580,6 +580,11 @@ typedef struct blkptr { (bp)->blk_dva[1].dva_word[1] = 0; \ (bp)->blk_dva[2].dva_word[0] = 0; \ (bp)->blk_dva[2].dva_word[1] = 0; \ +} + +#define BP_ZERO(bp) \ +{ \ + BP_ZERO_DVAS(bp); \ (bp)->blk_prop = 0; \ (bp)->blk_pad[0] = 0; \ (bp)->blk_pad[1] = 0; \ diff --git a/module/zfs/ddt.c b/module/zfs/ddt.c index 4699ad0fb3e1..08d1f6bf2cef 100644 --- a/module/zfs/ddt.c +++ b/module/zfs/ddt.c @@ -75,12 +75,19 @@ * fill the BP with the DVAs from the entry, increment the refcount and cause * the write IO to return immediately. * - * Each ddt_phys_t slot in the entry represents a separate dedup block for the - * same content/checksum. The slot is selected based on the zp_copies parameter - * the block is written with, that is, the number of DVAs in the block. The - * "ditto" slot (DDT_PHYS_DITTO) used to be used for now-removed "dedupditto" - * feature. These are no longer written, and will be freed if encountered on - * old pools. + * Traditionally, each ddt_phys_t slot in the entry represents a separate dedup + * block for the same content/checksum. The slot is selected based on the + * zp_copies parameter the block is written with, that is, the number of DVAs + * in the block. The "ditto" slot (DDT_PHYS_DITTO) used to be used for + * now-removed "dedupditto" feature. These are no longer written, and will be + * freed if encountered on old pools. + * + * If the "fast_dedup" feature is enabled, new dedup tables will be created + * with the "flat phys" option. In this mode, there is only one ddt_phys_t + * slot. If a write is issued for an entry that exists, but has fewer DVAs, + * then only as many new DVAs are allocated and written to make up the + * shortfall. The existing entry is then extended (ddt_phys_extend()) with the + * new DVAs. * * ## Lifetime of an entry * @@ -130,6 +137,16 @@ * from the alternate block. If the block is actually damaged, this will invoke * the pool's "self-healing" mechanism, and repair the block. * + * If the "fast_dedup" feature is enabled, the "flat phys" option will be in + * use, so there is only ever one ddt_phys_t slot. The repair process will + * still happen in this case, though it is unlikely to succeed as there will + * usually be no other equivalent blocks to fall back on (though there might + * be, if this was an early version of a dedup'd block that has since been + * extended). + * + * Note that this repair mechanism is in addition to and separate from the + * regular OpenZFS scrub and self-healing mechanisms. + * * ## Scanning (scrub/resilver) * * If dedup is active, the scrub machinery will walk the dedup table first, and @@ -162,10 +179,15 @@ c == ZIO_CHECKSUM_BLAKE3) static kmem_cache_t *ddt_cache; -static kmem_cache_t *ddt_entry_cache; -#define DDT_ENTRY_SIZE \ - (sizeof (ddt_entry_t) + sizeof (ddt_phys_t) * DDT_PHYS_MAX) +static kmem_cache_t *ddt_entry_flat_cache; +static kmem_cache_t *ddt_entry_trad_cache; + +#define DDT_ENTRY_FLAT_SIZE (sizeof (ddt_entry_t) + DDT_FLAT_PHYS_SIZE) +#define DDT_ENTRY_TRAD_SIZE (sizeof (ddt_entry_t) + DDT_TRAD_PHYS_SIZE) + +#define DDT_ENTRY_SIZE(ddt) \ + _DDT_PHYS_SWITCH(ddt, DDT_ENTRY_FLAT_SIZE, DDT_ENTRY_TRAD_SIZE) /* * Enable/disable prefetching of dedup-ed blocks which are going to be freed. @@ -195,7 +217,7 @@ static const char *const ddt_class_name[DDT_CLASSES] = { */ static const uint64_t ddt_version_flags[] = { [DDT_VERSION_LEGACY] = 0, - [DDT_VERSION_FDT] = 0, + [DDT_VERSION_FDT] = DDT_FLAG_FLAT, }; /* Dummy version to signal that configure is still necessary */ @@ -346,7 +368,7 @@ ddt_object_lookup(ddt_t *ddt, ddt_type_t type, ddt_class_t class, return (ddt_ops[type]->ddt_op_lookup(ddt->ddt_os, ddt->ddt_object[type][class], &dde->dde_key, - dde->dde_phys, sizeof (ddt_phys_t) * DDT_NPHYS(ddt))); + dde->dde_phys, DDT_PHYS_SIZE(ddt))); } static int @@ -388,8 +410,8 @@ ddt_object_update(ddt_t *ddt, ddt_type_t type, ddt_class_t class, ASSERT(ddt_object_exists(ddt, type, class)); return (ddt_ops[type]->ddt_op_update(ddt->ddt_os, - ddt->ddt_object[type][class], &dde->dde_key, dde->dde_phys, - sizeof (ddt_phys_t) * DDT_NPHYS(ddt), tx)); + ddt->ddt_object[type][class], &dde->dde_key, + dde->dde_phys, DDT_PHYS_SIZE(ddt), tx)); } static int @@ -410,11 +432,10 @@ ddt_object_walk(ddt_t *ddt, ddt_type_t type, ddt_class_t class, int error = ddt_ops[type]->ddt_op_walk(ddt->ddt_os, ddt->ddt_object[type][class], walk, &ddlwe->ddlwe_key, - ddlwe->ddlwe_phys, sizeof (ddlwe->ddlwe_phys)); + &ddlwe->ddlwe_phys, DDT_PHYS_SIZE(ddt)); if (error == 0) { ddlwe->ddlwe_type = type; ddlwe->ddlwe_class = class; - ddlwe->ddlwe_nphys = DDT_NPHYS(ddt); return (0); } return (error); @@ -451,13 +472,25 @@ ddt_object_name(ddt_t *ddt, ddt_type_t type, ddt_class_t class, } void -ddt_bp_fill(const ddt_phys_t *ddp, blkptr_t *bp, uint64_t txg) +ddt_bp_fill(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v, + blkptr_t *bp, uint64_t txg) { ASSERT3U(txg, !=, 0); + ASSERT3U(v, <, DDT_PHYS_NONE); + uint64_t phys_birth; + const dva_t *dvap; + + if (v == DDT_PHYS_FLAT) { + phys_birth = ddp->ddp_flat.ddp_phys_birth; + dvap = ddp->ddp_flat.ddp_dva; + } else { + phys_birth = ddp->ddp_trad[v].ddp_phys_birth; + dvap = ddp->ddp_trad[v].ddp_dva; + } for (int d = 0; d < SPA_DVAS_PER_BP; d++) - bp->blk_dva[d] = ddp->ddp_dva[d]; - BP_SET_BIRTH(bp, txg, ddp->ddp_phys_birth); + bp->blk_dva[d] = dvap[d]; + BP_SET_BIRTH(bp, txg, phys_birth); } /* @@ -465,13 +498,13 @@ ddt_bp_fill(const ddt_phys_t *ddp, blkptr_t *bp, uint64_t txg) * will be missing the salt / IV required to do a full decrypting read. */ void -ddt_bp_create(enum zio_checksum checksum, - const ddt_key_t *ddk, const ddt_phys_t *ddp, blkptr_t *bp) +ddt_bp_create(enum zio_checksum checksum, const ddt_key_t *ddk, + const ddt_univ_phys_t *ddp, ddt_phys_variant_t v, blkptr_t *bp) { BP_ZERO(bp); if (ddp != NULL) - ddt_bp_fill(ddp, bp, ddp->ddp_phys_birth); + ddt_bp_fill(ddp, v, bp, ddt_phys_birth(ddp, v)); bp->blk_cksum = ddk->ddk_cksum; @@ -502,42 +535,101 @@ ddt_key_fill(ddt_key_t *ddk, const blkptr_t *bp) } void -ddt_phys_fill(ddt_phys_t *ddp, const blkptr_t *bp) +ddt_phys_extend(ddt_univ_phys_t *ddp, ddt_phys_variant_t v, const blkptr_t *bp) { - ASSERT0(ddp->ddp_phys_birth); + ASSERT3U(v, <, DDT_PHYS_NONE); + int bp_ndvas = BP_GET_NDVAS(bp); + int ddp_max_dvas = BP_IS_ENCRYPTED(bp) ? + SPA_DVAS_PER_BP - 1 : SPA_DVAS_PER_BP; + dva_t *dvas = (v == DDT_PHYS_FLAT) ? + ddp->ddp_flat.ddp_dva : ddp->ddp_trad[v].ddp_dva; + + int s = 0, d = 0; + while (s < bp_ndvas && d < ddp_max_dvas) { + if (DVA_IS_VALID(&dvas[d])) { + d++; + continue; + } + dvas[d] = bp->blk_dva[s]; + s++; d++; + } - for (int d = 0; d < SPA_DVAS_PER_BP; d++) - ddp->ddp_dva[d] = bp->blk_dva[d]; - ddp->ddp_phys_birth = BP_GET_BIRTH(bp); + /* + * If the caller offered us more DVAs than we can fit, something has + * gone wrong in their accounting. zio_ddt_write() should never ask for + * more than we need. + */ + ASSERT3U(s, ==, bp_ndvas); + + if (BP_IS_ENCRYPTED(bp)) + dvas[2] = bp->blk_dva[2]; + + if (ddt_phys_birth(ddp, v) == 0) { + if (v == DDT_PHYS_FLAT) + ddp->ddp_flat.ddp_phys_birth = BP_GET_BIRTH(bp); + else + ddp->ddp_trad[v].ddp_phys_birth = BP_GET_BIRTH(bp); + } } void -ddt_phys_clear(ddt_phys_t *ddp) +ddt_phys_copy(ddt_univ_phys_t *dst, const ddt_univ_phys_t *src, + ddt_phys_variant_t v) { - memset(ddp, 0, sizeof (*ddp)); + ASSERT3U(v, <, DDT_PHYS_NONE); + + if (v == DDT_PHYS_FLAT) + dst->ddp_flat = src->ddp_flat; + else + dst->ddp_trad[v] = src->ddp_trad[v]; } void -ddt_phys_addref(ddt_phys_t *ddp) +ddt_phys_clear(ddt_univ_phys_t *ddp, ddt_phys_variant_t v) { - ddp->ddp_refcnt++; + ASSERT3U(v, <, DDT_PHYS_NONE); + + if (v == DDT_PHYS_FLAT) + memset(&ddp->ddp_flat, 0, DDT_FLAT_PHYS_SIZE); + else + memset(&ddp->ddp_trad[v], 0, DDT_TRAD_PHYS_SIZE / DDT_PHYS_MAX); } void -ddt_phys_decref(ddt_phys_t *ddp) +ddt_phys_addref(ddt_univ_phys_t *ddp, ddt_phys_variant_t v) { - if (ddp) { - ASSERT3U(ddp->ddp_refcnt, >, 0); - ddp->ddp_refcnt--; - } + ASSERT3U(v, <, DDT_PHYS_NONE); + + if (v == DDT_PHYS_FLAT) + ddp->ddp_flat.ddp_refcnt++; + else + ddp->ddp_trad[v].ddp_refcnt++; +} + +uint64_t +ddt_phys_decref(ddt_univ_phys_t *ddp, ddt_phys_variant_t v) +{ + ASSERT3U(v, <, DDT_PHYS_NONE); + + uint64_t *refcntp; + + if (v == DDT_PHYS_FLAT) + refcntp = &ddp->ddp_flat.ddp_refcnt; + else + refcntp = &ddp->ddp_trad[v].ddp_refcnt; + + ASSERT3U(*refcntp, >, 0); + (*refcntp)--; + return (*refcntp); } static void -ddt_phys_free(ddt_t *ddt, ddt_key_t *ddk, ddt_phys_t *ddp, uint64_t txg) +ddt_phys_free(ddt_t *ddt, ddt_key_t *ddk, ddt_univ_phys_t *ddp, + ddt_phys_variant_t v, uint64_t txg) { blkptr_t blk; - ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk); + ddt_bp_create(ddt->ddt_checksum, ddk, ddp, v, &blk); /* * We clear the dedup bit so that zio_free() will actually free the @@ -545,20 +637,67 @@ ddt_phys_free(ddt_t *ddt, ddt_key_t *ddk, ddt_phys_t *ddp, uint64_t txg) */ BP_SET_DEDUP(&blk, 0); - ddt_phys_clear(ddp); + ddt_phys_clear(ddp, v); zio_free(ddt->ddt_spa, txg, &blk); } -ddt_phys_t * +uint64_t +ddt_phys_birth(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v) +{ + ASSERT3U(v, <, DDT_PHYS_NONE); + + if (v == DDT_PHYS_FLAT) + return (ddp->ddp_flat.ddp_phys_birth); + else + return (ddp->ddp_trad[v].ddp_phys_birth); +} + +int +ddt_phys_dva_count(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v, + boolean_t encrypted) +{ + ASSERT3U(v, <, DDT_PHYS_NONE); + + const dva_t *dvas = (v == DDT_PHYS_FLAT) ? + ddp->ddp_flat.ddp_dva : ddp->ddp_trad[v].ddp_dva; + + return (DVA_IS_VALID(&dvas[0]) + + DVA_IS_VALID(&dvas[1]) + + DVA_IS_VALID(&dvas[2]) * !encrypted); +} + +ddt_phys_variant_t ddt_phys_select(const ddt_t *ddt, const ddt_entry_t *dde, const blkptr_t *bp) { - for (int p = 0; p < DDT_NPHYS(ddt); p++) { - ddt_phys_t *ddp = (ddt_phys_t *)&dde->dde_phys[p]; - if (DVA_EQUAL(BP_IDENTITY(bp), &ddp->ddp_dva[0]) && - BP_GET_BIRTH(bp) == ddp->ddp_phys_birth) - return (ddp); + const ddt_univ_phys_t *ddp = dde->dde_phys; + + if (ddt->ddt_flags & DDT_FLAG_FLAT) { + if (DVA_EQUAL(BP_IDENTITY(bp), &ddp->ddp_flat.ddp_dva[0]) && + BP_GET_BIRTH(bp) == ddp->ddp_flat.ddp_phys_birth) { + return (DDT_PHYS_FLAT); + } + } else /* traditional phys */ { + for (int p = 0; p < DDT_PHYS_MAX; p++) { + if (DVA_EQUAL(BP_IDENTITY(bp), + &ddp->ddp_trad[p].ddp_dva[0]) && + BP_GET_BIRTH(bp) == + ddp->ddp_trad[p].ddp_phys_birth) { + return (p); + } + } } - return (NULL); + return (DDT_PHYS_NONE); +} + +uint64_t +ddt_phys_refcnt(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v) +{ + ASSERT3U(v, <, DDT_PHYS_NONE); + + if (v == DDT_PHYS_FLAT) + return (ddp->ddp_flat.ddp_refcnt); + else + return (ddp->ddp_trad[v].ddp_refcnt); } uint64_t @@ -566,10 +705,11 @@ ddt_phys_total_refcnt(const ddt_t *ddt, const ddt_entry_t *dde) { uint64_t refcnt = 0; - for (int p = 0; p < DDT_NPHYS(ddt); p++) { - if (DDT_PHYS_IS_DITTO(ddt, p)) - continue; - refcnt += dde->dde_phys[p].ddp_refcnt; + if (ddt->ddt_flags & DDT_FLAG_FLAT) { + refcnt = dde->dde_phys->ddp_flat.ddp_refcnt; + } else { + for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) + refcnt += dde->dde_phys->ddp_trad[p].ddp_refcnt; } return (refcnt); @@ -599,24 +739,33 @@ ddt_init(void) { ddt_cache = kmem_cache_create("ddt_cache", sizeof (ddt_t), 0, NULL, NULL, NULL, NULL, NULL, 0); - ddt_entry_cache = kmem_cache_create("ddt_entry_cache", - DDT_ENTRY_SIZE, 0, NULL, NULL, NULL, NULL, NULL, 0); + ddt_entry_flat_cache = kmem_cache_create("ddt_entry_flat_cache", + DDT_ENTRY_FLAT_SIZE, 0, NULL, NULL, NULL, NULL, NULL, 0); + ddt_entry_trad_cache = kmem_cache_create("ddt_entry_trad_cache", + DDT_ENTRY_TRAD_SIZE, 0, NULL, NULL, NULL, NULL, NULL, 0); } void ddt_fini(void) { - kmem_cache_destroy(ddt_entry_cache); + kmem_cache_destroy(ddt_entry_trad_cache); + kmem_cache_destroy(ddt_entry_flat_cache); kmem_cache_destroy(ddt_cache); } static ddt_entry_t * -ddt_alloc(const ddt_key_t *ddk) +ddt_alloc(const ddt_t *ddt, const ddt_key_t *ddk) { ddt_entry_t *dde; - dde = kmem_cache_alloc(ddt_entry_cache, KM_SLEEP); - memset(dde, 0, DDT_ENTRY_SIZE); + if (ddt->ddt_flags & DDT_FLAG_FLAT) { + dde = kmem_cache_alloc(ddt_entry_flat_cache, KM_SLEEP); + memset(dde, 0, DDT_ENTRY_FLAT_SIZE); + } else { + dde = kmem_cache_alloc(ddt_entry_trad_cache, KM_SLEEP); + memset(dde, 0, DDT_ENTRY_TRAD_SIZE); + } + cv_init(&dde->dde_cv, NULL, CV_DEFAULT, NULL); dde->dde_key = *ddk; @@ -647,7 +796,8 @@ ddt_free(const ddt_t *ddt, ddt_entry_t *dde) } cv_destroy(&dde->dde_cv); - kmem_cache_free(ddt_entry_cache, dde); + kmem_cache_free(ddt->ddt_flags & DDT_FLAG_FLAT ? + ddt_entry_flat_cache : ddt_entry_trad_cache, dde); } void @@ -797,7 +947,12 @@ ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add) return (NULL); /* Time to make a new entry. */ - dde = ddt_alloc(&search); + dde = ddt_alloc(ddt, &search); + + /* Record the time this class was created (used by ddt prune) */ + if (ddt->ddt_flags & DDT_FLAG_FLAT) + dde->dde_phys->ddp_flat.ddp_class_start = gethrestime_sec(); + avl_insert(&ddt->ddt_tree, dde, where); /* @@ -1210,7 +1365,7 @@ ddt_repair_start(ddt_t *ddt, const blkptr_t *bp) ddt_key_fill(&ddk, bp); - dde = ddt_alloc(&ddk); + dde = ddt_alloc(ddt, &ddk); ddt_alloc_entry_io(dde); for (ddt_type_t type = 0; type < DDT_TYPES; type++) { @@ -1226,7 +1381,7 @@ ddt_repair_start(ddt_t *ddt, const blkptr_t *bp) } } - memset(dde->dde_phys, 0, sizeof (ddt_phys_t) * DDT_NPHYS(ddt)); + memset(dde->dde_phys, 0, DDT_PHYS_SIZE(ddt)); return (dde); } @@ -1269,13 +1424,26 @@ ddt_repair_entry(ddt_t *ddt, ddt_entry_t *dde, ddt_entry_t *rdde, zio_t *rio) ddt_repair_entry_done, rdde, rio->io_flags); for (int p = 0; p < DDT_NPHYS(ddt); p++) { - ddt_phys_t *ddp = &dde->dde_phys[p]; - ddt_phys_t *rddp = &rdde->dde_phys[p]; - if (ddp->ddp_phys_birth == 0 || - ddp->ddp_phys_birth != rddp->ddp_phys_birth || - memcmp(ddp->ddp_dva, rddp->ddp_dva, sizeof (ddp->ddp_dva))) + ddt_univ_phys_t *ddp = dde->dde_phys; + ddt_univ_phys_t *rddp = rdde->dde_phys; + ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p); + uint64_t phys_birth = ddt_phys_birth(ddp, v); + const dva_t *dvas, *rdvas; + + if (ddt->ddt_flags & DDT_FLAG_FLAT) { + dvas = ddp->ddp_flat.ddp_dva; + rdvas = rddp->ddp_flat.ddp_dva; + } else { + dvas = ddp->ddp_trad[p].ddp_dva; + rdvas = rddp->ddp_trad[p].ddp_dva; + } + + if (phys_birth == 0 || + phys_birth != ddt_phys_birth(rddp, v) || + memcmp(dvas, rdvas, sizeof (dva_t) * SPA_DVAS_PER_BP)) continue; - ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk); + + ddt_bp_create(ddt->ddt_checksum, ddk, ddp, v, &blk); zio_nowait(zio_rewrite(zio, zio->io_spa, 0, &blk, rdde->dde_io->dde_repair_abd, DDK_GET_PSIZE(rddk), NULL, NULL, ZIO_PRIORITY_SYNC_WRITE, @@ -1301,7 +1469,8 @@ ddt_repair_table(ddt_t *ddt, zio_t *rio) rdde_next = AVL_NEXT(t, rdde); avl_remove(&ddt->ddt_repair_tree, rdde); ddt_exit(ddt); - ddt_bp_create(ddt->ddt_checksum, &rdde->dde_key, NULL, &blk); + ddt_bp_create(ddt->ddt_checksum, &rdde->dde_key, NULL, + DDT_PHYS_NONE, &blk); dde = ddt_repair_start(ddt, &blk); ddt_repair_entry(ddt, dde, rdde, rio); ddt_repair_done(ddt, dde); @@ -1326,9 +1495,12 @@ ddt_sync_entry(ddt_t *ddt, ddt_entry_t *dde, dmu_tx_t *tx, uint64_t txg) for (int p = 0; p < DDT_NPHYS(ddt); p++) { ASSERT(dde->dde_io == NULL || dde->dde_io->dde_lead_zio[p] == NULL); - ddt_phys_t *ddp = &dde->dde_phys[p]; - if (ddp->ddp_phys_birth == 0) { - ASSERT0(ddp->ddp_refcnt); + ddt_univ_phys_t *ddp = dde->dde_phys; + ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p); + uint64_t phys_refcnt = ddt_phys_refcnt(ddp, v); + + if (ddt_phys_birth(ddp, v) == 0) { + ASSERT0(phys_refcnt); continue; } if (DDT_PHYS_IS_DITTO(ddt, p)) { @@ -1336,12 +1508,12 @@ ddt_sync_entry(ddt_t *ddt, ddt_entry_t *dde, dmu_tx_t *tx, uint64_t txg) * Note, we no longer create DDT-DITTO blocks, but we * don't want to leak any written by older software. */ - ddt_phys_free(ddt, ddk, ddp, txg); + ddt_phys_free(ddt, ddk, ddp, v, txg); continue; } - if (ddp->ddp_refcnt == 0) - ddt_phys_free(ddt, ddk, ddp, txg); - total_refcnt += ddp->ddp_refcnt; + if (phys_refcnt == 0) + ddt_phys_free(ddt, ddk, ddp, v, txg); + total_refcnt += phys_refcnt; } if (total_refcnt > 1) @@ -1375,7 +1547,7 @@ ddt_sync_entry(ddt_t *ddt, ddt_entry_t *dde, dmu_tx_t *tx, uint64_t txg) ddt_lightweight_entry_t ddlwe; DDT_ENTRY_TO_LIGHTWEIGHT(ddt, dde, &ddlwe); dsl_scan_ddt_entry(dp->dp_scan, - ddt->ddt_checksum, &ddlwe, tx); + ddt->ddt_checksum, ddt, &ddlwe, tx); } } } @@ -1540,12 +1712,10 @@ ddt_addref(spa_t *spa, const blkptr_t *bp) } if (dde->dde_type < DDT_TYPES) { - ddt_phys_t *ddp; - ASSERT3S(dde->dde_class, <, DDT_CLASSES); int p = DDT_PHYS_FOR_COPIES(ddt, BP_GET_NDVAS(bp)); - ddp = &dde->dde_phys[p]; + ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p); /* * This entry already existed (dde_type is real), so it must @@ -1557,9 +1727,9 @@ ddt_addref(spa_t *spa, const blkptr_t *bp) * likely further action is required to fill out the DDT entry, * and this is a place that is likely to be missed in testing. */ - ASSERT3U(ddp->ddp_refcnt, >, 0); + ASSERT3U(ddt_phys_refcnt(dde->dde_phys, v), >, 0); - ddt_phys_addref(ddp); + ddt_phys_addref(dde->dde_phys, v); result = B_TRUE; } else { /* diff --git a/module/zfs/ddt_stats.c b/module/zfs/ddt_stats.c index 5449eca3afb1..6da77bbca5cb 100644 --- a/module/zfs/ddt_stats.c +++ b/module/zfs/ddt_stats.c @@ -43,18 +43,22 @@ ddt_stat_generate(ddt_t *ddt, ddt_entry_t *dde, ddt_stat_t *dds) memset(dds, 0, sizeof (*dds)); for (int p = 0; p < DDT_NPHYS(ddt); p++) { - ddt_phys_t *ddp = &dde->dde_phys[p]; + const ddt_univ_phys_t *ddp = dde->dde_phys; + ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p); - uint64_t dsize = 0; - uint64_t refcnt = ddp->ddp_refcnt; - - if (ddp->ddp_phys_birth == 0) + if (ddt_phys_birth(ddp, v) == 0) continue; - int ndvas = DDK_GET_CRYPT(&dde->dde_key) ? - SPA_DVAS_PER_BP - 1 : SPA_DVAS_PER_BP; + int ndvas = ddt_phys_dva_count(ddp, v, + DDK_GET_CRYPT(&dde->dde_key)); + const dva_t *dvas = (ddt->ddt_flags & DDT_FLAG_FLAT) ? + ddp->ddp_flat.ddp_dva : ddp->ddp_trad[p].ddp_dva; + + uint64_t dsize = 0; for (int d = 0; d < ndvas; d++) - dsize += dva_get_dsize_sync(spa, &ddp->ddp_dva[d]); + dsize += dva_get_dsize_sync(spa, &dvas[d]); + + uint64_t refcnt = ddt_phys_refcnt(ddp, v); dds->dds_blocks += 1; dds->dds_lsize += lsize; diff --git a/module/zfs/ddt_zap.c b/module/zfs/ddt_zap.c index 8f1bbeeecd8d..4e01624f3684 100644 --- a/module/zfs/ddt_zap.c +++ b/module/zfs/ddt_zap.c @@ -109,7 +109,7 @@ ddt_zap_destroy(objset_t *os, uint64_t object, dmu_tx_t *tx) static int ddt_zap_lookup(objset_t *os, uint64_t object, - const ddt_key_t *ddk, ddt_phys_t *phys, size_t psize) + const ddt_key_t *ddk, void *phys, size_t psize) { uchar_t *cbuf; uint64_t one, csize; @@ -156,7 +156,7 @@ ddt_zap_prefetch_all(objset_t *os, uint64_t object) static int ddt_zap_update(objset_t *os, uint64_t object, const ddt_key_t *ddk, - const ddt_phys_t *phys, size_t psize, dmu_tx_t *tx) + const void *phys, size_t psize, dmu_tx_t *tx) { const size_t cbuf_size = psize + 1; @@ -182,7 +182,7 @@ ddt_zap_remove(objset_t *os, uint64_t object, const ddt_key_t *ddk, static int ddt_zap_walk(objset_t *os, uint64_t object, uint64_t *walk, ddt_key_t *ddk, - ddt_phys_t *phys, size_t psize) + void *phys, size_t psize) { zap_cursor_t zc; zap_attribute_t za; diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c index dec0eb28dc5f..daf1bd5d637b 100644 --- a/module/zfs/dsl_scan.c +++ b/module/zfs/dsl_scan.c @@ -2929,7 +2929,7 @@ enqueue_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg) void dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum, - ddt_lightweight_entry_t *ddlwe, dmu_tx_t *tx) + ddt_t *ddt, ddt_lightweight_entry_t *ddlwe, dmu_tx_t *tx) { (void) tx; const ddt_key_t *ddk = &ddlwe->ddlwe_key; @@ -2953,13 +2953,13 @@ dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum, if (scn->scn_done_txg != 0) return; - for (int p = 0; p < ddlwe->ddlwe_nphys; p++) { - ddt_phys_t *ddp = &ddlwe->ddlwe_phys[p]; + for (int p = 0; p < DDT_NPHYS(ddt); p++) { + ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p); + uint64_t phys_birth = ddt_phys_birth(&ddlwe->ddlwe_phys, v); - if (ddp->ddp_phys_birth == 0 || - ddp->ddp_phys_birth > scn->scn_phys.scn_max_txg) + if (phys_birth == 0 || phys_birth > scn->scn_phys.scn_max_txg) continue; - ddt_bp_create(checksum, ddk, ddp, &bp); + ddt_bp_create(checksum, ddk, &ddlwe->ddlwe_phys, v, &bp); scn->scn_visited_this_txg++; scan_funcs[scn->scn_phys.scn_func](scn->scn_dp, &bp, &zb); @@ -3022,7 +3022,7 @@ dsl_scan_ddt(dsl_scan_t *scn, dmu_tx_t *tx) ddt = scn->scn_dp->dp_spa->spa_ddt[ddb->ddb_checksum]; ASSERT(avl_first(&ddt->ddt_tree) == NULL); - dsl_scan_ddt_entry(scn, ddb->ddb_checksum, &ddlwe, tx); + dsl_scan_ddt_entry(scn, ddb->ddb_checksum, ddt, &ddlwe, tx); n++; if (dsl_scan_check_suspend(scn, NULL)) diff --git a/module/zfs/zio.c b/module/zfs/zio.c index 34ab27ca1cb9..a79bf1ff7629 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -3240,14 +3240,16 @@ zio_ddt_child_read_done(zio_t *zio) blkptr_t *bp = zio->io_bp; ddt_t *ddt; ddt_entry_t *dde = zio->io_private; - ddt_phys_t *ddp; zio_t *pio = zio_unique_parent(zio); mutex_enter(&pio->io_lock); ddt = ddt_select(zio->io_spa, bp); - ddp = ddt_phys_select(ddt, dde, bp); - if (zio->io_error == 0) - ddt_phys_clear(ddp); /* this ddp doesn't need repair */ + + if (zio->io_error == 0) { + ddt_phys_variant_t v = ddt_phys_select(ddt, dde, bp); + /* this phys variant doesn't need repair */ + ddt_phys_clear(dde->dde_phys, v); + } if (zio->io_error == 0 && dde->dde_io->dde_repair_abd == NULL) dde->dde_io->dde_repair_abd = zio->io_abd; @@ -3268,21 +3270,25 @@ zio_ddt_read_start(zio_t *zio) if (zio->io_child_error[ZIO_CHILD_DDT]) { ddt_t *ddt = ddt_select(zio->io_spa, bp); ddt_entry_t *dde = ddt_repair_start(ddt, bp); - ddt_phys_t *ddp_self = ddt_phys_select(ddt, dde, bp); + ddt_phys_variant_t v_self = ddt_phys_select(ddt, dde, bp); + ddt_univ_phys_t *ddp = dde->dde_phys; blkptr_t blk; ASSERT(zio->io_vsd == NULL); zio->io_vsd = dde; - if (ddp_self == NULL) + if (v_self == DDT_PHYS_NONE) return (zio); + /* issue I/O for the other copies */ for (int p = 0; p < DDT_NPHYS(ddt); p++) { - ddt_phys_t *ddp = &dde->dde_phys[p]; - if (ddp->ddp_phys_birth == 0 || ddp == ddp_self) + ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p); + + if (ddt_phys_birth(ddp, v) == 0 || v == v_self) continue; - ddt_bp_create(ddt->ddt_checksum, &dde->dde_key, ddp, - &blk); + + ddt_bp_create(ddt->ddt_checksum, &dde->dde_key, + ddp, v, &blk); zio_nowait(zio_read(zio, zio->io_spa, &blk, abd_alloc_for_io(zio->io_size, B_TRUE), zio->io_size, zio_ddt_child_read_done, dde, @@ -3362,30 +3368,32 @@ zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde) if (DDT_PHYS_IS_DITTO(ddt, p)) continue; + if (dde->dde_io == NULL) + continue; + zio_t *lio = dde->dde_io->dde_lead_zio[p]; + if (lio == NULL) + continue; - if (lio != NULL && do_raw) { + if (do_raw) return (lio->io_size != zio->io_size || abd_cmp(zio->io_abd, lio->io_abd) != 0); - } else if (lio != NULL) { - return (lio->io_orig_size != zio->io_orig_size || - abd_cmp(zio->io_orig_abd, lio->io_orig_abd) != 0); - } + + return (lio->io_orig_size != zio->io_orig_size || + abd_cmp(zio->io_orig_abd, lio->io_orig_abd) != 0); } for (int p = 0; p < DDT_NPHYS(ddt); p++) { - if (DDT_PHYS_IS_DITTO(ddt, p)) - continue; + ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p); + uint64_t phys_birth = ddt_phys_birth(dde->dde_phys, v); - ddt_phys_t *ddp = &dde->dde_phys[p]; - - if (ddp->ddp_phys_birth != 0 && do_raw) { + if (phys_birth != 0 && do_raw) { blkptr_t blk = *zio->io_bp; uint64_t psize; abd_t *tmpabd; int error; - ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth); + ddt_bp_fill(dde->dde_phys, v, &blk, phys_birth); psize = BP_GET_PSIZE(&blk); if (psize != zio->io_size) @@ -3408,13 +3416,13 @@ zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde) abd_free(tmpabd); ddt_enter(ddt); return (error != 0); - } else if (ddp->ddp_phys_birth != 0) { + } else if (phys_birth != 0) { arc_buf_t *abuf = NULL; arc_flags_t aflags = ARC_FLAG_WAIT; blkptr_t blk = *zio->io_bp; int error; - ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth); + ddt_bp_fill(dde->dde_phys, v, &blk, phys_birth); if (BP_GET_LSIZE(&blk) != zio->io_orig_size) return (B_TRUE); @@ -3442,52 +3450,87 @@ zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde) } static void -zio_ddt_child_write_ready(zio_t *zio) +zio_ddt_child_write_done(zio_t *zio) { ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp); ddt_entry_t *dde = zio->io_private; - zio_t *pio; - if (zio->io_error) - return; + zio_link_t *zl = NULL; + ASSERT3P(zio_walk_parents(zio, &zl), !=, NULL); int p = DDT_PHYS_FOR_COPIES(ddt, zio->io_prop.zp_copies); - ddt_phys_t *ddp = &dde->dde_phys[p]; + ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p); + ddt_univ_phys_t *ddp = dde->dde_phys; ddt_enter(ddt); - ASSERT(dde->dde_io->dde_lead_zio[p] == zio); + /* we're the lead, so once we're done there's no one else outstanding */ + if (dde->dde_io->dde_lead_zio[p] == zio) + dde->dde_io->dde_lead_zio[p] = NULL; - ddt_phys_fill(ddp, zio->io_bp); + ddt_univ_phys_t *orig = &dde->dde_io->dde_orig_phys; - zio_link_t *zl = NULL; - while ((pio = zio_walk_parents(zio, &zl)) != NULL) - ddt_bp_fill(ddp, pio->io_bp, zio->io_txg); + if (zio->io_error != 0) { + /* + * The write failed, so we're about to abort the entire IO + * chain. We need to revert the entry back to what it was at + * the last time it was successfully extended. + */ + ddt_phys_copy(ddp, orig, v); + ddt_phys_clear(orig, v); + + ddt_exit(ddt); + return; + } + + /* + * We've successfully added new DVAs to the entry. Clear the saved + * state or, if there's still outstanding IO, remember it so we can + * revert to a known good state if that IO fails. + */ + if (dde->dde_io->dde_lead_zio[p] == NULL) + ddt_phys_clear(orig, v); + else + ddt_phys_copy(orig, ddp, v); + + /* + * Add references for all dedup writes that were waiting on the + * physical one, skipping any other physical writes that are waiting. + */ + zio_t *pio; + zl = NULL; + while ((pio = zio_walk_parents(zio, &zl)) != NULL) { + if (!(pio->io_flags & ZIO_FLAG_DDT_CHILD)) + ddt_phys_addref(ddp, v); + } ddt_exit(ddt); } static void -zio_ddt_child_write_done(zio_t *zio) +zio_ddt_child_write_ready(zio_t *zio) { ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp); ddt_entry_t *dde = zio->io_private; + zio_link_t *zl = NULL; + ASSERT3P(zio_walk_parents(zio, &zl), !=, NULL); + int p = DDT_PHYS_FOR_COPIES(ddt, zio->io_prop.zp_copies); - ddt_phys_t *ddp = &dde->dde_phys[p]; + ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p); + + if (zio->io_error != 0) + return; ddt_enter(ddt); - ASSERT(ddp->ddp_refcnt == 0); - ASSERT(dde->dde_io->dde_lead_zio[p] == zio); - dde->dde_io->dde_lead_zio[p] = NULL; + ddt_phys_extend(dde->dde_phys, v, zio->io_bp); - if (zio->io_error == 0) { - zio_link_t *zl = NULL; - while (zio_walk_parents(zio, &zl) != NULL) - ddt_phys_addref(ddp); - } else { - ddt_phys_clear(ddp); + zio_t *pio; + zl = NULL; + while ((pio = zio_walk_parents(zio, &zl)) != NULL) { + if (!(pio->io_flags & ZIO_FLAG_DDT_CHILD)) + ddt_bp_fill(dde->dde_phys, v, pio->io_bp, zio->io_txg); } ddt_exit(ddt); @@ -3500,7 +3543,6 @@ zio_ddt_write(zio_t *zio) blkptr_t *bp = zio->io_bp; uint64_t txg = zio->io_txg; zio_prop_t *zp = &zio->io_prop; - zio_t *cio = NULL; ddt_t *ddt = ddt_select(spa, bp); ddt_entry_t *dde; @@ -3521,9 +3563,6 @@ zio_ddt_write(zio_t *zio) return (zio); } - int p = DDT_PHYS_FOR_COPIES(ddt, zp->zp_copies); - ddt_phys_t *ddp = &dde->dde_phys[p]; - if (zp->zp_dedup_verify && zio_ddt_collision(zio, ddt, dde)) { /* * If we're using a weak checksum, upgrade to a strong checksum @@ -3547,31 +3586,227 @@ zio_ddt_write(zio_t *zio) return (zio); } - ddt_alloc_entry_io(dde); + int p = DDT_PHYS_FOR_COPIES(ddt, zp->zp_copies); + ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p); + ddt_univ_phys_t *ddp = dde->dde_phys; - if (ddp->ddp_phys_birth != 0 || dde->dde_io->dde_lead_zio[p] != NULL) { - if (ddp->ddp_phys_birth != 0) - ddt_bp_fill(ddp, bp, txg); - if (dde->dde_io->dde_lead_zio[p] != NULL) - zio_add_child(zio, dde->dde_io->dde_lead_zio[p]); - else - ddt_phys_addref(ddp); - } else if (zio->io_bp_override) { - ASSERT(BP_GET_LOGICAL_BIRTH(bp) == txg); - ASSERT(BP_EQUAL(bp, zio->io_bp_override)); - ddt_phys_fill(ddp, bp); - ddt_phys_addref(ddp); + /* + * In the common cases, at this point we have a regular BP with no + * allocated DVAs, and the corresponding DDT entry for its checksum. + * Our goal is to fill the BP with enough DVAs to satisfy its copies= + * requirement. + * + * One of three things needs to happen to fulfill this: + * + * - if the DDT entry has enough DVAs to satisfy the BP, we just copy + * them out of the entry and return; + * + * - if the DDT entry has no DVAs (ie its brand new), then we have to + * issue the write as normal so that DVAs can be allocated and the + * data land on disk. We then copy the DVAs into the DDT entry on + * return. + * + * - if the DDT entry has some DVAs, but too few, we have to issue the + * write, adjusted to have allocate fewer copies. When it returns, we + * add the new DVAs to the DDT entry, and update the BP to have the + * full amount it originally requested. + * + * In all cases, if there's already a writing IO in flight, we need to + * defer the action until after the write is done. If our action is to + * write, we need to adjust our request for additional DVAs to match + * what will be in the DDT entry after it completes. In this way every + * IO can be guaranteed to recieve enough DVAs simply by joining the + * end of the chain and letting the sequence play out. + */ + + /* + * Number of DVAs in the DDT entry. If the BP is encrypted we ignore + * the third one as normal. + */ + int have_dvas = ddt_phys_dva_count(ddp, v, BP_IS_ENCRYPTED(bp)); + IMPLY(have_dvas == 0, ddt_phys_birth(ddp, v) == 0); + + /* Number of DVAs requested bya the IO. */ + uint8_t need_dvas = zp->zp_copies; + + /* + * What we do next depends on whether or not there's IO outstanding that + * will update this entry. + */ + if (dde->dde_io == NULL || dde->dde_io->dde_lead_zio[p] == NULL) { + /* + * No IO outstanding, so we only need to worry about ourselves. + */ + + /* + * Override BPs bring their own DVAs and their own problems. + */ + if (zio->io_bp_override) { + /* + * For a brand-new entry, all the work has been done + * for us, and we can just fill it out from the provided + * block and leave. + */ + if (have_dvas == 0) { + ASSERT(BP_GET_LOGICAL_BIRTH(bp) == txg); + ASSERT(BP_EQUAL(bp, zio->io_bp_override)); + ddt_phys_extend(ddp, v, bp); + ddt_phys_addref(ddp, v); + ddt_exit(ddt); + return (zio); + } + + /* + * If we already have this entry, then we want to treat + * it like a regular write. To do this we just wipe + * them out and proceed like a regular write. + * + * Even if there are some DVAs in the entry, we still + * have to clear them out. We can't use them to fill + * out the dedup entry, as they are all referenced + * together by a bp already on disk, and will be freed + * as a group. + */ + BP_ZERO_DVAS(bp); + BP_SET_BIRTH(bp, 0, 0); + } + + /* + * If there are enough DVAs in the entry to service our request, + * then we can just use them as-is. + */ + if (have_dvas >= need_dvas) { + ddt_bp_fill(ddp, v, bp, txg); + ddt_phys_addref(ddp, v); + ddt_exit(ddt); + return (zio); + } + + /* + * Otherwise, we have to issue IO to fill the entry up to the + * amount we need. + */ + need_dvas -= have_dvas; } else { - cio = zio_write(zio, spa, txg, bp, zio->io_orig_abd, - zio->io_orig_size, zio->io_orig_size, zp, - zio_ddt_child_write_ready, NULL, - zio_ddt_child_write_done, dde, zio->io_priority, - ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark); + /* + * There's a write in-flight. If there's already enough DVAs on + * the entry, then either there were already enough to start + * with, or the in-flight IO is between READY and DONE, and so + * has extended the entry with new DVAs. Either way, we don't + * need to do anything, we can just slot in behind it. + */ + + if (zio->io_bp_override) { + /* + * If there's a write out, then we're soon going to + * have our own copies of this block, so clear out the + * override block and treat it as a regular dedup + * write. See comment above. + */ + BP_ZERO_DVAS(bp); + BP_SET_BIRTH(bp, 0, 0); + } + + if (have_dvas >= need_dvas) { + /* + * A minor point: there might already be enough + * committed DVAs in the entry to service our request, + * but we don't know which are completed and which are + * allocated but not yet written. In this case, should + * the IO for the new DVAs fail, we will be on the end + * of the IO chain and will also recieve an error, even + * though our request could have been serviced. + * + * This is an extremely rare case, as it requires the + * original block to be copied with a request for a + * larger number of DVAs, then copied again requesting + * the same (or already fulfilled) number of DVAs while + * the first request is active, and then that first + * request errors. In return, the logic required to + * catch and handle it is complex. For now, I'm just + * not going to bother with it. + */ + + /* + * We always fill the bp here as we may have arrived + * after the in-flight write has passed READY, and so + * missed out. + */ + ddt_bp_fill(ddp, v, bp, txg); + zio_add_child(zio, dde->dde_io->dde_lead_zio[p]); + ddt_exit(ddt); + return (zio); + } + + /* + * There's not enough in the entry yet, so we need to look at + * the write in-flight and see how many DVAs it will have once + * it completes. + * + * The in-flight write has potentially had its copies request + * reduced (if we're filling out an existing entry), so we need + * to reach in and get the original write to find out what it is + * expecting. + * + * Note that the parent of the lead zio will always have the + * highest zp_copies of any zio in the chain, because ones that + * can be serviced without additional IO are always added to + * the back of the chain. + */ + zio_link_t *zl = NULL; + zio_t *pio = + zio_walk_parents(dde->dde_io->dde_lead_zio[p], &zl); + ASSERT(pio); + uint8_t parent_dvas = pio->io_prop.zp_copies; + + if (parent_dvas >= need_dvas) { + zio_add_child(zio, dde->dde_io->dde_lead_zio[p]); + ddt_exit(ddt); + return (zio); + } - zio_push_transform(cio, zio->io_abd, zio->io_size, 0, NULL); - dde->dde_io->dde_lead_zio[p] = cio; + /* + * Still not enough, so we will need to issue to get the + * shortfall. + */ + need_dvas -= parent_dvas; } + /* + * We need to write. We will create a new write with the copies + * property adjusted to match the number of DVAs we need to need to + * grow the DDT entry by to satisfy the request. + */ + zio_prop_t czp = *zp; + czp.zp_copies = need_dvas; + zio_t *cio = zio_write(zio, spa, txg, bp, zio->io_orig_abd, + zio->io_orig_size, zio->io_orig_size, &czp, + zio_ddt_child_write_ready, NULL, + zio_ddt_child_write_done, dde, zio->io_priority, + ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark); + + zio_push_transform(cio, zio->io_abd, zio->io_size, 0, NULL); + + /* + * We are the new lead zio, because our parent has the highest + * zp_copies that has been requested for this entry so far. + */ + ddt_alloc_entry_io(dde); + if (dde->dde_io->dde_lead_zio[p] == NULL) { + /* + * First time out, take a copy of the stable entry to revert + * to if there's an error (see zio_ddt_child_write_done()) + */ + ddt_phys_copy(&dde->dde_io->dde_orig_phys, dde->dde_phys, v); + } else { + /* + * Make the existing chain our child, because it cannot + * complete until we have. + */ + zio_add_child(cio, dde->dde_io->dde_lead_zio[p]); + } + dde->dde_io->dde_lead_zio[p] = cio; + ddt_exit(ddt); zio_nowait(cio); @@ -3587,8 +3822,7 @@ zio_ddt_free(zio_t *zio) spa_t *spa = zio->io_spa; blkptr_t *bp = zio->io_bp; ddt_t *ddt = ddt_select(spa, bp); - ddt_entry_t *dde; - ddt_phys_t *ddp; + ddt_entry_t *dde = NULL; ASSERT(BP_GET_DEDUP(bp)); ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); @@ -3596,9 +3830,9 @@ zio_ddt_free(zio_t *zio) ddt_enter(ddt); freedde = dde = ddt_lookup(ddt, bp, B_TRUE); if (dde) { - ddp = ddt_phys_select(ddt, dde, bp); - if (ddp) - ddt_phys_decref(ddp); + ddt_phys_variant_t v = ddt_phys_select(ddt, dde, bp); + if (v != DDT_PHYS_NONE) + ddt_phys_decref(dde->dde_phys, v); } ddt_exit(ddt);