Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fast Dedup: “flat” DDT entry format #15893

Closed
wants to merge 7 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
359 changes: 226 additions & 133 deletions cmd/zdb/zdb.c

Large diffs are not rendered by default.

163 changes: 129 additions & 34 deletions include/sys/ddt.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,12 @@ extern "C" {

struct abd;

/*
* DDT-wide feature flags. These are set in ddt_flags by ddt_configure().
*/
#define DDT_FLAG_FLAT (1 << 0) /* single extensible phys */
#define DDT_FLAG_MASK (DDT_FLAG_FLAT)

/*
* DDT on-disk storage object types. Each one corresponds to specific
* implementation, see ddt_ops_t. The value itself is not stored on disk.
Expand Down Expand Up @@ -120,30 +126,80 @@ typedef struct {
* characteristics of the stored block, such as its location on disk (DVAs),
* birth txg and ref count.
*
* Note that an entry has an array of four ddt_phys_t, one for each number of
* DVAs (copies= property) and another for additional "ditto" copies. Most
* users of ddt_phys_t will handle indexing into or counting the phys they
* want.
* The "traditional" entry has an array of four, one for each number of DVAs
* (copies= property) and another for additional "ditto" copies. Users of the
* traditional struct will specify the variant (index) of the one they want.
*
* The newer "flat" entry has only a single form that is specified using the
* DDT_PHYS_FLAT variant.
*
* Since the value size varies, use one of the size macros when interfacing
* with the ddt zap.
*/
typedef struct {
dva_t ddp_dva[SPA_DVAS_PER_BP];
uint64_t ddp_refcnt;
uint64_t ddp_phys_birth;
} ddt_phys_t;

#define DDT_PHYS_MAX (4)

/*
* Named indexes into the ddt_phys_t array in each entry.
* Note - this can be used in a flexible array and allocated for
* a specific size (ddp_trad or ddp_flat). So be careful not to
* copy using "=" assignment but instead use ddt_phys_copy().
*/
typedef union {
/*
* Traditional physical payload value for DDT zap (256 bytes)
*/
struct {
dva_t ddp_dva[SPA_DVAS_PER_BP];
uint64_t ddp_refcnt;
uint64_t ddp_phys_birth;
} ddp_trad[DDT_PHYS_MAX];

/*
* Flat physical payload value for DDT zap (72 bytes)
*/
struct {
dva_t ddp_dva[SPA_DVAS_PER_BP];
uint64_t ddp_refcnt;
uint64_t ddp_phys_birth; /* txg based from BP */
uint64_t ddp_class_start; /* in realtime seconds */
} ddp_flat;
} ddt_univ_phys_t;

/*
* This enum denotes which variant of a ddt_univ_phys_t to target. For
* a traditional DDT entry, it represents the indexes into the ddp_trad
* array. Any consumer of a ddt_univ_phys_t needs to know which variant
* is being targeted.
*
* Note, we no longer generate new DDT_PHYS_DITTO-type blocks. However,
* we maintain the ability to free existing dedup-ditto blocks.
*/
enum ddt_phys_type {

typedef enum {
DDT_PHYS_DITTO = 0,
DDT_PHYS_SINGLE = 1,
DDT_PHYS_DOUBLE = 2,
DDT_PHYS_TRIPLE = 3,
DDT_PHYS_TYPES
};
DDT_PHYS_FLAT = 4,
DDT_PHYS_NONE = 5
} ddt_phys_variant_t;

#define DDT_PHYS_VARIANT(ddt, p) \
(ASSERT((p) < DDT_PHYS_NONE), \
allanjude marked this conversation as resolved.
Show resolved Hide resolved
((ddt)->ddt_flags & DDT_FLAG_FLAT ? DDT_PHYS_FLAT : (p)))

#define DDT_TRAD_PHYS_SIZE sizeof (((ddt_univ_phys_t *)0)->ddp_trad)
#define DDT_FLAT_PHYS_SIZE sizeof (((ddt_univ_phys_t *)0)->ddp_flat)

#define _DDT_PHYS_SWITCH(ddt, flat, trad) \
(((ddt)->ddt_flags & DDT_FLAG_FLAT) ? (flat) : (trad))

#define DDT_PHYS_SIZE(ddt) _DDT_PHYS_SWITCH(ddt, \
DDT_FLAT_PHYS_SIZE, DDT_TRAD_PHYS_SIZE)

#define DDT_NPHYS(ddt) _DDT_PHYS_SWITCH(ddt, 1, DDT_PHYS_MAX)
#define DDT_PHYS_FOR_COPIES(ddt, p) _DDT_PHYS_SWITCH(ddt, 0, p)
#define DDT_PHYS_IS_DITTO(ddt, p) _DDT_PHYS_SWITCH(ddt, 0, (p == 0))

/*
* A "live" entry, holding changes to an entry made this txg, and other data to
Expand All @@ -154,16 +210,25 @@ enum ddt_phys_type {
#define DDE_FLAG_LOADED (1 << 0) /* entry ready for use */
#define DDE_FLAG_OVERQUOTA (1 << 1) /* entry unusable, no space */

/*
* Additional data to support entry update or repair. This is fixed size
* because its relatively rarely used.
*/
typedef struct {
/* key must be first for ddt_key_compare */
ddt_key_t dde_key; /* ddt_tree key */
ddt_phys_t dde_phys[DDT_PHYS_TYPES]; /* on-disk data */
/* copy of data after a repair read, to be rewritten */
abd_t *dde_repair_abd;

/* original phys contents before update, for error handling */
ddt_univ_phys_t dde_orig_phys;

/* in-flight update IOs */
zio_t *dde_lead_zio[DDT_PHYS_TYPES];
zio_t *dde_lead_zio[DDT_PHYS_MAX];
} ddt_entry_io_t;

/* copy of data after a repair read, to be rewritten */
struct abd *dde_repair_abd;
typedef struct {
/* key must be first for ddt_key_compare */
ddt_key_t dde_key; /* ddt_tree key */
avl_node_t dde_node; /* ddt_tree_node */

/* storage type and class the entry was loaded from */
ddt_type_t dde_type;
Expand All @@ -173,9 +238,22 @@ typedef struct {
kcondvar_t dde_cv; /* signaled when load completes */
uint64_t dde_waiters; /* count of waiters on dde_cv */

avl_node_t dde_node; /* ddt_tree node */
ddt_entry_io_t *dde_io; /* IO support, when required */

ddt_univ_phys_t dde_phys[]; /* flexible -- allocated size varies */
} ddt_entry_t;

/*
* A lightweight entry is for short-lived or transient uses, like iterating or
* inspecting, when you don't care where it came from.
*/
typedef struct {
ddt_key_t ddlwe_key;
ddt_type_t ddlwe_type;
ddt_class_t ddlwe_class;
ddt_univ_phys_t ddlwe_phys;
} ddt_lightweight_entry_t;

/*
* In-core DDT object. This covers all entries and stats for a the whole pool
* for a given checksum type.
Expand All @@ -185,11 +263,15 @@ typedef struct {

avl_tree_t ddt_tree; /* "live" (changed) entries this txg */

avl_tree_t ddt_repair_tree; /* entries being repaired */
avl_tree_t ddt_repair_tree; /* entries being repaired */

enum zio_checksum ddt_checksum; /* checksum algorithm in use */
spa_t *ddt_spa; /* pool this ddt is on */
objset_t *ddt_os; /* ddt objset (always MOS) */

enum zio_checksum ddt_checksum; /* checksum algorithm in use */
spa_t *ddt_spa; /* pool this ddt is on */
objset_t *ddt_os; /* ddt objset (always MOS) */
uint64_t ddt_dir_object; /* MOS dir holding ddt objects */
uint64_t ddt_version; /* DDT version */
uint64_t ddt_flags; /* FDT option flags */

/* per-type/per-class entry store objects */
uint64_t ddt_object[DDT_TYPES][DDT_CLASSES];
Expand All @@ -215,16 +297,26 @@ typedef struct {
uint64_t ddb_cursor;
} ddt_bookmark_t;

extern void ddt_bp_fill(const ddt_phys_t *ddp, blkptr_t *bp,
uint64_t txg);
extern void ddt_bp_fill(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v,
blkptr_t *bp, uint64_t txg);
extern void ddt_bp_create(enum zio_checksum checksum, const ddt_key_t *ddk,
const ddt_phys_t *ddp, blkptr_t *bp);
const ddt_univ_phys_t *ddp, ddt_phys_variant_t v, blkptr_t *bp);

extern void ddt_phys_fill(ddt_phys_t *ddp, const blkptr_t *bp);
extern void ddt_phys_clear(ddt_phys_t *ddp);
extern void ddt_phys_addref(ddt_phys_t *ddp);
extern void ddt_phys_decref(ddt_phys_t *ddp);
extern ddt_phys_t *ddt_phys_select(const ddt_entry_t *dde, const blkptr_t *bp);
extern void ddt_phys_extend(ddt_univ_phys_t *ddp, ddt_phys_variant_t v,
const blkptr_t *bp);
extern void ddt_phys_copy(ddt_univ_phys_t *dst, const ddt_univ_phys_t *src,
ddt_phys_variant_t v);
extern void ddt_phys_clear(ddt_univ_phys_t *ddp, ddt_phys_variant_t v);
extern void ddt_phys_addref(ddt_univ_phys_t *ddp, ddt_phys_variant_t v);
extern uint64_t ddt_phys_decref(ddt_univ_phys_t *ddp, ddt_phys_variant_t v);
extern uint64_t ddt_phys_refcnt(const ddt_univ_phys_t *ddp,
ddt_phys_variant_t v);
extern ddt_phys_variant_t ddt_phys_select(const ddt_t *ddt,
const ddt_entry_t *dde, const blkptr_t *bp);
extern uint64_t ddt_phys_birth(const ddt_univ_phys_t *ddp,
ddt_phys_variant_t v);
extern int ddt_phys_dva_count(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v,
boolean_t encrypted);

extern void ddt_histogram_add(ddt_histogram_t *dst, const ddt_histogram_t *src);
extern void ddt_histogram_stat(ddt_stat_t *dds, const ddt_histogram_t *ddh);
Expand All @@ -243,14 +335,16 @@ extern void ddt_enter(ddt_t *ddt);
extern void ddt_exit(ddt_t *ddt);
extern void ddt_init(void);
extern void ddt_fini(void);
extern ddt_entry_t *ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add);
extern ddt_entry_t *ddt_lookup(ddt_t *ddt, const blkptr_t *bp);
extern void ddt_remove(ddt_t *ddt, ddt_entry_t *dde);
extern void ddt_prefetch(spa_t *spa, const blkptr_t *bp);
extern void ddt_prefetch_all(spa_t *spa);

extern boolean_t ddt_class_contains(spa_t *spa, ddt_class_t max_class,
const blkptr_t *bp);

extern void ddt_alloc_entry_io(ddt_entry_t *dde);

extern ddt_entry_t *ddt_repair_start(ddt_t *ddt, const blkptr_t *bp);
extern void ddt_repair_done(ddt_t *ddt, ddt_entry_t *dde);

Expand All @@ -260,7 +354,8 @@ extern void ddt_create(spa_t *spa);
extern int ddt_load(spa_t *spa);
extern void ddt_unload(spa_t *spa);
extern void ddt_sync(spa_t *spa, uint64_t txg);
extern int ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_entry_t *dde);
extern int ddt_walk(spa_t *spa, ddt_bookmark_t *ddb,
ddt_lightweight_entry_t *ddlwe);

extern boolean_t ddt_addref(spa_t *spa, const blkptr_t *bp);

Expand Down
27 changes: 22 additions & 5 deletions include/sys/ddt_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,23 @@
extern "C" {
#endif

/* DDT version numbers */
#define DDT_VERSION_LEGACY (0)
#define DDT_VERSION_FDT (1)

/* Names of interesting objects in the DDT root dir */
#define DDT_DIR_VERSION "version"
#define DDT_DIR_FLAGS "flags"

/* Fill a lightweight entry from a live entry. */
#define DDT_ENTRY_TO_LIGHTWEIGHT(ddt, dde, ddlwe) do { \
memset((ddlwe), 0, sizeof (*ddlwe)); \
(ddlwe)->ddlwe_key = (dde)->dde_key; \
(ddlwe)->ddlwe_type = (dde)->dde_type; \
(ddlwe)->ddlwe_class = (dde)->dde_class; \
memcpy(&(ddlwe)->ddlwe_phys, (dde)->dde_phys, DDT_PHYS_SIZE(ddt)); \
} while (0)

/*
* Ops vector to access a specific DDT object type.
*/
Expand All @@ -42,19 +59,19 @@ typedef struct {
boolean_t prehash);
int (*ddt_op_destroy)(objset_t *os, uint64_t object, dmu_tx_t *tx);
int (*ddt_op_lookup)(objset_t *os, uint64_t object,
const ddt_key_t *ddk, ddt_phys_t *phys, size_t psize);
const ddt_key_t *ddk, void *phys, size_t psize);
int (*ddt_op_contains)(objset_t *os, uint64_t object,
const ddt_key_t *ddk);
void (*ddt_op_prefetch)(objset_t *os, uint64_t object,
const ddt_key_t *ddk);
void (*ddt_op_prefetch_all)(objset_t *os, uint64_t object);
int (*ddt_op_update)(objset_t *os, uint64_t object,
const ddt_key_t *ddk, const ddt_phys_t *phys, size_t psize,
const ddt_key_t *ddk, const void *phys, size_t psize,
dmu_tx_t *tx);
int (*ddt_op_remove)(objset_t *os, uint64_t object,
const ddt_key_t *ddk, dmu_tx_t *tx);
int (*ddt_op_walk)(objset_t *os, uint64_t object, uint64_t *walk,
ddt_key_t *ddk, ddt_phys_t *phys, size_t psize);
ddt_key_t *ddk, void *phys, size_t psize);
int (*ddt_op_count)(objset_t *os, uint64_t object, uint64_t *count);
} ddt_ops_t;

Expand All @@ -74,7 +91,7 @@ extern void ddt_stat_update(ddt_t *ddt, ddt_entry_t *dde, uint64_t neg);
*/
#define DDT_NAMELEN 32

extern uint64_t ddt_phys_total_refcnt(const ddt_entry_t *dde);
extern uint64_t ddt_phys_total_refcnt(const ddt_t *ddt, const ddt_entry_t *dde);

extern void ddt_key_fill(ddt_key_t *ddk, const blkptr_t *bp);

Expand All @@ -83,7 +100,7 @@ extern void ddt_stat_add(ddt_stat_t *dst, const ddt_stat_t *src, uint64_t neg);
extern void ddt_object_name(ddt_t *ddt, ddt_type_t type, ddt_class_t clazz,
char *name);
extern int ddt_object_walk(ddt_t *ddt, ddt_type_t type, ddt_class_t clazz,
uint64_t *walk, ddt_entry_t *dde);
uint64_t *walk, ddt_lightweight_entry_t *ddlwe);
extern int ddt_object_count(ddt_t *ddt, ddt_type_t type, ddt_class_t clazz,
uint64_t *count);
extern int ddt_object_info(ddt_t *ddt, ddt_type_t type, ddt_class_t clazz,
Expand Down
1 change: 1 addition & 0 deletions include/sys/dmu.h
Original file line number Diff line number Diff line change
Expand Up @@ -376,6 +376,7 @@ typedef struct dmu_buf {
#define DMU_POOL_TMP_USERREFS "tmp_userrefs"
#define DMU_POOL_DDT "DDT-%s-%s-%s"
#define DMU_POOL_DDT_STATS "DDT-statistics"
#define DMU_POOL_DDT_DIR "DDT-%s"
#define DMU_POOL_CREATION_VERSION "creation_version"
#define DMU_POOL_SCAN "scan"
#define DMU_POOL_ERRORSCRUB "error_scrub"
Expand Down
2 changes: 1 addition & 1 deletion include/sys/dsl_scan.h
Original file line number Diff line number Diff line change
Expand Up @@ -202,7 +202,7 @@ boolean_t dsl_scan_resilvering(struct dsl_pool *dp);
boolean_t dsl_scan_resilver_scheduled(struct dsl_pool *dp);
boolean_t dsl_dataset_unstable(struct dsl_dataset *ds);
void dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum,
ddt_entry_t *dde, dmu_tx_t *tx);
ddt_t *ddt, ddt_lightweight_entry_t *ddlwe, dmu_tx_t *tx);
void dsl_scan_ds_destroyed(struct dsl_dataset *ds, struct dmu_tx *tx);
void dsl_scan_ds_snapshotted(struct dsl_dataset *ds, struct dmu_tx *tx);
void dsl_scan_ds_clone_swapped(struct dsl_dataset *ds1, struct dsl_dataset *ds2,
Expand Down
7 changes: 6 additions & 1 deletion include/sys/spa.h
Original file line number Diff line number Diff line change
Expand Up @@ -572,14 +572,19 @@ typedef struct blkptr {
#define BP_IS_RAIDZ(bp) (DVA_GET_ASIZE(&(bp)->blk_dva[0]) > \
BP_GET_PSIZE(bp))

#define BP_ZERO(bp) \
#define BP_ZERO_DVAS(bp) \
{ \
(bp)->blk_dva[0].dva_word[0] = 0; \
(bp)->blk_dva[0].dva_word[1] = 0; \
(bp)->blk_dva[1].dva_word[0] = 0; \
(bp)->blk_dva[1].dva_word[1] = 0; \
(bp)->blk_dva[2].dva_word[0] = 0; \
(bp)->blk_dva[2].dva_word[1] = 0; \
}

#define BP_ZERO(bp) \
{ \
BP_ZERO_DVAS(bp); \
(bp)->blk_prop = 0; \
(bp)->blk_pad[0] = 0; \
(bp)->blk_pad[1] = 0; \
Expand Down
1 change: 1 addition & 0 deletions include/zfeature_common.h
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ typedef enum spa_feature {
SPA_FEATURE_AVZ_V2,
SPA_FEATURE_REDACTION_LIST_SPILL,
SPA_FEATURE_RAIDZ_EXPANSION,
SPA_FEATURE_FAST_DEDUP,
SPA_FEATURES
} spa_feature_t;

Expand Down
11 changes: 6 additions & 5 deletions lib/libzfs/libzfs.abi
Original file line number Diff line number Diff line change
Expand Up @@ -616,7 +616,7 @@
<elf-symbol name='fletcher_4_superscalar_ops' size='128' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='libzfs_config_ops' size='16' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='sa_protocol_names' size='16' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='spa_feature_table' size='2296' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='spa_feature_table' size='2352' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='zfeature_checks_disable' size='4' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='zfs_deleg_perm_tab' size='512' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='zfs_history_event_names' size='328' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
Expand Down Expand Up @@ -6006,7 +6006,8 @@
<enumerator name='SPA_FEATURE_AVZ_V2' value='38'/>
<enumerator name='SPA_FEATURE_REDACTION_LIST_SPILL' value='39'/>
<enumerator name='SPA_FEATURE_RAIDZ_EXPANSION' value='40'/>
<enumerator name='SPA_FEATURES' value='41'/>
<enumerator name='SPA_FEATURE_FAST_DEDUP' value='41'/>
<enumerator name='SPA_FEATURES' value='42'/>
</enum-decl>
<typedef-decl name='spa_feature_t' type-id='33ecb627' id='d6618c78'/>
<qualified-type-def type-id='80f4b756' const='yes' id='b99c00c9'/>
Expand Down Expand Up @@ -9131,8 +9132,8 @@
</function-decl>
</abi-instr>
<abi-instr address-size='64' path='module/zcommon/zfeature_common.c' language='LANG_C99'>
<array-type-def dimensions='1' type-id='83f29ca2' size-in-bits='18368' id='b93e4d14'>
<subrange length='41' type-id='7359adad' id='cb834f44'/>
<array-type-def dimensions='1' type-id='83f29ca2' size-in-bits='18816' id='b937914f'>
<subrange length='42' type-id='7359adad' id='cb7c937f'/>
</array-type-def>
<enum-decl name='zfeature_flags' id='6db816a4'>
<underlying-type type-id='9cac1fee'/>
Expand Down Expand Up @@ -9209,7 +9210,7 @@
<pointer-type-def type-id='611586a1' size-in-bits='64' id='2e243169'/>
<qualified-type-def type-id='eaa32e2f' const='yes' id='83be723c'/>
<pointer-type-def type-id='83be723c' size-in-bits='64' id='7acd98a2'/>
<var-decl name='spa_feature_table' type-id='b93e4d14' mangled-name='spa_feature_table' visibility='default' elf-symbol-id='spa_feature_table'/>
<var-decl name='spa_feature_table' type-id='b937914f' mangled-name='spa_feature_table' visibility='default' elf-symbol-id='spa_feature_table'/>
<var-decl name='zfeature_checks_disable' type-id='c19b74c3' mangled-name='zfeature_checks_disable' visibility='default' elf-symbol-id='zfeature_checks_disable'/>
<function-decl name='opendir' visibility='default' binding='global' size-in-bits='64'>
<parameter type-id='80f4b756'/>
Expand Down
Loading
Loading