Skip to content

Commit

Permalink
ddt: add support for prefetching tables into the ARC
Browse files Browse the repository at this point in the history
This change adds a new `zpool prefetch -t ddt $pool` command which
causes a pool's DDT to be loaded into the ARC. The primary goal is to
remove the need to "warm" a pool's cache before deduplication stops
slowing write performance. It may also provide a way to reload portions
of a DDT if they have been flushed due to inactivity.

Sponsored-by: iXsystems, Inc.
Sponsored-by: Catalogics, Inc.
Sponsored-by: Klara, Inc.
Reviewed-by: Alexander Motin <[email protected]>
Reviewed-by: Tony Hutter <[email protected]>
Reviewed-by: Brian Behlendorf <[email protected]>
Signed-off-by: Allan Jude <[email protected]>
Signed-off-by: Will Andrews <[email protected]>
Signed-off-by: Fred Weigel <[email protected]>
Signed-off-by: Rob Norris <[email protected]>
Signed-off-by: Don Brady <[email protected]>
Co-authored-by: Will Andrews <[email protected]>
Co-authored-by: Don Brady <[email protected]>
Closes #15890
  • Loading branch information
allanjude authored Jul 26, 2024
1 parent 2ed1aeb commit 62e7d3c
Show file tree
Hide file tree
Showing 37 changed files with 1,057 additions and 42 deletions.
4 changes: 2 additions & 2 deletions cmd/zdb/zdb.c
Original file line number Diff line number Diff line change
Expand Up @@ -1985,8 +1985,8 @@ dump_ddt(ddt_t *ddt, ddt_type_t type, ddt_class_t class)
(void) printf("%s: %llu entries, size %llu on disk, %llu in core\n",
name,
(u_longlong_t)count,
(u_longlong_t)(dspace / count),
(u_longlong_t)(mspace / count));
(u_longlong_t)dspace,
(u_longlong_t)mspace);

if (dump_opt['D'] < 3)
return;
Expand Down
112 changes: 104 additions & 8 deletions cmd/zpool/zpool_main.c
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
* Copyright (c) 2017, Intel Corporation.
* Copyright (c) 2019, loli10K <[email protected]>
* Copyright (c) 2021, Colm Buckley <[email protected]>
* Copyright (c) 2021, Klara Inc.
* Copyright (c) 2021, 2023, Klara Inc.
* Copyright [2021] Hewlett Packard Enterprise Development LP
*/

Expand Down Expand Up @@ -90,6 +90,7 @@ static int zpool_do_remove(int, char **);
static int zpool_do_labelclear(int, char **);

static int zpool_do_checkpoint(int, char **);
static int zpool_do_prefetch(int, char **);

static int zpool_do_list(int, char **);
static int zpool_do_iostat(int, char **);
Expand Down Expand Up @@ -176,6 +177,7 @@ typedef enum {
HELP_LIST,
HELP_OFFLINE,
HELP_ONLINE,
HELP_PREFETCH,
HELP_REPLACE,
HELP_REMOVE,
HELP_INITIALIZE,
Expand Down Expand Up @@ -307,6 +309,7 @@ static zpool_command_t command_table[] = {
{ "labelclear", zpool_do_labelclear, HELP_LABELCLEAR },
{ NULL },
{ "checkpoint", zpool_do_checkpoint, HELP_CHECKPOINT },
{ "prefetch", zpool_do_prefetch, HELP_PREFETCH },
{ NULL },
{ "list", zpool_do_list, HELP_LIST },
{ "iostat", zpool_do_iostat, HELP_IOSTAT },
Expand Down Expand Up @@ -398,6 +401,9 @@ get_usage(zpool_help_t idx)
return (gettext("\tlist [-gHLpPv] [-o property[,...]] "
"[-T d|u] [pool] ... \n"
"\t [interval [count]]\n"));
case HELP_PREFETCH:
return (gettext("\tprefetch -t <type> [<type opts>] <pool>\n"
"\t -t ddt <pool>\n"));
case HELP_OFFLINE:
return (gettext("\toffline [--power]|[[-f][-t]] <pool> "
"<device> ...\n"));
Expand Down Expand Up @@ -3827,6 +3833,72 @@ zpool_do_checkpoint(int argc, char **argv)

#define CHECKPOINT_OPT 1024

/*
* zpool prefetch <type> [<type opts>] <pool>
*
* Prefetchs a particular type of data in the specified pool.
*/
int
zpool_do_prefetch(int argc, char **argv)
{
int c;
char *poolname;
char *typestr = NULL;
zpool_prefetch_type_t type;
zpool_handle_t *zhp;
int err = 0;

while ((c = getopt(argc, argv, "t:")) != -1) {
switch (c) {
case 't':
typestr = optarg;
break;
case ':':
(void) fprintf(stderr, gettext("missing argument for "
"'%c' option\n"), optopt);
usage(B_FALSE);
break;
case '?':
(void) fprintf(stderr, gettext("invalid option '%c'\n"),
optopt);
usage(B_FALSE);
}
}
argc -= optind;
argv += optind;

if (argc < 1) {
(void) fprintf(stderr, gettext("missing pool name argument\n"));
usage(B_FALSE);
}

if (argc > 1) {
(void) fprintf(stderr, gettext("too many arguments\n"));
usage(B_FALSE);
}

poolname = argv[0];

argc--;
argv++;

if (strcmp(typestr, "ddt") == 0) {
type = ZPOOL_PREFETCH_DDT;
} else {
(void) fprintf(stderr, gettext("unsupported prefetch type\n"));
usage(B_FALSE);
}

if ((zhp = zpool_open(g_zfs, poolname)) == NULL)
return (1);

err = zpool_prefetch(zhp, type);

zpool_close(zhp);

return (err);
}

/*
* zpool import [-d dir] [-D]
* import [-o mntopts] [-o prop=value] ... [-R root] [-D] [-l]
Expand Down Expand Up @@ -6446,6 +6518,7 @@ print_one_column(zpool_prop_t prop, uint64_t value, const char *str,
case ZPOOL_PROP_EXPANDSZ:
case ZPOOL_PROP_CHECKPOINT:
case ZPOOL_PROP_DEDUPRATIO:
case ZPOOL_PROP_DEDUPCACHED:
if (value == 0)
(void) strlcpy(propval, "-", sizeof (propval));
else
Expand Down Expand Up @@ -8792,13 +8865,17 @@ print_l2cache(zpool_handle_t *zhp, status_cbdata_t *cb, nvlist_t **l2cache,
}

static void
print_dedup_stats(nvlist_t *config)
print_dedup_stats(zpool_handle_t *zhp, nvlist_t *config, boolean_t literal)
{
ddt_histogram_t *ddh;
ddt_stat_t *dds;
ddt_object_t *ddo;
uint_t c;
char dspace[6], mspace[6];
/* Extra space provided for literal display */
char dspace[32], mspace[32], cspace[32];
uint64_t cspace_prop;
enum zfs_nicenum_format format;
zprop_source_t src;

/*
* If the pool was faulted then we may not have been able to
Expand All @@ -8816,12 +8893,26 @@ print_dedup_stats(nvlist_t *config)
return;
}

zfs_nicebytes(ddo->ddo_dspace, dspace, sizeof (dspace));
zfs_nicebytes(ddo->ddo_mspace, mspace, sizeof (mspace));
(void) printf("DDT entries %llu, size %s on disk, %s in core\n",
/*
* Squash cached size into in-core size to handle race.
* Only include cached size if it is available.
*/
cspace_prop = zpool_get_prop_int(zhp, ZPOOL_PROP_DEDUPCACHED, &src);
cspace_prop = MIN(cspace_prop, ddo->ddo_mspace);
format = literal ? ZFS_NICENUM_RAW : ZFS_NICENUM_1024;
zfs_nicenum_format(cspace_prop, cspace, sizeof (cspace), format);
zfs_nicenum_format(ddo->ddo_dspace, dspace, sizeof (dspace), format);
zfs_nicenum_format(ddo->ddo_mspace, mspace, sizeof (mspace), format);
(void) printf("DDT entries %llu, size %s on disk, %s in core",
(u_longlong_t)ddo->ddo_count,
dspace,
mspace);
if (src != ZPROP_SRC_DEFAULT) {
(void) printf(", %s cached (%.02f%%)",
cspace,
(double)cspace_prop / (double)ddo->ddo_mspace * 100.0);
}
(void) printf("\n");

verify(nvlist_lookup_uint64_array(config, ZPOOL_CONFIG_DDT_STATS,
(uint64_t **)&dds, &c) == 0);
Expand Down Expand Up @@ -8857,6 +8948,10 @@ status_callback(zpool_handle_t *zhp, void *data)
uint_t c;
vdev_stat_t *vs;

/* If dedup stats were requested, also fetch dedupcached. */
if (cbp->cb_dedup_stats > 1)
zpool_add_propname(zhp, ZPOOL_DEDUPCACHED_PROP_NAME);

config = zpool_get_config(zhp, NULL);
reason = zpool_get_status(zhp, &msgid, &errata);

Expand Down Expand Up @@ -9338,7 +9433,7 @@ status_callback(zpool_handle_t *zhp, void *data)
}

if (cbp->cb_dedup_stats)
print_dedup_stats(config);
print_dedup_stats(zhp, config, cbp->cb_literal);
} else {
(void) printf(gettext("config: The configuration cannot be "
"determined.\n"));
Expand Down Expand Up @@ -9412,7 +9507,8 @@ zpool_do_status(int argc, char **argv)
cmd = optarg;
break;
case 'D':
cb.cb_dedup_stats = B_TRUE;
if (++cb.cb_dedup_stats > 2)
cb.cb_dedup_stats = 2;
break;
case 'e':
cb.cb_print_unhealthy = B_TRUE;
Expand Down
18 changes: 18 additions & 0 deletions cmd/ztest.c
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
* Copyright (c) 2014 Integros [integros.com]
* Copyright 2017 Joyent, Inc.
* Copyright (c) 2017, Intel Corporation.
* Copyright (c) 2023, Klara, Inc.
*/

/*
Expand Down Expand Up @@ -444,6 +445,7 @@ ztest_func_t ztest_blake3;
ztest_func_t ztest_fletcher;
ztest_func_t ztest_fletcher_incr;
ztest_func_t ztest_verify_dnode_bt;
ztest_func_t ztest_pool_prefetch_ddt;

static uint64_t zopt_always = 0ULL * NANOSEC; /* all the time */
static uint64_t zopt_incessant = 1ULL * NANOSEC / 10; /* every 1/10 second */
Expand Down Expand Up @@ -499,6 +501,7 @@ static ztest_info_t ztest_info[] = {
ZTI_INIT(ztest_fletcher, 1, &zopt_rarely),
ZTI_INIT(ztest_fletcher_incr, 1, &zopt_rarely),
ZTI_INIT(ztest_verify_dnode_bt, 1, &zopt_sometimes),
ZTI_INIT(ztest_pool_prefetch_ddt, 1, &zopt_rarely),
};

#define ZTEST_FUNCS (sizeof (ztest_info) / sizeof (ztest_info_t))
Expand Down Expand Up @@ -6993,6 +6996,21 @@ ztest_fletcher_incr(ztest_ds_t *zd, uint64_t id)
}
}

void
ztest_pool_prefetch_ddt(ztest_ds_t *zd, uint64_t id)
{
(void) zd, (void) id;
spa_t *spa;

(void) pthread_rwlock_rdlock(&ztest_name_lock);
VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG));

ddt_prefetch_all(spa);

spa_close(spa, FTAG);
(void) pthread_rwlock_unlock(&ztest_name_lock);
}

static int
ztest_set_global_vars(void)
{
Expand Down
1 change: 1 addition & 0 deletions contrib/debian/openzfs-zfsutils.install
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,7 @@ usr/share/man/man8/zpool-labelclear.8
usr/share/man/man8/zpool-list.8
usr/share/man/man8/zpool-offline.8
usr/share/man/man8/zpool-online.8
usr/share/man/man8/zpool-prefetch.8
usr/share/man/man8/zpool-reguid.8
usr/share/man/man8/zpool-remove.8
usr/share/man/man8/zpool-reopen.8
Expand Down
3 changes: 3 additions & 0 deletions include/libzfs.h
Original file line number Diff line number Diff line change
Expand Up @@ -458,6 +458,7 @@ _LIBZFS_H nvlist_t *zpool_get_config(zpool_handle_t *, nvlist_t **);
_LIBZFS_H nvlist_t *zpool_get_features(zpool_handle_t *);
_LIBZFS_H int zpool_refresh_stats(zpool_handle_t *, boolean_t *);
_LIBZFS_H int zpool_get_errlog(zpool_handle_t *, nvlist_t **);
_LIBZFS_H void zpool_add_propname(zpool_handle_t *, const char *);

/*
* Import and export functions
Expand Down Expand Up @@ -504,6 +505,8 @@ _LIBZFS_H int zpool_checkpoint(zpool_handle_t *);
_LIBZFS_H int zpool_discard_checkpoint(zpool_handle_t *);
_LIBZFS_H boolean_t zpool_is_draid_spare(const char *);

_LIBZFS_H int zpool_prefetch(zpool_handle_t *, zpool_prefetch_type_t);

/*
* Basic handle manipulations. These functions do not create or destroy the
* underlying datasets, only the references to them.
Expand Down
3 changes: 3 additions & 0 deletions include/libzfs_core.h
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,9 @@ _LIBZFS_CORE_H int lzc_pool_checkpoint_discard(const char *);
_LIBZFS_CORE_H int lzc_wait(const char *, zpool_wait_activity_t, boolean_t *);
_LIBZFS_CORE_H int lzc_wait_tag(const char *, zpool_wait_activity_t, uint64_t,
boolean_t *);

_LIBZFS_CORE_H int lzc_pool_prefetch(const char *, zpool_prefetch_type_t);

_LIBZFS_CORE_H int lzc_wait_fs(const char *, zfs_wait_activity_t, boolean_t *);

_LIBZFS_CORE_H int lzc_set_bootenv(const char *, const nvlist_t *);
Expand Down
11 changes: 11 additions & 0 deletions include/sys/arc.h
Original file line number Diff line number Diff line change
Expand Up @@ -250,6 +250,16 @@ typedef struct arc_buf_info {
enum zio_compress abi_l2arc_compress;
} arc_buf_info_t;

/*
* Flags returned by arc_cached; describes which part of the arc
* the block is cached in.
*/
#define ARC_CACHED_EMBEDDED (1U << 0)
#define ARC_CACHED_IN_L1 (1U << 1)
#define ARC_CACHED_IN_MRU (1U << 2)
#define ARC_CACHED_IN_MFU (1U << 3)
#define ARC_CACHED_IN_L2 (1U << 4)

void arc_space_consume(uint64_t space, arc_space_type_t type);
void arc_space_return(uint64_t space, arc_space_type_t type);
boolean_t arc_is_metadata(arc_buf_t *buf);
Expand Down Expand Up @@ -310,6 +320,7 @@ zio_t *arc_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
arc_prune_t *arc_add_prune_callback(arc_prune_func_t *func, void *priv);
void arc_remove_prune_callback(arc_prune_t *p);
void arc_freed(spa_t *spa, const blkptr_t *bp);
int arc_cached(spa_t *spa, const blkptr_t *bp);

void arc_flush(spa_t *spa, boolean_t retry);
void arc_tempreserve_clear(uint64_t reserve);
Expand Down
4 changes: 3 additions & 1 deletion include/sys/ddt.h
Original file line number Diff line number Diff line change
Expand Up @@ -236,15 +236,17 @@ extern void ddt_get_dedup_stats(spa_t *spa, ddt_stat_t *dds_total);

extern uint64_t ddt_get_dedup_dspace(spa_t *spa);
extern uint64_t ddt_get_pool_dedup_ratio(spa_t *spa);
extern int ddt_get_pool_dedup_cached(spa_t *spa, uint64_t *psize);

extern ddt_t *ddt_select(spa_t *spa, const blkptr_t *bp);
extern void ddt_enter(ddt_t *ddt);
extern void ddt_exit(ddt_t *ddt);
extern void ddt_init(void);
extern void ddt_fini(void);
extern ddt_entry_t *ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add);
extern void ddt_prefetch(spa_t *spa, const blkptr_t *bp);
extern void ddt_remove(ddt_t *ddt, ddt_entry_t *dde);
extern void ddt_prefetch(spa_t *spa, const blkptr_t *bp);
extern void ddt_prefetch_all(spa_t *spa);

extern boolean_t ddt_class_contains(spa_t *spa, ddt_class_t max_class,
const blkptr_t *bp);
Expand Down
1 change: 1 addition & 0 deletions include/sys/ddt_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ typedef struct {
const ddt_key_t *ddk);
void (*ddt_op_prefetch)(objset_t *os, uint64_t object,
const ddt_key_t *ddk);
void (*ddt_op_prefetch_all)(objset_t *os, uint64_t object);
int (*ddt_op_update)(objset_t *os, uint64_t object,
const ddt_key_t *ddk, const ddt_phys_t *phys, size_t psize,
dmu_tx_t *tx);
Expand Down
8 changes: 8 additions & 0 deletions include/sys/dmu.h
Original file line number Diff line number Diff line change
Expand Up @@ -505,6 +505,12 @@ void dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum,
void dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress,
dmu_tx_t *tx);

/*
* Get an estimated cache size for an object. Caller must expect races.
*/
int dmu_object_cached_size(objset_t *os, uint64_t object,
uint64_t *l1sz, uint64_t *l2sz);

void dmu_write_embedded(objset_t *os, uint64_t object, uint64_t offset,
void *data, uint8_t etype, uint8_t comp, int uncompressed_size,
int compressed_size, int byteorder, dmu_tx_t *tx);
Expand Down Expand Up @@ -903,6 +909,8 @@ void dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset,
void dmu_prefetch_by_dnode(dnode_t *dn, int64_t level, uint64_t offset,
uint64_t len, enum zio_priority pri);
void dmu_prefetch_dnode(objset_t *os, uint64_t object, enum zio_priority pri);
int dmu_prefetch_wait(objset_t *os, uint64_t object, uint64_t offset,
uint64_t size);

typedef struct dmu_object_info {
/* All sizes are in bytes unless otherwise indicated. */
Expand Down
Loading

0 comments on commit 62e7d3c

Please sign in to comment.