From 7024ec8db3e8f8baf3fa7584ae530efaa3bceba6 Mon Sep 17 00:00:00 2001 From: Fred Weigel Date: Wed, 10 May 2023 17:36:18 +0000 Subject: [PATCH] ddt: add support for prefetching tables into the ARC This change adds a new `zpool prefetch -t ddt $pool` command which causes a pool's DDT to be loaded into the ARC. The primary goal is to remove the need to "warm" a pool's cache before deduplication stops slowing write performance. It may also provide a way to reload portions of a DDT if they have been flushed due to inactivity. Co-authored-by: Will Andrews Co-authored-by: Allan Jude Co-authored-by: Don Brady Signed-off-by: Allan Jude Signed-off-by: Will Andrews Signed-off-by: Fred Weigel Signed-off-by: Rob Norris Sponsored-by: iXsystems, Inc. Sponsored-by: Catalogics, Inc. Sponsored-by: Klara, Inc. --- cmd/zdb/zdb.c | 4 +- cmd/zpool/zpool_main.c | 134 ++++++++++++++- cmd/ztest.c | 18 ++ include/libzfs.h | 3 + include/libzfs_core.h | 3 + include/sys/arc.h | 11 ++ include/sys/ddt.h | 4 +- include/sys/ddt_impl.h | 3 +- include/sys/dmu.h | 8 + include/sys/fs/zfs.h | 13 ++ include/sys/spa.h | 2 + include/sys/zap.h | 1 + lib/libzfs/libzfs_impl.h | 3 + lib/libzfs/libzfs_pool.c | 46 ++++++ lib/libzfs_core/libzfs_core.c | 18 ++ man/Makefile.am | 1 + man/man7/zpoolprops.7 | 4 + man/man8/zpool-prefetch.8 | 46 ++++++ man/man8/zpool-status.8 | 4 +- man/man8/zpool.8 | 5 +- module/zcommon/zpool_prop.c | 3 + module/zfs/arc.c | 53 +++++- module/zfs/ddt.c | 34 +++- module/zfs/ddt_stats.c | 29 ++++ module/zfs/ddt_zap.c | 7 + module/zfs/dmu.c | 156 +++++++++++++++++- module/zfs/spa.c | 58 ++++++- module/zfs/zap_micro.c | 17 ++ module/zfs/zfs_ioctl.c | 84 ++++++++-- tests/runfiles/common.run | 4 + tests/zfs-tests/tests/Makefile.am | 4 + .../cli_root/zpool_prefetch/cleanup.ksh | 30 ++++ .../cli_root/zpool_prefetch/setup.ksh | 32 ++++ .../zpool_prefetch/zpool_prefetch.cfg | 26 +++ .../zpool_prefetch/zpool_prefetch_001_pos.ksh | 128 ++++++++++++++ 35 files changed, 962 insertions(+), 34 deletions(-) create mode 100644 man/man8/zpool-prefetch.8 create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_prefetch/cleanup.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_prefetch/setup.ksh create mode 100644 tests/zfs-tests/tests/functional/cli_root/zpool_prefetch/zpool_prefetch.cfg create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_prefetch/zpool_prefetch_001_pos.ksh diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index 4880c8048726..5fde5f6316b9 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -1933,8 +1933,8 @@ dump_ddt(ddt_t *ddt, ddt_type_t type, ddt_class_t class) (void) printf("%s: %llu entries, size %llu on disk, %llu in core\n", name, (u_longlong_t)count, - (u_longlong_t)(dspace / count), - (u_longlong_t)(mspace / count)); + (u_longlong_t)dspace, + (u_longlong_t)mspace); if (dump_opt['D'] < 3) return; diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c index 0783271f4734..9a2f77da15e6 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c @@ -32,7 +32,7 @@ * Copyright (c) 2017, Intel Corporation. * Copyright (c) 2019, loli10K * Copyright (c) 2021, Colm Buckley - * Copyright (c) 2021, Klara Inc. + * Copyright (c) 2021, 2023, Klara Inc. * Copyright [2021] Hewlett Packard Enterprise Development LP */ @@ -87,6 +87,7 @@ static int zpool_do_remove(int, char **); static int zpool_do_labelclear(int, char **); static int zpool_do_checkpoint(int, char **); +static int zpool_do_prefetch(int, char **); static int zpool_do_list(int, char **); static int zpool_do_iostat(int, char **); @@ -166,6 +167,7 @@ typedef enum { HELP_LIST, HELP_OFFLINE, HELP_ONLINE, + HELP_PREFETCH, HELP_REPLACE, HELP_REMOVE, HELP_INITIALIZE, @@ -297,6 +299,7 @@ static zpool_command_t command_table[] = { { "labelclear", zpool_do_labelclear, HELP_LABELCLEAR }, { NULL }, { "checkpoint", zpool_do_checkpoint, HELP_CHECKPOINT }, + { "prefetch", zpool_do_prefetch, HELP_PREFETCH }, { NULL }, { "list", zpool_do_list, HELP_LIST }, { "iostat", zpool_do_iostat, HELP_IOSTAT }, @@ -388,6 +391,9 @@ get_usage(zpool_help_t idx) return (gettext("\tlist [-gHLpPv] [-o property[,...]] " "[-T d|u] [pool] ... \n" "\t [interval [count]]\n")); + case HELP_PREFETCH: + return (gettext("\tprefetch -t [] \n" + "\t -t ddt \n")); case HELP_OFFLINE: return (gettext("\toffline [--power]|[[-f][-t]] " " ...\n")); @@ -3653,6 +3659,81 @@ zpool_do_checkpoint(int argc, char **argv) #define CHECKPOINT_OPT 1024 +enum zpool_prefetch_type { + ZPOOL_PREFETCH_TYPE_DDT, +}; + +/* + * zpool prefetch [] + * + * Prefetchs a particular type of data in the specified pool. + */ +int +zpool_do_prefetch(int argc, char **argv) +{ + int c; + char *poolname; + char *typestr = NULL; + enum zpool_prefetch_type type; + zpool_handle_t *zhp; + int err = 0; + + while ((c = getopt(argc, argv, "t:")) != -1) { + switch (c) { + case 't': + typestr = optarg; + break; + case ':': + (void) fprintf(stderr, gettext("missing argument for " + "'%c' option\n"), optopt); + usage(B_FALSE); + break; + case '?': + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + } + } + argc -= optind; + argv += optind; + + if (argc < 1) { + (void) fprintf(stderr, gettext("missing pool name argument\n")); + usage(B_FALSE); + } + + if (argc > 1) { + (void) fprintf(stderr, gettext("too many arguments\n")); + usage(B_FALSE); + } + + poolname = argv[0]; + + argc--; + argv++; + + if (strcmp(typestr, "ddt") == 0) { + type = ZPOOL_PREFETCH_TYPE_DDT; + } else { + (void) fprintf(stderr, gettext("unsupported prefetch type\n")); + usage(B_FALSE); + } + + if ((zhp = zpool_open(g_zfs, poolname)) == NULL) + return (1); + + switch (type) { + case ZPOOL_PREFETCH_TYPE_DDT: + err = zpool_prefetch_ddt(zhp); + break; + default: + break; + } + zpool_close(zhp); + + return (err); +} + /* * zpool import [-d dir] [-D] * import [-o mntopts] [-o prop=value] ... [-R root] [-D] [-l] @@ -6275,6 +6356,7 @@ print_one_column(zpool_prop_t prop, uint64_t value, const char *str, case ZPOOL_PROP_EXPANDSZ: case ZPOOL_PROP_CHECKPOINT: case ZPOOL_PROP_DEDUPRATIO: + case ZPOOL_PROP_DEDUPCACHED: if (value == 0) (void) strlcpy(propval, "-", sizeof (propval)); else @@ -8621,13 +8703,17 @@ print_l2cache(zpool_handle_t *zhp, status_cbdata_t *cb, nvlist_t **l2cache, } static void -print_dedup_stats(nvlist_t *config) +print_dedup_stats(zpool_handle_t *zhp, nvlist_t *config, boolean_t literal) { ddt_histogram_t *ddh; ddt_stat_t *dds; ddt_object_t *ddo; uint_t c; - char dspace[6], mspace[6]; + /* Extra space provided for literal display */ + char dspace[32], mspace[32], cspace[32]; + uint64_t cspace_prop; + enum zfs_nicenum_format format; + zprop_source_t src; /* * If the pool was faulted then we may not have been able to @@ -8645,12 +8731,26 @@ print_dedup_stats(nvlist_t *config) return; } - zfs_nicebytes(ddo->ddo_dspace, dspace, sizeof (dspace)); - zfs_nicebytes(ddo->ddo_mspace, mspace, sizeof (mspace)); - (void) printf("DDT entries %llu, size %s on disk, %s in core\n", + /* + * Squash cached size into in-core size to handle race. + * Only include cached size if it is available. + */ + cspace_prop = MIN(zpool_get_prop_int(zhp, ZPOOL_PROP_DEDUPCACHED, &src), + ddo->ddo_mspace); + format = literal ? ZFS_NICENUM_RAW : ZFS_NICENUM_1024; + zfs_nicenum_format(cspace_prop, cspace, sizeof (cspace), format); + zfs_nicenum_format(ddo->ddo_dspace, dspace, sizeof (dspace), format); + zfs_nicenum_format(ddo->ddo_mspace, mspace, sizeof (mspace), format); + (void) printf("DDT entries %llu, size %s on disk, %s in core", (u_longlong_t)ddo->ddo_count, dspace, mspace); + if (src != ZPROP_SRC_DEFAULT) { + (void) printf(", %s cached (%.02f%%)", + cspace, + (double)cspace_prop / (double)ddo->ddo_mspace * 100.0); + } + (void) printf("\n"); verify(nvlist_lookup_uint64_array(config, ZPOOL_CONFIG_DDT_STATS, (uint64_t **)&dds, &c) == 0); @@ -8686,6 +8786,10 @@ status_callback(zpool_handle_t *zhp, void *data) uint_t c; vdev_stat_t *vs; + /* If dedup stats were requested, also fetch dedupcached. */ + if (cbp->cb_dedup_stats > 1) + zpool_add_propname(zhp, ZPOOL_DEDUPCACHED_PROP_NAME); + config = zpool_get_config(zhp, NULL); reason = zpool_get_status(zhp, &msgid, &errata); @@ -9167,7 +9271,7 @@ status_callback(zpool_handle_t *zhp, void *data) } if (cbp->cb_dedup_stats) - print_dedup_stats(config); + print_dedup_stats(zhp, config, cbp->cb_literal); } else { (void) printf(gettext("config: The configuration cannot be " "determined.\n")); @@ -9268,7 +9372,8 @@ zpool_do_status(int argc, char **argv) cb.cb_explain = B_TRUE; break; case 'D': - cb.cb_dedup_stats = B_TRUE; + if (++cb.cb_dedup_stats > 2) + cb.cb_dedup_stats = 2; break; case 't': cb.cb_print_vdev_trim = B_TRUE; @@ -10953,6 +11058,19 @@ zpool_do_set(int argc, char **argv) cb.cb_type = ZFS_TYPE_VDEV; } + /* argv[1], when supplied, is vdev name */ + if (argc == 2) { + if (!are_vdevs_in_pool(1, argv + 1, argv[0], &cb.cb_vdevs)) { + (void) fprintf(stderr, gettext( + "cannot find '%s' in '%s': device not in pool\n"), + argv[1], argv[0]); + return (EINVAL); + } + cb.cb_vdevs.cb_names = argv + 1; + cb.cb_vdevs.cb_names_count = 1; + cb.cb_type = ZFS_TYPE_VDEV; + } + error = for_each_pool(1, argv, B_TRUE, NULL, ZFS_TYPE_POOL, B_FALSE, set_callback, &cb); diff --git a/cmd/ztest.c b/cmd/ztest.c index f834ad227e4c..d03d03ee2703 100644 --- a/cmd/ztest.c +++ b/cmd/ztest.c @@ -26,6 +26,7 @@ * Copyright (c) 2014 Integros [integros.com] * Copyright 2017 Joyent, Inc. * Copyright (c) 2017, Intel Corporation. + * Copyright (c) 2023, Klara, Inc. */ /* @@ -446,6 +447,7 @@ ztest_func_t ztest_blake3; ztest_func_t ztest_fletcher; ztest_func_t ztest_fletcher_incr; ztest_func_t ztest_verify_dnode_bt; +ztest_func_t ztest_pool_prefetch_ddt; static uint64_t zopt_always = 0ULL * NANOSEC; /* all the time */ static uint64_t zopt_incessant = 1ULL * NANOSEC / 10; /* every 1/10 second */ @@ -501,6 +503,7 @@ static ztest_info_t ztest_info[] = { ZTI_INIT(ztest_fletcher, 1, &zopt_rarely), ZTI_INIT(ztest_fletcher_incr, 1, &zopt_rarely), ZTI_INIT(ztest_verify_dnode_bt, 1, &zopt_sometimes), + ZTI_INIT(ztest_pool_prefetch_ddt, 1, &zopt_rarely), }; #define ZTEST_FUNCS (sizeof (ztest_info) / sizeof (ztest_info_t)) @@ -6999,6 +7002,21 @@ ztest_fletcher_incr(ztest_ds_t *zd, uint64_t id) } } +void +ztest_pool_prefetch_ddt(ztest_ds_t *zd, uint64_t id) +{ + (void) zd, (void) id; + spa_t *spa; + + (void) pthread_rwlock_rdlock(&ztest_name_lock); + VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); + + ddt_prefetch_all(spa); + + spa_close(spa, FTAG); + (void) pthread_rwlock_unlock(&ztest_name_lock); +} + static int ztest_set_global_vars(void) { diff --git a/include/libzfs.h b/include/libzfs.h index 4f06b5d3c24c..9f16922cdead 100644 --- a/include/libzfs.h +++ b/include/libzfs.h @@ -457,6 +457,7 @@ _LIBZFS_H nvlist_t *zpool_get_config(zpool_handle_t *, nvlist_t **); _LIBZFS_H nvlist_t *zpool_get_features(zpool_handle_t *); _LIBZFS_H int zpool_refresh_stats(zpool_handle_t *, boolean_t *); _LIBZFS_H int zpool_get_errlog(zpool_handle_t *, nvlist_t **); +_LIBZFS_H void zpool_add_propname(zpool_handle_t *, const char *); /* * Import and export functions @@ -503,6 +504,8 @@ _LIBZFS_H int zpool_checkpoint(zpool_handle_t *); _LIBZFS_H int zpool_discard_checkpoint(zpool_handle_t *); _LIBZFS_H boolean_t zpool_is_draid_spare(const char *); +_LIBZFS_H int zpool_prefetch_ddt(zpool_handle_t *); + /* * Basic handle manipulations. These functions do not create or destroy the * underlying datasets, only the references to them. diff --git a/include/libzfs_core.h b/include/libzfs_core.h index 867c18b9c226..3bc56087d345 100644 --- a/include/libzfs_core.h +++ b/include/libzfs_core.h @@ -147,6 +147,9 @@ _LIBZFS_CORE_H int lzc_pool_checkpoint_discard(const char *); _LIBZFS_CORE_H int lzc_wait(const char *, zpool_wait_activity_t, boolean_t *); _LIBZFS_CORE_H int lzc_wait_tag(const char *, zpool_wait_activity_t, uint64_t, boolean_t *); + +_LIBZFS_CORE_H int lzc_pool_prefetch_ddt(const char *); + _LIBZFS_CORE_H int lzc_wait_fs(const char *, zfs_wait_activity_t, boolean_t *); _LIBZFS_CORE_H int lzc_set_bootenv(const char *, const nvlist_t *); diff --git a/include/sys/arc.h b/include/sys/arc.h index 05307aab99e3..c92b3eee618c 100644 --- a/include/sys/arc.h +++ b/include/sys/arc.h @@ -250,6 +250,16 @@ typedef struct arc_buf_info { enum zio_compress abi_l2arc_compress; } arc_buf_info_t; +/* + * Flags returned by arc_cached; describes which part of the arc + * the block is cached in. + */ +#define ARC_CACHED_EMBEDDED (1U << 0) +#define ARC_CACHED_IN_L1 (1U << 1) +#define ARC_CACHED_IN_MRU (1U << 2) +#define ARC_CACHED_IN_MFU (1U << 3) +#define ARC_CACHED_IN_L2 (1U << 4) + void arc_space_consume(uint64_t space, arc_space_type_t type); void arc_space_return(uint64_t space, arc_space_type_t type); boolean_t arc_is_metadata(arc_buf_t *buf); @@ -310,6 +320,7 @@ zio_t *arc_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, arc_prune_t *arc_add_prune_callback(arc_prune_func_t *func, void *priv); void arc_remove_prune_callback(arc_prune_t *p); void arc_freed(spa_t *spa, const blkptr_t *bp); +int arc_cached(spa_t *spa, const blkptr_t *bp); void arc_flush(spa_t *spa, boolean_t retry); void arc_tempreserve_clear(uint64_t reserve); diff --git a/include/sys/ddt.h b/include/sys/ddt.h index e0129eda5cf5..66d59cebacde 100644 --- a/include/sys/ddt.h +++ b/include/sys/ddt.h @@ -236,6 +236,7 @@ extern void ddt_get_dedup_stats(spa_t *spa, ddt_stat_t *dds_total); extern uint64_t ddt_get_dedup_dspace(spa_t *spa); extern uint64_t ddt_get_pool_dedup_ratio(spa_t *spa); +extern int ddt_get_pool_dedup_cached(spa_t *spa, uint64_t *psize); extern ddt_t *ddt_select(spa_t *spa, const blkptr_t *bp); extern void ddt_enter(ddt_t *ddt); @@ -243,8 +244,9 @@ extern void ddt_exit(ddt_t *ddt); extern void ddt_init(void); extern void ddt_fini(void); extern ddt_entry_t *ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add); -extern void ddt_prefetch(spa_t *spa, const blkptr_t *bp); extern void ddt_remove(ddt_t *ddt, ddt_entry_t *dde); +extern void ddt_prefetch(spa_t *spa, const blkptr_t *bp); +extern void ddt_prefetch_all(spa_t *spa); extern boolean_t ddt_class_contains(spa_t *spa, ddt_class_t max_class, const blkptr_t *bp); diff --git a/include/sys/ddt_impl.h b/include/sys/ddt_impl.h index d6693658885b..7072cfb4ecae 100644 --- a/include/sys/ddt_impl.h +++ b/include/sys/ddt_impl.h @@ -47,6 +47,7 @@ typedef struct { const ddt_key_t *ddk); void (*ddt_op_prefetch)(objset_t *os, uint64_t object, const ddt_key_t *ddk); + void (*ddt_op_prefetch_all)(objset_t *os, uint64_t object); int (*ddt_op_update)(objset_t *os, uint64_t object, const ddt_key_t *ddk, const ddt_phys_t *phys, size_t psize, dmu_tx_t *tx); @@ -66,7 +67,7 @@ extern void ddt_stat_update(ddt_t *ddt, ddt_entry_t *dde, uint64_t neg); * outside of the DDT implementation proper, and if you do, consider moving * them up. */ -#define DDT_NAMELEN 110 +#define DDT_NAMELEN 118 extern uint64_t ddt_phys_total_refcnt(const ddt_entry_t *dde); diff --git a/include/sys/dmu.h b/include/sys/dmu.h index 921f51f27a20..8174b21f63dd 100644 --- a/include/sys/dmu.h +++ b/include/sys/dmu.h @@ -505,6 +505,12 @@ void dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum, void dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress, dmu_tx_t *tx); +/* + * Get an estimated cache size for an object. Caller must expect races. + */ +int dmu_object_cached_size(objset_t *os, uint64_t object, + uint64_t *l1sz, uint64_t *l2sz); + void dmu_write_embedded(objset_t *os, uint64_t object, uint64_t offset, void *data, uint8_t etype, uint8_t comp, int uncompressed_size, int compressed_size, int byteorder, dmu_tx_t *tx); @@ -903,6 +909,8 @@ extern uint_t zfs_max_recordsize; void dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset, uint64_t len, enum zio_priority pri); void dmu_prefetch_dnode(objset_t *os, uint64_t object, enum zio_priority pri); +int dmu_prefetch_wait(objset_t *os, uint64_t object, uint64_t offset, + uint64_t size, uint32_t flags); typedef struct dmu_object_info { /* All sizes are in bytes unless otherwise indicated. */ diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h index 9bf9ca1e9dbe..959db8ab1b4b 100644 --- a/include/sys/fs/zfs.h +++ b/include/sys/fs/zfs.h @@ -260,6 +260,7 @@ typedef enum { ZPOOL_PROP_BCLONERATIO, ZPOOL_PROP_DEDUP_TABLE_SIZE, ZPOOL_PROP_DEDUP_TABLE_QUOTA, + ZPOOL_PROP_DEDUPCACHED, ZPOOL_NUM_PROPS } zpool_prop_t; @@ -1508,6 +1509,7 @@ typedef enum zfs_ioc { ZFS_IOC_VDEV_GET_PROPS, /* 0x5a55 */ ZFS_IOC_VDEV_SET_PROPS, /* 0x5a56 */ ZFS_IOC_POOL_SCRUB, /* 0x5a57 */ + ZFS_IOC_POOL_PREFETCH_DDT, /* 0x5a58 */ /* * Per-platform (Optional) - 8/128 numbers reserved. @@ -1676,6 +1678,17 @@ typedef enum { */ #define ZPOOL_HIDDEN_ARGS "hidden_args" +/* + * The following is used when invoking ZFS_IOC_POOL_GET_PROPS. + */ +#define ZPOOL_GET_PROPS_NAMES "get_props_names" + +/* + * Opt-in property names used with ZPOOL_GET_PROPS_NAMES. + * For example, properties that are hidden or expensive to compute. + */ +#define ZPOOL_DEDUPCACHED_PROP_NAME "dedupcached" + /* * The following are names used when invoking ZFS_IOC_POOL_INITIALIZE. */ diff --git a/include/sys/spa.h b/include/sys/spa.h index cada3c841037..9ca3f62b78e7 100644 --- a/include/sys/spa.h +++ b/include/sys/spa.h @@ -1189,6 +1189,8 @@ extern void spa_boot_init(void); /* properties */ extern int spa_prop_set(spa_t *spa, nvlist_t *nvp); extern int spa_prop_get(spa_t *spa, nvlist_t **nvp); +extern int spa_prop_get_nvlist(spa_t *spa, char **props, + unsigned int n_props, nvlist_t **outnvl); extern void spa_prop_clear_bootfs(spa_t *spa, uint64_t obj, dmu_tx_t *tx); extern void spa_configfile_set(spa_t *, nvlist_t *, boolean_t); diff --git a/include/sys/zap.h b/include/sys/zap.h index 308a7c7284d7..3d1f971074f9 100644 --- a/include/sys/zap.h +++ b/include/sys/zap.h @@ -225,6 +225,7 @@ int zap_lookup_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf); int zap_contains(objset_t *ds, uint64_t zapobj, const char *name); int zap_prefetch(objset_t *os, uint64_t zapobj, const char *name); +int zap_prefetch_object(objset_t *os, uint64_t zapobj); int zap_prefetch_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, int key_numints); diff --git a/lib/libzfs/libzfs_impl.h b/lib/libzfs/libzfs_impl.h index ef0359f45ea0..e98ede51e4ba 100644 --- a/lib/libzfs/libzfs_impl.h +++ b/lib/libzfs/libzfs_impl.h @@ -94,12 +94,15 @@ struct zfs_handle { * snapshots of volumes. */ #define ZFS_IS_VOLUME(zhp) ((zhp)->zfs_head_type == ZFS_TYPE_VOLUME) +#define ZHP_MAX_PROPNAMES 4 struct zpool_handle { libzfs_handle_t *zpool_hdl; zpool_handle_t *zpool_next; char zpool_name[ZFS_MAX_DATASET_NAME_LEN]; int zpool_state; + unsigned int zpool_n_propnames; + const char *zpool_propnames[ZHP_MAX_PROPNAMES]; size_t zpool_config_size; nvlist_t *zpool_config; nvlist_t *zpool_old_config; diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c index 36464080cc14..0a397c47219d 100644 --- a/lib/libzfs/libzfs_pool.c +++ b/lib/libzfs/libzfs_pool.c @@ -79,6 +79,13 @@ zpool_get_all_props(zpool_handle_t *zhp) (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); + if (zhp->zpool_n_propnames > 0) { + nvlist_t *innvl = fnvlist_alloc(); + fnvlist_add_string_array(innvl, ZPOOL_GET_PROPS_NAMES, + zhp->zpool_propnames, zhp->zpool_n_propnames); + zcmd_write_src_nvlist(hdl, &zc, innvl); + } + zcmd_alloc_dst_nvlist(hdl, &zc, 0); while (zfs_ioctl(hdl, ZFS_IOC_POOL_GET_PROPS, &zc) != 0) { @@ -329,6 +336,15 @@ zpool_get_prop(zpool_handle_t *zhp, zpool_prop_t prop, char *buf, break; case PROP_TYPE_NUMBER: + /* + * ZPOOL_PROP_DEDUPCACHED can be fetched by name only using + * the ZPOOL_GET_PROPS_NAMES mechanism + */ + if (prop == ZPOOL_PROP_DEDUPCACHED) { + zpool_add_propname(zhp, ZPOOL_DEDUPCACHED_PROP_NAME); + zpool_get_all_props(zhp); + } + intval = zpool_get_prop_int(zhp, prop, &src); switch (prop) { @@ -361,6 +377,7 @@ zpool_get_prop(zpool_handle_t *zhp, zpool_prop_t prop, char *buf, case ZPOOL_PROP_BCLONESAVED: case ZPOOL_PROP_BCLONEUSED: case ZPOOL_PROP_DEDUP_TABLE_SIZE: + case ZPOOL_PROP_DEDUPCACHED: if (literal) (void) snprintf(buf, len, "%llu", (u_longlong_t)intval); @@ -1738,6 +1755,27 @@ zpool_discard_checkpoint(zpool_handle_t *zhp) return (0); } +/* + * Load the DDT table for the given pool. + */ +int +zpool_prefetch_ddt(zpool_handle_t *zhp) +{ + libzfs_handle_t *hdl = zhp->zpool_hdl; + char msg[1024]; + int error; + + error = lzc_pool_prefetch_ddt(zhp->zpool_name); + if (error != 0) { + (void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN, + "cannot prefetch ddt in '%s'"), zhp->zpool_name); + (void) zpool_standard_error(hdl, error, msg); + return (-1); + } + + return (0); +} + /* * Add the given vdevs to the pool. The caller must have already performed the * necessary verification to ensure that the vdev specification is well-formed. @@ -4398,6 +4436,14 @@ zbookmark_mem_compare(const void *a, const void *b) return (memcmp(a, b, sizeof (zbookmark_phys_t))); } +void +zpool_add_propname(zpool_handle_t *zhp, const char *propname) +{ + assert(zhp->zpool_n_propnames < ZHP_MAX_PROPNAMES); + zhp->zpool_propnames[zhp->zpool_n_propnames] = propname; + zhp->zpool_n_propnames++; +} + /* * Retrieve the persistent error log, uniquify the members, and return to the * caller. diff --git a/lib/libzfs_core/libzfs_core.c b/lib/libzfs_core/libzfs_core.c index 01d803e21db0..663f2623cb86 100644 --- a/lib/libzfs_core/libzfs_core.c +++ b/lib/libzfs_core/libzfs_core.c @@ -1623,6 +1623,24 @@ lzc_pool_checkpoint_discard(const char *pool) return (error); } +/* + * Load the DDT table for the specified pool. + */ +int +lzc_pool_prefetch_ddt(const char *pool) +{ + int error; + nvlist_t *result = NULL; + nvlist_t *args = fnvlist_alloc(); + + error = lzc_ioctl(ZFS_IOC_POOL_PREFETCH_DDT, pool, args, &result); + + fnvlist_free(args); + fnvlist_free(result); + + return (error); +} + /* * Executes a read-only channel program. * diff --git a/man/Makefile.am b/man/Makefile.am index 45156571eec3..2c495cded7c5 100644 --- a/man/Makefile.am +++ b/man/Makefile.am @@ -84,6 +84,7 @@ dist_man_MANS = \ %D%/man8/zpool-list.8 \ %D%/man8/zpool-offline.8 \ %D%/man8/zpool-online.8 \ + %D%/man8/zpool-prefetch.8 \ %D%/man8/zpool-reguid.8 \ %D%/man8/zpool-remove.8 \ %D%/man8/zpool-reopen.8 \ diff --git a/man/man7/zpoolprops.7 b/man/man7/zpoolprops.7 index 45ca5087f31b..6304f93a15ad 100644 --- a/man/man7/zpoolprops.7 +++ b/man/man7/zpoolprops.7 @@ -73,6 +73,10 @@ The amount of storage used by cloned blocks. Percentage of pool space used. This property can also be referred to by its shortened column name, .Sy cap . +.It Sy dedupcached +Total size of the deduplication table currently loaded into the ARC. +See +.Xr zpool-prefetch 8 . .It Sy dedup_table_size Total on-disk size of the deduplication table. .It Sy expandsize diff --git a/man/man8/zpool-prefetch.8 b/man/man8/zpool-prefetch.8 new file mode 100644 index 000000000000..57445bd4a655 --- /dev/null +++ b/man/man8/zpool-prefetch.8 @@ -0,0 +1,46 @@ +.\" +.\" CDDL HEADER START +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License (the "License"). +.\" You may not use this file except in compliance with the License. +.\" +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +.\" or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions +.\" and limitations under the License. +.\" +.\" When distributing Covered Code, include this CDDL HEADER in each +.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. +.\" If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying +.\" information: Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" CDDL HEADER END +.\" +.\" +.\" Copyright (c) 2023, Klara Inc. +.\" +.Dd February 14, 2024 +.Dt ZPOOL-PREFETCH 8 +.Os +. +.Sh NAME +.Nm zpool-prefetch +.Nd Loads specific types of data for the given pool +.Sh SYNOPSIS +.Nm zpool +.Cm prefetch +.Fl t Ar type +.Ar pool +.Sh DESCRIPTION +.Bl -tag -width Ds +.It Xo +.Nm zpool +.Cm prefetch +.Fl t Li ddt +.Ar pool +.Xc +Prefetch data of a specific type for the given pool; specifically the DDT, +which will improve write I/O performance when the DDT is resident in the ARC. +.El diff --git a/man/man8/zpool-status.8 b/man/man8/zpool-status.8 index 24ad6e643cae..7f7e0b5b2328 100644 --- a/man/man8/zpool-status.8 +++ b/man/man8/zpool-status.8 @@ -26,7 +26,7 @@ .\" Copyright 2017 Nexenta Systems, Inc. .\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved. .\" -.Dd March 16, 2022 +.Dd February 14, 2024 .Dt ZPOOL-STATUS 8 .Os . @@ -96,6 +96,8 @@ Display a histogram of deduplication statistics, showing the allocated and referenced .Pq logically referenced in the pool block counts and sizes by reference count. +If repeated, (-DD), also shows statistics on how much of the DDT is resident +in the ARC. .It Fl s Display the number of leaf vdev slow I/O operations. This is the number of I/O operations that didn't complete in diff --git a/man/man8/zpool.8 b/man/man8/zpool.8 index fe44e15cabe1..2b966b72bf4c 100644 --- a/man/man8/zpool.8 +++ b/man/man8/zpool.8 @@ -26,7 +26,7 @@ .\" Copyright 2017 Nexenta Systems, Inc. .\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved. .\" -.Dd March 16, 2022 +.Dd February 14, 2024 .Dt ZPOOL 8 .Os . @@ -168,6 +168,8 @@ specified. . .Ss Maintenance .Bl -tag -width Ds +.It Xr zpool-prefetch 8 +Prefetches specific types of pool data. .It Xr zpool-scrub 8 Begins a scrub or resumes a paused scrub. .It Xr zpool-checkpoint 8 @@ -598,6 +600,7 @@ don't wait. .Xr zpool-list 8 , .Xr zpool-offline 8 , .Xr zpool-online 8 , +.Xr zpool-prefetch 8 , .Xr zpool-reguid 8 , .Xr zpool-remove 8 , .Xr zpool-reopen 8 , diff --git a/module/zcommon/zpool_prop.c b/module/zcommon/zpool_prop.c index 173dd5d40ae4..d5ad3f8047c5 100644 --- a/module/zcommon/zpool_prop.c +++ b/module/zcommon/zpool_prop.c @@ -183,6 +183,9 @@ zpool_prop_init(void) zprop_register_hidden(ZPOOL_PROP_DEDUPDITTO, "dedupditto", PROP_TYPE_NUMBER, PROP_DEFAULT, ZFS_TYPE_POOL, "DEDUPDITTO", B_FALSE, sfeatures); + zprop_register_hidden(ZPOOL_PROP_DEDUPCACHED, + ZPOOL_DEDUPCACHED_PROP_NAME, PROP_TYPE_NUMBER, PROP_READONLY, + ZFS_TYPE_POOL, "DEDUPCACHED", B_FALSE, sfeatures); zfs_mod_list_supported_free(sfeatures); } diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 3bcffb3c7ede..58e44a2426ff 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -26,7 +26,7 @@ * Copyright (c) 2017, Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2019, loli10K . All rights reserved. * Copyright (c) 2020, George Amanakis. All rights reserved. - * Copyright (c) 2019, Klara Inc. + * Copyright (c) 2019, 2023, Klara Inc. * Copyright (c) 2019, Allan Jude * Copyright (c) 2020, The FreeBSD Foundation [1] * @@ -5441,6 +5441,57 @@ arc_read_done(zio_t *zio) } } +/* + * Lookup the block at the specified DVA (in bp), and return the manner in + * which the block is cached. A zero return indicates not cached. + */ +int +arc_cached(spa_t *spa, const blkptr_t *bp) +{ + arc_buf_hdr_t *hdr = NULL; + kmutex_t *hash_lock = NULL; + uint64_t guid = spa_load_guid(spa); + int flags = 0; + + if (BP_IS_EMBEDDED(bp)) + return (ARC_CACHED_EMBEDDED); + + hdr = buf_hash_find(guid, bp, &hash_lock); + if (hdr == NULL) + return (0); + + if (HDR_HAS_L1HDR(hdr)) { + arc_state_t *state = hdr->b_l1hdr.b_state; + /* + * We switch to ensure that any future arc_state_type_t + * changes are handled. This is just a shift to promote + * more compile-time checking. + */ + switch (state->arcs_state) { + case ARC_STATE_ANON: + break; + case ARC_STATE_MRU: + flags |= ARC_CACHED_IN_MRU | ARC_CACHED_IN_L1; + break; + case ARC_STATE_MFU: + flags |= ARC_CACHED_IN_MFU | ARC_CACHED_IN_L1; + break; + case ARC_STATE_UNCACHED: + /* The header is still in L1, probably not for long */ + flags |= ARC_CACHED_IN_L1; + break; + default: + break; + } + } + if (HDR_HAS_L2HDR(hdr)) + flags |= ARC_CACHED_IN_L2; + + mutex_exit(hash_lock); + + return (flags); +} + /* * "Read" the block at the specified DVA (in bp) via the * cache. If the block is found in the cache, invoke the provided diff --git a/module/zfs/ddt.c b/module/zfs/ddt.c index 5c5af14754c4..bbb49fdcbd20 100644 --- a/module/zfs/ddt.c +++ b/module/zfs/ddt.c @@ -23,7 +23,7 @@ * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2016 by Delphix. All rights reserved. * Copyright (c) 2022 by Pawel Jakub Dawidek - * Copyright (c) 2023, Klara Inc. + * Copyright (c) 2019, 2023, Klara Inc. */ #include @@ -340,6 +340,16 @@ ddt_object_prefetch(ddt_t *ddt, ddt_type_t type, ddt_class_t class, ddt->ddt_object[type][class], ddk); } +static void +ddt_object_prefetch_all(ddt_t *ddt, ddt_type_t type, ddt_class_t class) +{ + if (!ddt_object_exists(ddt, type, class)) + return; + + ddt_ops[type]->ddt_op_prefetch_all(ddt->ddt_os, + ddt->ddt_object[type][class]); +} + static int ddt_object_update(ddt_t *ddt, ddt_type_t type, ddt_class_t class, ddt_entry_t *dde, dmu_tx_t *tx) @@ -637,6 +647,28 @@ ddt_over_quota(spa_t *spa) return (B_FALSE); } +void +ddt_prefetch_all(spa_t *spa) +{ + /* + * Load all DDT entries for each type/class combination. This is + * indended to perform a prefetch on all such blocks. For the same + * reason that ddt_prefetch isn't locked, this is also not locked. + */ + for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { + ddt_t *ddt = spa->spa_ddt[c]; + if (!ddt) + continue; + + for (ddt_type_t type = 0; type < DDT_TYPES; type++) { + for (ddt_class_t class = 0; class < DDT_CLASSES; + class++) { + ddt_object_prefetch_all(ddt, type, class); + } + } + } +} + ddt_entry_t * ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add) { diff --git a/module/zfs/ddt_stats.c b/module/zfs/ddt_stats.c index 22806245db2d..f8e0745ad242 100644 --- a/module/zfs/ddt_stats.c +++ b/module/zfs/ddt_stats.c @@ -250,3 +250,32 @@ ddt_get_pool_dedup_ratio(spa_t *spa) return (dds_total.dds_ref_dsize * 100 / dds_total.dds_dsize); } + +int +ddt_get_pool_dedup_cached(spa_t *spa, uint64_t *psize) +{ + uint64_t l1sz, l1tot, l2sz, l2tot; + int err = 0; + + l1tot = l2tot = 0; + *psize = 0; + for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { + ddt_t *ddt = spa->spa_ddt[c]; + if (ddt == NULL) + continue; + for (ddt_type_t type = 0; type < DDT_TYPES; type++) { + for (ddt_class_t class = 0; class < DDT_CLASSES; + class++) { + err = dmu_object_cached_size(ddt->ddt_os, + ddt->ddt_object[type][class], &l1sz, &l2sz); + if (err != 0) + return (err); + l1tot += l1sz; + l2tot += l2sz; + } + } + } + + *psize = l1tot + l2tot; + return (err); +} diff --git a/module/zfs/ddt_zap.c b/module/zfs/ddt_zap.c index 741554de3c60..7ce7461a2b25 100644 --- a/module/zfs/ddt_zap.c +++ b/module/zfs/ddt_zap.c @@ -147,6 +147,12 @@ ddt_zap_prefetch(objset_t *os, uint64_t object, const ddt_key_t *ddk) (void) zap_prefetch_uint64(os, object, (uint64_t *)ddk, DDT_KEY_WORDS); } +static void +ddt_zap_prefetch_all(objset_t *os, uint64_t object) +{ + (void) zap_prefetch_object(os, object); +} + static int ddt_zap_update(objset_t *os, uint64_t object, const ddt_key_t *ddk, const ddt_phys_t *phys, size_t psize, dmu_tx_t *tx) @@ -231,6 +237,7 @@ const ddt_ops_t ddt_zap_ops = { ddt_zap_lookup, ddt_zap_contains, ddt_zap_prefetch, + ddt_zap_prefetch_all, ddt_zap_update, ddt_zap_remove, ddt_zap_walk, diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index d82211e6d4c7..e3b6b8662169 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -26,7 +26,7 @@ * Copyright (c) 2016, Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2015 by Chunwei Chen. All rights reserved. * Copyright (c) 2019 Datto Inc. - * Copyright (c) 2019, Klara Inc. + * Copyright (c) 2019, 2023, Klara Inc. * Copyright (c) 2019, Allan Jude * Copyright (c) 2022 Hewlett Packard Enterprise Development LP. * Copyright (c) 2021, 2022 by Pawel Jakub Dawidek @@ -698,7 +698,7 @@ dmu_buf_rele_array(dmu_buf_t **dbp_fake, int numbufs, const void *tag) * Issue prefetch I/Os for the given blocks. If level is greater than 0, the * indirect blocks prefetched will be those that point to the blocks containing * the data starting at offset, and continuing to offset + len. If the range - * it too long, prefetch the first dmu_prefetch_max bytes as requested, while + * is too long, prefetch the first dmu_prefetch_max bytes as requested, while * for the rest only a higher level, also fitting within dmu_prefetch_max. It * should primarily help random reads, since for long sequential reads there is * a speculative prefetcher. @@ -766,6 +766,50 @@ dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset, dnode_rele(dn, FTAG); } +/* XXX comment me -- robn, 2024-02-07 */ +int +dmu_prefetch_wait(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, + uint32_t flags) +{ + dnode_t *dn; + dmu_buf_t **dbp; + int numbufs, err = 0; + + err = dnode_hold(os, object, FTAG, &dn); + if (err != 0) + return (err); + + while (size > 0) { + uint64_t mylen = MIN(size, dmu_prefetch_max); + + /* + * NB: we could do this block-at-a-time, but it's nice + * to be reading in parallel. + * + * XXX -- can we use dbuf_prefetch_impl() to avoid filling + * the dbuf cache with decompressed ddt data which we don't + * need right now? + */ + err = dmu_buf_hold_array_by_dnode(dn, offset, mylen, + TRUE, FTAG, &numbufs, &dbp, flags); + if (err) + break; + + dmu_buf_rele_array(dbp, numbufs, FTAG); + + offset += mylen; + size -= mylen; + + if (issig(JUSTLOOKING) && issig(FORREAL)) { + err = SET_ERROR(EINTR); + break; + } + } + + dnode_rele(dn, FTAG); + return (err); +} + /* * Issue prefetch I/Os for the given object's dnode. */ @@ -1433,6 +1477,114 @@ dmu_write_uio(objset_t *os, uint64_t object, zfs_uio_t *uio, uint64_t size, } #endif /* _KERNEL */ +static void +dbuf_cached_bps(spa_t *spa, blkptr_t *bps, uint_t nbps, + uint64_t *l1sz, uint64_t *l2sz) +{ + int cached_flags; + + if (bps == NULL) + return; + + for (size_t blk_off = 0; blk_off < nbps; blk_off++) { + blkptr_t *bp = &bps[blk_off]; + uint64_t *u64p; + + if (BP_IS_HOLE(bp)) + continue; + + cached_flags = arc_cached(spa, bp); + if (cached_flags == 0) + continue; + + u64p = (cached_flags & ARC_CACHED_IN_L2) ? l2sz : l1sz; + *u64p += BP_GET_LSIZE(bp); + } +} + +/* + * Estimate DMU object cached size. + */ +int +dmu_object_cached_size(objset_t *os, uint64_t object, + uint64_t *l1sz, uint64_t *l2sz) +{ + dnode_t *dn; + dmu_object_info_t doi; + int level = 1; + int err = 0; + + *l1sz = *l2sz = 0; + + if (dnode_hold(os, object, FTAG, &dn) != 0) + return (0); + + dmu_object_info_from_dnode(dn, &doi); + + for (uint64_t off = 0; off < doi.doi_max_offset; + off += dmu_prefetch_max) { + /* dbuf_read doesn't prefetch L1 blocks. */ + dmu_prefetch(os, object, level, off, + dmu_prefetch_max, ZIO_PRIORITY_NOW); + } + + /* + * Hold all valid L1 blocks, asking ARC the status of each BP + * contained in each such L1 block. + */ + uint_t nbps = bp_span_in_blocks(dn->dn_indblkshift, level); + rw_enter(&dn->dn_struct_rwlock, RW_READER); + for (uint64_t off = 0; off < doi.doi_max_offset; + off += doi.doi_metadata_block_size) { + dmu_buf_impl_t *db = NULL; + + /* Skip L1's that are not present (i.e., sparse) */ + if (dnode_next_offset(dn, DNODE_FIND_HAVELOCK, &off, level, + 1, 0) != 0) { + break; + } + + /* + * If we get an i/o error here, the L1 can't be read, + * and nothing under it could be cached, so we just + * continue. Ignoring the error from dbuf_hold_impl + * or from dbuf_read is then a reasonable choice. + */ + err = dbuf_hold_impl(dn, level, off >> dn->dn_indblkshift, + B_TRUE, B_FALSE, FTAG, &db); + if (err != 0) { + /* + * ignore error and continue + */ + err = 0; + continue; + } + + err = dbuf_read(db, NULL, DB_RF_CANFAIL); + if (err == 0) { + dbuf_cached_bps(dmu_objset_spa(os), db->db.db_data, + nbps, l1sz, l2sz); + } + /* + * error may be ignored, and we continue + */ + err = 0; + dbuf_rele(db, FTAG); + + if (issig(JUSTLOOKING) && issig(FORREAL)) { + /* + * On interrupt, get out, and bubble up EINTR + */ + err = EINTR; + break; + } + } + rw_exit(&dn->dn_struct_rwlock); + + dnode_rele(dn, FTAG); + return (err); +} + /* * Allocate a loaned anonymous arc buffer. */ diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 33c9ab5b1664..1ecd074858ac 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -34,6 +34,7 @@ * Copyright (c) 2017, Intel Corporation. * Copyright (c) 2021, Colm Buckley * Copyright (c) 2023 Hewlett Packard Enterprise Development LP. + * Copyright (c) 2023, Klara Inc. */ /* @@ -331,6 +332,55 @@ spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, const char *strval, nvlist_free(propval); } +static int +spa_prop_add(spa_t *spa, const char *propname, nvlist_t *outnvl) +{ + zpool_prop_t prop = zpool_name_to_prop(propname); + zprop_source_t src = ZPROP_SRC_NONE; + uint64_t intval; + int err; + + /* + * NB: Not all properties lookups via this API require + * the spa props lock, so they must explicitly grab it here. + */ + switch (prop) { + case ZPOOL_PROP_DEDUPCACHED: + err = ddt_get_pool_dedup_cached(spa, &intval); + if (err != 0) + return (SET_ERROR(err)); + break; + default: + return (SET_ERROR(EINVAL)); + } + + spa_prop_add_list(outnvl, prop, NULL, intval, src); + + return (0); +} + +int +spa_prop_get_nvlist(spa_t *spa, char **props, unsigned int n_props, + nvlist_t **outnvl) +{ + int err; + + if (props == NULL) + return (0); + + if (*outnvl == NULL) { + err = nvlist_alloc(outnvl, NV_UNIQUE_NAME, KM_SLEEP); + if (err) + return (err); + } + + for (unsigned int i = 0; i < n_props && err == 0; i++) { + err = spa_prop_add(spa, props[i], *outnvl); + } + + return (err); +} + /* * Add a user property (source=src, propname=propval) to an nvlist. */ @@ -497,9 +547,11 @@ spa_prop_get(spa_t *spa, nvlist_t **nvp) dsl_pool_t *dp; int err; - err = nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP); - if (err) - return (err); + if (*nvp == NULL) { + err = nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP); + if (err) + return (err); + } dp = spa_get_dsl(spa); dsl_pool_config_enter(dp, FTAG); diff --git a/module/zfs/zap_micro.c b/module/zfs/zap_micro.c index 085d9cd8b4b6..318a433f4042 100644 --- a/module/zfs/zap_micro.c +++ b/module/zfs/zap_micro.c @@ -1074,6 +1074,22 @@ zap_prefetch(objset_t *os, uint64_t zapobj, const char *name) return (err); } +int +zap_prefetch_object(objset_t *os, uint64_t zapobj) +{ + int error; + dmu_object_info_t doi; + + error = dmu_object_info(os, zapobj, &doi); + if (error == 0 && DMU_OT_BYTESWAP(doi.doi_type) != DMU_BSWAP_ZAP) + error = SET_ERROR(EINVAL); + if (error == 0) { + dmu_prefetch_wait(os, zapobj, 0, doi.doi_max_offset, + DMU_READ_PREFETCH); + } + return (error); +} + int zap_lookup_by_dnode(dnode_t *dn, const char *name, uint64_t integer_size, uint64_t num_integers, void *buf) @@ -1701,6 +1717,7 @@ EXPORT_SYMBOL(zap_lookup_uint64); EXPORT_SYMBOL(zap_contains); EXPORT_SYMBOL(zap_prefetch); EXPORT_SYMBOL(zap_prefetch_uint64); +EXPORT_SYMBOL(zap_prefetch_object); EXPORT_SYMBOL(zap_add); EXPORT_SYMBOL(zap_add_by_dnode); EXPORT_SYMBOL(zap_add_uint64); diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c index b2b06881bdd4..a240e1f6341b 100644 --- a/module/zfs/zfs_ioctl.c +++ b/module/zfs/zfs_ioctl.c @@ -38,7 +38,7 @@ * Copyright (c) 2017 Open-E, Inc. All Rights Reserved. * Copyright (c) 2019 Datto Inc. * Copyright (c) 2019, 2020 by Christian Schwarz. All rights reserved. - * Copyright (c) 2019, 2021, Klara Inc. + * Copyright (c) 2019, 2023, Klara Inc. * Copyright (c) 2019, Allan Jude */ @@ -3008,34 +3008,51 @@ zfs_ioc_pool_set_props(zfs_cmd_t *zc) return (error); } +/* + * innvl: { + * "get_props_names": [ "prop1", "prop2", ..., "propN" ] + * } + */ + +static const zfs_ioc_key_t zfs_keys_get_props[] = { + { ZPOOL_GET_PROPS_NAMES, DATA_TYPE_STRING_ARRAY, ZK_OPTIONAL }, +}; + static int -zfs_ioc_pool_get_props(zfs_cmd_t *zc) +zfs_ioc_pool_get_props(const char *pool, nvlist_t *innvl, nvlist_t *outnvl) { + nvlist_t *nvp = outnvl; spa_t *spa; + char **props = NULL; + unsigned int n_props = 0; int error; - nvlist_t *nvp = NULL; - if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) { + if (nvlist_lookup_string_array(innvl, ZPOOL_GET_PROPS_NAMES, + &props, &n_props) != 0) { + props = NULL; + } + + if ((error = spa_open(pool, &spa, FTAG)) != 0) { /* * If the pool is faulted, there may be properties we can still * get (such as altroot and cachefile), so attempt to get them * anyway. */ mutex_enter(&spa_namespace_lock); - if ((spa = spa_lookup(zc->zc_name)) != NULL) + if ((spa = spa_lookup(pool)) != NULL) { error = spa_prop_get(spa, &nvp); + if (error == 0 && props != NULL) + error = spa_prop_get_nvlist(spa, props, n_props, + &nvp); + } mutex_exit(&spa_namespace_lock); } else { error = spa_prop_get(spa, &nvp); + if (error == 0 && props != NULL) + error = spa_prop_get_nvlist(spa, props, n_props, &nvp); spa_close(spa, FTAG); } - if (error == 0 && zc->zc_nvlist_dst != 0) - error = put_nvlist(zc, nvp); - else - error = SET_ERROR(EFAULT); - - nvlist_free(nvp); return (error); } @@ -4030,6 +4047,38 @@ zfs_ioc_pool_discard_checkpoint(const char *poolname, nvlist_t *innvl, return (spa_checkpoint_discard(poolname)); } +/* + * innvl: unused + * outnvl: empty + */ +static const zfs_ioc_key_t zfs_keys_pool_prefetch_ddt[] = { + /* no nvl keys */ +}; + +static int +zfs_ioc_pool_prefetch_ddt(const char *poolname, nvlist_t *innvl, + nvlist_t *outnvl) +{ + int error; + spa_t *spa; + + (void) innvl, (void) outnvl; + error = spa_open(poolname, &spa, FTAG); + if (error != 0) + return (error); + + hrtime_t start_time = gethrtime(); + + ddt_prefetch_all(spa); + + zfs_dbgmsg("pool '%s': loaded ddt into ARC in %llu ms", spa->spa_name, + (u_longlong_t)((gethrtime() - start_time) / 1000000)); + + spa_close(spa, FTAG); + + return (error); +} + /* * inputs: * zc_name name of dataset to destroy @@ -7235,6 +7284,12 @@ zfs_ioctl_init(void) zfs_keys_pool_discard_checkpoint, ARRAY_SIZE(zfs_keys_pool_discard_checkpoint)); + zfs_ioctl_register("zpool_prefetch_ddt", + ZFS_IOC_POOL_PREFETCH_DDT, zfs_ioc_pool_prefetch_ddt, + zfs_secpolicy_config, POOL_NAME, + POOL_CHECK_SUSPENDED, B_TRUE, B_TRUE, + zfs_keys_pool_prefetch_ddt, ARRAY_SIZE(zfs_keys_pool_prefetch_ddt)); + zfs_ioctl_register("initialize", ZFS_IOC_POOL_INITIALIZE, zfs_ioc_pool_initialize, zfs_secpolicy_config, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, @@ -7280,6 +7335,11 @@ zfs_ioctl_init(void) POOL_CHECK_NONE, B_TRUE, B_TRUE, zfs_keys_pool_scrub, ARRAY_SIZE(zfs_keys_pool_scrub)); + zfs_ioctl_register("get_props", ZFS_IOC_POOL_GET_PROPS, + zfs_ioc_pool_get_props, zfs_secpolicy_read, POOL_NAME, + POOL_CHECK_NONE, B_FALSE, B_FALSE, + zfs_keys_get_props, ARRAY_SIZE(zfs_keys_get_props)); + /* IOCTLS that use the legacy function signature */ zfs_ioctl_register_legacy(ZFS_IOC_POOL_FREEZE, zfs_ioc_pool_freeze, @@ -7335,8 +7395,6 @@ zfs_ioctl_init(void) zfs_ioctl_register_pool(ZFS_IOC_POOL_STATS, zfs_ioc_pool_stats, zfs_secpolicy_read, B_FALSE, POOL_CHECK_NONE); - zfs_ioctl_register_pool(ZFS_IOC_POOL_GET_PROPS, zfs_ioc_pool_get_props, - zfs_secpolicy_read, B_FALSE, POOL_CHECK_NONE); zfs_ioctl_register_pool(ZFS_IOC_ERROR_LOG, zfs_ioc_error_log, zfs_secpolicy_inject, B_FALSE, POOL_CHECK_SUSPENDED); diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index 34adb9119363..ee411077dd96 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -202,6 +202,10 @@ tests = ['zfs_create_001_pos', 'zfs_create_002_pos', 'zfs_create_003_pos', 'zfs_create_verbose'] tags = ['functional', 'cli_root', 'zfs_create'] +[tests/functional/cli_root/zpool_prefetch] +tests = ['zpool_prefetch_001_pos'] +tags = ['functional', 'cli_root', 'zpool_prefetch'] + [tests/functional/cli_root/zfs_destroy] tests = ['zfs_clone_livelist_condense_and_disable', 'zfs_clone_livelist_condense_races', 'zfs_clone_livelist_dedup', diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index 1883bffdf2fa..a1881a09516c 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -192,6 +192,7 @@ nobase_dist_datadir_zfs_tests_tests_DATA += \ functional/cli_root/zpool_import/zpool_import.kshlib \ functional/cli_root/zpool_initialize/zpool_initialize.kshlib \ functional/cli_root/zpool_labelclear/labelclear.cfg \ + functional/cli_root/zpool_prefetch/zpool_prefetch.cfg \ functional/cli_root/zpool_remove/zpool_remove.cfg \ functional/cli_root/zpool_reopen/zpool_reopen.cfg \ functional/cli_root/zpool_reopen/zpool_reopen.shlib \ @@ -1168,6 +1169,9 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/cli_root/zpool_online/setup.ksh \ functional/cli_root/zpool_online/zpool_online_001_pos.ksh \ functional/cli_root/zpool_online/zpool_online_002_neg.ksh \ + functional/cli_root/zpool_prefetch/cleanup.ksh \ + functional/cli_root/zpool_prefetch/setup.ksh \ + functional/cli_root/zpool_prefetch/zpool_prefetch_001_pos.ksh \ functional/cli_root/zpool_remove/cleanup.ksh \ functional/cli_root/zpool_remove/setup.ksh \ functional/cli_root/zpool_remove/zpool_remove_001_neg.ksh \ diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_prefetch/cleanup.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_prefetch/cleanup.ksh new file mode 100755 index 000000000000..79cd6e9f908e --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_prefetch/cleanup.ksh @@ -0,0 +1,30 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# + +. $STF_SUITE/include/libtest.shlib + +default_cleanup diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_prefetch/setup.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_prefetch/setup.ksh new file mode 100755 index 000000000000..6a9af3bc28c3 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_prefetch/setup.ksh @@ -0,0 +1,32 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# + +. $STF_SUITE/include/libtest.shlib + +DISK=${DISKS%% *} + +default_setup $DISK diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_prefetch/zpool_prefetch.cfg b/tests/zfs-tests/tests/functional/cli_root/zpool_prefetch/zpool_prefetch.cfg new file mode 100644 index 000000000000..70da58df084b --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_prefetch/zpool_prefetch.cfg @@ -0,0 +1,26 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# + diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_prefetch/zpool_prefetch_001_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_prefetch/zpool_prefetch_001_pos.ksh new file mode 100755 index 000000000000..a96a38ff178a --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_prefetch/zpool_prefetch_001_pos.ksh @@ -0,0 +1,128 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2019, 2023 by Klara Inc. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# 'zpool prefetch -t ddt ' can successfully load a pool's DDT on demand. +# +# STRATEGY: +# 1. Build up storage pool with deduplicated dataset. +# 2. Export the pool. +# 3. Import the pool, and use zpool prefetch -t ddt to load its table. +# 4. Verify the DDT was loaded successfully using ddt cache stats +# + +verify_runnable "both" + +log_assert "'zpool prefetch -t ddt ' can successfully load the DDT for a pool." + +function getddtstats +{ + typeset -n gds=$1 + typeset pool=$2 + + out=$(zpool status -DDp $pool | awk '/^ dedup: / {print $6 " " $9 " " $12}') + log_note "status -DDp output: ${out}" + + gds.ondisk=$(echo $out | cut -d" " -f1) + gds.incore=$(echo $out | cut -d" " -f2) + gds.cached=$(echo $out | cut -d" " -f3) + + # In case of missing data, reset to 0. This should normally be due + # to a pool without any DDT. + [ -z "${gds.ondisk}" ] && gds.ondisk="0" + [ -z "${gds.incore}" ] && gds.incore="0" + [ -z "${gds.cached}" ] && gds.cached="0" + + return true +} + +# Confirm that nothing happens on a standard pool config. +typeset -A before +log_must getddtstats before $TESTPOOL +log_note "before stats: ${before}" +log_must test "${before.ondisk}" -eq "0" +log_must test "${before.incore}" -eq "0" +log_must test "${before.cached}" -eq "0" +log_must zpool prefetch -t ddt $TESTPOOL + +# Build up the deduplicated dataset. This consists of creating enough files +# to generate a reasonable size DDT for testing purposes. + +DATASET=$TESTPOOL/ddt +log_must zfs create -o dedup=on $DATASET +MNTPOINT=$(get_prop mountpoint $TESTPOOL/ddt) + +log_note "Generating dataset ..." +typeset -i i=0 +while (( i < 16384 )); do + echo -n $i > $MNTPOINT/f.$i + + # Create some copies of the original mainly for the purpose of + # having duplicate entries. About half will have no copies, while + # the remainder will have an equal distribution of 1-4 copies, + # depending on the number put into the original. + typeset -i j + ((j = i % 8)) + while (( j < 4 )); do + cp $MNTPOINT/f.$i $MNTPOINT/f.$i.$j + ((j += 1)) + done + ((i += 1)) +done +log_note "Dataset generation completed." + +typeset -A generated +log_must getddtstats generated $TESTPOOL +log_note "generated stats: ${generated}" +log_must test "${generated.ondisk}" -ge "1048576" +log_must test "${generated.incore}" -ge "1048576" +log_must test "${generated.cached}" -ge "1048576" +log_must zpool prefetch -t ddt $TESTPOOL + +# Do an export/import series to flush the DDT dataset cache. +typeset -A reimport +log_must zpool export $TESTPOOL +log_must zpool import $TESTPOOL +log_must getddtstats reimport $TESTPOOL +log_note "reimport stats: ${reimport}" +log_must test "${reimport.ondisk}" -ge "1048576" +log_must test "${reimport.incore}" -ge "1048576" +# On reimport, only the first block or two should be cached. +log_must test "${reimport.cached}" -le "65536" + +# Finally, reload it and check again. +typeset -A reloaded +log_must zpool prefetch -t ddt $TESTPOOL +log_must getddtstats reloaded $TESTPOOL +log_note "reloaded stats: ${reloaded}" +log_must test "${reloaded.ondisk}" -ge "1048576" +log_must test "${reloaded.incore}" -ge "1048576" +log_must test "${reloaded.cached}" -eq "${reloaded.incore}" + +log_pass "'zpool prefetch -t ddt ' success."