Skip to content

Commit

Permalink
ZIL: Improve next log block size prediction.
Browse files Browse the repository at this point in the history
Detect single-threaded workloads by checking the previous block is
fully written and flushed.  It allows to make size prediction logic
much more precise and skip commit delays, since we can give up on
write aggregation in that case.

Since single-threaded workloads are no longer delayed, increase
zfs_commit_timeout_pct from 5 to 10%.  Parallel workloads should
less care about it, and it should provide more aggregation.

Remove zil_min_commit_timeout tunable, since very fast ZILs should
detect most of workloads as single-threaded.  And when not, not
delaying writes wastes extra block space allocated for aggregation.

Track history in context of bursts, not individual log blocks.  It
allows to not blow away all the history by single large burst of
many block, and same time allows optimizations covering multiple
blocks in a burst and even predicted following burst.  For each
burst account its optimal block size and minimal first block size.
Use that statistics from the last 8 bursts to predict first block
size of the next burst.

Remove predefined set of block sizes.  Allocate any size we see fit,
multiple of 4KB, as required by ZIL now.  With compression enabled
by default, ZFS already writes pretty random block sizes, so this
should not surprise space allocator any more.

Reduce max_waste_space from 12 to 6% and max_copied_data from 63KB
to 8KB.  It allows prediction to be more precise on large bursts,
improve space efficiency and reduce extra memory copying.

Signed-off-by:	Alexander Motin <[email protected]>
Sponsored by:	iXsystems, Inc.
  • Loading branch information
amotin committed May 27, 2023
1 parent d3e0138 commit 1b3816c
Show file tree
Hide file tree
Showing 4 changed files with 261 additions and 142 deletions.
14 changes: 10 additions & 4 deletions include/os/linux/zfs/sys/trace_zil.h
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,9 @@
__field(uint64_t, zl_parse_lr_seq) \
__field(uint64_t, zl_parse_blk_count) \
__field(uint64_t, zl_parse_lr_count) \
__field(uint64_t, zl_cur_used) \
__field(uint64_t, zl_cur_size) \
__field(uint64_t, zl_cur_left) \
__field(uint64_t, zl_cur_max) \
__field(clock_t, zl_replay_time) \
__field(uint64_t, zl_replay_blks)

Expand All @@ -72,7 +74,9 @@
__entry->zl_parse_lr_seq = zilog->zl_parse_lr_seq; \
__entry->zl_parse_blk_count = zilog->zl_parse_blk_count;\
__entry->zl_parse_lr_count = zilog->zl_parse_lr_count; \
__entry->zl_cur_used = zilog->zl_cur_used; \
__entry->zl_cur_size = zilog->zl_cur_size; \
__entry->zl_cur_left = zilog->zl_cur_left; \
__entry->zl_cur_max = zilog->zl_cur_max; \
__entry->zl_replay_time = zilog->zl_replay_time; \
__entry->zl_replay_blks = zilog->zl_replay_blks;

Expand All @@ -82,7 +86,8 @@
"replay %u stop_sync %u logbias %u sync %u " \
"parse_error %u parse_blk_seq %llu parse_lr_seq %llu " \
"parse_blk_count %llu parse_lr_count %llu " \
"cur_used %llu replay_time %lu replay_blks %llu }"
"cur_size %u cur_left %llu cur_max %llu replay_time %lu " \
"replay_blks %llu }"

#define ZILOG_TP_PRINTK_ARGS \
__entry->zl_lr_seq, __entry->zl_commit_lr_seq, \
Expand All @@ -92,7 +97,8 @@
__entry->zl_stop_sync, __entry->zl_logbias, __entry->zl_sync, \
__entry->zl_parse_error, __entry->zl_parse_blk_seq, \
__entry->zl_parse_lr_seq, __entry->zl_parse_blk_count, \
__entry->zl_parse_lr_count, __entry->zl_cur_used, \
__entry->zl_parse_lr_count, __entry->zl_cur_size, \
__entry->zl_cur_left, __entry->zl_cur_max, \
__entry->zl_replay_time, __entry->zl_replay_blks

#define ITX_TP_STRUCT_ENTRY \
Expand Down
12 changes: 8 additions & 4 deletions include/sys/zil_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,7 @@ typedef struct zil_vdev_node {
avl_node_t zv_node; /* AVL tree linkage */
} zil_vdev_node_t;

#define ZIL_PREV_BLKS 16
#define ZIL_BURSTS 8

/*
* Stable storage intent log management structure. One per dataset.
Expand Down Expand Up @@ -202,14 +202,18 @@ struct zilog {
uint64_t zl_parse_lr_count; /* number of log records parsed */
itxg_t zl_itxg[TXG_SIZE]; /* intent log txg chains */
list_t zl_itx_commit_list; /* itx list to be committed */
uint64_t zl_cur_used; /* current commit log size used */
uint64_t zl_cur_size; /* current burst full size */
uint64_t zl_cur_left; /* current burst remaining size */
uint64_t zl_cur_max; /* biggest record in current burst */
list_t zl_lwb_list; /* in-flight log write list */
avl_tree_t zl_bp_tree; /* track bps during log parse */
clock_t zl_replay_time; /* lbolt of when replay started */
uint64_t zl_replay_blks; /* number of log blocks replayed */
zil_header_t zl_old_header; /* debugging aid */
uint_t zl_prev_blks[ZIL_PREV_BLKS]; /* size - sector rounded */
uint_t zl_prev_rotor; /* rotor for zl_prev[] */
uint_t zl_parallel; /* workload is multi-threaded */
uint_t zl_prev_rotor; /* rotor for zl_prev_* */
uint_t zl_prev_opt[ZIL_BURSTS]; /* optimal block size */
uint_t zl_prev_min[ZIL_BURSTS]; /* minimal first block size */
txg_node_t zl_dirty_link; /* protected by dp_dirty_zilogs list */
uint64_t zl_dirty_max_txg; /* highest txg used to dirty zilog */

Expand Down
16 changes: 7 additions & 9 deletions man/man4/zfs.4
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
.\" own identifying information:
.\" Portions Copyright [yyyy] [name of copyright owner]
.\"
.Dd January 10, 2023
.Dd May 26, 2023
.Dt ZFS 4
.Os
.
Expand Down Expand Up @@ -780,7 +780,7 @@ Note that this should not be set below the ZED thresholds
(currently 10 checksums over 10 seconds)
or else the daemon may not trigger any action.
.
.It Sy zfs_commit_timeout_pct Ns = Ns Sy 5 Ns % Pq uint
.It Sy zfs_commit_timeout_pct Ns = Ns Sy 10 Ns % Pq uint
This controls the amount of time that a ZIL block (lwb) will remain "open"
when it isn't "full", and it has a thread waiting for it to be committed to
stable storage.
Expand Down Expand Up @@ -2153,12 +2153,10 @@ On very fragmented pools, lowering this
.Pq typically to Sy 36 KiB
can improve performance.
.
.It Sy zil_min_commit_timeout Ns = Ns Sy 5000 Pq u64
This sets the minimum delay in nanoseconds ZIL care to delay block commit,
waiting for more records.
If ZIL writes are too fast, kernel may not be able sleep for so short interval,
increasing log latency above allowed by
.Sy zfs_commit_timeout_pct .
.It Sy zil_maxcopied Ns = Ns Sy 8192 Ns B Po 8 KiB Pc Pq uint
This sets the maximum number of write bytes logged via WR_COPIED.
It tunes a tradeoff between additional memory copy and possibly worse log
space efficiency vs additional range lock/unlock.
.
.It Sy zil_nocacheflush Ns = Ns Sy 0 Ns | Ns 1 Pq int
Disable the cache flush commands that are normally sent to disk by
Expand All @@ -2170,7 +2168,7 @@ if a volatile out-of-order write cache is enabled.
Disable intent logging replay.
Can be disabled for recovery from corrupted ZIL.
.
.It Sy zil_slog_bulk Ns = Ns Sy 786432 Ns B Po 768 KiB Pc Pq u64
.It Sy zil_slog_bulk Ns = Ns Sy 67108864 Ns B Po 64 MiB Pc Pq u64
Limit SLOG write size per commit executed with synchronous priority.
Any writes above that will be executed with lower (asynchronous) priority
to limit potential SLOG device abuse by single active ZIL writer.
Expand Down
Loading

0 comments on commit 1b3816c

Please sign in to comment.