Skip to content

Commit

Permalink
ZIO: add "vdev tracing" facility; use it for ZIL flushing
Browse files Browse the repository at this point in the history
A problem with zio_flush() is that it issues a flush ZIO to a top-level
vdev, which then recursively issues child flush ZIOs until the real leaf
devices are flushed. As usual, an error in a child ZIO results in the
parent ZIO erroring too, so if a leaf device has failed, it's flush ZIO
will fail, and so will the entire flush operation.

This didn't matter when we used to ignore flush errors, but now that we
propagate them, the flush error propagates into the ZIL write ZIO. This
causes the ZIL to believe its write failed, and fall back to a full txg
wait. This still provides correct behaviour for zil_commit() callers (eg
fsync()) but it ruins performance.

We cannot simply skip flushing failed vdevs, because the associated
write may have succeeded before the vdev failed, which would give the
appearance the write is fully flushed when it is not. Neither can we
issue a "syncing write" to the device (eg SCSI FUA), as this also
degrades performance.

The answer is that we must bind writes and flushes together in a way
such that we only flush the physical devices that we wrote to.

This adds a "vdev tracing" facility to ZIOs, zio_vdev_trace. When
enabled on a ZIO with ZIO_FLAG_VDEV_TRACE, then upon successful
completion (in the _done handler), zio->io_vdev_trace_tree will have a
list of zio_vdev_trace_t objects that each describe a vdev that was
involved in the successful completion of the ZIO.

A companion function, zio_vdev_trace_flush(), is included, that issues a
flush ZIO to the child vdevs on the given trace tree.
zil_lwb_write_done() is updated to use this to bind ZIL writes and
flushes together.

The tracing facility is similar in many ways to the "deferred flushing"
facility inside the ZIL, to the point where it can replace it. Now, if
the flush should be deferred, the trace records from the writing ZIO are
captured and combined with any captured from previous writes. When its
finally time to issue the flush, we issue it to the entire accumulated
set of traced vdevs.

Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Signed-off-by: Rob Norris <[email protected]>
  • Loading branch information
robn committed Aug 17, 2024
1 parent 8a09462 commit e35088c
Show file tree
Hide file tree
Showing 6 changed files with 243 additions and 118 deletions.
2 changes: 1 addition & 1 deletion include/sys/zil.h
Original file line number Diff line number Diff line change
Expand Up @@ -604,7 +604,7 @@ extern void zil_clean(zilog_t *zilog, uint64_t synced_txg);
extern int zil_suspend(const char *osname, void **cookiep);
extern void zil_resume(void *cookie);

extern void zil_lwb_add_block(struct lwb *lwb, const blkptr_t *bp);
extern void zil_lwb_add_flush(struct lwb *lwb, zio_t *zio);
extern void zil_lwb_add_txg(struct lwb *lwb, uint64_t txg);
extern int zil_bp_tree_add(zilog_t *zilog, const blkptr_t *bp);

Expand Down
9 changes: 0 additions & 9 deletions include/sys/zil_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -172,15 +172,6 @@ typedef struct itx_async_node {
avl_node_t ia_node; /* AVL tree linkage */
} itx_async_node_t;

/*
* Vdev flushing: during a zil_commit(), we build up an AVL tree of the vdevs
* we've touched so we know which ones need a write cache flush at the end.
*/
typedef struct zil_vdev_node {
uint64_t zv_vdev; /* vdev to be flushed */
avl_node_t zv_node; /* AVL tree linkage */
} zil_vdev_node_t;

#define ZIL_BURSTS 8

/*
Expand Down
22 changes: 18 additions & 4 deletions include/sys/zio.h
Original file line number Diff line number Diff line change
Expand Up @@ -180,16 +180,17 @@ typedef uint64_t zio_flag_t;
#define ZIO_FLAG_SCRUB (1ULL << 4)
#define ZIO_FLAG_SCAN_THREAD (1ULL << 5)
#define ZIO_FLAG_PHYSICAL (1ULL << 6)
#define ZIO_FLAG_VDEV_TRACE (1ULL << 7)

#define ZIO_FLAG_AGG_INHERIT (ZIO_FLAG_CANFAIL - 1)

/*
* Flags inherited by ddt, gang, and vdev children.
*/
#define ZIO_FLAG_CANFAIL (1ULL << 7) /* must be first for INHERIT */
#define ZIO_FLAG_SPECULATIVE (1ULL << 8)
#define ZIO_FLAG_CONFIG_WRITER (1ULL << 9)
#define ZIO_FLAG_DONT_RETRY (1ULL << 10)
#define ZIO_FLAG_CANFAIL (1ULL << 8) /* must be first for INHERIT */
#define ZIO_FLAG_SPECULATIVE (1ULL << 9)
#define ZIO_FLAG_CONFIG_WRITER (1ULL << 10)
#define ZIO_FLAG_DONT_RETRY (1ULL << 11)
#define ZIO_FLAG_NODATA (1ULL << 12)
#define ZIO_FLAG_INDUCE_DAMAGE (1ULL << 13)
#define ZIO_FLAG_IO_ALLOCATING (1ULL << 14)
Expand Down Expand Up @@ -445,6 +446,11 @@ enum zio_qstate {
ZIO_QS_ACTIVE,
};

typedef struct zio_vdev_trace {
uint64_t zvt_guid;
avl_node_t zvt_node;
} zio_vdev_trace_t;

struct zio {
/* Core information about this I/O */
zbookmark_phys_t io_bookmark;
Expand Down Expand Up @@ -513,6 +519,7 @@ struct zio {
int io_error;
int io_child_error[ZIO_CHILD_TYPES];
uint64_t io_children[ZIO_CHILD_TYPES][ZIO_WAIT_TYPES];
avl_tree_t io_vdev_trace_tree;
uint64_t *io_stall;
zio_t *io_gang_leader;
zio_gang_node_t *io_gang_tree;
Expand Down Expand Up @@ -636,6 +643,13 @@ extern void zio_vdev_io_bypass(zio_t *zio);
extern void zio_vdev_io_reissue(zio_t *zio);
extern void zio_vdev_io_redone(zio_t *zio);

extern void zio_vdev_trace_init(avl_tree_t *t);
extern void zio_vdev_trace_fini(avl_tree_t *t);
extern void zio_vdev_trace_copy(avl_tree_t *src, avl_tree_t *dst);
extern void zio_vdev_trace_move(avl_tree_t *src, avl_tree_t *dst);
extern void zio_vdev_trace_flush(zio_t *zio, avl_tree_t *t);
extern void zio_vdev_trace_empty(avl_tree_t *t);

extern void zio_change_priority(zio_t *pio, zio_priority_t priority);

extern void zio_checksum_verified(zio_t *zio);
Expand Down
19 changes: 10 additions & 9 deletions module/zfs/dmu.c
Original file line number Diff line number Diff line change
Expand Up @@ -1806,12 +1806,11 @@ dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg)
zgd_t *zgd = dsa->dsa_zgd;

/*
* Record the vdev(s) backing this blkptr so they can be flushed after
* the writes for the lwb have completed.
* Capture the trace records for this zio so the vdevs can be flushed
* after the writes for the lwb have completed.
*/
if (zio->io_error == 0) {
zil_lwb_add_block(zgd->zgd_lwb, zgd->zgd_bp);
}
if (zio->io_error == 0)
zil_lwb_add_flush(zgd->zgd_lwb, zio);

mutex_enter(&db->db_mtx);
ASSERT(dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC);
Expand Down Expand Up @@ -1865,10 +1864,10 @@ dmu_sync_late_arrival_done(zio_t *zio)

if (zio->io_error == 0) {
/*
* Record the vdev(s) backing this blkptr so they can be
* Capture the trace records for this zio so the vdevs can be
* flushed after the writes for the lwb have completed.
*/
zil_lwb_add_block(zgd->zgd_lwb, zgd->zgd_bp);
zil_lwb_add_flush(zgd->zgd_lwb, zio);

if (!BP_IS_HOLE(bp)) {
blkptr_t *bp_orig __maybe_unused = &zio->io_bp_orig;
Expand Down Expand Up @@ -1955,7 +1954,8 @@ dmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sync_cb_t *done, zgd_t *zgd,
abd_get_from_buf(zgd->zgd_db->db_data, zgd->zgd_db->db_size),
zgd->zgd_db->db_size, zgd->zgd_db->db_size, zp,
dmu_sync_late_arrival_ready, NULL, dmu_sync_late_arrival_done,
dsa, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, zb));
dsa, ZIO_PRIORITY_SYNC_WRITE,
ZIO_FLAG_CANFAIL | ZIO_FLAG_VDEV_TRACE, zb));

return (0);
}
Expand Down Expand Up @@ -2122,7 +2122,8 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd)
zio_nowait(arc_write(pio, os->os_spa, txg, zgd->zgd_bp,
dr->dt.dl.dr_data, !DBUF_IS_CACHEABLE(db), dbuf_is_l2cacheable(db),
&zp, dmu_sync_ready, NULL, dmu_sync_done, dsa,
ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, &zb));
ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL | ZIO_FLAG_VDEV_TRACE,
&zb));

return (0);
}
Expand Down
162 changes: 67 additions & 95 deletions module/zfs/zil.c
Original file line number Diff line number Diff line change
Expand Up @@ -798,15 +798,6 @@ zil_free_log_record(zilog_t *zilog, const lr_t *lrc, void *tx,
}
}

static int
zil_lwb_vdev_compare(const void *x1, const void *x2)
{
const uint64_t v1 = ((zil_vdev_node_t *)x1)->zv_vdev;
const uint64_t v2 = ((zil_vdev_node_t *)x2)->zv_vdev;

return (TREE_CMP(v1, v2));
}

/*
* Allocate a new lwb. We may already have a block pointer for it, in which
* case we get size and version from there. Or we may not yet, in which case
Expand Down Expand Up @@ -1369,68 +1360,19 @@ zil_commit_waiter_link_nolwb(zil_commit_waiter_t *zcw, list_t *nolwb)
}

void
zil_lwb_add_block(lwb_t *lwb, const blkptr_t *bp)
zil_lwb_add_flush(struct lwb *lwb, zio_t *zio)
{
avl_tree_t *t = &lwb->lwb_vdev_tree;
avl_index_t where;
zil_vdev_node_t *zv, zvsearch;
int ndvas = BP_GET_NDVAS(bp);
int i;

ASSERT3S(lwb->lwb_state, !=, LWB_STATE_WRITE_DONE);
ASSERT3S(lwb->lwb_state, !=, LWB_STATE_FLUSH_DONE);

if (zil_nocacheflush)
return;

mutex_enter(&lwb->lwb_vdev_lock);
for (i = 0; i < ndvas; i++) {
zvsearch.zv_vdev = DVA_GET_VDEV(&bp->blk_dva[i]);
if (avl_find(t, &zvsearch, &where) == NULL) {
zv = kmem_alloc(sizeof (*zv), KM_SLEEP);
zv->zv_vdev = zvsearch.zv_vdev;
avl_insert(t, zv, where);
}
}
zio_vdev_trace_copy(&zio->io_vdev_trace_tree, &lwb->lwb_vdev_tree);
mutex_exit(&lwb->lwb_vdev_lock);
}

static void
zil_lwb_flush_defer(lwb_t *lwb, lwb_t *nlwb)
{
avl_tree_t *src = &lwb->lwb_vdev_tree;
avl_tree_t *dst = &nlwb->lwb_vdev_tree;
void *cookie = NULL;
zil_vdev_node_t *zv;

ASSERT3S(lwb->lwb_state, ==, LWB_STATE_WRITE_DONE);
ASSERT3S(nlwb->lwb_state, !=, LWB_STATE_WRITE_DONE);
ASSERT3S(nlwb->lwb_state, !=, LWB_STATE_FLUSH_DONE);

/*
* While 'lwb' is at a point in its lifetime where lwb_vdev_tree does
* not need the protection of lwb_vdev_lock (it will only be modified
* while holding zilog->zl_lock) as its writes and those of its
* children have all completed. The younger 'nlwb' may be waiting on
* future writes to additional vdevs.
*/
mutex_enter(&nlwb->lwb_vdev_lock);
/*
* Tear down the 'lwb' vdev tree, ensuring that entries which do not
* exist in 'nlwb' are moved to it, freeing any would-be duplicates.
*/
while ((zv = avl_destroy_nodes(src, &cookie)) != NULL) {
avl_index_t where;

if (avl_find(dst, zv, &where) == NULL) {
avl_insert(dst, zv, where);
} else {
kmem_free(zv, sizeof (*zv));
}
}
mutex_exit(&nlwb->lwb_vdev_lock);
}

void
zil_lwb_add_txg(lwb_t *lwb, uint64_t txg)
{
Expand Down Expand Up @@ -1573,9 +1515,6 @@ zil_lwb_write_done(zio_t *zio)
lwb_t *lwb = zio->io_private;
spa_t *spa = zio->io_spa;
zilog_t *zilog = lwb->lwb_zilog;
avl_tree_t *t = &lwb->lwb_vdev_tree;
void *cookie = NULL;
zil_vdev_node_t *zv;
lwb_t *nlwb;

ASSERT3S(spa_config_held(spa, SCL_STATE, RW_READER), !=, 0);
Expand All @@ -1601,13 +1540,9 @@ zil_lwb_write_done(zio_t *zio)
nlwb = NULL;
mutex_exit(&zilog->zl_lock);

if (avl_numnodes(t) == 0)
return;

/*
* If there was an IO error, we're not going to call zio_flush()
* on these vdevs, so we simply empty the tree and free the
* nodes. We avoid calling zio_flush() since there isn't any
* If there was an IO error, there's no point continuing. We
* avoid calling zio_flush() since there isn't any
* good reason for doing so, after the lwb block failed to be
* written out.
*
Expand All @@ -1617,38 +1552,78 @@ zil_lwb_write_done(zio_t *zio)
* we expect any error seen here, to have been propagated to
* that function).
*/
if (zio->io_error != 0) {
while ((zv = avl_destroy_nodes(t, &cookie)) != NULL)
kmem_free(zv, sizeof (*zv));
if (zio->io_error != 0 || zil_nocacheflush) {
/*
* Trace records may have been passed on to us from a
* previously-deferred lwb. Since we're not going to use them,
* we clean them up now.
*/
zio_vdev_trace_empty(&lwb->lwb_vdev_tree);
return;
}

/*
* If this lwb does not have any threads waiting for it to complete, we
* want to defer issuing the flush command to the vdevs written to by
* "this" lwb, and instead rely on the "next" lwb to handle the flush
* command for those vdevs. Thus, we merge the vdev tree of "this" lwb
* with the vdev tree of the "next" lwb in the list, and assume the
* "next" lwb will handle flushing the vdevs (or deferring the flush(s)
* again).
* This zio was issued with ZIO_FLAG_VDEV_TRACE, and so its
* io_vdev_trace_tree has the set of zio_vdev_trace_t records for vdevs
* that were written to by "this" lwb.
*
* This is a useful performance optimization, especially for workloads
* To complete the commit, we need to issue a flush to those vdevs. If
* this lwb does not have any threads waiting for it to complete, we
* want to defer issuing that flush, and instead rely on the "next" lwb
* to hndle the flush command for those vdevs.
*
* (This is a useful performance optimization, especially for workloads
* with lots of async write activity and few sync write and/or fsync
* activity, as it has the potential to coalesce multiple flush
* commands to a vdev into one.
* commands to a vdev into one).
*
* However, if this lwb does have threads waiting for it, we must issue
* the flush now.
*
* Regardless of whether or not we issue the flush now or defer it to
* the "next" lwb, our lwb_vdev_tree may also have entries on it that
* were deferred to us.
*
* So, our job here is to assemble a single tree of vdev trace records
* including all those that were written by this zio, plus any that
* were deferred to us, and either issue the flush now, or pass them on
* to the "next" lwb.
*
* Because the zio owns the entries on its trace tree, and it is no
* longer available to us after this function returns, so we make our
* own copies instead of just stealing them.
*/
if (list_is_empty(&lwb->lwb_waiters) && nlwb != NULL) {
zil_lwb_flush_defer(lwb, nlwb);
ASSERT(avl_is_empty(&lwb->lwb_vdev_tree));
ASSERT3S(nlwb->lwb_state, !=, LWB_STATE_WRITE_DONE);
ASSERT3S(nlwb->lwb_state, !=, LWB_STATE_FLUSH_DONE);

/*
* While 'lwb' is at a point in its lifetime where
* lwb_vdev_tree does not need the protection of lwb_vdev_lock
* (it will only be modified while holding zilog->zl_lock) as
* its writes and those of its children have all completed.
* The younger 'nlwb' may be waiting on future writes to
* additional vdevs.
*/
mutex_enter(&nlwb->lwb_vdev_lock);
zio_vdev_trace_copy(&zio->io_vdev_trace_tree,
&nlwb->lwb_vdev_tree);

/*
* Also move any trace records from this lwb to the next.
*/
zio_vdev_trace_move(&lwb->lwb_vdev_tree, &nlwb->lwb_vdev_tree);
mutex_exit(&nlwb->lwb_vdev_lock);
return;
}

while ((zv = avl_destroy_nodes(t, &cookie)) != NULL) {
vdev_t *vd = vdev_lookup_top(spa, zv->zv_vdev);
if (vd != NULL)
zio_flush(lwb->lwb_root_zio, vd, B_TRUE);
kmem_free(zv, sizeof (*zv));
}
/*
* Fill out the trace tree with records, then issue the flush,
* delivering errors to zil_lwb_flush_vdevs_done().
*/
zio_vdev_trace_copy(&zio->io_vdev_trace_tree, &lwb->lwb_vdev_tree);
zio_vdev_trace_flush(lwb->lwb_root_zio, &lwb->lwb_vdev_tree);
zio_vdev_trace_empty(&lwb->lwb_vdev_tree);
}

/*
Expand Down Expand Up @@ -1931,8 +1906,7 @@ zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb)
lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_SEQ]);
lwb->lwb_write_zio = zio_rewrite(lwb->lwb_root_zio, spa, 0,
&lwb->lwb_blk, lwb_abd, lwb->lwb_sz, zil_lwb_write_done,
lwb, prio, ZIO_FLAG_CANFAIL, &zb);
zil_lwb_add_block(lwb, &lwb->lwb_blk);
lwb, prio, ZIO_FLAG_CANFAIL | ZIO_FLAG_VDEV_TRACE, &zb);

if (lwb->lwb_slim) {
/* For Slim ZIL only write what is used. */
Expand Down Expand Up @@ -3773,8 +3747,7 @@ zil_lwb_cons(void *vbuf, void *unused, int kmflag)
list_create(&lwb->lwb_itxs, sizeof (itx_t), offsetof(itx_t, itx_node));
list_create(&lwb->lwb_waiters, sizeof (zil_commit_waiter_t),
offsetof(zil_commit_waiter_t, zcw_node));
avl_create(&lwb->lwb_vdev_tree, zil_lwb_vdev_compare,
sizeof (zil_vdev_node_t), offsetof(zil_vdev_node_t, zv_node));
zio_vdev_trace_init(&lwb->lwb_vdev_tree);
mutex_init(&lwb->lwb_vdev_lock, NULL, MUTEX_DEFAULT, NULL);
return (0);
}
Expand All @@ -3785,7 +3758,7 @@ zil_lwb_dest(void *vbuf, void *unused)
(void) unused;
lwb_t *lwb = vbuf;
mutex_destroy(&lwb->lwb_vdev_lock);
avl_destroy(&lwb->lwb_vdev_tree);
zio_vdev_trace_fini(&lwb->lwb_vdev_tree);
list_destroy(&lwb->lwb_waiters);
list_destroy(&lwb->lwb_itxs);
}
Expand Down Expand Up @@ -4373,7 +4346,6 @@ EXPORT_SYMBOL(zil_sync);
EXPORT_SYMBOL(zil_clean);
EXPORT_SYMBOL(zil_suspend);
EXPORT_SYMBOL(zil_resume);
EXPORT_SYMBOL(zil_lwb_add_block);
EXPORT_SYMBOL(zil_bp_tree_add);
EXPORT_SYMBOL(zil_set_sync);
EXPORT_SYMBOL(zil_set_logbias);
Expand Down
Loading

0 comments on commit e35088c

Please sign in to comment.