From 883cd30f908cb5c5a2df673ec85ebacac7933d11 Mon Sep 17 00:00:00 2001 From: Alexander Stetsenko Date: Sat, 24 Aug 2024 18:33:18 +0300 Subject: [PATCH] Implement parallel dbuf eviction In the previous code, dbuf_evict_thread() would called dbuf_evict_one() in a look while dbuf_cache_above_lowater(). dbuf_evict_one() would select a random sublist from the dbuf cache, then walk it from the tail forward, attempting to acquire the lock on each object until it succeeded, then evict that object and return. As the name suggests, it would evict only a single object from the cache. However, evicting one object is not likely to bring us below the desired low water mark, so dbuf_evict_one() will be called again, where it will loop over all of the same busy objects again, until it founds one it can evict. This has been replaced with dbuf_evict_many() which takes a specific sublist as a parameter, as well as a desired amount of data to evict. It then walks the sublist from the tail forward, evicting what it can until the number of bytes evicted satisfies the input parameter or the head of the sublist is reached. The dbuf_evict_thread now runs is parallel as well, allowing it to keep up with demand more easily. For the dbuf cache, if the single thread was not able to keep up, ZFS would shift the work of evicting some items to each incoming I/O thread. While that is still the case it should be seen much less often now that dbuf_evict is more efficient and no longer bottlenecked to a single thread. Sponsored-by: Expensify, Inc. Sponsored-by: Klara, Inc. Co-authored-by: Allan Jude Co-authored-by: Mateusz Piotrowski Signed-off-by: Alexander Stetsenko Signed-off-by: Allan Jude Signed-off-by: Mateusz Piotrowski --- man/man4/zfs.4 | 13 +++- module/zfs/dbuf.c | 189 ++++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 187 insertions(+), 15 deletions(-) diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index 20bb95c1aeea..32eca143dfbe 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -16,7 +16,7 @@ .\" own identifying information: .\" Portions Copyright [yyyy] [name of copyright owner] .\" -.Dd June 27, 2024 +.Dd August 28, 2024 .Dt ZFS 4 .Os . @@ -73,6 +73,17 @@ When set to .Sy 0 the array is dynamically sized based on total system memory. . +.It Sy dbuf_evict_parallel Ns = Ns Sy 0 Pq uint +When set to 1, ZFS will use up to +.Sy dbuf_evict_threads +threads to evict dbuf data in parallel, improving the responsiveness +of ZFS to memory pressure. +. +.It Sy dbuf_evict_threads Ns = Ns Sy 0 Pq uint +Sets the maximum number of dbuf eviction threads to be used. +When set to 0, ZFS uses one-eighth of the available CPUs, +with a minimum of 2 and a maximum of 16. +. .It Sy dmu_object_alloc_chunk_shift Ns = Ns Sy 7 Po 128 Pc Pq uint dnode slots allocated in a single operation as a power of 2. The default value minimizes lock contention for the bulk operation performed. diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index 099883ba2652..f2d6d27efe1b 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -183,6 +183,7 @@ static void dbuf_sync_leaf_verify_bonus_dnode(dbuf_dirty_record_t *dr); */ static kmem_cache_t *dbuf_kmem_cache; static taskq_t *dbu_evict_taskq; +static taskq_t *dbuf_evict_taskq; static kthread_t *dbuf_cache_evict_thread; static kmutex_t dbuf_evict_lock; @@ -237,6 +238,20 @@ static uint_t dbuf_metadata_cache_shift = 6; /* Set the dbuf hash mutex count as log2 shift (dynamic by default) */ static uint_t dbuf_mutex_cache_shift = 0; +/* + * Number of dbuf_evict threads + */ +static uint_t dbuf_evict_threads = 0; + +/* + * The minimum number of bytes we can evict at once is a block size. + * So, SPA_MAXBLOCKSIZE is a reasonable minimal value per an eviction task. + * We use this value to compute a scaling factor for the eviction tasks. + */ +#define DBUF_MIN_EVICT_PERTASK_SHIFT (SPA_MAXBLOCKSHIFT) + +static uint_t dbuf_evict_parallel = 0; + static unsigned long dbuf_cache_target_bytes(void); static unsigned long dbuf_metadata_cache_target_bytes(void); @@ -762,26 +777,47 @@ dbuf_cache_above_lowater(void) } /* - * Evict the oldest eligible dbuf from the dbuf cache. + * Evict the oldest eligible dbufs from the dbuf cache. + * Use the multilist sublist (mls) with the provided index #idx. */ static void -dbuf_evict_one(void) +dbuf_evict_many(uint64_t bytes, unsigned int idx) { - int idx = multilist_get_random_index(&dbuf_caches[DB_DBUF_CACHE].cache); + int64_t evicted = 0; + dmu_buf_impl_t *marker = kmem_cache_alloc(dbuf_kmem_cache, KM_SLEEP); + marker->db_objset = NULL; + + ASSERT3U(idx, <, multilist_get_num_sublists( + &dbuf_caches[DB_DBUF_CACHE].cache)); + multilist_sublist_t *mls = multilist_sublist_lock_idx( &dbuf_caches[DB_DBUF_CACHE].cache, idx); ASSERT(!MUTEX_HELD(&dbuf_evict_lock)); dmu_buf_impl_t *db = multilist_sublist_tail(mls); - while (db != NULL && mutex_tryenter(&db->db_mtx) == 0) { - db = multilist_sublist_prev(mls, db); - } + multilist_sublist_insert_after(mls, db, marker); + + while (db != NULL && evicted < bytes) { + int skip = 0; + while (db != NULL && (db->db_objset == NULL || + mutex_tryenter(&db->db_mtx) == 0)) { + db = multilist_sublist_prev(mls, db); + if (skip == 0) + skip = 1; + } - DTRACE_PROBE2(dbuf__evict__one, dmu_buf_impl_t *, db, - multilist_sublist_t *, mls); + if (db == NULL) + break; + + if (skip) { + multilist_sublist_remove(mls, marker); + multilist_sublist_insert_before(mls, db, marker); + } + + DTRACE_PROBE2(dbuf__evict__one, dmu_buf_impl_t *, db, + multilist_sublist_t *, mls); - if (db != NULL) { multilist_sublist_remove(mls, db); multilist_sublist_unlock(mls); uint64_t size = db->db.db_size; @@ -797,9 +833,121 @@ dbuf_evict_one(void) db->db_caching_status = DB_NO_CACHE; dbuf_destroy(db); DBUF_STAT_BUMP(cache_total_evicts); - } else { - multilist_sublist_unlock(mls); + evicted += size + usize; + + mls = multilist_sublist_lock_idx( + &dbuf_caches[DB_DBUF_CACHE].cache, idx); + db = multilist_sublist_prev(mls, marker); } + + multilist_sublist_remove(mls, marker); + multilist_sublist_unlock(mls); + kmem_cache_free(dbuf_kmem_cache, marker); +} + +typedef struct evict_arg { + taskq_ent_t tqe; + unsigned idx; + uint64_t bytes; +} evict_arg_t; + +static void +dbuf_evict_task(void *arg) +{ + evict_arg_t *eva = arg; + dbuf_evict_many(eva->bytes, eva->idx); +} + +static void +dbuf_evict(void) +{ + int64_t bytes = (zfs_refcount_count(&dbuf_caches[DB_DBUF_CACHE].size) - + dbuf_cache_lowater_bytes()); + + if (bytes <= 0) + return; + + unsigned idx = multilist_get_random_index( + &dbuf_caches[DB_DBUF_CACHE].cache); + + if (!dbuf_evict_parallel) + return (dbuf_evict_many(bytes, idx)); + + /* + * Go to the parallel eviction. + */ + unsigned int num_sublists = multilist_get_num_sublists( + &dbuf_caches[DB_DBUF_CACHE].cache); + evict_arg_t *evarg = kmem_zalloc(sizeof (*evarg) * num_sublists, + KM_SLEEP); + /* + * How we scale + * + * Example 1, # of chunks less than # of tasks. + * We have: + * - 4 tasks + * - 3 chunks + * - 3 full col + * - 0 low cols. + * + * The first low col index is 3. + * The tasks #0-#2 evict 1 chunk each. + * + * 0 | 1 | 2 | 3 | + * +===+===+===+===+ + * | x | x | x | | + * +---+---+---+---+ + * + * Example 2, # of chunks more than # of tasks. + * We have: + * - 4 tasks + * - 9 chunks + * - 1 full col + * - 3 low cols + * + * The first low col index is 1. + * The task #0 evicts 3 chunks, the others evict 2 chunks each. + * + * 0 | 1 | 2 | 3 | + * +===+===+===+===+ + * | x | x | x | x | + * +---+---+---+---+ + * | x | x | x | x | + * +---+---+---+---+ + * | x | | | | + * +---+---+---+---+ + */ + + /* + * Compute number of tasks to run (n), first low col index (k), + * normal and low bytes per task. + */ + uint64_t nchunks = ((bytes - 1) >> DBUF_MIN_EVICT_PERTASK_SHIFT) + 1; + unsigned n = nchunks < num_sublists ? nchunks : num_sublists; + uint64_t fullrows = nchunks / n; + unsigned lastrowcols = nchunks % n; + unsigned k = (lastrowcols ? lastrowcols : n); + + uint64_t bytes_pertask_low = fullrows << DBUF_MIN_EVICT_PERTASK_SHIFT; + uint64_t bytes_pertask = bytes_pertask_low + (lastrowcols ? + (1 << DBUF_MIN_EVICT_PERTASK_SHIFT) : 0); + + for (unsigned i = 0; i < n; i++) { + uint64_t evict = i < k ? bytes_pertask : bytes_pertask_low; + + evarg[i].idx = idx; + evarg[i].bytes = evict; + + taskq_dispatch_ent(dbuf_evict_taskq, dbuf_evict_task, + &evarg[i], 0, &evarg[i].tqe); + + /* wrap idx */ + if (++idx >= num_sublists) + idx = 0; + } + + taskq_wait(dbuf_evict_taskq); + kmem_free(evarg, sizeof (*evarg) * num_sublists); } /* @@ -833,7 +981,7 @@ dbuf_evict_thread(void *unused) * minimize lock contention. */ while (dbuf_cache_above_lowater() && !dbuf_evict_thread_exit) { - dbuf_evict_one(); + dbuf_evict(); } mutex_enter(&dbuf_evict_lock); @@ -860,7 +1008,7 @@ dbuf_evict_notify(uint64_t size) */ if (size > dbuf_cache_target_bytes()) { if (size > dbuf_cache_hiwater_bytes()) - dbuf_evict_one(); + dbuf_evict(); cv_signal(&dbuf_evict_cv); } } @@ -965,11 +1113,16 @@ dbuf_init(void) dbuf_stats_init(h); + if (dbuf_evict_threads == 0) + dbuf_evict_threads = MAX(2, MIN(16, max_ncpus >> 3)); /* * All entries are queued via taskq_dispatch_ent(), so min/maxalloc * configuration is not required. */ dbu_evict_taskq = taskq_create("dbu_evict", 1, defclsyspri, 0, 0, 0); + dbuf_evict_taskq = taskq_create("dbuf_evict", + MIN(dbuf_evict_threads, max_ncpus), defclsyspri, + MIN(dbuf_evict_threads, max_ncpus), max_ncpus, TASKQ_PREPOPULATE); for (dbuf_cached_state_t dcs = 0; dcs < DB_CACHE_MAX; dcs++) { multilist_create(&dbuf_caches[dcs].cache, @@ -1035,6 +1188,8 @@ dbuf_fini(void) kmem_cache_destroy(dbuf_kmem_cache); taskq_destroy(dbu_evict_taskq); + taskq_wait(dbuf_evict_taskq); + taskq_destroy(dbuf_evict_taskq); mutex_enter(&dbuf_evict_lock); dbuf_evict_thread_exit = B_TRUE; @@ -3963,7 +4118,7 @@ dmu_buf_rele(dmu_buf_t *db, const void *tag) * dbuf_rele()-->dbuf_rele_and_unlock()-->dbuf_evict_notify() * ^ | * | | - * +-----dbuf_destroy()<--dbuf_evict_one()<--------+ + * +-----dbuf_destroy()<--dbuf_evict()<------------+ * */ void @@ -5282,3 +5437,9 @@ ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, metadata_cache_shift, UINT, ZMOD_RW, ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, mutex_cache_shift, UINT, ZMOD_RD, "Set size of dbuf cache mutex array as log2 shift."); + +ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, evict_parallel, UINT, ZMOD_RW, + "Evict from the dbuf cache in parallel using a taskq"); + +ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, evict_threads, UINT, ZMOD_RW, + "Maximum number of dbuf_evict threads");