diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index 20bb95c1aeea..1906ec159c47 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -16,7 +16,7 @@ .\" own identifying information: .\" Portions Copyright [yyyy] [name of copyright owner] .\" -.Dd June 27, 2024 +.Dd August 28, 2024 .Dt ZFS 4 .Os . @@ -73,6 +73,16 @@ When set to .Sy 0 the array is dynamically sized based on total system memory. . +.It Sy dbuf_evict_parallel Ns = Ns Sy 0 Pq uint +When set to 1, ZFS will use up to +.Sy dbuf_evict_threads +threads to evict dbuf data in parallel, improving the responsiveness +of ZFS to memory pressure. +. +.It Sy dbuf_evict_threads Ns = Ns Sy 0 Pq uint +Sets the maximum number of dbuf eviction threads to be used. +When set to 0, ZFS uses half the available CPUs or 16, whichever is less. +. .It Sy dmu_object_alloc_chunk_shift Ns = Ns Sy 7 Po 128 Pc Pq uint dnode slots allocated in a single operation as a power of 2. The default value minimizes lock contention for the bulk operation performed. diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index 099883ba2652..cda31cac5aef 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -183,6 +183,7 @@ static void dbuf_sync_leaf_verify_bonus_dnode(dbuf_dirty_record_t *dr); */ static kmem_cache_t *dbuf_kmem_cache; static taskq_t *dbu_evict_taskq; +static taskq_t *dbuf_evict_taskq; static kthread_t *dbuf_cache_evict_thread; static kmutex_t dbuf_evict_lock; @@ -237,6 +238,20 @@ static uint_t dbuf_metadata_cache_shift = 6; /* Set the dbuf hash mutex count as log2 shift (dynamic by default) */ static uint_t dbuf_mutex_cache_shift = 0; +/* + * Number of dbuf_evict threads + */ +static uint_t dbuf_evict_threads = 0; + +/* + * The minimum number of bytes we can evict at once is a block size. + * So, SPA_MAXBLOCKSIZE is a reasonable minimal value per an eviction task. + * We use this value to compute a scaling factor for the eviction tasks. + */ +#define DBUF_MIN_EVICT_PERTASK_SHIFT (SPA_MAXBLOCKSHIFT) + +static uint_t dbuf_evict_parallel = 0; + static unsigned long dbuf_cache_target_bytes(void); static unsigned long dbuf_metadata_cache_target_bytes(void); @@ -762,26 +777,47 @@ dbuf_cache_above_lowater(void) } /* - * Evict the oldest eligible dbuf from the dbuf cache. + * Evict the oldest eligible dbufs from the dbuf cache. + * Use the multilist sublist (mls) with the provided index #idx. */ static void -dbuf_evict_one(void) +dbuf_evict_many(uint64_t bytes, unsigned int idx) { - int idx = multilist_get_random_index(&dbuf_caches[DB_DBUF_CACHE].cache); + int64_t evicted = 0; + dmu_buf_impl_t *marker = kmem_cache_alloc(dbuf_kmem_cache, KM_SLEEP); + marker->db_objset = NULL; + + ASSERT3U(idx, <, multilist_get_num_sublists( + &dbuf_caches[DB_DBUF_CACHE].cache)); + multilist_sublist_t *mls = multilist_sublist_lock_idx( &dbuf_caches[DB_DBUF_CACHE].cache, idx); ASSERT(!MUTEX_HELD(&dbuf_evict_lock)); dmu_buf_impl_t *db = multilist_sublist_tail(mls); - while (db != NULL && mutex_tryenter(&db->db_mtx) == 0) { - db = multilist_sublist_prev(mls, db); - } + multilist_sublist_insert_after(mls, db, marker); + + while (db != NULL && evicted < bytes) { + int skip = 0; + while (db != NULL && (db->db_objset == NULL || + mutex_tryenter(&db->db_mtx) == 0)) { + db = multilist_sublist_prev(mls, db); + if (skip == 0) + skip = 1; + } - DTRACE_PROBE2(dbuf__evict__one, dmu_buf_impl_t *, db, - multilist_sublist_t *, mls); + if (db == NULL) + break; + + if (skip) { + multilist_sublist_remove(mls, marker); + multilist_sublist_insert_before(mls, db, marker); + } + + DTRACE_PROBE2(dbuf__evict__one, dmu_buf_impl_t *, db, + multilist_sublist_t *, mls); - if (db != NULL) { multilist_sublist_remove(mls, db); multilist_sublist_unlock(mls); uint64_t size = db->db.db_size; @@ -797,9 +833,121 @@ dbuf_evict_one(void) db->db_caching_status = DB_NO_CACHE; dbuf_destroy(db); DBUF_STAT_BUMP(cache_total_evicts); - } else { - multilist_sublist_unlock(mls); + evicted += size + usize; + + mls = multilist_sublist_lock_idx( + &dbuf_caches[DB_DBUF_CACHE].cache, idx); + db = multilist_sublist_prev(mls, marker); } + + multilist_sublist_remove(mls, marker); + multilist_sublist_unlock(mls); + kmem_cache_free(dbuf_kmem_cache, marker); +} + +typedef struct evict_arg { + taskq_ent_t tqe; + unsigned idx; + uint64_t bytes; +} evict_arg_t; + +static void +dbuf_evict_task(void *arg) +{ + evict_arg_t *eva = arg; + dbuf_evict_many(eva->bytes, eva->idx); +} + +static void +dbuf_evict(void) +{ + int64_t bytes = (zfs_refcount_count(&dbuf_caches[DB_DBUF_CACHE].size) - + dbuf_cache_lowater_bytes()); + + if (bytes <= 0) + return; + + unsigned idx = multilist_get_random_index( + &dbuf_caches[DB_DBUF_CACHE].cache); + + if (!dbuf_evict_parallel) + return (dbuf_evict_many(bytes, idx)); + + /* + * Go to the parallel eviction. + */ + unsigned int num_sublists = multilist_get_num_sublists( + &dbuf_caches[DB_DBUF_CACHE].cache); + evict_arg_t *evarg = kmem_zalloc(sizeof (*evarg) * num_sublists, + KM_SLEEP); + /* + * How we scale + * + * Example 1, # of chunks less than # of tasks. + * We have: + * - 4 tasks + * - 3 chunks + * - 3 full col + * - 0 low cols. + * + * The first low col index is 3. + * The tasks #0-#2 evict 1 chunk each. + * + * 0 | 1 | 2 | 3 | + * +===+===+===+===+ + * | x | x | x | | + * +---+---+---+---+ + * + * Example 2, # of chunks more than # of tasks. + * We have: + * - 4 tasks + * - 9 chunks + * - 1 full col + * - 3 low cols + * + * The first low col index is 1. + * The task #0 evicts 3 chunks, the others evict 2 chunks each. + * + * 0 | 1 | 2 | 3 | + * +===+===+===+===+ + * | x | x | x | x | + * +---+---+---+---+ + * | x | x | x | x | + * +---+---+---+---+ + * | x | | | | + * +---+---+---+---+ + */ + + /* + * Compute number of tasks to run (n), first low col index (k), + * normal and low bytes per task. + */ + uint64_t nchunks = ((bytes - 1) >> DBUF_MIN_EVICT_PERTASK_SHIFT) + 1; + unsigned n = nchunks < num_sublists ? nchunks : num_sublists; + uint64_t fullrows = nchunks / n; + unsigned lastrowcols = nchunks % n; + unsigned k = (lastrowcols ? lastrowcols : n); + + uint64_t bytes_pertask_low = fullrows << DBUF_MIN_EVICT_PERTASK_SHIFT; + uint64_t bytes_pertask = bytes_pertask_low + (lastrowcols ? + (1 << DBUF_MIN_EVICT_PERTASK_SHIFT) : 0); + + for (unsigned i = 0; i < n; i++) { + uint64_t evict = i < k ? bytes_pertask : bytes_pertask_low; + + evarg[i].idx = idx; + evarg[i].bytes = evict; + + taskq_dispatch_ent(dbuf_evict_taskq, dbuf_evict_task, + &evarg[i], 0, &evarg[i].tqe); + + /* wrap idx */ + if (++idx >= num_sublists) + idx = 0; + } + + taskq_wait(dbuf_evict_taskq); + kmem_free(evarg, sizeof (*evarg) * num_sublists); } /* @@ -833,7 +981,7 @@ dbuf_evict_thread(void *unused) * minimize lock contention. */ while (dbuf_cache_above_lowater() && !dbuf_evict_thread_exit) { - dbuf_evict_one(); + dbuf_evict(); } mutex_enter(&dbuf_evict_lock); @@ -860,7 +1008,7 @@ dbuf_evict_notify(uint64_t size) */ if (size > dbuf_cache_target_bytes()) { if (size > dbuf_cache_hiwater_bytes()) - dbuf_evict_one(); + dbuf_evict(); cv_signal(&dbuf_evict_cv); } } @@ -965,11 +1113,16 @@ dbuf_init(void) dbuf_stats_init(h); + if (dbuf_evict_threads == 0) + dbuf_evict_threads = MIN(16, max_ncpus >> 1); /* * All entries are queued via taskq_dispatch_ent(), so min/maxalloc * configuration is not required. */ dbu_evict_taskq = taskq_create("dbu_evict", 1, defclsyspri, 0, 0, 0); + dbuf_evict_taskq = taskq_create("dbuf_evict", + MIN(dbuf_evict_threads, max_ncpus), defclsyspri, + MIN(dbuf_evict_threads, max_ncpus), max_ncpus, TASKQ_PREPOPULATE); for (dbuf_cached_state_t dcs = 0; dcs < DB_CACHE_MAX; dcs++) { multilist_create(&dbuf_caches[dcs].cache, @@ -1035,6 +1188,8 @@ dbuf_fini(void) kmem_cache_destroy(dbuf_kmem_cache); taskq_destroy(dbu_evict_taskq); + taskq_wait(dbuf_evict_taskq); + taskq_destroy(dbuf_evict_taskq); mutex_enter(&dbuf_evict_lock); dbuf_evict_thread_exit = B_TRUE; @@ -3963,7 +4118,7 @@ dmu_buf_rele(dmu_buf_t *db, const void *tag) * dbuf_rele()-->dbuf_rele_and_unlock()-->dbuf_evict_notify() * ^ | * | | - * +-----dbuf_destroy()<--dbuf_evict_one()<--------+ + * +-----dbuf_destroy()<--dbuf_evict()<------------+ * */ void @@ -5282,3 +5437,9 @@ ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, metadata_cache_shift, UINT, ZMOD_RW, ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, mutex_cache_shift, UINT, ZMOD_RD, "Set size of dbuf cache mutex array as log2 shift."); + +ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, evict_parallel, UINT, ZMOD_RW, + "Evict from the dbuf cache in parallel using a taskq"); + +ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, evict_threads, UINT, ZMOD_RW, + "Maximum number of dbuf_evict threads");