diff options
32 files changed, 341 insertions, 247 deletions
diff --git a/sys/contrib/openzfs/cmd/arc_summary b/sys/contrib/openzfs/cmd/arc_summary index 426e0207052d..9c69ec4f8ccc 100755 --- a/sys/contrib/openzfs/cmd/arc_summary +++ b/sys/contrib/openzfs/cmd/arc_summary @@ -711,7 +711,7 @@ def section_archits(kstats_dict): pd_total = int(arc_stats['prefetch_data_hits']) +\ int(arc_stats['prefetch_data_iohits']) +\ int(arc_stats['prefetch_data_misses']) - prt_2('ARC prefetch metadata accesses:', f_perc(pd_total, all_accesses), + prt_2('ARC prefetch data accesses:', f_perc(pd_total, all_accesses), f_hits(pd_total)) pd_todo = (('Prefetch data hits:', arc_stats['prefetch_data_hits']), ('Prefetch data I/O hits:', arc_stats['prefetch_data_iohits']), diff --git a/sys/contrib/openzfs/cmd/zdb/zdb.c b/sys/contrib/openzfs/cmd/zdb/zdb.c index b39a0e8825fb..3c282f3fc975 100644 --- a/sys/contrib/openzfs/cmd/zdb/zdb.c +++ b/sys/contrib/openzfs/cmd/zdb/zdb.c @@ -8716,8 +8716,6 @@ zdb_read_block(char *thing, spa_t *spa) BP_SET_CHECKSUM(bp, ck); spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); czio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); - czio->io_bp = bp; - if (vd == vd->vdev_top) { zio_nowait(zio_read(czio, spa, bp, pabd, psize, NULL, NULL, @@ -8736,7 +8734,8 @@ zdb_read_block(char *thing, spa_t *spa) } error = zio_wait(czio); if (error == 0 || error == ECKSUM) { - zio_t *ck_zio = zio_root(spa, NULL, NULL, 0); + zio_t *ck_zio = zio_null(NULL, spa, NULL, + NULL, NULL, 0); ck_zio->io_offset = DVA_GET_OFFSET(&bp->blk_dva[0]); ck_zio->io_bp = bp; diff --git a/sys/contrib/openzfs/include/os/freebsd/spl/sys/mutex.h b/sys/contrib/openzfs/include/os/freebsd/spl/sys/mutex.h index e757d12c1502..8cfe56c75309 100644 --- a/sys/contrib/openzfs/include/os/freebsd/spl/sys/mutex.h +++ b/sys/contrib/openzfs/include/os/freebsd/spl/sys/mutex.h @@ -64,6 +64,7 @@ typedef enum { } while (0) #define mutex_destroy(lock) sx_destroy(lock) #define mutex_enter(lock) sx_xlock(lock) +#define mutex_enter_interruptible(lock) sx_xlock_sig(lock) #define mutex_enter_nested(lock, type) sx_xlock(lock) #define mutex_tryenter(lock) sx_try_xlock(lock) #define mutex_exit(lock) sx_xunlock(lock) diff --git a/sys/contrib/openzfs/include/os/linux/spl/sys/mutex.h b/sys/contrib/openzfs/include/os/linux/spl/sys/mutex.h index 6b61c59c48e2..b4eaa0266d20 100644 --- a/sys/contrib/openzfs/include/os/linux/spl/sys/mutex.h +++ b/sys/contrib/openzfs/include/os/linux/spl/sys/mutex.h @@ -128,7 +128,6 @@ spl_mutex_lockdep_on_maybe(kmutex_t *mp) \ #define NESTED_SINGLE 1 -#ifdef CONFIG_DEBUG_LOCK_ALLOC #define mutex_enter_nested(mp, subclass) \ { \ ASSERT3P(mutex_owner(mp), !=, current); \ @@ -137,16 +136,22 @@ spl_mutex_lockdep_on_maybe(kmutex_t *mp) \ spl_mutex_lockdep_on_maybe(mp); \ spl_mutex_set_owner(mp); \ } -#else /* CONFIG_DEBUG_LOCK_ALLOC */ -#define mutex_enter_nested(mp, subclass) \ -{ \ + +#define mutex_enter_interruptible(mp) \ +/* CSTYLED */ \ +({ \ + int _rc_; \ + \ ASSERT3P(mutex_owner(mp), !=, current); \ spl_mutex_lockdep_off_maybe(mp); \ - mutex_lock(MUTEX(mp)); \ + _rc_ = mutex_lock_interruptible(MUTEX(mp)); \ spl_mutex_lockdep_on_maybe(mp); \ - spl_mutex_set_owner(mp); \ -} -#endif /* CONFIG_DEBUG_LOCK_ALLOC */ + if (!_rc_) { \ + spl_mutex_set_owner(mp); \ + } \ + \ + _rc_; \ +}) #define mutex_enter(mp) mutex_enter_nested((mp), 0) diff --git a/sys/contrib/openzfs/include/os/linux/spl/sys/uio.h b/sys/contrib/openzfs/include/os/linux/spl/sys/uio.h index cce097e16fbc..a4b600004c9f 100644 --- a/sys/contrib/openzfs/include/os/linux/spl/sys/uio.h +++ b/sys/contrib/openzfs/include/os/linux/spl/sys/uio.h @@ -73,13 +73,6 @@ typedef struct zfs_uio { size_t uio_skip; struct request *rq; - - /* - * Used for saving rq_for_each_segment() state between calls - * to zfs_uiomove_bvec_rq(). - */ - struct req_iterator iter; - struct bio_vec bv; } zfs_uio_t; @@ -138,7 +131,6 @@ zfs_uio_bvec_init(zfs_uio_t *uio, struct bio *bio, struct request *rq) } else { uio->uio_bvec = NULL; uio->uio_iovcnt = 0; - memset(&uio->iter, 0, sizeof (uio->iter)); } uio->uio_loffset = io_offset(bio, rq); diff --git a/sys/contrib/openzfs/include/sys/dmu_objset.h b/sys/contrib/openzfs/include/sys/dmu_objset.h index 9f6e0fdd601b..a9123e862af7 100644 --- a/sys/contrib/openzfs/include/sys/dmu_objset.h +++ b/sys/contrib/openzfs/include/sys/dmu_objset.h @@ -132,6 +132,7 @@ struct objset { zfs_logbias_op_t os_logbias; zfs_cache_type_t os_primary_cache; zfs_cache_type_t os_secondary_cache; + zfs_prefetch_type_t os_prefetch; zfs_sync_type_t os_sync; zfs_redundant_metadata_type_t os_redundant_metadata; uint64_t os_recordsize; diff --git a/sys/contrib/openzfs/include/sys/fs/zfs.h b/sys/contrib/openzfs/include/sys/fs/zfs.h index bc940e8a7929..9d7eca9784b6 100644 --- a/sys/contrib/openzfs/include/sys/fs/zfs.h +++ b/sys/contrib/openzfs/include/sys/fs/zfs.h @@ -191,6 +191,7 @@ typedef enum { ZFS_PROP_REDACTED, ZFS_PROP_REDACT_SNAPS, ZFS_PROP_SNAPSHOTS_CHANGED, + ZFS_PROP_PREFETCH, ZFS_NUM_PROPS } zfs_prop_t; @@ -543,6 +544,12 @@ typedef enum zfs_key_location { ZFS_KEYLOCATION_LOCATIONS } zfs_keylocation_t; +typedef enum { + ZFS_PREFETCH_NONE = 0, + ZFS_PREFETCH_METADATA = 1, + ZFS_PREFETCH_ALL = 2 +} zfs_prefetch_type_t; + #define DEFAULT_PBKDF2_ITERATIONS 350000 #define MIN_PBKDF2_ITERATIONS 100000 diff --git a/sys/contrib/openzfs/include/sys/spa.h b/sys/contrib/openzfs/include/sys/spa.h index 18062d3f2a95..88ef510b744b 100644 --- a/sys/contrib/openzfs/include/sys/spa.h +++ b/sys/contrib/openzfs/include/sys/spa.h @@ -837,7 +837,7 @@ extern kmutex_t spa_namespace_lock; extern void spa_write_cachefile(spa_t *, boolean_t, boolean_t, boolean_t); extern void spa_config_load(void); -extern nvlist_t *spa_all_configs(uint64_t *); +extern int spa_all_configs(uint64_t *generation, nvlist_t **pools); extern void spa_config_set(spa_t *spa, nvlist_t *config); extern nvlist_t *spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats); diff --git a/sys/contrib/openzfs/include/sys/zfs_context.h b/sys/contrib/openzfs/include/sys/zfs_context.h index 6a337b49edf3..750ca612b962 100644 --- a/sys/contrib/openzfs/include/sys/zfs_context.h +++ b/sys/contrib/openzfs/include/sys/zfs_context.h @@ -274,11 +274,13 @@ typedef struct kmutex { extern void mutex_init(kmutex_t *mp, char *name, int type, void *cookie); extern void mutex_destroy(kmutex_t *mp); extern void mutex_enter(kmutex_t *mp); +extern int mutex_enter_check_return(kmutex_t *mp); extern void mutex_exit(kmutex_t *mp); extern int mutex_tryenter(kmutex_t *mp); #define NESTED_SINGLE 1 #define mutex_enter_nested(mp, class) mutex_enter(mp) +#define mutex_enter_interruptible(mp) mutex_enter_check_return(mp) /* * RW locks */ diff --git a/sys/contrib/openzfs/include/sys/zil_impl.h b/sys/contrib/openzfs/include/sys/zil_impl.h index f780ad3d61bc..c9db6d428ea2 100644 --- a/sys/contrib/openzfs/include/sys/zil_impl.h +++ b/sys/contrib/openzfs/include/sys/zil_impl.h @@ -181,6 +181,7 @@ typedef struct zil_vdev_node { avl_node_t zv_node; /* AVL tree linkage */ } zil_vdev_node_t; +#define ZIL_BURSTS 8 #define ZIL_PREV_BLKS 16 /* @@ -222,8 +223,9 @@ struct zilog { clock_t zl_replay_time; /* lbolt of when replay started */ uint64_t zl_replay_blks; /* number of log blocks replayed */ zil_header_t zl_old_header; /* debugging aid */ - uint_t zl_prev_blks[ZIL_PREV_BLKS]; /* size - sector rounded */ + uint_t zl_parallel; /* workload is multi-threaded */ uint_t zl_prev_rotor; /* rotor for zl_prev[] */ + uint_t zl_prev_blks[ZIL_PREV_BLKS]; /* size - sector rounded */ txg_node_t zl_dirty_link; /* protected by dp_dirty_zilogs list */ uint64_t zl_dirty_max_txg; /* highest txg used to dirty zilog */ diff --git a/sys/contrib/openzfs/include/sys/zio_impl.h b/sys/contrib/openzfs/include/sys/zio_impl.h index 29a05986cd4f..febe0a87b428 100644 --- a/sys/contrib/openzfs/include/sys/zio_impl.h +++ b/sys/contrib/openzfs/include/sys/zio_impl.h @@ -159,6 +159,9 @@ enum zio_stage { ZIO_STAGE_DONE = 1 << 25 /* RWFCI */ }; +#define ZIO_ROOT_PIPELINE \ + ZIO_STAGE_DONE + #define ZIO_INTERLOCK_STAGES \ (ZIO_STAGE_READY | \ ZIO_STAGE_DONE) diff --git a/sys/contrib/openzfs/lib/libzfs/libzfs.abi b/sys/contrib/openzfs/lib/libzfs/libzfs.abi index c1ce3d0f67d8..083c27be7d31 100644 --- a/sys/contrib/openzfs/lib/libzfs/libzfs.abi +++ b/sys/contrib/openzfs/lib/libzfs/libzfs.abi @@ -1867,7 +1867,8 @@ <enumerator name='ZFS_PROP_REDACTED' value='93'/> <enumerator name='ZFS_PROP_REDACT_SNAPS' value='94'/> <enumerator name='ZFS_PROP_SNAPSHOTS_CHANGED' value='95'/> - <enumerator name='ZFS_NUM_PROPS' value='96'/> + <enumerator name='ZFS_PROP_PREFETCH' value='96'/> + <enumerator name='ZFS_NUM_PROPS' value='97'/> </enum-decl> <typedef-decl name='zfs_prop_t' type-id='4b000d60' id='58603c44'/> <enum-decl name='zprop_source_t' naming-typedef-id='a2256d42' id='5903f80e'> diff --git a/sys/contrib/openzfs/lib/libzpool/kernel.c b/sys/contrib/openzfs/lib/libzpool/kernel.c index a9b9bf4c2ce5..ffad7fc02bc9 100644 --- a/sys/contrib/openzfs/lib/libzpool/kernel.c +++ b/sys/contrib/openzfs/lib/libzpool/kernel.c @@ -206,6 +206,15 @@ mutex_enter(kmutex_t *mp) } int +mutex_enter_check_return(kmutex_t *mp) +{ + int error = pthread_mutex_lock(&mp->m_lock); + if (error == 0) + mp->m_owner = pthread_self(); + return (error); +} + +int mutex_tryenter(kmutex_t *mp) { int error = pthread_mutex_trylock(&mp->m_lock); diff --git a/sys/contrib/openzfs/man/man4/zfs.4 b/sys/contrib/openzfs/man/man4/zfs.4 index 5f89f6adf1e3..ddad00be412d 100644 --- a/sys/contrib/openzfs/man/man4/zfs.4 +++ b/sys/contrib/openzfs/man/man4/zfs.4 @@ -644,10 +644,7 @@ Max size of ARC in bytes. If .Sy 0 , then the max size of ARC is determined by the amount of system memory installed. -Under Linux, half of system memory will be used as the limit. -Under -.Fx , -the larger of +The larger of .Sy all_system_memory No \- Sy 1 GiB and .Sy 5/8 No \(mu Sy all_system_memory @@ -798,7 +795,7 @@ Note that this should not be set below the ZED thresholds (currently 10 checksums over 10 seconds) or else the daemon may not trigger any action. . -.It Sy zfs_commit_timeout_pct Ns = Ns Sy 5 Ns % Pq uint +.It Sy zfs_commit_timeout_pct Ns = Ns Sy 10 Ns % Pq uint This controls the amount of time that a ZIL block (lwb) will remain "open" when it isn't "full", and it has a thread waiting for it to be committed to stable storage. @@ -2155,13 +2152,6 @@ This sets the maximum number of write bytes logged via WR_COPIED. It tunes a tradeoff between additional memory copy and possibly worse log space efficiency vs additional range lock/unlock. . -.It Sy zil_min_commit_timeout Ns = Ns Sy 5000 Pq u64 -This sets the minimum delay in nanoseconds ZIL care to delay block commit, -waiting for more records. -If ZIL writes are too fast, kernel may not be able sleep for so short interval, -increasing log latency above allowed by -.Sy zfs_commit_timeout_pct . -. .It Sy zil_nocacheflush Ns = Ns Sy 0 Ns | Ns 1 Pq int Disable the cache flush commands that are normally sent to disk by the ZIL after an LWB write has completed. @@ -2317,6 +2307,63 @@ If .Sy zvol_threads to the number of CPUs present or 32 (whichever is greater). . +.It Sy zvol_blk_mq_threads Ns = Ns Sy 0 Pq uint +The number of threads per zvol to use for queuing IO requests. +This parameter will only appear if your kernel supports +.Li blk-mq +and is only read and assigned to a zvol at zvol load time. +If +.Sy 0 +(the default) then internally set +.Sy zvol_blk_mq_threads +to the number of CPUs present. +. +.It Sy zvol_use_blk_mq Ns = Ns Sy 0 Ns | Ns 1 Pq uint +Set to +.Sy 1 +to use the +.Li blk-mq +API for zvols. +Set to +.Sy 0 +(the default) to use the legacy zvol APIs. +This setting can give better or worse zvol performance depending on +the workload. +This parameter will only appear if your kernel supports +.Li blk-mq +and is only read and assigned to a zvol at zvol load time. +. +.It Sy zvol_blk_mq_blocks_per_thread Ns = Ns Sy 8 Pq uint +If +.Sy zvol_use_blk_mq +is enabled, then process this number of +.Sy volblocksize Ns -sized blocks per zvol thread. +This tunable can be use to favor better performance for zvol reads (lower +values) or writes (higher values). +If set to +.Sy 0 , +then the zvol layer will process the maximum number of blocks +per thread that it can. +This parameter will only appear if your kernel supports +.Li blk-mq +and is only applied at each zvol's load time. +. +.It Sy zvol_blk_mq_queue_depth Ns = Ns Sy 0 Pq uint +The queue_depth value for the zvol +.Li blk-mq +interface. +This parameter will only appear if your kernel supports +.Li blk-mq +and is only applied at each zvol's load time. +If +.Sy 0 +(the default) then use the kernel's default queue depth. +Values are clamped to the kernel's +.Dv BLKDEV_MIN_RQ +and +.Dv BLKDEV_MAX_RQ Ns / Ns Dv BLKDEV_DEFAULT_RQ +limits. +. .It Sy zvol_volmode Ns = Ns Sy 1 Pq uint Defines zvol block devices behaviour when .Sy volmode Ns = Ns Sy default : diff --git a/sys/contrib/openzfs/man/man7/zfsprops.7 b/sys/contrib/openzfs/man/man7/zfsprops.7 index e3674b1f8a8d..59f6404379af 100644 --- a/sys/contrib/openzfs/man/man7/zfsprops.7 +++ b/sys/contrib/openzfs/man/man7/zfsprops.7 @@ -1613,6 +1613,23 @@ If this property is set to then only metadata is cached. The default value is .Sy all . +.It Sy prefetch Ns = Ns Sy all Ns | Ns Sy none Ns | Ns Sy metadata +Controls what speculative prefetch does. +If this property is set to +.Sy all , +then both user data and metadata are prefetched. +If this property is set to +.Sy none , +then neither user data nor metadata are prefetched. +If this property is set to +.Sy metadata , +then only metadata are prefetched. +The default value is +.Sy all . +.Pp +Please note that the module parameter zfs_disable_prefetch=1 can +be used to totally disable speculative prefetch, bypassing anything +this property does. .It Sy setuid Ns = Ns Sy on Ns | Ns Sy off Controls whether the setuid bit is respected for the file system. The default value is diff --git a/sys/contrib/openzfs/module/os/linux/zfs/arc_os.c b/sys/contrib/openzfs/module/os/linux/zfs/arc_os.c index 29a8802b8367..381563476daf 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/arc_os.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/arc_os.c @@ -80,12 +80,18 @@ static struct notifier_block arc_hotplug_callback_mem_nb; /* * Return a default max arc size based on the amount of physical memory. + * This may be overridden by tuning the zfs_arc_max module parameter. */ uint64_t arc_default_max(uint64_t min, uint64_t allmem) { - /* Default to 1/2 of all memory. */ - return (MAX(allmem / 2, min)); + uint64_t size; + + if (allmem >= 1 << 30) + size = allmem - (1 << 30); + else + size = min; + return (MAX(allmem * 5 / 8, size)); } #ifdef _KERNEL diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_uio.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_uio.c index 3efd4ab159c6..c2ed67c438c6 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_uio.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_uio.c @@ -204,22 +204,6 @@ zfs_uiomove_bvec_rq(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio) this_seg_start = orig_loffset; rq_for_each_segment(bv, rq, iter) { - if (uio->iter.bio) { - /* - * If uio->iter.bio is present, then we know we've saved - * uio->iter from a previous call to this function, and - * we can skip ahead in this rq_for_each_segment() loop - * to where we last left off. That way, we don't need - * to iterate over tons of segments we've already - * processed - we can just restore the "saved state". - */ - iter = uio->iter; - bv = uio->bv; - this_seg_start = uio->uio_loffset; - memset(&uio->iter, 0, sizeof (uio->iter)); - continue; - } - /* * Lookup what the logical offset of the last byte of this * segment is. @@ -260,19 +244,6 @@ zfs_uiomove_bvec_rq(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio) copied = 1; /* We copied some data */ } - if (n == 0) { - /* - * All done copying. Save our 'iter' value to the uio. - * This allows us to "save our state" and skip ahead in - * the rq_for_each_segment() loop the next time we call - * call zfs_uiomove_bvec_rq() on this uio (which we - * will be doing for any remaining data in the uio). - */ - uio->iter = iter; /* make a copy of the struct data */ - uio->bv = bv; - return (0); - } - this_seg_start = this_seg_end + 1; } diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c b/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c index e145a8c876b4..f94ce69fb9e2 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c @@ -1626,6 +1626,18 @@ MODULE_PARM_DESC(zvol_prefetch_bytes, "Prefetch N bytes at zvol start+end"); module_param(zvol_volmode, uint, 0644); MODULE_PARM_DESC(zvol_volmode, "Default volmode property value"); +#ifdef HAVE_BLK_MQ +module_param(zvol_blk_mq_queue_depth, uint, 0644); +MODULE_PARM_DESC(zvol_blk_mq_queue_depth, "Default blk-mq queue depth"); + +module_param(zvol_use_blk_mq, uint, 0644); +MODULE_PARM_DESC(zvol_use_blk_mq, "Use the blk-mq API for zvols"); + +module_param(zvol_blk_mq_blocks_per_thread, uint, 0644); +MODULE_PARM_DESC(zvol_blk_mq_blocks_per_thread, + "Process volblocksize blocks per thread"); +#endif + #ifndef HAVE_BLKDEV_GET_ERESTARTSYS module_param(zvol_open_timeout_ms, uint, 0644); MODULE_PARM_DESC(zvol_open_timeout_ms, "Timeout for ZVOL open retries"); diff --git a/sys/contrib/openzfs/module/zcommon/zfs_prop.c b/sys/contrib/openzfs/module/zcommon/zfs_prop.c index 3db6fd13f4ae..29764674a31b 100644 --- a/sys/contrib/openzfs/module/zcommon/zfs_prop.c +++ b/sys/contrib/openzfs/module/zcommon/zfs_prop.c @@ -345,6 +345,13 @@ zfs_prop_init(void) { NULL } }; + static const zprop_index_t prefetch_table[] = { + { "none", ZFS_PREFETCH_NONE }, + { "metadata", ZFS_PREFETCH_METADATA }, + { "all", ZFS_PREFETCH_ALL }, + { NULL } + }; + static const zprop_index_t sync_table[] = { { "standard", ZFS_SYNC_STANDARD }, { "always", ZFS_SYNC_ALWAYS }, @@ -453,6 +460,10 @@ zfs_prop_init(void) ZFS_CACHE_ALL, PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT | ZFS_TYPE_VOLUME, "all | none | metadata", "SECONDARYCACHE", cache_table, sfeatures); + zprop_register_index(ZFS_PROP_PREFETCH, "prefetch", + ZFS_PREFETCH_ALL, PROP_INHERIT, + ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT | ZFS_TYPE_VOLUME, + "none | metadata | all", "PREFETCH", prefetch_table, sfeatures); zprop_register_index(ZFS_PROP_LOGBIAS, "logbias", ZFS_LOGBIAS_LATENCY, PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "latency | throughput", "LOGBIAS", logbias_table, sfeatures); diff --git a/sys/contrib/openzfs/module/zfs/abd.c b/sys/contrib/openzfs/module/zfs/abd.c index 745ee8f02ed4..d982f201c930 100644 --- a/sys/contrib/openzfs/module/zfs/abd.c +++ b/sys/contrib/openzfs/module/zfs/abd.c @@ -802,13 +802,10 @@ abd_iterate_func(abd_t *abd, size_t off, size_t size, abd_verify(abd); ASSERT3U(off + size, <=, abd->abd_size); - boolean_t gang = abd_is_gang(abd); abd_t *c_abd = abd_init_abd_iter(abd, &aiter, off); while (size > 0) { - /* If we are at the end of the gang ABD we are done */ - if (gang && !c_abd) - break; + IMPLY(abd_is_gang(abd), c_abd != NULL); abd_iter_map(&aiter); @@ -930,7 +927,6 @@ abd_iterate_func2(abd_t *dabd, abd_t *sabd, size_t doff, size_t soff, { int ret = 0; struct abd_iter daiter, saiter; - boolean_t dabd_is_gang_abd, sabd_is_gang_abd; abd_t *c_dabd, *c_sabd; if (size == 0) @@ -942,16 +938,12 @@ abd_iterate_func2(abd_t *dabd, abd_t *sabd, size_t doff, size_t soff, ASSERT3U(doff + size, <=, dabd->abd_size); ASSERT3U(soff + size, <=, sabd->abd_size); - dabd_is_gang_abd = abd_is_gang(dabd); - sabd_is_gang_abd = abd_is_gang(sabd); c_dabd = abd_init_abd_iter(dabd, &daiter, doff); c_sabd = abd_init_abd_iter(sabd, &saiter, soff); while (size > 0) { - /* if we are at the end of the gang ABD we are done */ - if ((dabd_is_gang_abd && !c_dabd) || - (sabd_is_gang_abd && !c_sabd)) - break; + IMPLY(abd_is_gang(dabd), c_dabd != NULL); + IMPLY(abd_is_gang(sabd), c_sabd != NULL); abd_iter_map(&daiter); abd_iter_map(&saiter); @@ -1032,66 +1024,40 @@ abd_raidz_gen_iterate(abd_t **cabds, abd_t *dabd, int i; ssize_t len, dlen; struct abd_iter caiters[3]; - struct abd_iter daiter = {0}; + struct abd_iter daiter; void *caddrs[3]; unsigned long flags __maybe_unused = 0; abd_t *c_cabds[3]; abd_t *c_dabd = NULL; - boolean_t cabds_is_gang_abd[3]; - boolean_t dabd_is_gang_abd = B_FALSE; ASSERT3U(parity, <=, 3); - for (i = 0; i < parity; i++) { - cabds_is_gang_abd[i] = abd_is_gang(cabds[i]); + abd_verify(cabds[i]); + ASSERT3U(csize, <=, cabds[i]->abd_size); c_cabds[i] = abd_init_abd_iter(cabds[i], &caiters[i], 0); } - if (dabd) { - dabd_is_gang_abd = abd_is_gang(dabd); + ASSERT3S(dsize, >=, 0); + if (dsize > 0) { + ASSERT(dabd); + abd_verify(dabd); + ASSERT3U(dsize, <=, dabd->abd_size); c_dabd = abd_init_abd_iter(dabd, &daiter, 0); } - ASSERT3S(dsize, >=, 0); - abd_enter_critical(flags); while (csize > 0) { - /* if we are at the end of the gang ABD we are done */ - if (dabd_is_gang_abd && !c_dabd) - break; - + len = csize; for (i = 0; i < parity; i++) { - /* - * If we are at the end of the gang ABD we are - * done. - */ - if (cabds_is_gang_abd[i] && !c_cabds[i]) - break; + IMPLY(abd_is_gang(cabds[i]), c_cabds[i] != NULL); abd_iter_map(&caiters[i]); caddrs[i] = caiters[i].iter_mapaddr; + len = MIN(caiters[i].iter_mapsize, len); } - len = csize; - - if (dabd && dsize > 0) + if (dsize > 0) { + IMPLY(abd_is_gang(dabd), c_dabd != NULL); abd_iter_map(&daiter); - - switch (parity) { - case 3: - len = MIN(caiters[2].iter_mapsize, len); - zfs_fallthrough; - case 2: - len = MIN(caiters[1].iter_mapsize, len); - zfs_fallthrough; - case 1: - len = MIN(caiters[0].iter_mapsize, len); - } - - /* must be progressive */ - ASSERT3S(len, >, 0); - - if (dabd && dsize > 0) { - /* this needs precise iter.length */ len = MIN(daiter.iter_mapsize, len); dlen = len; } else @@ -1114,7 +1080,7 @@ abd_raidz_gen_iterate(abd_t **cabds, abd_t *dabd, &caiters[i], len); } - if (dabd && dsize > 0) { + if (dsize > 0) { abd_iter_unmap(&daiter); c_dabd = abd_advance_abd_iter(dabd, c_dabd, &daiter, @@ -1153,16 +1119,16 @@ abd_raidz_rec_iterate(abd_t **cabds, abd_t **tabds, struct abd_iter xiters[3]; void *caddrs[3], *xaddrs[3]; unsigned long flags __maybe_unused = 0; - boolean_t cabds_is_gang_abd[3]; - boolean_t tabds_is_gang_abd[3]; abd_t *c_cabds[3]; abd_t *c_tabds[3]; ASSERT3U(parity, <=, 3); for (i = 0; i < parity; i++) { - cabds_is_gang_abd[i] = abd_is_gang(cabds[i]); - tabds_is_gang_abd[i] = abd_is_gang(tabds[i]); + abd_verify(cabds[i]); + abd_verify(tabds[i]); + ASSERT3U(tsize, <=, cabds[i]->abd_size); + ASSERT3U(tsize, <=, tabds[i]->abd_size); c_cabds[i] = abd_init_abd_iter(cabds[i], &citers[i], 0); c_tabds[i] = @@ -1171,36 +1137,18 @@ abd_raidz_rec_iterate(abd_t **cabds, abd_t **tabds, abd_enter_critical(flags); while (tsize > 0) { - + len = tsize; for (i = 0; i < parity; i++) { - /* - * If we are at the end of the gang ABD we - * are done. - */ - if (cabds_is_gang_abd[i] && !c_cabds[i]) - break; - if (tabds_is_gang_abd[i] && !c_tabds[i]) - break; + IMPLY(abd_is_gang(cabds[i]), c_cabds[i] != NULL); + IMPLY(abd_is_gang(tabds[i]), c_tabds[i] != NULL); abd_iter_map(&citers[i]); abd_iter_map(&xiters[i]); caddrs[i] = citers[i].iter_mapaddr; xaddrs[i] = xiters[i].iter_mapaddr; + len = MIN(citers[i].iter_mapsize, len); + len = MIN(xiters[i].iter_mapsize, len); } - len = tsize; - switch (parity) { - case 3: - len = MIN(xiters[2].iter_mapsize, len); - len = MIN(citers[2].iter_mapsize, len); - zfs_fallthrough; - case 2: - len = MIN(xiters[1].iter_mapsize, len); - len = MIN(citers[1].iter_mapsize, len); - zfs_fallthrough; - case 1: - len = MIN(xiters[0].iter_mapsize, len); - len = MIN(citers[0].iter_mapsize, len); - } /* must be progressive */ ASSERT3S(len, >, 0); /* diff --git a/sys/contrib/openzfs/module/zfs/arc.c b/sys/contrib/openzfs/module/zfs/arc.c index 5d4a52fa0693..06544925b5ca 100644 --- a/sys/contrib/openzfs/module/zfs/arc.c +++ b/sys/contrib/openzfs/module/zfs/arc.c @@ -5868,12 +5868,9 @@ top: * 3. This buffer isn't currently writing to the L2ARC. * 4. The L2ARC entry wasn't evicted, which may * also have invalidated the vdev. - * 5. This isn't prefetch or l2arc_noprefetch is 0. */ if (HDR_HAS_L2HDR(hdr) && - !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) && - !(l2arc_noprefetch && - (*arc_flags & ARC_FLAG_PREFETCH))) { + !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr)) { l2arc_read_callback_t *cb; abd_t *abd; uint64_t asize; diff --git a/sys/contrib/openzfs/module/zfs/dmu_objset.c b/sys/contrib/openzfs/module/zfs/dmu_objset.c index d134d4958f7c..76e65b5506a9 100644 --- a/sys/contrib/openzfs/module/zfs/dmu_objset.c +++ b/sys/contrib/openzfs/module/zfs/dmu_objset.c @@ -264,6 +264,19 @@ secondary_cache_changed_cb(void *arg, uint64_t newval) } static void +prefetch_changed_cb(void *arg, uint64_t newval) +{ + objset_t *os = arg; + + /* + * Inheritance should have been done by now. + */ + ASSERT(newval == ZFS_PREFETCH_ALL || newval == ZFS_PREFETCH_NONE || + newval == ZFS_PREFETCH_METADATA); + os->os_prefetch = newval; +} + +static void sync_changed_cb(void *arg, uint64_t newval) { objset_t *os = arg; @@ -562,6 +575,11 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, zfs_prop_to_name(ZFS_PROP_SECONDARYCACHE), secondary_cache_changed_cb, os); } + if (err == 0) { + err = dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_PREFETCH), + prefetch_changed_cb, os); + } if (!ds->ds_is_snapshot) { if (err == 0) { err = dsl_prop_register(ds, @@ -635,6 +653,7 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, os->os_primary_cache = ZFS_CACHE_ALL; os->os_secondary_cache = ZFS_CACHE_ALL; os->os_dnodesize = DNODE_MIN_SIZE; + os->os_prefetch = ZFS_PREFETCH_ALL; } if (ds == NULL || !ds->ds_is_snapshot) diff --git a/sys/contrib/openzfs/module/zfs/dmu_zfetch.c b/sys/contrib/openzfs/module/zfs/dmu_zfetch.c index d0acaf502066..2b2d72671001 100644 --- a/sys/contrib/openzfs/module/zfs/dmu_zfetch.c +++ b/sys/contrib/openzfs/module/zfs/dmu_zfetch.c @@ -329,9 +329,14 @@ dmu_zfetch_prepare(zfetch_t *zf, uint64_t blkid, uint64_t nblks, { zstream_t *zs; spa_t *spa = zf->zf_dnode->dn_objset->os_spa; + zfs_prefetch_type_t os_prefetch = zf->zf_dnode->dn_objset->os_prefetch; - if (zfs_prefetch_disable) + if (zfs_prefetch_disable || os_prefetch == ZFS_PREFETCH_NONE) return (NULL); + + if (os_prefetch == ZFS_PREFETCH_METADATA) + fetch_data = B_FALSE; + /* * If we haven't yet loaded the indirect vdevs' mappings, we * can only read from blocks that we carefully ensure are on diff --git a/sys/contrib/openzfs/module/zfs/spa.c b/sys/contrib/openzfs/module/zfs/spa.c index 0b69568e2aad..aa97144f16e4 100644 --- a/sys/contrib/openzfs/module/zfs/spa.c +++ b/sys/contrib/openzfs/module/zfs/spa.c @@ -101,6 +101,26 @@ #include "zfs_comutil.h" /* + * spa_thread() existed on Illumos as a parent thread for the various worker + * threads that actually run the pool, as a way to both reference the entire + * pool work as a single object, and to share properties like scheduling + * options. It has not yet been adapted to Linux or FreeBSD. This define is + * used to mark related parts of the code to make things easier for the reader, + * and to compile this code out. It can be removed when someone implements it, + * moves it to some Illumos-specific place, or removes it entirely. + */ +#undef HAVE_SPA_THREAD + +/* + * The "System Duty Cycle" scheduling class is an Illumos feature to help + * prevent CPU-intensive kernel threads from affecting latency on interactive + * threads. It doesn't exist on Linux or FreeBSD, so the supporting code is + * gated behind a define. On Illumos SDC depends on spa_thread(), but + * spa_thread() also has other uses, so this is a separate define. + */ +#undef HAVE_SYSDC + +/* * The interval, in seconds, at which failed configuration cache file writes * should be retried. */ @@ -176,10 +196,15 @@ static uint_t metaslab_preload_pct = 50; static uint_t zio_taskq_batch_pct = 80; /* 1 thread per cpu in pset */ static uint_t zio_taskq_batch_tpq; /* threads per taskq */ + +#ifdef HAVE_SYSDC static const boolean_t zio_taskq_sysdc = B_TRUE; /* use SDC scheduling class */ static const uint_t zio_taskq_basedc = 80; /* base duty cycle */ +#endif +#ifdef HAVE_SPA_THREAD static const boolean_t spa_create_process = B_TRUE; /* no process => no sysdc */ +#endif /* * Report any spa_load_verify errors found, but do not fail spa_load. @@ -1029,7 +1054,9 @@ spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q) uint_t count = ztip->zti_count; spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; uint_t cpus, flags = TASKQ_DYNAMIC; +#ifdef HAVE_SYSDC boolean_t batch = B_FALSE; +#endif switch (mode) { case ZTI_MODE_FIXED: @@ -1037,7 +1064,9 @@ spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q) break; case ZTI_MODE_BATCH: +#ifdef HAVE_SYSDC batch = B_TRUE; +#endif flags |= TASKQ_THREADS_CPU_PCT; value = MIN(zio_taskq_batch_pct, 100); break; @@ -1106,6 +1135,7 @@ spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q) (void) snprintf(name, sizeof (name), "%s_%s", zio_type_name[t], zio_taskq_types[q]); +#ifdef HAVE_SYSDC if (zio_taskq_sysdc && spa->spa_proc != &p0) { if (batch) flags |= TASKQ_DC_BATCH; @@ -1114,6 +1144,7 @@ spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q) tq = taskq_create_sysdc(name, value, 50, INT_MAX, spa->spa_proc, zio_taskq_basedc, flags); } else { +#endif pri_t pri = maxclsyspri; /* * The write issue taskq can be extremely CPU @@ -1139,7 +1170,9 @@ spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q) } tq = taskq_create_proc(name, value, pri, 50, INT_MAX, spa->spa_proc, flags); +#ifdef HAVE_SYSDC } +#endif tqs->stqs_taskq[i] = tq; } @@ -1224,11 +1257,6 @@ spa_create_zio_taskqs(spa_t *spa) } } -/* - * Disabled until spa_thread() can be adapted for Linux. - */ -#undef HAVE_SPA_THREAD - #if defined(_KERNEL) && defined(HAVE_SPA_THREAD) static void spa_thread(void *arg) @@ -1269,9 +1297,11 @@ spa_thread(void *arg) pool_unlock(); } +#ifdef HAVE_SYSDC if (zio_taskq_sysdc) { sysdc_thread_enter(curthread, 100, 0); } +#endif spa->spa_proc = curproc; spa->spa_did = curthread->t_did; @@ -1327,7 +1357,6 @@ spa_activate(spa_t *spa, spa_mode_t mode) ASSERT(spa->spa_proc == &p0); spa->spa_did = 0; - (void) spa_create_process; #ifdef HAVE_SPA_THREAD /* Only create a process if we're going to be around a while. */ if (spa_create_process && strcmp(spa->spa_name, TRYIMPORT_NAME) != 0) { diff --git a/sys/contrib/openzfs/module/zfs/spa_config.c b/sys/contrib/openzfs/module/zfs/spa_config.c index 636c04d9f785..a77874ea0dd3 100644 --- a/sys/contrib/openzfs/module/zfs/spa_config.c +++ b/sys/contrib/openzfs/module/zfs/spa_config.c @@ -367,23 +367,24 @@ spa_write_cachefile(spa_t *target, boolean_t removing, boolean_t postsysevent, * So we have to invent the ZFS_IOC_CONFIG ioctl to grab the configuration * information for all pool visible within the zone. */ -nvlist_t * -spa_all_configs(uint64_t *generation) +int +spa_all_configs(uint64_t *generation, nvlist_t **pools) { - nvlist_t *pools; spa_t *spa = NULL; if (*generation == spa_config_generation) - return (NULL); + return (SET_ERROR(EEXIST)); - pools = fnvlist_alloc(); + int error = mutex_enter_interruptible(&spa_namespace_lock); + if (error) + return (SET_ERROR(EINTR)); - mutex_enter(&spa_namespace_lock); + *pools = fnvlist_alloc(); while ((spa = spa_next(spa)) != NULL) { if (INGLOBALZONE(curproc) || zone_dataset_visible(spa_name(spa), NULL)) { mutex_enter(&spa->spa_props_lock); - fnvlist_add_nvlist(pools, spa_name(spa), + fnvlist_add_nvlist(*pools, spa_name(spa), spa->spa_config); mutex_exit(&spa->spa_props_lock); } @@ -391,7 +392,7 @@ spa_all_configs(uint64_t *generation) *generation = spa_config_generation; mutex_exit(&spa_namespace_lock); - return (pools); + return (0); } void diff --git a/sys/contrib/openzfs/module/zfs/zfs_ioctl.c b/sys/contrib/openzfs/module/zfs/zfs_ioctl.c index f91a2f3bbca5..2738385e260b 100644 --- a/sys/contrib/openzfs/module/zfs/zfs_ioctl.c +++ b/sys/contrib/openzfs/module/zfs/zfs_ioctl.c @@ -1582,8 +1582,9 @@ zfs_ioc_pool_configs(zfs_cmd_t *zc) nvlist_t *configs; int error; - if ((configs = spa_all_configs(&zc->zc_cookie)) == NULL) - return (SET_ERROR(EEXIST)); + error = spa_all_configs(&zc->zc_cookie, &configs); + if (error) + return (error); error = put_nvlist(zc, configs); diff --git a/sys/contrib/openzfs/module/zfs/zfs_quota.c b/sys/contrib/openzfs/module/zfs/zfs_quota.c index 56f9d22ed0e5..9b351eefc04e 100644 --- a/sys/contrib/openzfs/module/zfs/zfs_quota.c +++ b/sys/contrib/openzfs/module/zfs/zfs_quota.c @@ -347,32 +347,18 @@ zfs_set_userquota(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type, if (*objp == 0) { *objp = zap_create(zfsvfs->z_os, DMU_OT_USERGROUP_QUOTA, DMU_OT_NONE, 0, tx); - VERIFY0(zap_add(zfsvfs->z_os, MASTER_NODE_OBJ, + VERIFY(0 == zap_add(zfsvfs->z_os, MASTER_NODE_OBJ, zfs_userquota_prop_prefixes[type], 8, 1, objp, tx)); } + mutex_exit(&zfsvfs->z_lock); if (quota == 0) { err = zap_remove(zfsvfs->z_os, *objp, buf, tx); if (err == ENOENT) err = 0; - /* - * If the quota contains no more entries after the entry - * was removed, destroy the quota zap and remove the - * reference from zfsvfs. This will save us unnecessary - * zap_lookups for the quota during writes. - */ - uint64_t zap_nentries; - VERIFY0(zap_count(zfsvfs->z_os, *objp, &zap_nentries)); - if (zap_nentries == 0) { - VERIFY0(zap_remove(zfsvfs->z_os, MASTER_NODE_OBJ, - zfs_userquota_prop_prefixes[type], tx)); - VERIFY0(zap_destroy(zfsvfs->z_os, *objp, tx)); - *objp = 0; - } } else { err = zap_update(zfsvfs->z_os, *objp, buf, 8, 1, "a, tx); } - mutex_exit(&zfsvfs->z_lock); ASSERT(err == 0); if (fuid_dirtied) zfs_fuid_sync(zfsvfs, tx); diff --git a/sys/contrib/openzfs/module/zfs/zil.c b/sys/contrib/openzfs/module/zfs/zil.c index a11886136994..ce2cb8b1446a 100644 --- a/sys/contrib/openzfs/module/zfs/zil.c +++ b/sys/contrib/openzfs/module/zfs/zil.c @@ -91,15 +91,7 @@ * committed to stable storage. Please refer to the zil_commit_waiter() * function (and the comments within it) for more details. */ -static uint_t zfs_commit_timeout_pct = 5; - -/* - * Minimal time we care to delay commit waiting for more ZIL records. - * At least FreeBSD kernel can't sleep for less than 2us at its best. - * So requests to sleep for less then 5us is a waste of CPU time with - * a risk of significant log latency increase due to oversleep. - */ -static uint64_t zil_min_commit_timeout = 5000; +static uint_t zfs_commit_timeout_pct = 10; /* * See zil.h for more information about these fields. @@ -2163,8 +2155,8 @@ zil_lwb_commit(zilog_t *zilog, lwb_t *lwb, itx_t *itx) ZIL_STAT_INCR(zilog, zil_itx_indirect_bytes, lrw->lr_length); if (lwb->lwb_child_zio == NULL) { - lwb->lwb_child_zio = zio_root( - zilog->zl_spa, NULL, NULL, + lwb->lwb_child_zio = zio_null(NULL, + zilog->zl_spa, NULL, NULL, NULL, ZIO_FLAG_CANFAIL); } } @@ -2696,6 +2688,19 @@ zil_commit_writer_stall(zilog_t *zilog) ASSERT(list_is_empty(&zilog->zl_lwb_list)); } +static void +zil_burst_done(zilog_t *zilog) +{ + if (!list_is_empty(&zilog->zl_itx_commit_list) || + zilog->zl_cur_used == 0) + return; + + if (zilog->zl_parallel) + zilog->zl_parallel--; + + zilog->zl_cur_used = 0; +} + /* * This function will traverse the commit list, creating new lwbs as * needed, and committing the itxs from the commit list to these newly @@ -2710,7 +2715,6 @@ zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs) list_t nolwb_waiters; lwb_t *lwb, *plwb; itx_t *itx; - boolean_t first = B_TRUE; ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock)); @@ -2736,9 +2740,22 @@ zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs) zil_commit_activate_saxattr_feature(zilog); ASSERT(lwb->lwb_state == LWB_STATE_NEW || lwb->lwb_state == LWB_STATE_OPENED); - first = (lwb->lwb_state == LWB_STATE_NEW) && - ((plwb = list_prev(&zilog->zl_lwb_list, lwb)) == NULL || - plwb->lwb_state == LWB_STATE_FLUSH_DONE); + + /* + * If the lwb is still opened, it means the workload is really + * multi-threaded and we won the chance of write aggregation. + * If it is not opened yet, but previous lwb is still not + * flushed, it still means the workload is multi-threaded, but + * there was too much time between the commits to aggregate, so + * we try aggregation next times, but without too much hopes. + */ + if (lwb->lwb_state == LWB_STATE_OPENED) { + zilog->zl_parallel = ZIL_BURSTS; + } else if ((plwb = list_prev(&zilog->zl_lwb_list, lwb)) + != NULL && plwb->lwb_state != LWB_STATE_FLUSH_DONE) { + zilog->zl_parallel = MAX(zilog->zl_parallel, + ZIL_BURSTS / 2); + } } while ((itx = list_remove_head(&zilog->zl_itx_commit_list)) != NULL) { @@ -2813,7 +2830,7 @@ zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs) * Our lwb is done, leave the rest of * itx list to somebody else who care. */ - first = B_FALSE; + zilog->zl_parallel = ZIL_BURSTS; break; } } else { @@ -2905,28 +2922,15 @@ zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs) * try and pack as many itxs into as few lwbs as * possible, without significantly impacting the latency * of each individual itx. - * - * If we had no already running or open LWBs, it can be - * the workload is single-threaded. And if the ZIL write - * latency is very small or if the LWB is almost full, it - * may be cheaper to bypass the delay. */ - if (lwb->lwb_state == LWB_STATE_OPENED && first) { - hrtime_t sleep = zilog->zl_last_lwb_latency * - zfs_commit_timeout_pct / 100; - if (sleep < zil_min_commit_timeout || - lwb->lwb_nmax - lwb->lwb_nused < - lwb->lwb_nmax / 8) { - list_insert_tail(ilwbs, lwb); - lwb = zil_lwb_write_close(zilog, lwb, - LWB_STATE_NEW); - zilog->zl_cur_used = 0; - if (lwb == NULL) { - while ((lwb = list_remove_head(ilwbs)) - != NULL) - zil_lwb_write_issue(zilog, lwb); - zil_commit_writer_stall(zilog); - } + if (lwb->lwb_state == LWB_STATE_OPENED && !zilog->zl_parallel) { + list_insert_tail(ilwbs, lwb); + lwb = zil_lwb_write_close(zilog, lwb, LWB_STATE_NEW); + zil_burst_done(zilog); + if (lwb == NULL) { + while ((lwb = list_remove_head(ilwbs)) != NULL) + zil_lwb_write_issue(zilog, lwb); + zil_commit_writer_stall(zilog); } } } @@ -3084,19 +3088,7 @@ zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw) ASSERT3S(lwb->lwb_state, ==, LWB_STATE_CLOSED); - /* - * Since the lwb's zio hadn't been issued by the time this thread - * reached its timeout, we reset the zilog's "zl_cur_used" field - * to influence the zil block size selection algorithm. - * - * By having to issue the lwb's zio here, it means the size of the - * lwb was too large, given the incoming throughput of itxs. By - * setting "zl_cur_used" to zero, we communicate this fact to the - * block size selection algorithm, so it can take this information - * into account, and potentially select a smaller size for the - * next lwb block that is allocated. - */ - zilog->zl_cur_used = 0; + zil_burst_done(zilog); if (nlwb == NULL) { /* @@ -4214,9 +4206,6 @@ EXPORT_SYMBOL(zil_kstat_values_update); ZFS_MODULE_PARAM(zfs, zfs_, commit_timeout_pct, UINT, ZMOD_RW, "ZIL block open timeout percentage"); -ZFS_MODULE_PARAM(zfs_zil, zil_, min_commit_timeout, U64, ZMOD_RW, - "Minimum delay we care for ZIL block commit"); - ZFS_MODULE_PARAM(zfs_zil, zil_, replay_disable, INT, ZMOD_RW, "Disable intent logging replay"); diff --git a/sys/contrib/openzfs/module/zfs/zio.c b/sys/contrib/openzfs/module/zfs/zio.c index 3b3b40fa73d8..3eb472a9fd2a 100644 --- a/sys/contrib/openzfs/module/zfs/zio.c +++ b/sys/contrib/openzfs/module/zfs/zio.c @@ -634,6 +634,11 @@ zio_add_child(zio_t *pio, zio_t *cio) */ ASSERT3S(cio->io_child_type, <=, pio->io_child_type); + /* Parent should not have READY stage if child doesn't have it. */ + IMPLY((cio->io_pipeline & ZIO_STAGE_READY) == 0 && + (cio->io_child_type != ZIO_CHILD_VDEV), + (pio->io_pipeline & ZIO_STAGE_READY) == 0); + zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP); zl->zl_parent = pio; zl->zl_child = cio; @@ -665,6 +670,11 @@ zio_add_child_first(zio_t *pio, zio_t *cio) */ ASSERT3S(cio->io_child_type, <=, pio->io_child_type); + /* Parent should not have READY stage if child doesn't have it. */ + IMPLY((cio->io_pipeline & ZIO_STAGE_READY) == 0 && + (cio->io_child_type != ZIO_CHILD_VDEV), + (pio->io_pipeline & ZIO_STAGE_READY) == 0); + zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP); zl->zl_parent = pio; zl->zl_child = cio; @@ -901,7 +911,8 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, zio->io_orig_pipeline = zio->io_pipeline = pipeline; zio->io_pipeline_trace = ZIO_STAGE_OPEN; - zio->io_state[ZIO_WAIT_READY] = (stage >= ZIO_STAGE_READY); + zio->io_state[ZIO_WAIT_READY] = (stage >= ZIO_STAGE_READY) || + (pipeline & ZIO_STAGE_READY) == 0; zio->io_state[ZIO_WAIT_DONE] = (stage >= ZIO_STAGE_DONE); if (zb != NULL) @@ -932,6 +943,10 @@ zio_destroy(zio_t *zio) kmem_cache_free(zio_cache, zio); } +/* + * ZIO intended to be between others. Provides synchronization at READY + * and DONE pipeline stages and calls the respective callbacks. + */ zio_t * zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, zio_done_func_t *done, void *private, zio_flag_t flags) @@ -945,10 +960,22 @@ zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, zio_done_func_t *done, return (zio); } +/* + * ZIO intended to be a root of a tree. Unlike null ZIO does not have a + * READY pipeline stage (is ready on creation), so it should not be used + * as child of any ZIO that may need waiting for grandchildren READY stage + * (any other ZIO type). + */ zio_t * zio_root(spa_t *spa, zio_done_func_t *done, void *private, zio_flag_t flags) { - return (zio_null(NULL, spa, NULL, done, private, flags)); + zio_t *zio; + + zio = zio_create(NULL, spa, 0, NULL, NULL, 0, 0, done, private, + ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, NULL, 0, NULL, + ZIO_STAGE_OPEN, ZIO_ROOT_PIPELINE); + + return (zio); } static int @@ -2396,13 +2423,14 @@ static void zio_reexecute(void *arg) { zio_t *pio = arg; - zio_t *cio, *cio_next; + zio_t *cio, *cio_next, *gio; ASSERT(pio->io_child_type == ZIO_CHILD_LOGICAL); ASSERT(pio->io_orig_stage == ZIO_STAGE_OPEN); ASSERT(pio->io_gang_leader == NULL); ASSERT(pio->io_gang_tree == NULL); + mutex_enter(&pio->io_lock); pio->io_flags = pio->io_orig_flags; pio->io_stage = pio->io_orig_stage; pio->io_pipeline = pio->io_orig_pipeline; @@ -2410,8 +2438,16 @@ zio_reexecute(void *arg) pio->io_flags |= ZIO_FLAG_REEXECUTED; pio->io_pipeline_trace = 0; pio->io_error = 0; - for (int w = 0; w < ZIO_WAIT_TYPES; w++) - pio->io_state[w] = 0; + pio->io_state[ZIO_WAIT_READY] = (pio->io_stage >= ZIO_STAGE_READY) || + (pio->io_pipeline & ZIO_STAGE_READY) == 0; + pio->io_state[ZIO_WAIT_DONE] = (pio->io_stage >= ZIO_STAGE_DONE); + zio_link_t *zl = NULL; + while ((gio = zio_walk_parents(pio, &zl)) != NULL) { + for (int w = 0; w < ZIO_WAIT_TYPES; w++) { + gio->io_children[pio->io_child_type][w] += + !pio->io_state[w]; + } + } for (int c = 0; c < ZIO_CHILD_TYPES; c++) pio->io_child_error[c] = 0; @@ -2425,12 +2461,9 @@ zio_reexecute(void *arg) * the remainder of pio's io_child_list, from 'cio_next' onward, * cannot be affected by any side effects of reexecuting 'cio'. */ - zio_link_t *zl = NULL; - mutex_enter(&pio->io_lock); + zl = NULL; for (cio = zio_walk_children(pio, &zl); cio != NULL; cio = cio_next) { cio_next = zio_walk_children(pio, &zl); - for (int w = 0; w < ZIO_WAIT_TYPES; w++) - pio->io_children[cio->io_child_type][w]++; mutex_exit(&pio->io_lock); zio_reexecute(cio); mutex_enter(&pio->io_lock); diff --git a/sys/contrib/openzfs/tests/zfs-tests/include/tunables.cfg b/sys/contrib/openzfs/tests/zfs-tests/include/tunables.cfg index 8010a9451597..80e7bcb3bd09 100644 --- a/sys/contrib/openzfs/tests/zfs-tests/include/tunables.cfg +++ b/sys/contrib/openzfs/tests/zfs-tests/include/tunables.cfg @@ -89,7 +89,7 @@ VDEV_VALIDATE_SKIP vdev.validate_skip vdev_validate_skip VOL_INHIBIT_DEV UNSUPPORTED zvol_inhibit_dev VOL_MODE vol.mode zvol_volmode VOL_RECURSIVE vol.recursive UNSUPPORTED -VOL_USE_BLK_MQ UNSUPPORTED UNSUPPORTED +VOL_USE_BLK_MQ UNSUPPORTED zvol_use_blk_mq XATTR_COMPAT xattr_compat zfs_xattr_compat ZEVENT_LEN_MAX zevent.len_max zfs_zevent_len_max ZEVENT_RETAIN_MAX zevent.retain_max zfs_zevent_retain_max diff --git a/sys/modules/zfs/zfs_config.h b/sys/modules/zfs/zfs_config.h index e91cf5f22368..39caf1355771 100644 --- a/sys/modules/zfs/zfs_config.h +++ b/sys/modules/zfs/zfs_config.h @@ -1110,7 +1110,7 @@ /* #undef ZFS_IS_GPL_COMPATIBLE */ /* Define the project alias string. */ -#define ZFS_META_ALIAS "zfs-2.2.99-FreeBSD_g797f55ef1" +#define ZFS_META_ALIAS "zfs-2.2.99-FreeBSD_g043c6ee3b" /* Define the project author. */ #define ZFS_META_AUTHOR "OpenZFS" @@ -1140,7 +1140,7 @@ #define ZFS_META_NAME "zfs" /* Define the project release. */ -#define ZFS_META_RELEASE "FreeBSD_g797f55ef1" +#define ZFS_META_RELEASE "FreeBSD_g043c6ee3b" /* Define the project version. */ #define ZFS_META_VERSION "2.2.99" diff --git a/sys/modules/zfs/zfs_gitrev.h b/sys/modules/zfs/zfs_gitrev.h index e88a5e8dbb37..b4d500f67a90 100644 --- a/sys/modules/zfs/zfs_gitrev.h +++ b/sys/modules/zfs/zfs_gitrev.h @@ -1 +1 @@ -#define ZFS_META_GITREV "zfs-2.2.99-162-g797f55ef1" +#define ZFS_META_GITREV "zfs-2.2.99-174-g043c6ee3b" |