aboutsummaryrefslogtreecommitdiff
path: root/sys
diff options
context:
space:
mode:
authorMartin Matuska <mm@FreeBSD.org>2023-10-30 20:28:03 +0000
committerMartin Matuska <mm@FreeBSD.org>2023-10-30 20:42:11 +0000
commit6c1e79df8c651b31ca4f656336e6c5ac29a66482 (patch)
tree751fa1ffefeabc6e248d538b8883cff639b40285 /sys
parent87c5032353106764f324e82541662f448e68f38a (diff)
parent043c6ee3b6bfb55f8d36e1f048ff13128c279fb8 (diff)
zfs: merge openzfs/zfs@043c6ee3b
Notable upstream pull request merges: #15360 97a0b5be Add mutex_enter_interruptible() for interruptible sleeping IOCTLs #15381 252f46be ZIL: Detect single-threaded workloads #15398 3afdc97d ZIO: Remove READY pipeline stage from root ZIOs #15428 e007908a ABD: Be more assertive in iterators #15436 07345ac2 Add prefetch property #15438 e9725abd Revert "Do not persist user/group/project quota zap objects when unneeded" #15451 043c6ee3 Read prefetched buffers from L2ARC Obtained from: OpenZFS OpenZFS commit: 043c6ee3b6bfb55f8d36e1f048ff13128c279fb8
Diffstat (limited to 'sys')
-rwxr-xr-xsys/contrib/openzfs/cmd/arc_summary2
-rw-r--r--sys/contrib/openzfs/cmd/zdb/zdb.c5
-rw-r--r--sys/contrib/openzfs/include/os/freebsd/spl/sys/mutex.h1
-rw-r--r--sys/contrib/openzfs/include/os/linux/spl/sys/mutex.h21
-rw-r--r--sys/contrib/openzfs/include/os/linux/spl/sys/uio.h8
-rw-r--r--sys/contrib/openzfs/include/sys/dmu_objset.h1
-rw-r--r--sys/contrib/openzfs/include/sys/fs/zfs.h7
-rw-r--r--sys/contrib/openzfs/include/sys/spa.h2
-rw-r--r--sys/contrib/openzfs/include/sys/zfs_context.h2
-rw-r--r--sys/contrib/openzfs/include/sys/zil_impl.h4
-rw-r--r--sys/contrib/openzfs/include/sys/zio_impl.h3
-rw-r--r--sys/contrib/openzfs/lib/libzfs/libzfs.abi3
-rw-r--r--sys/contrib/openzfs/lib/libzpool/kernel.c9
-rw-r--r--sys/contrib/openzfs/man/man4/zfs.471
-rw-r--r--sys/contrib/openzfs/man/man7/zfsprops.717
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/arc_os.c10
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/zfs_uio.c29
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c12
-rw-r--r--sys/contrib/openzfs/module/zcommon/zfs_prop.c11
-rw-r--r--sys/contrib/openzfs/module/zfs/abd.c104
-rw-r--r--sys/contrib/openzfs/module/zfs/arc.c5
-rw-r--r--sys/contrib/openzfs/module/zfs/dmu_objset.c19
-rw-r--r--sys/contrib/openzfs/module/zfs/dmu_zfetch.c7
-rw-r--r--sys/contrib/openzfs/module/zfs/spa.c41
-rw-r--r--sys/contrib/openzfs/module/zfs/spa_config.c17
-rw-r--r--sys/contrib/openzfs/module/zfs/zfs_ioctl.c5
-rw-r--r--sys/contrib/openzfs/module/zfs/zfs_quota.c18
-rw-r--r--sys/contrib/openzfs/module/zfs/zil.c95
-rw-r--r--sys/contrib/openzfs/module/zfs/zio.c51
-rw-r--r--sys/contrib/openzfs/tests/zfs-tests/include/tunables.cfg2
-rw-r--r--sys/modules/zfs/zfs_config.h4
-rw-r--r--sys/modules/zfs/zfs_gitrev.h2
32 files changed, 341 insertions, 247 deletions
diff --git a/sys/contrib/openzfs/cmd/arc_summary b/sys/contrib/openzfs/cmd/arc_summary
index 426e0207052d..9c69ec4f8ccc 100755
--- a/sys/contrib/openzfs/cmd/arc_summary
+++ b/sys/contrib/openzfs/cmd/arc_summary
@@ -711,7 +711,7 @@ def section_archits(kstats_dict):
pd_total = int(arc_stats['prefetch_data_hits']) +\
int(arc_stats['prefetch_data_iohits']) +\
int(arc_stats['prefetch_data_misses'])
- prt_2('ARC prefetch metadata accesses:', f_perc(pd_total, all_accesses),
+ prt_2('ARC prefetch data accesses:', f_perc(pd_total, all_accesses),
f_hits(pd_total))
pd_todo = (('Prefetch data hits:', arc_stats['prefetch_data_hits']),
('Prefetch data I/O hits:', arc_stats['prefetch_data_iohits']),
diff --git a/sys/contrib/openzfs/cmd/zdb/zdb.c b/sys/contrib/openzfs/cmd/zdb/zdb.c
index b39a0e8825fb..3c282f3fc975 100644
--- a/sys/contrib/openzfs/cmd/zdb/zdb.c
+++ b/sys/contrib/openzfs/cmd/zdb/zdb.c
@@ -8716,8 +8716,6 @@ zdb_read_block(char *thing, spa_t *spa)
BP_SET_CHECKSUM(bp, ck);
spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
czio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
- czio->io_bp = bp;
-
if (vd == vd->vdev_top) {
zio_nowait(zio_read(czio, spa, bp, pabd, psize,
NULL, NULL,
@@ -8736,7 +8734,8 @@ zdb_read_block(char *thing, spa_t *spa)
}
error = zio_wait(czio);
if (error == 0 || error == ECKSUM) {
- zio_t *ck_zio = zio_root(spa, NULL, NULL, 0);
+ zio_t *ck_zio = zio_null(NULL, spa, NULL,
+ NULL, NULL, 0);
ck_zio->io_offset =
DVA_GET_OFFSET(&bp->blk_dva[0]);
ck_zio->io_bp = bp;
diff --git a/sys/contrib/openzfs/include/os/freebsd/spl/sys/mutex.h b/sys/contrib/openzfs/include/os/freebsd/spl/sys/mutex.h
index e757d12c1502..8cfe56c75309 100644
--- a/sys/contrib/openzfs/include/os/freebsd/spl/sys/mutex.h
+++ b/sys/contrib/openzfs/include/os/freebsd/spl/sys/mutex.h
@@ -64,6 +64,7 @@ typedef enum {
} while (0)
#define mutex_destroy(lock) sx_destroy(lock)
#define mutex_enter(lock) sx_xlock(lock)
+#define mutex_enter_interruptible(lock) sx_xlock_sig(lock)
#define mutex_enter_nested(lock, type) sx_xlock(lock)
#define mutex_tryenter(lock) sx_try_xlock(lock)
#define mutex_exit(lock) sx_xunlock(lock)
diff --git a/sys/contrib/openzfs/include/os/linux/spl/sys/mutex.h b/sys/contrib/openzfs/include/os/linux/spl/sys/mutex.h
index 6b61c59c48e2..b4eaa0266d20 100644
--- a/sys/contrib/openzfs/include/os/linux/spl/sys/mutex.h
+++ b/sys/contrib/openzfs/include/os/linux/spl/sys/mutex.h
@@ -128,7 +128,6 @@ spl_mutex_lockdep_on_maybe(kmutex_t *mp) \
#define NESTED_SINGLE 1
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
#define mutex_enter_nested(mp, subclass) \
{ \
ASSERT3P(mutex_owner(mp), !=, current); \
@@ -137,16 +136,22 @@ spl_mutex_lockdep_on_maybe(kmutex_t *mp) \
spl_mutex_lockdep_on_maybe(mp); \
spl_mutex_set_owner(mp); \
}
-#else /* CONFIG_DEBUG_LOCK_ALLOC */
-#define mutex_enter_nested(mp, subclass) \
-{ \
+
+#define mutex_enter_interruptible(mp) \
+/* CSTYLED */ \
+({ \
+ int _rc_; \
+ \
ASSERT3P(mutex_owner(mp), !=, current); \
spl_mutex_lockdep_off_maybe(mp); \
- mutex_lock(MUTEX(mp)); \
+ _rc_ = mutex_lock_interruptible(MUTEX(mp)); \
spl_mutex_lockdep_on_maybe(mp); \
- spl_mutex_set_owner(mp); \
-}
-#endif /* CONFIG_DEBUG_LOCK_ALLOC */
+ if (!_rc_) { \
+ spl_mutex_set_owner(mp); \
+ } \
+ \
+ _rc_; \
+})
#define mutex_enter(mp) mutex_enter_nested((mp), 0)
diff --git a/sys/contrib/openzfs/include/os/linux/spl/sys/uio.h b/sys/contrib/openzfs/include/os/linux/spl/sys/uio.h
index cce097e16fbc..a4b600004c9f 100644
--- a/sys/contrib/openzfs/include/os/linux/spl/sys/uio.h
+++ b/sys/contrib/openzfs/include/os/linux/spl/sys/uio.h
@@ -73,13 +73,6 @@ typedef struct zfs_uio {
size_t uio_skip;
struct request *rq;
-
- /*
- * Used for saving rq_for_each_segment() state between calls
- * to zfs_uiomove_bvec_rq().
- */
- struct req_iterator iter;
- struct bio_vec bv;
} zfs_uio_t;
@@ -138,7 +131,6 @@ zfs_uio_bvec_init(zfs_uio_t *uio, struct bio *bio, struct request *rq)
} else {
uio->uio_bvec = NULL;
uio->uio_iovcnt = 0;
- memset(&uio->iter, 0, sizeof (uio->iter));
}
uio->uio_loffset = io_offset(bio, rq);
diff --git a/sys/contrib/openzfs/include/sys/dmu_objset.h b/sys/contrib/openzfs/include/sys/dmu_objset.h
index 9f6e0fdd601b..a9123e862af7 100644
--- a/sys/contrib/openzfs/include/sys/dmu_objset.h
+++ b/sys/contrib/openzfs/include/sys/dmu_objset.h
@@ -132,6 +132,7 @@ struct objset {
zfs_logbias_op_t os_logbias;
zfs_cache_type_t os_primary_cache;
zfs_cache_type_t os_secondary_cache;
+ zfs_prefetch_type_t os_prefetch;
zfs_sync_type_t os_sync;
zfs_redundant_metadata_type_t os_redundant_metadata;
uint64_t os_recordsize;
diff --git a/sys/contrib/openzfs/include/sys/fs/zfs.h b/sys/contrib/openzfs/include/sys/fs/zfs.h
index bc940e8a7929..9d7eca9784b6 100644
--- a/sys/contrib/openzfs/include/sys/fs/zfs.h
+++ b/sys/contrib/openzfs/include/sys/fs/zfs.h
@@ -191,6 +191,7 @@ typedef enum {
ZFS_PROP_REDACTED,
ZFS_PROP_REDACT_SNAPS,
ZFS_PROP_SNAPSHOTS_CHANGED,
+ ZFS_PROP_PREFETCH,
ZFS_NUM_PROPS
} zfs_prop_t;
@@ -543,6 +544,12 @@ typedef enum zfs_key_location {
ZFS_KEYLOCATION_LOCATIONS
} zfs_keylocation_t;
+typedef enum {
+ ZFS_PREFETCH_NONE = 0,
+ ZFS_PREFETCH_METADATA = 1,
+ ZFS_PREFETCH_ALL = 2
+} zfs_prefetch_type_t;
+
#define DEFAULT_PBKDF2_ITERATIONS 350000
#define MIN_PBKDF2_ITERATIONS 100000
diff --git a/sys/contrib/openzfs/include/sys/spa.h b/sys/contrib/openzfs/include/sys/spa.h
index 18062d3f2a95..88ef510b744b 100644
--- a/sys/contrib/openzfs/include/sys/spa.h
+++ b/sys/contrib/openzfs/include/sys/spa.h
@@ -837,7 +837,7 @@ extern kmutex_t spa_namespace_lock;
extern void spa_write_cachefile(spa_t *, boolean_t, boolean_t, boolean_t);
extern void spa_config_load(void);
-extern nvlist_t *spa_all_configs(uint64_t *);
+extern int spa_all_configs(uint64_t *generation, nvlist_t **pools);
extern void spa_config_set(spa_t *spa, nvlist_t *config);
extern nvlist_t *spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg,
int getstats);
diff --git a/sys/contrib/openzfs/include/sys/zfs_context.h b/sys/contrib/openzfs/include/sys/zfs_context.h
index 6a337b49edf3..750ca612b962 100644
--- a/sys/contrib/openzfs/include/sys/zfs_context.h
+++ b/sys/contrib/openzfs/include/sys/zfs_context.h
@@ -274,11 +274,13 @@ typedef struct kmutex {
extern void mutex_init(kmutex_t *mp, char *name, int type, void *cookie);
extern void mutex_destroy(kmutex_t *mp);
extern void mutex_enter(kmutex_t *mp);
+extern int mutex_enter_check_return(kmutex_t *mp);
extern void mutex_exit(kmutex_t *mp);
extern int mutex_tryenter(kmutex_t *mp);
#define NESTED_SINGLE 1
#define mutex_enter_nested(mp, class) mutex_enter(mp)
+#define mutex_enter_interruptible(mp) mutex_enter_check_return(mp)
/*
* RW locks
*/
diff --git a/sys/contrib/openzfs/include/sys/zil_impl.h b/sys/contrib/openzfs/include/sys/zil_impl.h
index f780ad3d61bc..c9db6d428ea2 100644
--- a/sys/contrib/openzfs/include/sys/zil_impl.h
+++ b/sys/contrib/openzfs/include/sys/zil_impl.h
@@ -181,6 +181,7 @@ typedef struct zil_vdev_node {
avl_node_t zv_node; /* AVL tree linkage */
} zil_vdev_node_t;
+#define ZIL_BURSTS 8
#define ZIL_PREV_BLKS 16
/*
@@ -222,8 +223,9 @@ struct zilog {
clock_t zl_replay_time; /* lbolt of when replay started */
uint64_t zl_replay_blks; /* number of log blocks replayed */
zil_header_t zl_old_header; /* debugging aid */
- uint_t zl_prev_blks[ZIL_PREV_BLKS]; /* size - sector rounded */
+ uint_t zl_parallel; /* workload is multi-threaded */
uint_t zl_prev_rotor; /* rotor for zl_prev[] */
+ uint_t zl_prev_blks[ZIL_PREV_BLKS]; /* size - sector rounded */
txg_node_t zl_dirty_link; /* protected by dp_dirty_zilogs list */
uint64_t zl_dirty_max_txg; /* highest txg used to dirty zilog */
diff --git a/sys/contrib/openzfs/include/sys/zio_impl.h b/sys/contrib/openzfs/include/sys/zio_impl.h
index 29a05986cd4f..febe0a87b428 100644
--- a/sys/contrib/openzfs/include/sys/zio_impl.h
+++ b/sys/contrib/openzfs/include/sys/zio_impl.h
@@ -159,6 +159,9 @@ enum zio_stage {
ZIO_STAGE_DONE = 1 << 25 /* RWFCI */
};
+#define ZIO_ROOT_PIPELINE \
+ ZIO_STAGE_DONE
+
#define ZIO_INTERLOCK_STAGES \
(ZIO_STAGE_READY | \
ZIO_STAGE_DONE)
diff --git a/sys/contrib/openzfs/lib/libzfs/libzfs.abi b/sys/contrib/openzfs/lib/libzfs/libzfs.abi
index c1ce3d0f67d8..083c27be7d31 100644
--- a/sys/contrib/openzfs/lib/libzfs/libzfs.abi
+++ b/sys/contrib/openzfs/lib/libzfs/libzfs.abi
@@ -1867,7 +1867,8 @@
<enumerator name='ZFS_PROP_REDACTED' value='93'/>
<enumerator name='ZFS_PROP_REDACT_SNAPS' value='94'/>
<enumerator name='ZFS_PROP_SNAPSHOTS_CHANGED' value='95'/>
- <enumerator name='ZFS_NUM_PROPS' value='96'/>
+ <enumerator name='ZFS_PROP_PREFETCH' value='96'/>
+ <enumerator name='ZFS_NUM_PROPS' value='97'/>
</enum-decl>
<typedef-decl name='zfs_prop_t' type-id='4b000d60' id='58603c44'/>
<enum-decl name='zprop_source_t' naming-typedef-id='a2256d42' id='5903f80e'>
diff --git a/sys/contrib/openzfs/lib/libzpool/kernel.c b/sys/contrib/openzfs/lib/libzpool/kernel.c
index a9b9bf4c2ce5..ffad7fc02bc9 100644
--- a/sys/contrib/openzfs/lib/libzpool/kernel.c
+++ b/sys/contrib/openzfs/lib/libzpool/kernel.c
@@ -206,6 +206,15 @@ mutex_enter(kmutex_t *mp)
}
int
+mutex_enter_check_return(kmutex_t *mp)
+{
+ int error = pthread_mutex_lock(&mp->m_lock);
+ if (error == 0)
+ mp->m_owner = pthread_self();
+ return (error);
+}
+
+int
mutex_tryenter(kmutex_t *mp)
{
int error = pthread_mutex_trylock(&mp->m_lock);
diff --git a/sys/contrib/openzfs/man/man4/zfs.4 b/sys/contrib/openzfs/man/man4/zfs.4
index 5f89f6adf1e3..ddad00be412d 100644
--- a/sys/contrib/openzfs/man/man4/zfs.4
+++ b/sys/contrib/openzfs/man/man4/zfs.4
@@ -644,10 +644,7 @@ Max size of ARC in bytes.
If
.Sy 0 ,
then the max size of ARC is determined by the amount of system memory installed.
-Under Linux, half of system memory will be used as the limit.
-Under
-.Fx ,
-the larger of
+The larger of
.Sy all_system_memory No \- Sy 1 GiB
and
.Sy 5/8 No \(mu Sy all_system_memory
@@ -798,7 +795,7 @@ Note that this should not be set below the ZED thresholds
(currently 10 checksums over 10 seconds)
or else the daemon may not trigger any action.
.
-.It Sy zfs_commit_timeout_pct Ns = Ns Sy 5 Ns % Pq uint
+.It Sy zfs_commit_timeout_pct Ns = Ns Sy 10 Ns % Pq uint
This controls the amount of time that a ZIL block (lwb) will remain "open"
when it isn't "full", and it has a thread waiting for it to be committed to
stable storage.
@@ -2155,13 +2152,6 @@ This sets the maximum number of write bytes logged via WR_COPIED.
It tunes a tradeoff between additional memory copy and possibly worse log
space efficiency vs additional range lock/unlock.
.
-.It Sy zil_min_commit_timeout Ns = Ns Sy 5000 Pq u64
-This sets the minimum delay in nanoseconds ZIL care to delay block commit,
-waiting for more records.
-If ZIL writes are too fast, kernel may not be able sleep for so short interval,
-increasing log latency above allowed by
-.Sy zfs_commit_timeout_pct .
-.
.It Sy zil_nocacheflush Ns = Ns Sy 0 Ns | Ns 1 Pq int
Disable the cache flush commands that are normally sent to disk by
the ZIL after an LWB write has completed.
@@ -2317,6 +2307,63 @@ If
.Sy zvol_threads
to the number of CPUs present or 32 (whichever is greater).
.
+.It Sy zvol_blk_mq_threads Ns = Ns Sy 0 Pq uint
+The number of threads per zvol to use for queuing IO requests.
+This parameter will only appear if your kernel supports
+.Li blk-mq
+and is only read and assigned to a zvol at zvol load time.
+If
+.Sy 0
+(the default) then internally set
+.Sy zvol_blk_mq_threads
+to the number of CPUs present.
+.
+.It Sy zvol_use_blk_mq Ns = Ns Sy 0 Ns | Ns 1 Pq uint
+Set to
+.Sy 1
+to use the
+.Li blk-mq
+API for zvols.
+Set to
+.Sy 0
+(the default) to use the legacy zvol APIs.
+This setting can give better or worse zvol performance depending on
+the workload.
+This parameter will only appear if your kernel supports
+.Li blk-mq
+and is only read and assigned to a zvol at zvol load time.
+.
+.It Sy zvol_blk_mq_blocks_per_thread Ns = Ns Sy 8 Pq uint
+If
+.Sy zvol_use_blk_mq
+is enabled, then process this number of
+.Sy volblocksize Ns -sized blocks per zvol thread.
+This tunable can be use to favor better performance for zvol reads (lower
+values) or writes (higher values).
+If set to
+.Sy 0 ,
+then the zvol layer will process the maximum number of blocks
+per thread that it can.
+This parameter will only appear if your kernel supports
+.Li blk-mq
+and is only applied at each zvol's load time.
+.
+.It Sy zvol_blk_mq_queue_depth Ns = Ns Sy 0 Pq uint
+The queue_depth value for the zvol
+.Li blk-mq
+interface.
+This parameter will only appear if your kernel supports
+.Li blk-mq
+and is only applied at each zvol's load time.
+If
+.Sy 0
+(the default) then use the kernel's default queue depth.
+Values are clamped to the kernel's
+.Dv BLKDEV_MIN_RQ
+and
+.Dv BLKDEV_MAX_RQ Ns / Ns Dv BLKDEV_DEFAULT_RQ
+limits.
+.
.It Sy zvol_volmode Ns = Ns Sy 1 Pq uint
Defines zvol block devices behaviour when
.Sy volmode Ns = Ns Sy default :
diff --git a/sys/contrib/openzfs/man/man7/zfsprops.7 b/sys/contrib/openzfs/man/man7/zfsprops.7
index e3674b1f8a8d..59f6404379af 100644
--- a/sys/contrib/openzfs/man/man7/zfsprops.7
+++ b/sys/contrib/openzfs/man/man7/zfsprops.7
@@ -1613,6 +1613,23 @@ If this property is set to
then only metadata is cached.
The default value is
.Sy all .
+.It Sy prefetch Ns = Ns Sy all Ns | Ns Sy none Ns | Ns Sy metadata
+Controls what speculative prefetch does.
+If this property is set to
+.Sy all ,
+then both user data and metadata are prefetched.
+If this property is set to
+.Sy none ,
+then neither user data nor metadata are prefetched.
+If this property is set to
+.Sy metadata ,
+then only metadata are prefetched.
+The default value is
+.Sy all .
+.Pp
+Please note that the module parameter zfs_disable_prefetch=1 can
+be used to totally disable speculative prefetch, bypassing anything
+this property does.
.It Sy setuid Ns = Ns Sy on Ns | Ns Sy off
Controls whether the setuid bit is respected for the file system.
The default value is
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/arc_os.c b/sys/contrib/openzfs/module/os/linux/zfs/arc_os.c
index 29a8802b8367..381563476daf 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/arc_os.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/arc_os.c
@@ -80,12 +80,18 @@ static struct notifier_block arc_hotplug_callback_mem_nb;
/*
* Return a default max arc size based on the amount of physical memory.
+ * This may be overridden by tuning the zfs_arc_max module parameter.
*/
uint64_t
arc_default_max(uint64_t min, uint64_t allmem)
{
- /* Default to 1/2 of all memory. */
- return (MAX(allmem / 2, min));
+ uint64_t size;
+
+ if (allmem >= 1 << 30)
+ size = allmem - (1 << 30);
+ else
+ size = min;
+ return (MAX(allmem * 5 / 8, size));
}
#ifdef _KERNEL
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_uio.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_uio.c
index 3efd4ab159c6..c2ed67c438c6 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_uio.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_uio.c
@@ -204,22 +204,6 @@ zfs_uiomove_bvec_rq(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio)
this_seg_start = orig_loffset;
rq_for_each_segment(bv, rq, iter) {
- if (uio->iter.bio) {
- /*
- * If uio->iter.bio is present, then we know we've saved
- * uio->iter from a previous call to this function, and
- * we can skip ahead in this rq_for_each_segment() loop
- * to where we last left off. That way, we don't need
- * to iterate over tons of segments we've already
- * processed - we can just restore the "saved state".
- */
- iter = uio->iter;
- bv = uio->bv;
- this_seg_start = uio->uio_loffset;
- memset(&uio->iter, 0, sizeof (uio->iter));
- continue;
- }
-
/*
* Lookup what the logical offset of the last byte of this
* segment is.
@@ -260,19 +244,6 @@ zfs_uiomove_bvec_rq(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio)
copied = 1; /* We copied some data */
}
- if (n == 0) {
- /*
- * All done copying. Save our 'iter' value to the uio.
- * This allows us to "save our state" and skip ahead in
- * the rq_for_each_segment() loop the next time we call
- * call zfs_uiomove_bvec_rq() on this uio (which we
- * will be doing for any remaining data in the uio).
- */
- uio->iter = iter; /* make a copy of the struct data */
- uio->bv = bv;
- return (0);
- }
-
this_seg_start = this_seg_end + 1;
}
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c b/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c
index e145a8c876b4..f94ce69fb9e2 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c
@@ -1626,6 +1626,18 @@ MODULE_PARM_DESC(zvol_prefetch_bytes, "Prefetch N bytes at zvol start+end");
module_param(zvol_volmode, uint, 0644);
MODULE_PARM_DESC(zvol_volmode, "Default volmode property value");
+#ifdef HAVE_BLK_MQ
+module_param(zvol_blk_mq_queue_depth, uint, 0644);
+MODULE_PARM_DESC(zvol_blk_mq_queue_depth, "Default blk-mq queue depth");
+
+module_param(zvol_use_blk_mq, uint, 0644);
+MODULE_PARM_DESC(zvol_use_blk_mq, "Use the blk-mq API for zvols");
+
+module_param(zvol_blk_mq_blocks_per_thread, uint, 0644);
+MODULE_PARM_DESC(zvol_blk_mq_blocks_per_thread,
+ "Process volblocksize blocks per thread");
+#endif
+
#ifndef HAVE_BLKDEV_GET_ERESTARTSYS
module_param(zvol_open_timeout_ms, uint, 0644);
MODULE_PARM_DESC(zvol_open_timeout_ms, "Timeout for ZVOL open retries");
diff --git a/sys/contrib/openzfs/module/zcommon/zfs_prop.c b/sys/contrib/openzfs/module/zcommon/zfs_prop.c
index 3db6fd13f4ae..29764674a31b 100644
--- a/sys/contrib/openzfs/module/zcommon/zfs_prop.c
+++ b/sys/contrib/openzfs/module/zcommon/zfs_prop.c
@@ -345,6 +345,13 @@ zfs_prop_init(void)
{ NULL }
};
+ static const zprop_index_t prefetch_table[] = {
+ { "none", ZFS_PREFETCH_NONE },
+ { "metadata", ZFS_PREFETCH_METADATA },
+ { "all", ZFS_PREFETCH_ALL },
+ { NULL }
+ };
+
static const zprop_index_t sync_table[] = {
{ "standard", ZFS_SYNC_STANDARD },
{ "always", ZFS_SYNC_ALWAYS },
@@ -453,6 +460,10 @@ zfs_prop_init(void)
ZFS_CACHE_ALL, PROP_INHERIT,
ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT | ZFS_TYPE_VOLUME,
"all | none | metadata", "SECONDARYCACHE", cache_table, sfeatures);
+ zprop_register_index(ZFS_PROP_PREFETCH, "prefetch",
+ ZFS_PREFETCH_ALL, PROP_INHERIT,
+ ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT | ZFS_TYPE_VOLUME,
+ "none | metadata | all", "PREFETCH", prefetch_table, sfeatures);
zprop_register_index(ZFS_PROP_LOGBIAS, "logbias", ZFS_LOGBIAS_LATENCY,
PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
"latency | throughput", "LOGBIAS", logbias_table, sfeatures);
diff --git a/sys/contrib/openzfs/module/zfs/abd.c b/sys/contrib/openzfs/module/zfs/abd.c
index 745ee8f02ed4..d982f201c930 100644
--- a/sys/contrib/openzfs/module/zfs/abd.c
+++ b/sys/contrib/openzfs/module/zfs/abd.c
@@ -802,13 +802,10 @@ abd_iterate_func(abd_t *abd, size_t off, size_t size,
abd_verify(abd);
ASSERT3U(off + size, <=, abd->abd_size);
- boolean_t gang = abd_is_gang(abd);
abd_t *c_abd = abd_init_abd_iter(abd, &aiter, off);
while (size > 0) {
- /* If we are at the end of the gang ABD we are done */
- if (gang && !c_abd)
- break;
+ IMPLY(abd_is_gang(abd), c_abd != NULL);
abd_iter_map(&aiter);
@@ -930,7 +927,6 @@ abd_iterate_func2(abd_t *dabd, abd_t *sabd, size_t doff, size_t soff,
{
int ret = 0;
struct abd_iter daiter, saiter;
- boolean_t dabd_is_gang_abd, sabd_is_gang_abd;
abd_t *c_dabd, *c_sabd;
if (size == 0)
@@ -942,16 +938,12 @@ abd_iterate_func2(abd_t *dabd, abd_t *sabd, size_t doff, size_t soff,
ASSERT3U(doff + size, <=, dabd->abd_size);
ASSERT3U(soff + size, <=, sabd->abd_size);
- dabd_is_gang_abd = abd_is_gang(dabd);
- sabd_is_gang_abd = abd_is_gang(sabd);
c_dabd = abd_init_abd_iter(dabd, &daiter, doff);
c_sabd = abd_init_abd_iter(sabd, &saiter, soff);
while (size > 0) {
- /* if we are at the end of the gang ABD we are done */
- if ((dabd_is_gang_abd && !c_dabd) ||
- (sabd_is_gang_abd && !c_sabd))
- break;
+ IMPLY(abd_is_gang(dabd), c_dabd != NULL);
+ IMPLY(abd_is_gang(sabd), c_sabd != NULL);
abd_iter_map(&daiter);
abd_iter_map(&saiter);
@@ -1032,66 +1024,40 @@ abd_raidz_gen_iterate(abd_t **cabds, abd_t *dabd,
int i;
ssize_t len, dlen;
struct abd_iter caiters[3];
- struct abd_iter daiter = {0};
+ struct abd_iter daiter;
void *caddrs[3];
unsigned long flags __maybe_unused = 0;
abd_t *c_cabds[3];
abd_t *c_dabd = NULL;
- boolean_t cabds_is_gang_abd[3];
- boolean_t dabd_is_gang_abd = B_FALSE;
ASSERT3U(parity, <=, 3);
-
for (i = 0; i < parity; i++) {
- cabds_is_gang_abd[i] = abd_is_gang(cabds[i]);
+ abd_verify(cabds[i]);
+ ASSERT3U(csize, <=, cabds[i]->abd_size);
c_cabds[i] = abd_init_abd_iter(cabds[i], &caiters[i], 0);
}
- if (dabd) {
- dabd_is_gang_abd = abd_is_gang(dabd);
+ ASSERT3S(dsize, >=, 0);
+ if (dsize > 0) {
+ ASSERT(dabd);
+ abd_verify(dabd);
+ ASSERT3U(dsize, <=, dabd->abd_size);
c_dabd = abd_init_abd_iter(dabd, &daiter, 0);
}
- ASSERT3S(dsize, >=, 0);
-
abd_enter_critical(flags);
while (csize > 0) {
- /* if we are at the end of the gang ABD we are done */
- if (dabd_is_gang_abd && !c_dabd)
- break;
-
+ len = csize;
for (i = 0; i < parity; i++) {
- /*
- * If we are at the end of the gang ABD we are
- * done.
- */
- if (cabds_is_gang_abd[i] && !c_cabds[i])
- break;
+ IMPLY(abd_is_gang(cabds[i]), c_cabds[i] != NULL);
abd_iter_map(&caiters[i]);
caddrs[i] = caiters[i].iter_mapaddr;
+ len = MIN(caiters[i].iter_mapsize, len);
}
- len = csize;
-
- if (dabd && dsize > 0)
+ if (dsize > 0) {
+ IMPLY(abd_is_gang(dabd), c_dabd != NULL);
abd_iter_map(&daiter);
-
- switch (parity) {
- case 3:
- len = MIN(caiters[2].iter_mapsize, len);
- zfs_fallthrough;
- case 2:
- len = MIN(caiters[1].iter_mapsize, len);
- zfs_fallthrough;
- case 1:
- len = MIN(caiters[0].iter_mapsize, len);
- }
-
- /* must be progressive */
- ASSERT3S(len, >, 0);
-
- if (dabd && dsize > 0) {
- /* this needs precise iter.length */
len = MIN(daiter.iter_mapsize, len);
dlen = len;
} else
@@ -1114,7 +1080,7 @@ abd_raidz_gen_iterate(abd_t **cabds, abd_t *dabd,
&caiters[i], len);
}
- if (dabd && dsize > 0) {
+ if (dsize > 0) {
abd_iter_unmap(&daiter);
c_dabd =
abd_advance_abd_iter(dabd, c_dabd, &daiter,
@@ -1153,16 +1119,16 @@ abd_raidz_rec_iterate(abd_t **cabds, abd_t **tabds,
struct abd_iter xiters[3];
void *caddrs[3], *xaddrs[3];
unsigned long flags __maybe_unused = 0;
- boolean_t cabds_is_gang_abd[3];
- boolean_t tabds_is_gang_abd[3];
abd_t *c_cabds[3];
abd_t *c_tabds[3];
ASSERT3U(parity, <=, 3);
for (i = 0; i < parity; i++) {
- cabds_is_gang_abd[i] = abd_is_gang(cabds[i]);
- tabds_is_gang_abd[i] = abd_is_gang(tabds[i]);
+ abd_verify(cabds[i]);
+ abd_verify(tabds[i]);
+ ASSERT3U(tsize, <=, cabds[i]->abd_size);
+ ASSERT3U(tsize, <=, tabds[i]->abd_size);
c_cabds[i] =
abd_init_abd_iter(cabds[i], &citers[i], 0);
c_tabds[i] =
@@ -1171,36 +1137,18 @@ abd_raidz_rec_iterate(abd_t **cabds, abd_t **tabds,
abd_enter_critical(flags);
while (tsize > 0) {
-
+ len = tsize;
for (i = 0; i < parity; i++) {
- /*
- * If we are at the end of the gang ABD we
- * are done.
- */
- if (cabds_is_gang_abd[i] && !c_cabds[i])
- break;
- if (tabds_is_gang_abd[i] && !c_tabds[i])
- break;
+ IMPLY(abd_is_gang(cabds[i]), c_cabds[i] != NULL);
+ IMPLY(abd_is_gang(tabds[i]), c_tabds[i] != NULL);
abd_iter_map(&citers[i]);
abd_iter_map(&xiters[i]);
caddrs[i] = citers[i].iter_mapaddr;
xaddrs[i] = xiters[i].iter_mapaddr;
+ len = MIN(citers[i].iter_mapsize, len);
+ len = MIN(xiters[i].iter_mapsize, len);
}
- len = tsize;
- switch (parity) {
- case 3:
- len = MIN(xiters[2].iter_mapsize, len);
- len = MIN(citers[2].iter_mapsize, len);
- zfs_fallthrough;
- case 2:
- len = MIN(xiters[1].iter_mapsize, len);
- len = MIN(citers[1].iter_mapsize, len);
- zfs_fallthrough;
- case 1:
- len = MIN(xiters[0].iter_mapsize, len);
- len = MIN(citers[0].iter_mapsize, len);
- }
/* must be progressive */
ASSERT3S(len, >, 0);
/*
diff --git a/sys/contrib/openzfs/module/zfs/arc.c b/sys/contrib/openzfs/module/zfs/arc.c
index 5d4a52fa0693..06544925b5ca 100644
--- a/sys/contrib/openzfs/module/zfs/arc.c
+++ b/sys/contrib/openzfs/module/zfs/arc.c
@@ -5868,12 +5868,9 @@ top:
* 3. This buffer isn't currently writing to the L2ARC.
* 4. The L2ARC entry wasn't evicted, which may
* also have invalidated the vdev.
- * 5. This isn't prefetch or l2arc_noprefetch is 0.
*/
if (HDR_HAS_L2HDR(hdr) &&
- !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) &&
- !(l2arc_noprefetch &&
- (*arc_flags & ARC_FLAG_PREFETCH))) {
+ !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr)) {
l2arc_read_callback_t *cb;
abd_t *abd;
uint64_t asize;
diff --git a/sys/contrib/openzfs/module/zfs/dmu_objset.c b/sys/contrib/openzfs/module/zfs/dmu_objset.c
index d134d4958f7c..76e65b5506a9 100644
--- a/sys/contrib/openzfs/module/zfs/dmu_objset.c
+++ b/sys/contrib/openzfs/module/zfs/dmu_objset.c
@@ -264,6 +264,19 @@ secondary_cache_changed_cb(void *arg, uint64_t newval)
}
static void
+prefetch_changed_cb(void *arg, uint64_t newval)
+{
+ objset_t *os = arg;
+
+ /*
+ * Inheritance should have been done by now.
+ */
+ ASSERT(newval == ZFS_PREFETCH_ALL || newval == ZFS_PREFETCH_NONE ||
+ newval == ZFS_PREFETCH_METADATA);
+ os->os_prefetch = newval;
+}
+
+static void
sync_changed_cb(void *arg, uint64_t newval)
{
objset_t *os = arg;
@@ -562,6 +575,11 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
zfs_prop_to_name(ZFS_PROP_SECONDARYCACHE),
secondary_cache_changed_cb, os);
}
+ if (err == 0) {
+ err = dsl_prop_register(ds,
+ zfs_prop_to_name(ZFS_PROP_PREFETCH),
+ prefetch_changed_cb, os);
+ }
if (!ds->ds_is_snapshot) {
if (err == 0) {
err = dsl_prop_register(ds,
@@ -635,6 +653,7 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
os->os_primary_cache = ZFS_CACHE_ALL;
os->os_secondary_cache = ZFS_CACHE_ALL;
os->os_dnodesize = DNODE_MIN_SIZE;
+ os->os_prefetch = ZFS_PREFETCH_ALL;
}
if (ds == NULL || !ds->ds_is_snapshot)
diff --git a/sys/contrib/openzfs/module/zfs/dmu_zfetch.c b/sys/contrib/openzfs/module/zfs/dmu_zfetch.c
index d0acaf502066..2b2d72671001 100644
--- a/sys/contrib/openzfs/module/zfs/dmu_zfetch.c
+++ b/sys/contrib/openzfs/module/zfs/dmu_zfetch.c
@@ -329,9 +329,14 @@ dmu_zfetch_prepare(zfetch_t *zf, uint64_t blkid, uint64_t nblks,
{
zstream_t *zs;
spa_t *spa = zf->zf_dnode->dn_objset->os_spa;
+ zfs_prefetch_type_t os_prefetch = zf->zf_dnode->dn_objset->os_prefetch;
- if (zfs_prefetch_disable)
+ if (zfs_prefetch_disable || os_prefetch == ZFS_PREFETCH_NONE)
return (NULL);
+
+ if (os_prefetch == ZFS_PREFETCH_METADATA)
+ fetch_data = B_FALSE;
+
/*
* If we haven't yet loaded the indirect vdevs' mappings, we
* can only read from blocks that we carefully ensure are on
diff --git a/sys/contrib/openzfs/module/zfs/spa.c b/sys/contrib/openzfs/module/zfs/spa.c
index 0b69568e2aad..aa97144f16e4 100644
--- a/sys/contrib/openzfs/module/zfs/spa.c
+++ b/sys/contrib/openzfs/module/zfs/spa.c
@@ -101,6 +101,26 @@
#include "zfs_comutil.h"
/*
+ * spa_thread() existed on Illumos as a parent thread for the various worker
+ * threads that actually run the pool, as a way to both reference the entire
+ * pool work as a single object, and to share properties like scheduling
+ * options. It has not yet been adapted to Linux or FreeBSD. This define is
+ * used to mark related parts of the code to make things easier for the reader,
+ * and to compile this code out. It can be removed when someone implements it,
+ * moves it to some Illumos-specific place, or removes it entirely.
+ */
+#undef HAVE_SPA_THREAD
+
+/*
+ * The "System Duty Cycle" scheduling class is an Illumos feature to help
+ * prevent CPU-intensive kernel threads from affecting latency on interactive
+ * threads. It doesn't exist on Linux or FreeBSD, so the supporting code is
+ * gated behind a define. On Illumos SDC depends on spa_thread(), but
+ * spa_thread() also has other uses, so this is a separate define.
+ */
+#undef HAVE_SYSDC
+
+/*
* The interval, in seconds, at which failed configuration cache file writes
* should be retried.
*/
@@ -176,10 +196,15 @@ static uint_t metaslab_preload_pct = 50;
static uint_t zio_taskq_batch_pct = 80; /* 1 thread per cpu in pset */
static uint_t zio_taskq_batch_tpq; /* threads per taskq */
+
+#ifdef HAVE_SYSDC
static const boolean_t zio_taskq_sysdc = B_TRUE; /* use SDC scheduling class */
static const uint_t zio_taskq_basedc = 80; /* base duty cycle */
+#endif
+#ifdef HAVE_SPA_THREAD
static const boolean_t spa_create_process = B_TRUE; /* no process => no sysdc */
+#endif
/*
* Report any spa_load_verify errors found, but do not fail spa_load.
@@ -1029,7 +1054,9 @@ spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
uint_t count = ztip->zti_count;
spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
uint_t cpus, flags = TASKQ_DYNAMIC;
+#ifdef HAVE_SYSDC
boolean_t batch = B_FALSE;
+#endif
switch (mode) {
case ZTI_MODE_FIXED:
@@ -1037,7 +1064,9 @@ spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
break;
case ZTI_MODE_BATCH:
+#ifdef HAVE_SYSDC
batch = B_TRUE;
+#endif
flags |= TASKQ_THREADS_CPU_PCT;
value = MIN(zio_taskq_batch_pct, 100);
break;
@@ -1106,6 +1135,7 @@ spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
(void) snprintf(name, sizeof (name), "%s_%s",
zio_type_name[t], zio_taskq_types[q]);
+#ifdef HAVE_SYSDC
if (zio_taskq_sysdc && spa->spa_proc != &p0) {
if (batch)
flags |= TASKQ_DC_BATCH;
@@ -1114,6 +1144,7 @@ spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
tq = taskq_create_sysdc(name, value, 50, INT_MAX,
spa->spa_proc, zio_taskq_basedc, flags);
} else {
+#endif
pri_t pri = maxclsyspri;
/*
* The write issue taskq can be extremely CPU
@@ -1139,7 +1170,9 @@ spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
}
tq = taskq_create_proc(name, value, pri, 50,
INT_MAX, spa->spa_proc, flags);
+#ifdef HAVE_SYSDC
}
+#endif
tqs->stqs_taskq[i] = tq;
}
@@ -1224,11 +1257,6 @@ spa_create_zio_taskqs(spa_t *spa)
}
}
-/*
- * Disabled until spa_thread() can be adapted for Linux.
- */
-#undef HAVE_SPA_THREAD
-
#if defined(_KERNEL) && defined(HAVE_SPA_THREAD)
static void
spa_thread(void *arg)
@@ -1269,9 +1297,11 @@ spa_thread(void *arg)
pool_unlock();
}
+#ifdef HAVE_SYSDC
if (zio_taskq_sysdc) {
sysdc_thread_enter(curthread, 100, 0);
}
+#endif
spa->spa_proc = curproc;
spa->spa_did = curthread->t_did;
@@ -1327,7 +1357,6 @@ spa_activate(spa_t *spa, spa_mode_t mode)
ASSERT(spa->spa_proc == &p0);
spa->spa_did = 0;
- (void) spa_create_process;
#ifdef HAVE_SPA_THREAD
/* Only create a process if we're going to be around a while. */
if (spa_create_process && strcmp(spa->spa_name, TRYIMPORT_NAME) != 0) {
diff --git a/sys/contrib/openzfs/module/zfs/spa_config.c b/sys/contrib/openzfs/module/zfs/spa_config.c
index 636c04d9f785..a77874ea0dd3 100644
--- a/sys/contrib/openzfs/module/zfs/spa_config.c
+++ b/sys/contrib/openzfs/module/zfs/spa_config.c
@@ -367,23 +367,24 @@ spa_write_cachefile(spa_t *target, boolean_t removing, boolean_t postsysevent,
* So we have to invent the ZFS_IOC_CONFIG ioctl to grab the configuration
* information for all pool visible within the zone.
*/
-nvlist_t *
-spa_all_configs(uint64_t *generation)
+int
+spa_all_configs(uint64_t *generation, nvlist_t **pools)
{
- nvlist_t *pools;
spa_t *spa = NULL;
if (*generation == spa_config_generation)
- return (NULL);
+ return (SET_ERROR(EEXIST));
- pools = fnvlist_alloc();
+ int error = mutex_enter_interruptible(&spa_namespace_lock);
+ if (error)
+ return (SET_ERROR(EINTR));
- mutex_enter(&spa_namespace_lock);
+ *pools = fnvlist_alloc();
while ((spa = spa_next(spa)) != NULL) {
if (INGLOBALZONE(curproc) ||
zone_dataset_visible(spa_name(spa), NULL)) {
mutex_enter(&spa->spa_props_lock);
- fnvlist_add_nvlist(pools, spa_name(spa),
+ fnvlist_add_nvlist(*pools, spa_name(spa),
spa->spa_config);
mutex_exit(&spa->spa_props_lock);
}
@@ -391,7 +392,7 @@ spa_all_configs(uint64_t *generation)
*generation = spa_config_generation;
mutex_exit(&spa_namespace_lock);
- return (pools);
+ return (0);
}
void
diff --git a/sys/contrib/openzfs/module/zfs/zfs_ioctl.c b/sys/contrib/openzfs/module/zfs/zfs_ioctl.c
index f91a2f3bbca5..2738385e260b 100644
--- a/sys/contrib/openzfs/module/zfs/zfs_ioctl.c
+++ b/sys/contrib/openzfs/module/zfs/zfs_ioctl.c
@@ -1582,8 +1582,9 @@ zfs_ioc_pool_configs(zfs_cmd_t *zc)
nvlist_t *configs;
int error;
- if ((configs = spa_all_configs(&zc->zc_cookie)) == NULL)
- return (SET_ERROR(EEXIST));
+ error = spa_all_configs(&zc->zc_cookie, &configs);
+ if (error)
+ return (error);
error = put_nvlist(zc, configs);
diff --git a/sys/contrib/openzfs/module/zfs/zfs_quota.c b/sys/contrib/openzfs/module/zfs/zfs_quota.c
index 56f9d22ed0e5..9b351eefc04e 100644
--- a/sys/contrib/openzfs/module/zfs/zfs_quota.c
+++ b/sys/contrib/openzfs/module/zfs/zfs_quota.c
@@ -347,32 +347,18 @@ zfs_set_userquota(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
if (*objp == 0) {
*objp = zap_create(zfsvfs->z_os, DMU_OT_USERGROUP_QUOTA,
DMU_OT_NONE, 0, tx);
- VERIFY0(zap_add(zfsvfs->z_os, MASTER_NODE_OBJ,
+ VERIFY(0 == zap_add(zfsvfs->z_os, MASTER_NODE_OBJ,
zfs_userquota_prop_prefixes[type], 8, 1, objp, tx));
}
+ mutex_exit(&zfsvfs->z_lock);
if (quota == 0) {
err = zap_remove(zfsvfs->z_os, *objp, buf, tx);
if (err == ENOENT)
err = 0;
- /*
- * If the quota contains no more entries after the entry
- * was removed, destroy the quota zap and remove the
- * reference from zfsvfs. This will save us unnecessary
- * zap_lookups for the quota during writes.
- */
- uint64_t zap_nentries;
- VERIFY0(zap_count(zfsvfs->z_os, *objp, &zap_nentries));
- if (zap_nentries == 0) {
- VERIFY0(zap_remove(zfsvfs->z_os, MASTER_NODE_OBJ,
- zfs_userquota_prop_prefixes[type], tx));
- VERIFY0(zap_destroy(zfsvfs->z_os, *objp, tx));
- *objp = 0;
- }
} else {
err = zap_update(zfsvfs->z_os, *objp, buf, 8, 1, &quota, tx);
}
- mutex_exit(&zfsvfs->z_lock);
ASSERT(err == 0);
if (fuid_dirtied)
zfs_fuid_sync(zfsvfs, tx);
diff --git a/sys/contrib/openzfs/module/zfs/zil.c b/sys/contrib/openzfs/module/zfs/zil.c
index a11886136994..ce2cb8b1446a 100644
--- a/sys/contrib/openzfs/module/zfs/zil.c
+++ b/sys/contrib/openzfs/module/zfs/zil.c
@@ -91,15 +91,7 @@
* committed to stable storage. Please refer to the zil_commit_waiter()
* function (and the comments within it) for more details.
*/
-static uint_t zfs_commit_timeout_pct = 5;
-
-/*
- * Minimal time we care to delay commit waiting for more ZIL records.
- * At least FreeBSD kernel can't sleep for less than 2us at its best.
- * So requests to sleep for less then 5us is a waste of CPU time with
- * a risk of significant log latency increase due to oversleep.
- */
-static uint64_t zil_min_commit_timeout = 5000;
+static uint_t zfs_commit_timeout_pct = 10;
/*
* See zil.h for more information about these fields.
@@ -2163,8 +2155,8 @@ zil_lwb_commit(zilog_t *zilog, lwb_t *lwb, itx_t *itx)
ZIL_STAT_INCR(zilog, zil_itx_indirect_bytes,
lrw->lr_length);
if (lwb->lwb_child_zio == NULL) {
- lwb->lwb_child_zio = zio_root(
- zilog->zl_spa, NULL, NULL,
+ lwb->lwb_child_zio = zio_null(NULL,
+ zilog->zl_spa, NULL, NULL, NULL,
ZIO_FLAG_CANFAIL);
}
}
@@ -2696,6 +2688,19 @@ zil_commit_writer_stall(zilog_t *zilog)
ASSERT(list_is_empty(&zilog->zl_lwb_list));
}
+static void
+zil_burst_done(zilog_t *zilog)
+{
+ if (!list_is_empty(&zilog->zl_itx_commit_list) ||
+ zilog->zl_cur_used == 0)
+ return;
+
+ if (zilog->zl_parallel)
+ zilog->zl_parallel--;
+
+ zilog->zl_cur_used = 0;
+}
+
/*
* This function will traverse the commit list, creating new lwbs as
* needed, and committing the itxs from the commit list to these newly
@@ -2710,7 +2715,6 @@ zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs)
list_t nolwb_waiters;
lwb_t *lwb, *plwb;
itx_t *itx;
- boolean_t first = B_TRUE;
ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
@@ -2736,9 +2740,22 @@ zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs)
zil_commit_activate_saxattr_feature(zilog);
ASSERT(lwb->lwb_state == LWB_STATE_NEW ||
lwb->lwb_state == LWB_STATE_OPENED);
- first = (lwb->lwb_state == LWB_STATE_NEW) &&
- ((plwb = list_prev(&zilog->zl_lwb_list, lwb)) == NULL ||
- plwb->lwb_state == LWB_STATE_FLUSH_DONE);
+
+ /*
+ * If the lwb is still opened, it means the workload is really
+ * multi-threaded and we won the chance of write aggregation.
+ * If it is not opened yet, but previous lwb is still not
+ * flushed, it still means the workload is multi-threaded, but
+ * there was too much time between the commits to aggregate, so
+ * we try aggregation next times, but without too much hopes.
+ */
+ if (lwb->lwb_state == LWB_STATE_OPENED) {
+ zilog->zl_parallel = ZIL_BURSTS;
+ } else if ((plwb = list_prev(&zilog->zl_lwb_list, lwb))
+ != NULL && plwb->lwb_state != LWB_STATE_FLUSH_DONE) {
+ zilog->zl_parallel = MAX(zilog->zl_parallel,
+ ZIL_BURSTS / 2);
+ }
}
while ((itx = list_remove_head(&zilog->zl_itx_commit_list)) != NULL) {
@@ -2813,7 +2830,7 @@ zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs)
* Our lwb is done, leave the rest of
* itx list to somebody else who care.
*/
- first = B_FALSE;
+ zilog->zl_parallel = ZIL_BURSTS;
break;
}
} else {
@@ -2905,28 +2922,15 @@ zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs)
* try and pack as many itxs into as few lwbs as
* possible, without significantly impacting the latency
* of each individual itx.
- *
- * If we had no already running or open LWBs, it can be
- * the workload is single-threaded. And if the ZIL write
- * latency is very small or if the LWB is almost full, it
- * may be cheaper to bypass the delay.
*/
- if (lwb->lwb_state == LWB_STATE_OPENED && first) {
- hrtime_t sleep = zilog->zl_last_lwb_latency *
- zfs_commit_timeout_pct / 100;
- if (sleep < zil_min_commit_timeout ||
- lwb->lwb_nmax - lwb->lwb_nused <
- lwb->lwb_nmax / 8) {
- list_insert_tail(ilwbs, lwb);
- lwb = zil_lwb_write_close(zilog, lwb,
- LWB_STATE_NEW);
- zilog->zl_cur_used = 0;
- if (lwb == NULL) {
- while ((lwb = list_remove_head(ilwbs))
- != NULL)
- zil_lwb_write_issue(zilog, lwb);
- zil_commit_writer_stall(zilog);
- }
+ if (lwb->lwb_state == LWB_STATE_OPENED && !zilog->zl_parallel) {
+ list_insert_tail(ilwbs, lwb);
+ lwb = zil_lwb_write_close(zilog, lwb, LWB_STATE_NEW);
+ zil_burst_done(zilog);
+ if (lwb == NULL) {
+ while ((lwb = list_remove_head(ilwbs)) != NULL)
+ zil_lwb_write_issue(zilog, lwb);
+ zil_commit_writer_stall(zilog);
}
}
}
@@ -3084,19 +3088,7 @@ zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw)
ASSERT3S(lwb->lwb_state, ==, LWB_STATE_CLOSED);
- /*
- * Since the lwb's zio hadn't been issued by the time this thread
- * reached its timeout, we reset the zilog's "zl_cur_used" field
- * to influence the zil block size selection algorithm.
- *
- * By having to issue the lwb's zio here, it means the size of the
- * lwb was too large, given the incoming throughput of itxs. By
- * setting "zl_cur_used" to zero, we communicate this fact to the
- * block size selection algorithm, so it can take this information
- * into account, and potentially select a smaller size for the
- * next lwb block that is allocated.
- */
- zilog->zl_cur_used = 0;
+ zil_burst_done(zilog);
if (nlwb == NULL) {
/*
@@ -4214,9 +4206,6 @@ EXPORT_SYMBOL(zil_kstat_values_update);
ZFS_MODULE_PARAM(zfs, zfs_, commit_timeout_pct, UINT, ZMOD_RW,
"ZIL block open timeout percentage");
-ZFS_MODULE_PARAM(zfs_zil, zil_, min_commit_timeout, U64, ZMOD_RW,
- "Minimum delay we care for ZIL block commit");
-
ZFS_MODULE_PARAM(zfs_zil, zil_, replay_disable, INT, ZMOD_RW,
"Disable intent logging replay");
diff --git a/sys/contrib/openzfs/module/zfs/zio.c b/sys/contrib/openzfs/module/zfs/zio.c
index 3b3b40fa73d8..3eb472a9fd2a 100644
--- a/sys/contrib/openzfs/module/zfs/zio.c
+++ b/sys/contrib/openzfs/module/zfs/zio.c
@@ -634,6 +634,11 @@ zio_add_child(zio_t *pio, zio_t *cio)
*/
ASSERT3S(cio->io_child_type, <=, pio->io_child_type);
+ /* Parent should not have READY stage if child doesn't have it. */
+ IMPLY((cio->io_pipeline & ZIO_STAGE_READY) == 0 &&
+ (cio->io_child_type != ZIO_CHILD_VDEV),
+ (pio->io_pipeline & ZIO_STAGE_READY) == 0);
+
zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP);
zl->zl_parent = pio;
zl->zl_child = cio;
@@ -665,6 +670,11 @@ zio_add_child_first(zio_t *pio, zio_t *cio)
*/
ASSERT3S(cio->io_child_type, <=, pio->io_child_type);
+ /* Parent should not have READY stage if child doesn't have it. */
+ IMPLY((cio->io_pipeline & ZIO_STAGE_READY) == 0 &&
+ (cio->io_child_type != ZIO_CHILD_VDEV),
+ (pio->io_pipeline & ZIO_STAGE_READY) == 0);
+
zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP);
zl->zl_parent = pio;
zl->zl_child = cio;
@@ -901,7 +911,8 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
zio->io_orig_pipeline = zio->io_pipeline = pipeline;
zio->io_pipeline_trace = ZIO_STAGE_OPEN;
- zio->io_state[ZIO_WAIT_READY] = (stage >= ZIO_STAGE_READY);
+ zio->io_state[ZIO_WAIT_READY] = (stage >= ZIO_STAGE_READY) ||
+ (pipeline & ZIO_STAGE_READY) == 0;
zio->io_state[ZIO_WAIT_DONE] = (stage >= ZIO_STAGE_DONE);
if (zb != NULL)
@@ -932,6 +943,10 @@ zio_destroy(zio_t *zio)
kmem_cache_free(zio_cache, zio);
}
+/*
+ * ZIO intended to be between others. Provides synchronization at READY
+ * and DONE pipeline stages and calls the respective callbacks.
+ */
zio_t *
zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, zio_done_func_t *done,
void *private, zio_flag_t flags)
@@ -945,10 +960,22 @@ zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, zio_done_func_t *done,
return (zio);
}
+/*
+ * ZIO intended to be a root of a tree. Unlike null ZIO does not have a
+ * READY pipeline stage (is ready on creation), so it should not be used
+ * as child of any ZIO that may need waiting for grandchildren READY stage
+ * (any other ZIO type).
+ */
zio_t *
zio_root(spa_t *spa, zio_done_func_t *done, void *private, zio_flag_t flags)
{
- return (zio_null(NULL, spa, NULL, done, private, flags));
+ zio_t *zio;
+
+ zio = zio_create(NULL, spa, 0, NULL, NULL, 0, 0, done, private,
+ ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, NULL, 0, NULL,
+ ZIO_STAGE_OPEN, ZIO_ROOT_PIPELINE);
+
+ return (zio);
}
static int
@@ -2396,13 +2423,14 @@ static void
zio_reexecute(void *arg)
{
zio_t *pio = arg;
- zio_t *cio, *cio_next;
+ zio_t *cio, *cio_next, *gio;
ASSERT(pio->io_child_type == ZIO_CHILD_LOGICAL);
ASSERT(pio->io_orig_stage == ZIO_STAGE_OPEN);
ASSERT(pio->io_gang_leader == NULL);
ASSERT(pio->io_gang_tree == NULL);
+ mutex_enter(&pio->io_lock);
pio->io_flags = pio->io_orig_flags;
pio->io_stage = pio->io_orig_stage;
pio->io_pipeline = pio->io_orig_pipeline;
@@ -2410,8 +2438,16 @@ zio_reexecute(void *arg)
pio->io_flags |= ZIO_FLAG_REEXECUTED;
pio->io_pipeline_trace = 0;
pio->io_error = 0;
- for (int w = 0; w < ZIO_WAIT_TYPES; w++)
- pio->io_state[w] = 0;
+ pio->io_state[ZIO_WAIT_READY] = (pio->io_stage >= ZIO_STAGE_READY) ||
+ (pio->io_pipeline & ZIO_STAGE_READY) == 0;
+ pio->io_state[ZIO_WAIT_DONE] = (pio->io_stage >= ZIO_STAGE_DONE);
+ zio_link_t *zl = NULL;
+ while ((gio = zio_walk_parents(pio, &zl)) != NULL) {
+ for (int w = 0; w < ZIO_WAIT_TYPES; w++) {
+ gio->io_children[pio->io_child_type][w] +=
+ !pio->io_state[w];
+ }
+ }
for (int c = 0; c < ZIO_CHILD_TYPES; c++)
pio->io_child_error[c] = 0;
@@ -2425,12 +2461,9 @@ zio_reexecute(void *arg)
* the remainder of pio's io_child_list, from 'cio_next' onward,
* cannot be affected by any side effects of reexecuting 'cio'.
*/
- zio_link_t *zl = NULL;
- mutex_enter(&pio->io_lock);
+ zl = NULL;
for (cio = zio_walk_children(pio, &zl); cio != NULL; cio = cio_next) {
cio_next = zio_walk_children(pio, &zl);
- for (int w = 0; w < ZIO_WAIT_TYPES; w++)
- pio->io_children[cio->io_child_type][w]++;
mutex_exit(&pio->io_lock);
zio_reexecute(cio);
mutex_enter(&pio->io_lock);
diff --git a/sys/contrib/openzfs/tests/zfs-tests/include/tunables.cfg b/sys/contrib/openzfs/tests/zfs-tests/include/tunables.cfg
index 8010a9451597..80e7bcb3bd09 100644
--- a/sys/contrib/openzfs/tests/zfs-tests/include/tunables.cfg
+++ b/sys/contrib/openzfs/tests/zfs-tests/include/tunables.cfg
@@ -89,7 +89,7 @@ VDEV_VALIDATE_SKIP vdev.validate_skip vdev_validate_skip
VOL_INHIBIT_DEV UNSUPPORTED zvol_inhibit_dev
VOL_MODE vol.mode zvol_volmode
VOL_RECURSIVE vol.recursive UNSUPPORTED
-VOL_USE_BLK_MQ UNSUPPORTED UNSUPPORTED
+VOL_USE_BLK_MQ UNSUPPORTED zvol_use_blk_mq
XATTR_COMPAT xattr_compat zfs_xattr_compat
ZEVENT_LEN_MAX zevent.len_max zfs_zevent_len_max
ZEVENT_RETAIN_MAX zevent.retain_max zfs_zevent_retain_max
diff --git a/sys/modules/zfs/zfs_config.h b/sys/modules/zfs/zfs_config.h
index e91cf5f22368..39caf1355771 100644
--- a/sys/modules/zfs/zfs_config.h
+++ b/sys/modules/zfs/zfs_config.h
@@ -1110,7 +1110,7 @@
/* #undef ZFS_IS_GPL_COMPATIBLE */
/* Define the project alias string. */
-#define ZFS_META_ALIAS "zfs-2.2.99-FreeBSD_g797f55ef1"
+#define ZFS_META_ALIAS "zfs-2.2.99-FreeBSD_g043c6ee3b"
/* Define the project author. */
#define ZFS_META_AUTHOR "OpenZFS"
@@ -1140,7 +1140,7 @@
#define ZFS_META_NAME "zfs"
/* Define the project release. */
-#define ZFS_META_RELEASE "FreeBSD_g797f55ef1"
+#define ZFS_META_RELEASE "FreeBSD_g043c6ee3b"
/* Define the project version. */
#define ZFS_META_VERSION "2.2.99"
diff --git a/sys/modules/zfs/zfs_gitrev.h b/sys/modules/zfs/zfs_gitrev.h
index e88a5e8dbb37..b4d500f67a90 100644
--- a/sys/modules/zfs/zfs_gitrev.h
+++ b/sys/modules/zfs/zfs_gitrev.h
@@ -1 +1 @@
-#define ZFS_META_GITREV "zfs-2.2.99-162-g797f55ef1"
+#define ZFS_META_GITREV "zfs-2.2.99-174-g043c6ee3b"