diff options
42 files changed, 755 insertions, 177 deletions
diff --git a/sys/contrib/openzfs/cmd/zed/agents/zfs_mod.c b/sys/contrib/openzfs/cmd/zed/agents/zfs_mod.c index 9636c99fc85f..69163b80bd5a 100644 --- a/sys/contrib/openzfs/cmd/zed/agents/zfs_mod.c +++ b/sys/contrib/openzfs/cmd/zed/agents/zfs_mod.c @@ -233,8 +233,12 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled) } (void) nvlist_lookup_string(vdev, ZPOOL_CONFIG_PHYS_PATH, &physpath); + + update_vdev_config_dev_sysfs_path(vdev, path, + ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH); (void) nvlist_lookup_string(vdev, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH, &enc_sysfs_path); + (void) nvlist_lookup_uint64(vdev, ZPOOL_CONFIG_WHOLE_DISK, &wholedisk); (void) nvlist_lookup_uint64(vdev, ZPOOL_CONFIG_OFFLINE, &offline); (void) nvlist_lookup_uint64(vdev, ZPOOL_CONFIG_FAULTED, &faulted); diff --git a/sys/contrib/openzfs/cmd/zed/zed_event.c b/sys/contrib/openzfs/cmd/zed/zed_event.c index c60d5a4bc22e..7e5867692234 100644 --- a/sys/contrib/openzfs/cmd/zed/zed_event.c +++ b/sys/contrib/openzfs/cmd/zed/zed_event.c @@ -35,6 +35,7 @@ #include "zed_strings.h" #include "agents/zfs_agents.h" +#include <libzutil.h> #define MAXBUF 4096 @@ -922,6 +923,25 @@ _zed_event_add_time_strings(uint64_t eid, zed_strings_t *zsp, int64_t etime[]) } } + +static void +_zed_event_update_enc_sysfs_path(nvlist_t *nvl) +{ + const char *vdev_path; + + if (nvlist_lookup_string(nvl, FM_EREPORT_PAYLOAD_ZFS_VDEV_PATH, + &vdev_path) != 0) { + return; /* some other kind of event, ignore it */ + } + + if (vdev_path == NULL) { + return; + } + + update_vdev_config_dev_sysfs_path(nvl, vdev_path, + FM_EREPORT_PAYLOAD_ZFS_VDEV_ENC_SYSFS_PATH); +} + /* * Service the next zevent, blocking until one is available. */ @@ -969,6 +989,17 @@ zed_event_service(struct zed_conf *zcp) zed_log_msg(LOG_WARNING, "Failed to lookup zevent class (eid=%llu)", eid); } else { + /* + * Special case: If we can dynamically detect an enclosure sysfs + * path, then use that value rather than the one stored in the + * vd->vdev_enc_sysfs_path. There have been rare cases where + * vd->vdev_enc_sysfs_path becomes outdated. However, there + * will be other times when we can not dynamically detect the + * sysfs path (like if a disk disappears) and have to rely on + * the old value for things like turning on the fault LED. + */ + _zed_event_update_enc_sysfs_path(nvl); + /* let internal modules see this event first */ zfs_agent_post_event(class, NULL, nvl); diff --git a/sys/contrib/openzfs/cmd/zpool/compatibility.d/grub2 b/sys/contrib/openzfs/cmd/zpool/compatibility.d/grub2 index 8c3a3ce11d94..6d60e643593b 100644 --- a/sys/contrib/openzfs/cmd/zpool/compatibility.d/grub2 +++ b/sys/contrib/openzfs/cmd/zpool/compatibility.d/grub2 @@ -14,7 +14,6 @@ large_blocks livelist log_spacemap lz4_compress -obsolete_counts project_quota resilver_defer spacemap_histogram diff --git a/sys/contrib/openzfs/cmd/zpool/zpool.d/ses b/sys/contrib/openzfs/cmd/zpool/zpool.d/ses index 638145c95d47..19ef92ad67b2 100755 --- a/sys/contrib/openzfs/cmd/zpool/zpool.d/ses +++ b/sys/contrib/openzfs/cmd/zpool/zpool.d/ses @@ -33,10 +33,18 @@ for i in $scripts ; do val="" case $i in enc) - val=$(ls "$VDEV_ENC_SYSFS_PATH/../../" 2>/dev/null) + if echo "$VDEV_ENC_SYSFS_PATH" | grep -q '/sys/bus/pci/slots' ; then + val="$VDEV_ENC_SYSFS_PATH" + else + val="$(ls """$VDEV_ENC_SYSFS_PATH/../../""" 2>/dev/null)" + fi ;; slot) - val=$(cat "$VDEV_ENC_SYSFS_PATH/slot" 2>/dev/null) + if echo "$VDEV_ENC_SYSFS_PATH" | grep -q '/sys/bus/pci/slots' ; then + val="$(basename """$VDEV_ENC_SYSFS_PATH""")" + else + val="$(cat """$VDEV_ENC_SYSFS_PATH/slot""" 2>/dev/null)" + fi ;; encdev) val=$(ls "$VDEV_ENC_SYSFS_PATH/../device/scsi_generic" 2>/dev/null) diff --git a/sys/contrib/openzfs/cmd/zpool/zpool_vdev.c b/sys/contrib/openzfs/cmd/zpool/zpool_vdev.c index 3d0fc089c32f..fbd4b81dfacc 100644 --- a/sys/contrib/openzfs/cmd/zpool/zpool_vdev.c +++ b/sys/contrib/openzfs/cmd/zpool/zpool_vdev.c @@ -372,6 +372,10 @@ make_leaf_vdev(nvlist_t *props, const char *arg, boolean_t is_primary) verify(nvlist_add_string(vdev, ZPOOL_CONFIG_PATH, path) == 0); verify(nvlist_add_string(vdev, ZPOOL_CONFIG_TYPE, type) == 0); + /* Lookup and add the enclosure sysfs path (if exists) */ + update_vdev_config_dev_sysfs_path(vdev, path, + ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH); + if (strcmp(type, VDEV_TYPE_DISK) == 0) verify(nvlist_add_uint64(vdev, ZPOOL_CONFIG_WHOLE_DISK, (uint64_t)wholedisk) == 0); diff --git a/sys/contrib/openzfs/configure.ac b/sys/contrib/openzfs/configure.ac index 4c75616e4299..99db7a1e35f6 100644 --- a/sys/contrib/openzfs/configure.ac +++ b/sys/contrib/openzfs/configure.ac @@ -44,7 +44,7 @@ AM_INIT_AUTOMAKE([subdir-objects foreign]) # Remove default macros from config.h: # PACKAGE, PACKAGE_{BUGREPORT,NAME,STRING,TARNAME,VERSION}, STDC_HEADERS, VERSION AC_CONFIG_HEADERS([zfs_config.h], [ - sed -nri~ -e '/^$/be' -e 'N;N;/#define (PACKAGE|VERSION|STDC_HEADERS)/d' -e ':e' -e 'p' zfs_config.h && rm zfs_config.h~ || exit]) + $SED -nri~ -e '/^$/be' -e 'N;N;/#define (PACKAGE|VERSION|STDC_HEADERS)/d' -e ':e' -e 'p' zfs_config.h && rm zfs_config.h~ || exit]) LT_INIT AC_PROG_INSTALL diff --git a/sys/contrib/openzfs/include/libzutil.h b/sys/contrib/openzfs/include/libzutil.h index 053b1ed4b52a..9842c225b6f0 100644 --- a/sys/contrib/openzfs/include/libzutil.h +++ b/sys/contrib/openzfs/include/libzutil.h @@ -208,6 +208,8 @@ int for_each_vdev_cb(void *zhp, nvlist_t *nv, pool_vdev_iter_f func, int for_each_vdev_in_nvlist(nvlist_t *nvroot, pool_vdev_iter_f func, void *data); void update_vdevs_config_dev_sysfs_path(nvlist_t *config); +_LIBZUTIL_H void update_vdev_config_dev_sysfs_path(nvlist_t *nv, + const char *path, const char *key); #ifdef __cplusplus } #endif diff --git a/sys/contrib/openzfs/include/os/freebsd/spl/sys/taskq.h b/sys/contrib/openzfs/include/os/freebsd/spl/sys/taskq.h index b23a939b3aa7..fa6ff84a0134 100644 --- a/sys/contrib/openzfs/include/os/freebsd/spl/sys/taskq.h +++ b/sys/contrib/openzfs/include/os/freebsd/spl/sys/taskq.h @@ -42,6 +42,7 @@ extern "C" { typedef struct taskq { struct taskqueue *tq_queue; + int tq_nthreads; } taskq_t; typedef uintptr_t taskqid_t; @@ -81,7 +82,6 @@ typedef struct taskq_ent { #define TASKQID_INVALID ((taskqid_t)0) -#define taskq_init_ent(x) extern taskq_t *system_taskq; /* Global dynamic task queue for long delay */ extern taskq_t *system_delay_taskq; @@ -92,7 +92,10 @@ extern taskqid_t taskq_dispatch_delay(taskq_t *, task_func_t, void *, extern void taskq_dispatch_ent(taskq_t *, task_func_t, void *, uint_t, taskq_ent_t *); extern int taskq_empty_ent(taskq_ent_t *); +extern void taskq_init_ent(taskq_ent_t *); taskq_t *taskq_create(const char *, int, pri_t, int, int, uint_t); +taskq_t *taskq_create_synced(const char *, int, pri_t, int, int, uint_t, + kthread_t ***); taskq_t *taskq_create_instance(const char *, int, int, pri_t, int, int, uint_t); taskq_t *taskq_create_proc(const char *, int, pri_t, int, int, struct proc *, uint_t); @@ -117,6 +120,7 @@ void taskq_resume(taskq_t *); #endif /* _KERNEL */ #ifdef _STANDALONE +typedef void taskq_t; typedef int taskq_ent_t; #define taskq_init_ent(x) #endif /* _STANDALONE */ diff --git a/sys/contrib/openzfs/include/os/linux/spl/sys/taskq.h b/sys/contrib/openzfs/include/os/linux/spl/sys/taskq.h index 6c1b4377a98a..aa5860c56e83 100644 --- a/sys/contrib/openzfs/include/os/linux/spl/sys/taskq.h +++ b/sys/contrib/openzfs/include/os/linux/spl/sys/taskq.h @@ -150,6 +150,8 @@ extern void taskq_dispatch_ent(taskq_t *, task_func_t, void *, uint_t, extern int taskq_empty_ent(taskq_ent_t *); extern void taskq_init_ent(taskq_ent_t *); extern taskq_t *taskq_create(const char *, int, pri_t, int, int, uint_t); +extern taskq_t *taskq_create_synced(const char *, int, pri_t, int, int, uint_t, + kthread_t ***); extern void taskq_destroy(taskq_t *); extern void taskq_wait_id(taskq_t *, taskqid_t); extern void taskq_wait_outstanding(taskq_t *, taskqid_t); diff --git a/sys/contrib/openzfs/include/sys/dataset_kstats.h b/sys/contrib/openzfs/include/sys/dataset_kstats.h index 40cf5258a2e7..c81a07f0c116 100644 --- a/sys/contrib/openzfs/include/sys/dataset_kstats.h +++ b/sys/contrib/openzfs/include/sys/dataset_kstats.h @@ -71,6 +71,7 @@ typedef struct dataset_kstats { int dataset_kstats_create(dataset_kstats_t *, objset_t *); void dataset_kstats_destroy(dataset_kstats_t *); +void dataset_kstats_rename(dataset_kstats_t *dk, const char *); void dataset_kstats_update_write_kstats(dataset_kstats_t *, int64_t); void dataset_kstats_update_read_kstats(dataset_kstats_t *, int64_t); diff --git a/sys/contrib/openzfs/include/sys/spa.h b/sys/contrib/openzfs/include/sys/spa.h index 88ef510b744b..cef7933df441 100644 --- a/sys/contrib/openzfs/include/sys/spa.h +++ b/sys/contrib/openzfs/include/sys/spa.h @@ -825,6 +825,11 @@ extern void spa_sync_allpools(void); extern uint_t zfs_sync_pass_deferred_free; +/* spa sync taskqueues */ +taskq_t *spa_sync_tq_create(spa_t *spa, const char *name); +void spa_sync_tq_destroy(spa_t *spa); +void spa_select_allocator(zio_t *zio); + /* spa namespace global mutex */ extern kmutex_t spa_namespace_lock; diff --git a/sys/contrib/openzfs/include/sys/spa_impl.h b/sys/contrib/openzfs/include/sys/spa_impl.h index 094258d47a48..b1eb06f94fcc 100644 --- a/sys/contrib/openzfs/include/sys/spa_impl.h +++ b/sys/contrib/openzfs/include/sys/spa_impl.h @@ -188,6 +188,12 @@ typedef struct spa_taskqs { taskq_t **stqs_taskq; } spa_taskqs_t; +/* one for each thread in the spa sync taskq */ +typedef struct spa_syncthread_info { + kthread_t *sti_thread; + taskq_t *sti_wr_iss_tq; /* assigned wr_iss taskq */ +} spa_syncthread_info_t; + typedef enum spa_all_vdev_zap_action { AVZ_ACTION_NONE = 0, AVZ_ACTION_DESTROY, /* Destroy all per-vdev ZAPs and the AVZ. */ @@ -265,6 +271,10 @@ struct spa { int spa_alloc_count; int spa_active_allocator; /* selectable allocator */ + /* per-allocator sync thread taskqs */ + taskq_t *spa_sync_tq; + spa_syncthread_info_t *spa_syncthreads; + spa_aux_vdev_t spa_spares; /* hot spares */ spa_aux_vdev_t spa_l2cache; /* L2ARC cache devices */ nvlist_t *spa_label_features; /* Features for reading MOS */ @@ -456,7 +466,7 @@ extern char *spa_config_path; extern const char *zfs_deadman_failmode; extern uint_t spa_slop_shift; extern void spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q, - task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent); + task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent, zio_t *zio); extern void spa_taskq_dispatch_sync(spa_t *, zio_type_t t, zio_taskq_type_t q, task_func_t *func, void *arg, uint_t flags); extern void spa_load_spares(spa_t *spa); diff --git a/sys/contrib/openzfs/include/sys/vdev_impl.h b/sys/contrib/openzfs/include/sys/vdev_impl.h index ad9dc3aefd8e..3f2312c23438 100644 --- a/sys/contrib/openzfs/include/sys/vdev_impl.h +++ b/sys/contrib/openzfs/include/sys/vdev_impl.h @@ -131,7 +131,10 @@ typedef const struct vdev_ops { * Virtual device properties */ typedef union vdev_queue_class { - list_t vqc_list; + struct { + ulong_t vqc_list_numnodes; + list_t vqc_list; + }; avl_tree_t vqc_tree; } vdev_queue_class_t; diff --git a/sys/contrib/openzfs/include/sys/zfs_context.h b/sys/contrib/openzfs/include/sys/zfs_context.h index 750ca612b962..9ec2f73b366c 100644 --- a/sys/contrib/openzfs/include/sys/zfs_context.h +++ b/sys/contrib/openzfs/include/sys/zfs_context.h @@ -496,6 +496,8 @@ extern taskq_t *system_taskq; extern taskq_t *system_delay_taskq; extern taskq_t *taskq_create(const char *, int, pri_t, int, int, uint_t); +extern taskq_t *taskq_create_synced(const char *, int, pri_t, int, int, uint_t, + kthread_t ***); #define taskq_create_proc(a, b, c, d, e, p, f) \ (taskq_create(a, b, c, d, e, f)) #define taskq_create_sysdc(a, b, d, e, p, dc, f) \ diff --git a/sys/contrib/openzfs/include/sys/zio.h b/sys/contrib/openzfs/include/sys/zio.h index e1f4d5c04499..25a4b221f05e 100644 --- a/sys/contrib/openzfs/include/sys/zio.h +++ b/sys/contrib/openzfs/include/sys/zio.h @@ -223,6 +223,9 @@ typedef uint64_t zio_flag_t; #define ZIO_FLAG_REEXECUTED (1ULL << 29) #define ZIO_FLAG_DELEGATED (1ULL << 30) +#define ZIO_ALLOCATOR_NONE (-1) +#define ZIO_HAS_ALLOCATOR(zio) ((zio)->io_allocator != ZIO_ALLOCATOR_NONE) + #define ZIO_FLAG_MUSTSUCCEED 0 #define ZIO_FLAG_RAW (ZIO_FLAG_RAW_COMPRESS | ZIO_FLAG_RAW_ENCRYPT) @@ -526,6 +529,9 @@ struct zio { /* Taskq dispatching state */ taskq_ent_t io_tqent; + + /* write issue taskq selection, based upon sync thread */ + taskq_t *io_wr_iss_tq; }; enum blk_verify_flag { diff --git a/sys/contrib/openzfs/lib/libzfs/libzfs.abi b/sys/contrib/openzfs/lib/libzfs/libzfs.abi index f8b0d395161b..bcdcff97666e 100644 --- a/sys/contrib/openzfs/lib/libzfs/libzfs.abi +++ b/sys/contrib/openzfs/lib/libzfs/libzfs.abi @@ -260,6 +260,7 @@ <elf-symbol name='tpool_suspended' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/> <elf-symbol name='tpool_wait' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/> <elf-symbol name='update_vdev_config_dev_strs' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/> + <elf-symbol name='update_vdev_config_dev_sysfs_path' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/> <elf-symbol name='use_color' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/> <elf-symbol name='vdev_expand_proplist' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/> <elf-symbol name='vdev_name_to_prop' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/> @@ -8332,6 +8333,12 @@ <parameter type-id='b59d7dce' name='buflen'/> <return type-id='95e97e5e'/> </function-decl> + <function-decl name='update_vdev_config_dev_sysfs_path' mangled-name='update_vdev_config_dev_sysfs_path' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='update_vdev_config_dev_sysfs_path'> + <parameter type-id='5ce45b60' name='nv'/> + <parameter type-id='80f4b756' name='path'/> + <parameter type-id='80f4b756' name='key'/> + <return type-id='48b5725f'/> + </function-decl> <function-type size-in-bits='64' id='2ec2411e'> <parameter type-id='eaa32e2f'/> <parameter type-id='5ce45b60'/> diff --git a/sys/contrib/openzfs/lib/libzpool/taskq.c b/sys/contrib/openzfs/lib/libzpool/taskq.c index a2e457ef9e60..99a181ec3c93 100644 --- a/sys/contrib/openzfs/lib/libzpool/taskq.c +++ b/sys/contrib/openzfs/lib/libzpool/taskq.c @@ -337,6 +337,36 @@ taskq_destroy(taskq_t *tq) kmem_free(tq, sizeof (taskq_t)); } +/* + * Create a taskq with a specified number of pool threads. Allocate + * and return an array of nthreads kthread_t pointers, one for each + * thread in the pool. The array is not ordered and must be freed + * by the caller. + */ +taskq_t * +taskq_create_synced(const char *name, int nthreads, pri_t pri, + int minalloc, int maxalloc, uint_t flags, kthread_t ***ktpp) +{ + taskq_t *tq; + kthread_t **kthreads = kmem_zalloc(sizeof (*kthreads) * nthreads, + KM_SLEEP); + + (void) pri; (void) minalloc; (void) maxalloc; + + flags &= ~(TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT | TASKQ_DC_BATCH); + + tq = taskq_create(name, nthreads, minclsyspri, nthreads, INT_MAX, + flags | TASKQ_PREPOPULATE); + VERIFY(tq != NULL); + VERIFY(tq->tq_nthreads == nthreads); + + for (int i = 0; i < nthreads; i++) { + kthreads[i] = tq->tq_threadlist[i]; + } + *ktpp = kthreads; + return (tq); +} + int taskq_member(taskq_t *tq, kthread_t *t) { diff --git a/sys/contrib/openzfs/lib/libzutil/os/freebsd/zutil_import_os.c b/sys/contrib/openzfs/lib/libzutil/os/freebsd/zutil_import_os.c index 19ba58e79a03..a134c173bc89 100644 --- a/sys/contrib/openzfs/lib/libzutil/os/freebsd/zutil_import_os.c +++ b/sys/contrib/openzfs/lib/libzutil/os/freebsd/zutil_import_os.c @@ -250,6 +250,15 @@ zfs_dev_flush(int fd) } void +update_vdev_config_dev_sysfs_path(nvlist_t *nv, const char *path, + const char *key) +{ + (void) nv; + (void) path; + (void) key; +} + +void update_vdevs_config_dev_sysfs_path(nvlist_t *config) { (void) config; diff --git a/sys/contrib/openzfs/lib/libzutil/os/linux/zutil_import_os.c b/sys/contrib/openzfs/lib/libzutil/os/linux/zutil_import_os.c index 44ed697dd490..fbfae4f7e685 100644 --- a/sys/contrib/openzfs/lib/libzutil/os/linux/zutil_import_os.c +++ b/sys/contrib/openzfs/lib/libzutil/os/linux/zutil_import_os.c @@ -766,9 +766,12 @@ no_dev: * Rescan the enclosure sysfs path for turning on enclosure LEDs and store it * in the nvlist * (if applicable). Like: * vdev_enc_sysfs_path: '/sys/class/enclosure/11:0:1:0/SLOT 4' + * + * key: The nvlist_t name (like ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH) */ -static void -update_vdev_config_dev_sysfs_path(nvlist_t *nv, const char *path) +void +update_vdev_config_dev_sysfs_path(nvlist_t *nv, const char *path, + const char *key) { char *upath, *spath; @@ -777,9 +780,9 @@ update_vdev_config_dev_sysfs_path(nvlist_t *nv, const char *path) spath = zfs_get_enclosure_sysfs_path(upath); if (spath) { - nvlist_add_string(nv, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH, spath); + (void) nvlist_add_string(nv, key, spath); } else { - nvlist_remove_all(nv, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH); + (void) nvlist_remove_all(nv, key); } free(upath); @@ -799,7 +802,8 @@ sysfs_path_pool_vdev_iter_f(void *hdl_data, nvlist_t *nv, void *data) return (1); /* Rescan our enclosure sysfs path for this vdev */ - update_vdev_config_dev_sysfs_path(nv, path); + update_vdev_config_dev_sysfs_path(nv, path, + ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH); return (0); } @@ -888,7 +892,8 @@ update_vdev_config_dev_strs(nvlist_t *nv) (void) nvlist_add_string(nv, ZPOOL_CONFIG_PHYS_PATH, vds.vds_devphys); } - update_vdev_config_dev_sysfs_path(nv, path); + update_vdev_config_dev_sysfs_path(nv, path, + ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH); } else { /* Clear out any stale entries. */ (void) nvlist_remove_all(nv, ZPOOL_CONFIG_DEVID); diff --git a/sys/contrib/openzfs/man/man4/zfs.4 b/sys/contrib/openzfs/man/man4/zfs.4 index ddad00be412d..f9824ac170ea 100644 --- a/sys/contrib/openzfs/man/man4/zfs.4 +++ b/sys/contrib/openzfs/man/man4/zfs.4 @@ -496,6 +496,13 @@ If we have less than this amount of free space, most ZPL operations (e.g. write, create) will return .Sy ENOSPC . . +.It Sy spa_num_allocators Ns = Ns Sy 4 Pq int +Determines the number of block alloctators to use per spa instance. +Capped by the number of actual CPUs in the system. +.Pp +Note that setting this value too high could result in performance +degredation and/or excess fragmentation. +. .It Sy spa_upgrade_errlog_limit Ns = Ns Sy 0 Pq uint Limits the number of on-disk error log entries that will be converted to the new format when enabling the @@ -1974,13 +1981,6 @@ and may need to load new metaslabs to satisfy these allocations. .It Sy zfs_sync_pass_rewrite Ns = Ns Sy 2 Pq uint Rewrite new block pointers starting in this pass. . -.It Sy zfs_sync_taskq_batch_pct Ns = Ns Sy 75 Ns % Pq int -This controls the number of threads used by -.Sy dp_sync_taskq . -The default value of -.Sy 75% -will create a maximum of one thread per CPU. -. .It Sy zfs_trim_extent_bytes_max Ns = Ns Sy 134217728 Ns B Po 128 MiB Pc Pq uint Maximum size of TRIM command. Larger ranges will be split into chunks no larger than this value before @@ -2265,6 +2265,14 @@ If .Sy 0 , generate a system-dependent value close to 6 threads per taskq. . +.It Sy zio_taskq_wr_iss_ncpus Ns = Ns Sy 0 Pq uint +Determines the number of CPUs to run write issue taskqs. +.Pp +When 0 (the default), the value to use is computed internally +as the number of actual CPUs in the system divided by the +.Sy spa_num_allocators +value. +. .It Sy zvol_inhibit_dev Ns = Ns Sy 0 Ns | Ns 1 Pq uint Do not create zvol device nodes. This may slightly improve startup time on diff --git a/sys/contrib/openzfs/module/os/freebsd/spl/spl_kstat.c b/sys/contrib/openzfs/module/os/freebsd/spl/spl_kstat.c index 9f5f92e194ec..43cd4da02e30 100644 --- a/sys/contrib/openzfs/module/os/freebsd/spl/spl_kstat.c +++ b/sys/contrib/openzfs/module/os/freebsd/spl/spl_kstat.c @@ -187,19 +187,18 @@ kstat_sysctl_dataset_string(SYSCTL_HANDLER_ARGS) static int kstat_sysctl_io(SYSCTL_HANDLER_ARGS) { - struct sbuf *sb; + struct sbuf sb; kstat_t *ksp = arg1; kstat_io_t *kip = ksp->ks_data; int rc; - sb = sbuf_new_auto(); - if (sb == NULL) - return (ENOMEM); + sbuf_new_for_sysctl(&sb, NULL, 0, req); + /* Update the aggsums before reading */ (void) ksp->ks_update(ksp, KSTAT_READ); /* though wlentime & friends are signed, they will never be negative */ - sbuf_printf(sb, + sbuf_printf(&sb, "%-8llu %-8llu %-8u %-8u %-8llu %-8llu " "%-8llu %-8llu %-8llu %-8llu %-8u %-8u\n", kip->nread, kip->nwritten, @@ -207,25 +206,21 @@ kstat_sysctl_io(SYSCTL_HANDLER_ARGS) kip->wtime, kip->wlentime, kip->wlastupdate, kip->rtime, kip->rlentime, kip->rlastupdate, kip->wcnt, kip->rcnt); - rc = sbuf_finish(sb); - if (rc == 0) - rc = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb)); - sbuf_delete(sb); + rc = sbuf_finish(&sb); + sbuf_delete(&sb); return (rc); } static int kstat_sysctl_raw(SYSCTL_HANDLER_ARGS) { - struct sbuf *sb; + struct sbuf sb; void *data; kstat_t *ksp = arg1; void *(*addr_op)(kstat_t *ksp, loff_t index); int n, has_header, rc = 0; - sb = sbuf_new_auto(); - if (sb == NULL) - return (ENOMEM); + sbuf_new_for_sysctl(&sb, NULL, PAGE_SIZE, req); if (ksp->ks_raw_ops.addr) addr_op = ksp->ks_raw_ops.addr; @@ -258,8 +253,10 @@ restart_headers: if (has_header) { if (rc == ENOMEM && !kstat_resize_raw(ksp)) goto restart_headers; - if (rc == 0) - sbuf_printf(sb, "\n%s", ksp->ks_raw_buf); + if (rc == 0) { + sbuf_cat(&sb, "\n"); + sbuf_cat(&sb, ksp->ks_raw_buf); + } } while ((data = addr_op(ksp, n)) != NULL) { @@ -270,22 +267,19 @@ restart: if (rc == ENOMEM && !kstat_resize_raw(ksp)) goto restart; if (rc == 0) - sbuf_printf(sb, "%s", ksp->ks_raw_buf); + sbuf_cat(&sb, ksp->ks_raw_buf); } else { ASSERT3U(ksp->ks_ndata, ==, 1); - sbuf_hexdump(sb, ksp->ks_data, + sbuf_hexdump(&sb, ksp->ks_data, ksp->ks_data_size, NULL, 0); } n++; } free(ksp->ks_raw_buf, M_TEMP); mutex_exit(ksp->ks_lock); - sbuf_trim(sb); - rc = sbuf_finish(sb); - if (rc == 0) - rc = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb)); - sbuf_delete(sb); + rc = sbuf_finish(&sb); + sbuf_delete(&sb); return (rc); } diff --git a/sys/contrib/openzfs/module/os/freebsd/spl/spl_taskq.c b/sys/contrib/openzfs/module/os/freebsd/spl/spl_taskq.c index 3fba5ed3c228..a005debaf6e3 100644 --- a/sys/contrib/openzfs/module/os/freebsd/spl/spl_taskq.c +++ b/sys/contrib/openzfs/module/os/freebsd/spl/spl_taskq.c @@ -220,6 +220,7 @@ taskq_create_impl(const char *name, int nthreads, pri_t pri, nthreads = MAX((mp_ncpus * nthreads) / 100, 1); tq = kmem_alloc(sizeof (*tq), KM_SLEEP); + tq->tq_nthreads = nthreads; tq->tq_queue = taskqueue_create(name, M_WAITOK, taskqueue_thread_enqueue, &tq->tq_queue); taskqueue_set_callback(tq->tq_queue, TASKQUEUE_CALLBACK_TYPE_INIT, @@ -254,6 +255,87 @@ taskq_destroy(taskq_t *tq) kmem_free(tq, sizeof (*tq)); } +static void taskq_sync_assign(void *arg); + +typedef struct taskq_sync_arg { + kthread_t *tqa_thread; + kcondvar_t tqa_cv; + kmutex_t tqa_lock; + int tqa_ready; +} taskq_sync_arg_t; + +static void +taskq_sync_assign(void *arg) +{ + taskq_sync_arg_t *tqa = arg; + + mutex_enter(&tqa->tqa_lock); + tqa->tqa_thread = curthread; + tqa->tqa_ready = 1; + cv_signal(&tqa->tqa_cv); + while (tqa->tqa_ready == 1) + cv_wait(&tqa->tqa_cv, &tqa->tqa_lock); + mutex_exit(&tqa->tqa_lock); +} + +/* + * Create a taskq with a specified number of pool threads. Allocate + * and return an array of nthreads kthread_t pointers, one for each + * thread in the pool. The array is not ordered and must be freed + * by the caller. + */ +taskq_t * +taskq_create_synced(const char *name, int nthreads, pri_t pri, + int minalloc, int maxalloc, uint_t flags, kthread_t ***ktpp) +{ + taskq_t *tq; + taskq_sync_arg_t *tqs = kmem_zalloc(sizeof (*tqs) * nthreads, KM_SLEEP); + kthread_t **kthreads = kmem_zalloc(sizeof (*kthreads) * nthreads, + KM_SLEEP); + + flags &= ~(TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT | TASKQ_DC_BATCH); + + tq = taskq_create(name, nthreads, minclsyspri, nthreads, INT_MAX, + flags | TASKQ_PREPOPULATE); + VERIFY(tq != NULL); + VERIFY(tq->tq_nthreads == nthreads); + + /* spawn all syncthreads */ + for (int i = 0; i < nthreads; i++) { + cv_init(&tqs[i].tqa_cv, NULL, CV_DEFAULT, NULL); + mutex_init(&tqs[i].tqa_lock, NULL, MUTEX_DEFAULT, NULL); + (void) taskq_dispatch(tq, taskq_sync_assign, + &tqs[i], TQ_FRONT); + } + + /* wait on all syncthreads to start */ + for (int i = 0; i < nthreads; i++) { + mutex_enter(&tqs[i].tqa_lock); + while (tqs[i].tqa_ready == 0) + cv_wait(&tqs[i].tqa_cv, &tqs[i].tqa_lock); + mutex_exit(&tqs[i].tqa_lock); + } + + /* let all syncthreads resume, finish */ + for (int i = 0; i < nthreads; i++) { + mutex_enter(&tqs[i].tqa_lock); + tqs[i].tqa_ready = 2; + cv_broadcast(&tqs[i].tqa_cv); + mutex_exit(&tqs[i].tqa_lock); + } + taskq_wait(tq); + + for (int i = 0; i < nthreads; i++) { + kthreads[i] = tqs[i].tqa_thread; + mutex_destroy(&tqs[i].tqa_lock); + cv_destroy(&tqs[i].tqa_cv); + } + kmem_free(tqs, sizeof (*tqs) * nthreads); + + *ktpp = kthreads; + return (tq); +} + int taskq_member(taskq_t *tq, kthread_t *thread) { @@ -398,22 +480,34 @@ void taskq_dispatch_ent(taskq_t *tq, task_func_t func, void *arg, uint32_t flags, taskq_ent_t *task) { - int prio; - /* * If TQ_FRONT is given, we want higher priority for this task, so it * can go at the front of the queue. */ - prio = !!(flags & TQ_FRONT); - task->tqent_id = 0; + task->tqent_task.ta_priority = !!(flags & TQ_FRONT); task->tqent_func = func; task->tqent_arg = arg; - - TASK_INIT(&task->tqent_task, prio, taskq_run_ent, task); taskqueue_enqueue(tq->tq_queue, &task->tqent_task); } void +taskq_init_ent(taskq_ent_t *task) +{ + TASK_INIT(&task->tqent_task, 0, taskq_run_ent, task); + task->tqent_func = NULL; + task->tqent_arg = NULL; + task->tqent_id = 0; + task->tqent_type = NORMAL_TASK; + task->tqent_rc = 0; +} + +int +taskq_empty_ent(taskq_ent_t *task) +{ + return (task->tqent_task.ta_pending == 0); +} + +void taskq_wait(taskq_t *tq) { taskqueue_quiesce(tq->tq_queue); @@ -439,9 +533,3 @@ taskq_wait_outstanding(taskq_t *tq, taskqid_t id __unused) { taskqueue_drain_all(tq->tq_queue); } - -int -taskq_empty_ent(taskq_ent_t *t) -{ - return (t->tqent_task.ta_pending == 0); -} diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/kmod_core.c b/sys/contrib/openzfs/module/os/freebsd/zfs/kmod_core.c index 9a268573528c..00c1acf57710 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/kmod_core.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/kmod_core.c @@ -65,6 +65,7 @@ __FBSDID("$FreeBSD$"); #include <sys/spa_impl.h> #include <sys/stat.h> #include <sys/sunddi.h> +#include <sys/sysctl.h> #include <sys/systm.h> #include <sys/taskqueue.h> #include <sys/uio.h> @@ -336,6 +337,8 @@ static moduledata_t zfs_mod = { EVENTHANDLER_DEFINE(mountroot, spa_boot_init, NULL, 0); #endif +FEATURE(zfs, "OpenZFS support"); + DECLARE_MODULE(zfsctrl, zfs_mod, SI_SUB_CLOCKS, SI_ORDER_ANY); MODULE_VERSION(zfsctrl, 1); #if __FreeBSD_version > 1300092 diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zvol_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zvol_os.c index 0830e1c26546..6a7c2d2811b1 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/zvol_os.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zvol_os.c @@ -1319,6 +1319,7 @@ zvol_os_rename_minor(zvol_state_t *zv, const char *newname) } } strlcpy(zv->zv_name, newname, sizeof (zv->zv_name)); + dataset_kstats_rename(&zv->zv_kstat, newname); } /* diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-taskq.c b/sys/contrib/openzfs/module/os/linux/spl/spl-taskq.c index d18f935b167c..79a1a8e5a5aa 100644 --- a/sys/contrib/openzfs/module/os/linux/spl/spl-taskq.c +++ b/sys/contrib/openzfs/module/os/linux/spl/spl-taskq.c @@ -1262,6 +1262,42 @@ taskq_destroy(taskq_t *tq) } EXPORT_SYMBOL(taskq_destroy); +/* + * Create a taskq with a specified number of pool threads. Allocate + * and return an array of nthreads kthread_t pointers, one for each + * thread in the pool. The array is not ordered and must be freed + * by the caller. + */ +taskq_t * +taskq_create_synced(const char *name, int nthreads, pri_t pri, + int minalloc, int maxalloc, uint_t flags, kthread_t ***ktpp) +{ + taskq_t *tq; + taskq_thread_t *tqt; + int i = 0; + kthread_t **kthreads = kmem_zalloc(sizeof (*kthreads) * nthreads, + KM_SLEEP); + + flags &= ~(TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT | TASKQ_DC_BATCH); + + /* taskq_create spawns all the threads before returning */ + tq = taskq_create(name, nthreads, minclsyspri, nthreads, INT_MAX, + flags | TASKQ_PREPOPULATE); + VERIFY(tq != NULL); + VERIFY(tq->tq_nthreads == nthreads); + + list_for_each_entry(tqt, &tq->tq_thread_list, tqt_thread_list) { + kthreads[i] = tqt->tqt_thread; + i++; + } + + ASSERT3S(i, ==, nthreads); + *ktpp = kthreads; + + return (tq); +} +EXPORT_SYMBOL(taskq_create_synced); + static unsigned int spl_taskq_kick = 0; /* diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c index a1db5c57c18b..2792bc027213 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c @@ -1488,7 +1488,7 @@ zfs_domount(struct super_block *sb, zfs_mnt_t *zm, int silent) * read-only flag, pretend it was set, as done for snapshots. */ if (!canwrite) - vfs->vfs_readonly = true; + vfs->vfs_readonly = B_TRUE; error = zfsvfs_create(osname, vfs->vfs_readonly, &zfsvfs); if (error) { diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c b/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c index fd0fd2c36dcd..384f5785a498 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c @@ -1528,6 +1528,8 @@ zvol_os_rename_minor(zvol_state_t *zv, const char *newname) */ set_disk_ro(zv->zv_zso->zvo_disk, !readonly); set_disk_ro(zv->zv_zso->zvo_disk, readonly); + + dataset_kstats_rename(&zv->zv_kstat, newname); } void diff --git a/sys/contrib/openzfs/module/zfs/abd.c b/sys/contrib/openzfs/module/zfs/abd.c index bcc6ddd5e81b..0a2411a2d572 100644 --- a/sys/contrib/openzfs/module/zfs/abd.c +++ b/sys/contrib/openzfs/module/zfs/abd.c @@ -1025,7 +1025,7 @@ abd_raidz_gen_iterate(abd_t **cabds, abd_t *dabd, size_t off, size_t len, dlen; struct abd_iter caiters[3]; struct abd_iter daiter; - void *caddrs[3]; + void *caddrs[3], *daddr; unsigned long flags __maybe_unused = 0; abd_t *c_cabds[3]; abd_t *c_dabd = NULL; @@ -1057,10 +1057,13 @@ abd_raidz_gen_iterate(abd_t **cabds, abd_t *dabd, size_t off, if (dsize > 0) { IMPLY(abd_is_gang(dabd), c_dabd != NULL); abd_iter_map(&daiter); + daddr = daiter.iter_mapaddr; len = MIN(daiter.iter_mapsize, len); dlen = len; - } else + } else { + daddr = NULL; dlen = 0; + } /* must be progressive */ ASSERT3U(len, >, 0); @@ -1070,7 +1073,7 @@ abd_raidz_gen_iterate(abd_t **cabds, abd_t *dabd, size_t off, */ ASSERT3U(((uint64_t)len & 511ULL), ==, 0); - func_raidz_gen(caddrs, daiter.iter_mapaddr, len, dlen); + func_raidz_gen(caddrs, daddr, len, dlen); for (i = parity-1; i >= 0; i--) { abd_iter_unmap(&caiters[i]); diff --git a/sys/contrib/openzfs/module/zfs/dataset_kstats.c b/sys/contrib/openzfs/module/zfs/dataset_kstats.c index 767a461e0026..2ac058fd2c93 100644 --- a/sys/contrib/openzfs/module/zfs/dataset_kstats.c +++ b/sys/contrib/openzfs/module/zfs/dataset_kstats.c @@ -199,6 +199,18 @@ dataset_kstats_destroy(dataset_kstats_t *dk) } void +dataset_kstats_rename(dataset_kstats_t *dk, const char *name) +{ + dataset_kstat_values_t *dkv = dk->dk_kstats->ks_data; + char *ds_name; + + ds_name = KSTAT_NAMED_STR_PTR(&dkv->dkv_ds_name); + ASSERT3S(ds_name, !=, NULL); + (void) strlcpy(ds_name, name, + KSTAT_NAMED_STR_BUFLEN(&dkv->dkv_ds_name)); +} + +void dataset_kstats_update_write_kstats(dataset_kstats_t *dk, int64_t nwritten) { diff --git a/sys/contrib/openzfs/module/zfs/dbuf.c b/sys/contrib/openzfs/module/zfs/dbuf.c index c0c2692c113a..0a179fffb16a 100644 --- a/sys/contrib/openzfs/module/zfs/dbuf.c +++ b/sys/contrib/openzfs/module/zfs/dbuf.c @@ -4587,6 +4587,10 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) } } +/* + * Syncs out a range of dirty records for indirect or leaf dbufs. May be + * called recursively from dbuf_sync_indirect(). + */ void dbuf_sync_list(list_t *list, int level, dmu_tx_t *tx) { @@ -5005,7 +5009,10 @@ dbuf_remap(dnode_t *dn, dmu_buf_impl_t *db, dmu_tx_t *tx) } -/* Issue I/O to commit a dirty buffer to disk. */ +/* + * Populate dr->dr_zio with a zio to commit a dirty buffer to disk. + * Caller is responsible for issuing the zio_[no]wait(dr->dr_zio). + */ static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) { diff --git a/sys/contrib/openzfs/module/zfs/dmu_objset.c b/sys/contrib/openzfs/module/zfs/dmu_objset.c index 76e65b5506a9..f098e1daa44b 100644 --- a/sys/contrib/openzfs/module/zfs/dmu_objset.c +++ b/sys/contrib/openzfs/module/zfs/dmu_objset.c @@ -1639,28 +1639,90 @@ dmu_objset_write_done(zio_t *zio, arc_buf_t *abuf, void *arg) kmem_free(bp, sizeof (*bp)); } +typedef struct sync_objset_arg { + zio_t *soa_zio; + objset_t *soa_os; + dmu_tx_t *soa_tx; + kmutex_t soa_mutex; + int soa_count; + taskq_ent_t soa_tq_ent; +} sync_objset_arg_t; + typedef struct sync_dnodes_arg { - multilist_t *sda_list; - int sda_sublist_idx; - multilist_t *sda_newlist; - dmu_tx_t *sda_tx; + multilist_t *sda_list; + int sda_sublist_idx; + multilist_t *sda_newlist; + sync_objset_arg_t *sda_soa; } sync_dnodes_arg_t; +static void sync_meta_dnode_task(void *arg); + static void sync_dnodes_task(void *arg) { sync_dnodes_arg_t *sda = arg; + sync_objset_arg_t *soa = sda->sda_soa; + objset_t *os = soa->soa_os; multilist_sublist_t *ms = multilist_sublist_lock(sda->sda_list, sda->sda_sublist_idx); - dmu_objset_sync_dnodes(ms, sda->sda_tx); + dmu_objset_sync_dnodes(ms, soa->soa_tx); multilist_sublist_unlock(ms); kmem_free(sda, sizeof (*sda)); + + mutex_enter(&soa->soa_mutex); + ASSERT(soa->soa_count != 0); + if (--soa->soa_count != 0) { + mutex_exit(&soa->soa_mutex); + return; + } + mutex_exit(&soa->soa_mutex); + + taskq_dispatch_ent(dmu_objset_pool(os)->dp_sync_taskq, + sync_meta_dnode_task, soa, TQ_FRONT, &soa->soa_tq_ent); } +/* + * Issue the zio_nowait() for all dirty record zios on the meta dnode, + * then trigger the callback for the zil_sync. This runs once for each + * objset, only after any/all sublists in the objset have been synced. + */ +static void +sync_meta_dnode_task(void *arg) +{ + sync_objset_arg_t *soa = arg; + objset_t *os = soa->soa_os; + dmu_tx_t *tx = soa->soa_tx; + int txgoff = tx->tx_txg & TXG_MASK; + dbuf_dirty_record_t *dr; + + ASSERT0(soa->soa_count); + + list_t *list = &DMU_META_DNODE(os)->dn_dirty_records[txgoff]; + while ((dr = list_remove_head(list)) != NULL) { + ASSERT0(dr->dr_dbuf->db_level); + zio_nowait(dr->dr_zio); + } + + /* Enable dnode backfill if enough objects have been freed. */ + if (os->os_freed_dnodes >= dmu_rescan_dnode_threshold) { + os->os_rescan_dnodes = B_TRUE; + os->os_freed_dnodes = 0; + } + + /* + * Free intent log blocks up to this tx. + */ + zil_sync(os->os_zil, tx); + os->os_phys->os_zil_header = os->os_zil_header; + zio_nowait(soa->soa_zio); + + mutex_destroy(&soa->soa_mutex); + kmem_free(soa, sizeof (*soa)); +} /* called from dsl */ void @@ -1670,8 +1732,6 @@ dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx) zbookmark_phys_t zb; zio_prop_t zp; zio_t *zio; - list_t *list; - dbuf_dirty_record_t *dr; int num_sublists; multilist_t *ml; blkptr_t *blkptr_copy = kmem_alloc(sizeof (*os->os_rootbp), KM_SLEEP); @@ -1758,39 +1818,49 @@ dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx) offsetof(dnode_t, dn_dirty_link[txgoff])); } + /* + * zio_nowait(zio) is done after any/all sublist and meta dnode + * zios have been nowaited, and the zil_sync() has been performed. + * The soa is freed at the end of sync_meta_dnode_task. + */ + sync_objset_arg_t *soa = kmem_alloc(sizeof (*soa), KM_SLEEP); + soa->soa_zio = zio; + soa->soa_os = os; + soa->soa_tx = tx; + taskq_init_ent(&soa->soa_tq_ent); + mutex_init(&soa->soa_mutex, NULL, MUTEX_DEFAULT, NULL); + ml = &os->os_dirty_dnodes[txgoff]; - num_sublists = multilist_get_num_sublists(ml); + soa->soa_count = num_sublists = multilist_get_num_sublists(ml); + for (int i = 0; i < num_sublists; i++) { if (multilist_sublist_is_empty_idx(ml, i)) - continue; - sync_dnodes_arg_t *sda = kmem_alloc(sizeof (*sda), KM_SLEEP); - sda->sda_list = ml; - sda->sda_sublist_idx = i; - sda->sda_tx = tx; - (void) taskq_dispatch(dmu_objset_pool(os)->dp_sync_taskq, - sync_dnodes_task, sda, 0); - /* callback frees sda */ - } - taskq_wait(dmu_objset_pool(os)->dp_sync_taskq); - - list = &DMU_META_DNODE(os)->dn_dirty_records[txgoff]; - while ((dr = list_remove_head(list)) != NULL) { - ASSERT0(dr->dr_dbuf->db_level); - zio_nowait(dr->dr_zio); + soa->soa_count--; } - /* Enable dnode backfill if enough objects have been freed. */ - if (os->os_freed_dnodes >= dmu_rescan_dnode_threshold) { - os->os_rescan_dnodes = B_TRUE; - os->os_freed_dnodes = 0; + if (soa->soa_count == 0) { + taskq_dispatch_ent(dmu_objset_pool(os)->dp_sync_taskq, + sync_meta_dnode_task, soa, TQ_FRONT, &soa->soa_tq_ent); + } else { + /* + * Sync sublists in parallel. The last to finish + * (i.e., when soa->soa_count reaches zero) must + * dispatch sync_meta_dnode_task. + */ + for (int i = 0; i < num_sublists; i++) { + if (multilist_sublist_is_empty_idx(ml, i)) + continue; + sync_dnodes_arg_t *sda = + kmem_alloc(sizeof (*sda), KM_SLEEP); + sda->sda_list = ml; + sda->sda_sublist_idx = i; + sda->sda_soa = soa; + (void) taskq_dispatch( + dmu_objset_pool(os)->dp_sync_taskq, + sync_dnodes_task, sda, 0); + /* sync_dnodes_task frees sda */ + } } - - /* - * Free intent log blocks up to this tx. - */ - zil_sync(os->os_zil, tx); - os->os_phys->os_zil_header = os->os_zil_header; - zio_nowait(zio); } boolean_t diff --git a/sys/contrib/openzfs/module/zfs/dnode_sync.c b/sys/contrib/openzfs/module/zfs/dnode_sync.c index 8e39af83bb0a..8cffbdb9d20b 100644 --- a/sys/contrib/openzfs/module/zfs/dnode_sync.c +++ b/sys/contrib/openzfs/module/zfs/dnode_sync.c @@ -627,6 +627,7 @@ dnode_sync_free(dnode_t *dn, dmu_tx_t *tx) /* * Write out the dnode's dirty buffers. + * Does not wait for zio completions. */ void dnode_sync(dnode_t *dn, dmu_tx_t *tx) diff --git a/sys/contrib/openzfs/module/zfs/dsl_dataset.c b/sys/contrib/openzfs/module/zfs/dsl_dataset.c index d6db61729223..62a1649d3786 100644 --- a/sys/contrib/openzfs/module/zfs/dsl_dataset.c +++ b/sys/contrib/openzfs/module/zfs/dsl_dataset.c @@ -2069,8 +2069,9 @@ dsl_dataset_snapshot_tmp(const char *fsname, const char *snapname, return (error); } +/* Nonblocking dataset sync. Assumes dataset:objset is always 1:1 */ void -dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx) +dsl_dataset_sync(dsl_dataset_t *ds, zio_t *rio, dmu_tx_t *tx) { ASSERT(dmu_tx_is_syncing(tx)); ASSERT(ds->ds_objset != NULL); @@ -2098,7 +2099,7 @@ dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx) ds->ds_resume_bytes[tx->tx_txg & TXG_MASK] = 0; } - dmu_objset_sync(ds->ds_objset, zio, tx); + dmu_objset_sync(ds->ds_objset, rio, tx); } /* diff --git a/sys/contrib/openzfs/module/zfs/dsl_pool.c b/sys/contrib/openzfs/module/zfs/dsl_pool.c index 17b971248283..370c6a010dca 100644 --- a/sys/contrib/openzfs/module/zfs/dsl_pool.c +++ b/sys/contrib/openzfs/module/zfs/dsl_pool.c @@ -141,11 +141,6 @@ uint_t zfs_delay_min_dirty_percent = 60; uint64_t zfs_delay_scale = 1000 * 1000 * 1000 / 2000; /* - * This determines the number of threads used by the dp_sync_taskq. - */ -static int zfs_sync_taskq_batch_pct = 75; - -/* * These tunables determine the behavior of how zil_itxg_clean() is * called via zil_clean() in the context of spa_sync(). When an itxg * list needs to be cleaned, TQ_NOSLEEP will be used when dispatching. @@ -214,9 +209,7 @@ dsl_pool_open_impl(spa_t *spa, uint64_t txg) txg_list_create(&dp->dp_early_sync_tasks, spa, offsetof(dsl_sync_task_t, dst_node)); - dp->dp_sync_taskq = taskq_create("dp_sync_taskq", - zfs_sync_taskq_batch_pct, minclsyspri, 1, INT_MAX, - TASKQ_THREADS_CPU_PCT); + dp->dp_sync_taskq = spa_sync_tq_create(spa, "dp_sync_taskq"); dp->dp_zil_clean_taskq = taskq_create("dp_zil_clean_taskq", zfs_zil_clean_taskq_nthr_pct, minclsyspri, @@ -409,7 +402,7 @@ dsl_pool_close(dsl_pool_t *dp) txg_list_destroy(&dp->dp_dirty_dirs); taskq_destroy(dp->dp_zil_clean_taskq); - taskq_destroy(dp->dp_sync_taskq); + spa_sync_tq_destroy(dp->dp_spa); /* * We can't set retry to TRUE since we're explicitly specifying @@ -674,7 +667,7 @@ dsl_early_sync_task_verify(dsl_pool_t *dp, uint64_t txg) void dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) { - zio_t *zio; + zio_t *rio; /* root zio for all dirty dataset syncs */ dmu_tx_t *tx; dsl_dir_t *dd; dsl_dataset_t *ds; @@ -704,9 +697,10 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) } /* - * Write out all dirty blocks of dirty datasets. + * Write out all dirty blocks of dirty datasets. Note, this could + * create a very large (+10k) zio tree. */ - zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); + rio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); while ((ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) != NULL) { /* * We must not sync any non-MOS datasets twice, because @@ -715,9 +709,9 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) */ ASSERT(!list_link_active(&ds->ds_synced_link)); list_insert_tail(&synced_datasets, ds); - dsl_dataset_sync(ds, zio, tx); + dsl_dataset_sync(ds, rio, tx); } - VERIFY0(zio_wait(zio)); + VERIFY0(zio_wait(rio)); /* * Update the long range free counter after @@ -748,13 +742,13 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) * user accounting information (and we won't get confused * about which blocks are part of the snapshot). */ - zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); + rio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); while ((ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) != NULL) { objset_t *os = ds->ds_objset; ASSERT(list_link_active(&ds->ds_synced_link)); dmu_buf_rele(ds->ds_dbuf, ds); - dsl_dataset_sync(ds, zio, tx); + dsl_dataset_sync(ds, rio, tx); /* * Release any key mappings created by calls to @@ -767,7 +761,7 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) key_mapping_rele(dp->dp_spa, ds->ds_key_mapping, ds); } } - VERIFY0(zio_wait(zio)); + VERIFY0(zio_wait(rio)); /* * Now that the datasets have been completely synced, we can @@ -1481,9 +1475,6 @@ ZFS_MODULE_PARAM(zfs, zfs_, dirty_data_sync_percent, UINT, ZMOD_RW, ZFS_MODULE_PARAM(zfs, zfs_, delay_scale, U64, ZMOD_RW, "How quickly delay approaches infinity"); -ZFS_MODULE_PARAM(zfs, zfs_, sync_taskq_batch_pct, INT, ZMOD_RW, - "Max percent of CPUs that are used to sync dirty data"); - ZFS_MODULE_PARAM(zfs_zil, zfs_zil_, clean_taskq_nthr_pct, INT, ZMOD_RW, "Max percent of CPUs that are used per dp_sync_taskq"); diff --git a/sys/contrib/openzfs/module/zfs/sa.c b/sys/contrib/openzfs/module/zfs/sa.c index f9daaabbed3e..0ae4c331dd36 100644 --- a/sys/contrib/openzfs/module/zfs/sa.c +++ b/sys/contrib/openzfs/module/zfs/sa.c @@ -23,6 +23,7 @@ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2013, 2017 by Delphix. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. + * Copyright 2023 RackTop Systems, Inc. */ #include <sys/zfs_context.h> @@ -369,7 +370,7 @@ sa_attr_op(sa_handle_t *hdl, sa_bulk_attr_t *bulk, int count, if (bulk[i].sa_data) { SA_COPY_DATA(bulk[i].sa_data_func, bulk[i].sa_addr, bulk[i].sa_data, - bulk[i].sa_size); + MIN(bulk[i].sa_size, bulk[i].sa_length)); } continue; diff --git a/sys/contrib/openzfs/module/zfs/spa.c b/sys/contrib/openzfs/module/zfs/spa.c index aa97144f16e4..68f367c1c744 100644 --- a/sys/contrib/openzfs/module/zfs/spa.c +++ b/sys/contrib/openzfs/module/zfs/spa.c @@ -99,6 +99,7 @@ #include "zfs_prop.h" #include "zfs_comutil.h" +#include <cityhash.h> /* * spa_thread() existed on Illumos as a parent thread for the various worker @@ -128,16 +129,16 @@ int zfs_ccw_retry_interval = 300; typedef enum zti_modes { ZTI_MODE_FIXED, /* value is # of threads (min 1) */ - ZTI_MODE_BATCH, /* cpu-intensive; value is ignored */ ZTI_MODE_SCALE, /* Taskqs scale with CPUs. */ + ZTI_MODE_SYNC, /* sync thread assigned */ ZTI_MODE_NULL, /* don't create a taskq */ ZTI_NMODES } zti_modes_t; #define ZTI_P(n, q) { ZTI_MODE_FIXED, (n), (q) } #define ZTI_PCT(n) { ZTI_MODE_ONLINE_PERCENT, (n), 1 } -#define ZTI_BATCH { ZTI_MODE_BATCH, 0, 1 } #define ZTI_SCALE { ZTI_MODE_SCALE, 0, 1 } +#define ZTI_SYNC { ZTI_MODE_SYNC, 0, 1 } #define ZTI_NULL { ZTI_MODE_NULL, 0, 0 } #define ZTI_N(n) ZTI_P(n, 1) @@ -158,14 +159,14 @@ static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = { * initializing a pool, we use this table to create an appropriately sized * taskq. Some operations are low volume and therefore have a small, static * number of threads assigned to their taskqs using the ZTI_N(#) or ZTI_ONE - * macros. Other operations process a large amount of data; the ZTI_BATCH + * macros. Other operations process a large amount of data; the ZTI_SCALE * macro causes us to create a taskq oriented for throughput. Some operations * are so high frequency and short-lived that the taskq itself can become a * point of lock contention. The ZTI_P(#, #) macro indicates that we need an * additional degree of parallelism specified by the number of threads per- * taskq and the number of taskqs; when dispatching an event in this case, the - * particular taskq is chosen at random. ZTI_SCALE is similar to ZTI_BATCH, - * but with number of taskqs also scaling with number of CPUs. + * particular taskq is chosen at random. ZTI_SCALE uses a number of taskqs + * that scales with the number of CPUs. * * The different taskq priorities are to handle the different contexts (issue * and interrupt) and then to reserve threads for ZIO_PRIORITY_NOW I/Os that @@ -175,7 +176,7 @@ static const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = { /* ISSUE ISSUE_HIGH INTR INTR_HIGH */ { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* NULL */ { ZTI_N(8), ZTI_NULL, ZTI_SCALE, ZTI_NULL }, /* READ */ - { ZTI_BATCH, ZTI_N(5), ZTI_SCALE, ZTI_N(5) }, /* WRITE */ + { ZTI_SYNC, ZTI_N(5), ZTI_SCALE, ZTI_N(5) }, /* WRITE */ { ZTI_SCALE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* FREE */ { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* CLAIM */ { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* IOCTL */ @@ -206,6 +207,8 @@ static const uint_t zio_taskq_basedc = 80; /* base duty cycle */ static const boolean_t spa_create_process = B_TRUE; /* no process => no sysdc */ #endif +static uint_t zio_taskq_wr_iss_ncpus = 0; + /* * Report any spa_load_verify errors found, but do not fail spa_load. * This is used by zdb to analyze non-idle pools. @@ -1054,21 +1057,34 @@ spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q) uint_t count = ztip->zti_count; spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; uint_t cpus, flags = TASKQ_DYNAMIC; -#ifdef HAVE_SYSDC - boolean_t batch = B_FALSE; -#endif switch (mode) { case ZTI_MODE_FIXED: ASSERT3U(value, >, 0); break; - case ZTI_MODE_BATCH: -#ifdef HAVE_SYSDC - batch = B_TRUE; -#endif + case ZTI_MODE_SYNC: + + /* + * Create one wr_iss taskq for every 'zio_taskq_wr_iss_ncpus', + * not to exceed the number of spa allocators. + */ + if (zio_taskq_wr_iss_ncpus == 0) { + count = MAX(boot_ncpus / spa->spa_alloc_count, 1); + } else { + count = MAX(1, + boot_ncpus / MAX(1, zio_taskq_wr_iss_ncpus)); + } + count = MAX(count, (zio_taskq_batch_pct + 99) / 100); + count = MIN(count, spa->spa_alloc_count); + + /* + * zio_taskq_batch_pct is unbounded and may exceed 100%, but no + * single taskq may have more threads than 100% of online cpus. + */ + value = (zio_taskq_batch_pct + count / 2) / count; + value = MIN(value, 100); flags |= TASKQ_THREADS_CPU_PCT; - value = MIN(zio_taskq_batch_pct, 100); break; case ZTI_MODE_SCALE: @@ -1115,7 +1131,7 @@ spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q) default: panic("unrecognized mode for %s_%s taskq (%u:%u) in " - "spa_activate()", + "spa_taskqs_init()", zio_type_name[t], zio_taskq_types[q], mode, value); break; } @@ -1137,9 +1153,6 @@ spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q) #ifdef HAVE_SYSDC if (zio_taskq_sysdc && spa->spa_proc != &p0) { - if (batch) - flags |= TASKQ_DC_BATCH; - (void) zio_taskq_basedc; tq = taskq_create_sysdc(name, value, 50, INT_MAX, spa->spa_proc, zio_taskq_basedc, flags); @@ -1200,12 +1213,11 @@ spa_taskqs_fini(spa_t *spa, zio_type_t t, zio_taskq_type_t q) /* * Dispatch a task to the appropriate taskq for the ZFS I/O type and priority. * Note that a type may have multiple discrete taskqs to avoid lock contention - * on the taskq itself. In that case we choose which taskq at random by using - * the low bits of gethrtime(). + * on the taskq itself. */ -void -spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q, - task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent) +static taskq_t * +spa_taskq_dispatch_select(spa_t *spa, zio_type_t t, zio_taskq_type_t q, + zio_t *zio) { spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; taskq_t *tq; @@ -1213,12 +1225,27 @@ spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q, ASSERT3P(tqs->stqs_taskq, !=, NULL); ASSERT3U(tqs->stqs_count, !=, 0); + if ((t == ZIO_TYPE_WRITE) && (q == ZIO_TASKQ_ISSUE) && + (zio != NULL) && (zio->io_wr_iss_tq != NULL)) { + /* dispatch to assigned write issue taskq */ + tq = zio->io_wr_iss_tq; + return (tq); + } + if (tqs->stqs_count == 1) { tq = tqs->stqs_taskq[0]; } else { tq = tqs->stqs_taskq[((uint64_t)gethrtime()) % tqs->stqs_count]; } + return (tq); +} +void +spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q, + task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent, + zio_t *zio) +{ + taskq_t *tq = spa_taskq_dispatch_select(spa, t, q, zio); taskq_dispatch_ent(tq, func, arg, flags, ent); } @@ -1229,20 +1256,8 @@ void spa_taskq_dispatch_sync(spa_t *spa, zio_type_t t, zio_taskq_type_t q, task_func_t *func, void *arg, uint_t flags) { - spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; - taskq_t *tq; - taskqid_t id; - - ASSERT3P(tqs->stqs_taskq, !=, NULL); - ASSERT3U(tqs->stqs_count, !=, 0); - - if (tqs->stqs_count == 1) { - tq = tqs->stqs_taskq[0]; - } else { - tq = tqs->stqs_taskq[((uint64_t)gethrtime()) % tqs->stqs_count]; - } - - id = taskq_dispatch(tq, func, arg, flags); + taskq_t *tq = spa_taskq_dispatch_select(spa, t, q, NULL); + taskqid_t id = taskq_dispatch(tq, func, arg, flags); if (id) taskq_wait_id(tq, id); } @@ -9649,6 +9664,104 @@ spa_sync_allpools(void) mutex_exit(&spa_namespace_lock); } +taskq_t * +spa_sync_tq_create(spa_t *spa, const char *name) +{ + kthread_t **kthreads; + + ASSERT(spa->spa_sync_tq == NULL); + ASSERT3S(spa->spa_alloc_count, <=, boot_ncpus); + + /* + * - do not allow more allocators than cpus. + * - there may be more cpus than allocators. + * - do not allow more sync taskq threads than allocators or cpus. + */ + int nthreads = spa->spa_alloc_count; + spa->spa_syncthreads = kmem_zalloc(sizeof (spa_syncthread_info_t) * + nthreads, KM_SLEEP); + + spa->spa_sync_tq = taskq_create_synced(name, nthreads, minclsyspri, + nthreads, INT_MAX, TASKQ_PREPOPULATE, &kthreads); + VERIFY(spa->spa_sync_tq != NULL); + VERIFY(kthreads != NULL); + + spa_taskqs_t *tqs = + &spa->spa_zio_taskq[ZIO_TYPE_WRITE][ZIO_TASKQ_ISSUE]; + + spa_syncthread_info_t *ti = spa->spa_syncthreads; + for (int i = 0, w = 0; i < nthreads; i++, w++, ti++) { + ti->sti_thread = kthreads[i]; + if (w == tqs->stqs_count) { + w = 0; + } + ti->sti_wr_iss_tq = tqs->stqs_taskq[w]; + } + + kmem_free(kthreads, sizeof (*kthreads) * nthreads); + return (spa->spa_sync_tq); +} + +void +spa_sync_tq_destroy(spa_t *spa) +{ + ASSERT(spa->spa_sync_tq != NULL); + + taskq_wait(spa->spa_sync_tq); + taskq_destroy(spa->spa_sync_tq); + kmem_free(spa->spa_syncthreads, + sizeof (spa_syncthread_info_t) * spa->spa_alloc_count); + spa->spa_sync_tq = NULL; +} + +void +spa_select_allocator(zio_t *zio) +{ + zbookmark_phys_t *bm = &zio->io_bookmark; + spa_t *spa = zio->io_spa; + + ASSERT(zio->io_type == ZIO_TYPE_WRITE); + + /* + * A gang block (for example) may have inherited its parent's + * allocator, in which case there is nothing further to do here. + */ + if (ZIO_HAS_ALLOCATOR(zio)) + return; + + ASSERT(spa != NULL); + ASSERT(bm != NULL); + + /* + * First try to use an allocator assigned to the syncthread, and set + * the corresponding write issue taskq for the allocator. + * Note, we must have an open pool to do this. + */ + if (spa->spa_sync_tq != NULL) { + spa_syncthread_info_t *ti = spa->spa_syncthreads; + for (int i = 0; i < spa->spa_alloc_count; i++, ti++) { + if (ti->sti_thread == curthread) { + zio->io_allocator = i; + zio->io_wr_iss_tq = ti->sti_wr_iss_tq; + return; + } + } + } + + /* + * We want to try to use as many allocators as possible to help improve + * performance, but we also want logically adjacent IOs to be physically + * adjacent to improve sequential read performance. We chunk each object + * into 2^20 block regions, and then hash based on the objset, object, + * level, and region to accomplish both of these goals. + */ + uint64_t hv = cityhash4(bm->zb_objset, bm->zb_object, bm->zb_level, + bm->zb_blkid >> 20); + + zio->io_allocator = (uint_t)hv % spa->spa_alloc_count; + zio->io_wr_iss_tq = NULL; +} + /* * ========================================================================== * Miscellaneous routines @@ -10242,3 +10355,6 @@ ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, new_alloc, INT, "Whether extra ALLOC blkptrs were added to a livelist entry while it " "was being condensed"); /* END CSTYLED */ + +ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_wr_iss_ncpus, UINT, ZMOD_RW, + "Number of CPUs to run write issue taskqs"); diff --git a/sys/contrib/openzfs/module/zfs/spa_misc.c b/sys/contrib/openzfs/module/zfs/spa_misc.c index c7472f972cc2..3990af98c732 100644 --- a/sys/contrib/openzfs/module/zfs/spa_misc.c +++ b/sys/contrib/openzfs/module/zfs/spa_misc.c @@ -388,7 +388,11 @@ uint_t spa_asize_inflation = 24; uint_t spa_slop_shift = 5; static const uint64_t spa_min_slop = 128ULL * 1024 * 1024; static const uint64_t spa_max_slop = 128ULL * 1024 * 1024 * 1024; -static const int spa_allocators = 4; + +/* + * Number of allocators to use, per spa instance + */ +static int spa_num_allocators = 4; /* * Spa active allocator. @@ -730,7 +734,9 @@ spa_add(const char *name, nvlist_t *config, const char *altroot) if (altroot) spa->spa_root = spa_strdup(altroot); - spa->spa_alloc_count = spa_allocators; + /* Do not allow more allocators than CPUs. */ + spa->spa_alloc_count = MIN(MAX(spa_num_allocators, 1), boot_ncpus); + spa->spa_allocs = kmem_zalloc(spa->spa_alloc_count * sizeof (spa_alloc_t), KM_SLEEP); for (int i = 0; i < spa->spa_alloc_count; i++) { @@ -739,6 +745,7 @@ spa_add(const char *name, nvlist_t *config, const char *altroot) avl_create(&spa->spa_allocs[i].spaa_tree, zio_bookmark_compare, sizeof (zio_t), offsetof(zio_t, io_queue_node.a)); } + avl_create(&spa->spa_metaslabs_by_flushed, metaslab_sort_by_flushed, sizeof (metaslab_t), offsetof(metaslab_t, ms_spa_txg_node)); avl_create(&spa->spa_sm_logs_by_txg, spa_log_sm_sort_by_txg, @@ -3009,3 +3016,6 @@ ZFS_MODULE_PARAM(zfs, zfs_, special_class_metadata_reserve_pct, UINT, ZMOD_RW, ZFS_MODULE_PARAM_CALL(zfs_spa, spa_, slop_shift, param_set_slop_shift, param_get_uint, ZMOD_RW, "Reserved free space in pool"); + +ZFS_MODULE_PARAM(zfs, spa_, num_allocators, INT, ZMOD_RW, + "Number of allocators per spa, capped by ncpus"); diff --git a/sys/contrib/openzfs/module/zfs/vdev_queue.c b/sys/contrib/openzfs/module/zfs/vdev_queue.c index 08d918467d03..092b3f375be0 100644 --- a/sys/contrib/openzfs/module/zfs/vdev_queue.c +++ b/sys/contrib/openzfs/module/zfs/vdev_queue.c @@ -273,8 +273,10 @@ vdev_queue_class_add(vdev_queue_t *vq, zio_t *zio) { zio_priority_t p = zio->io_priority; vq->vq_cqueued |= 1U << p; - if (vdev_queue_class_fifo(p)) + if (vdev_queue_class_fifo(p)) { list_insert_tail(&vq->vq_class[p].vqc_list, zio); + vq->vq_class[p].vqc_list_numnodes++; + } else avl_add(&vq->vq_class[p].vqc_tree, zio); } @@ -288,6 +290,7 @@ vdev_queue_class_remove(vdev_queue_t *vq, zio_t *zio) list_t *list = &vq->vq_class[p].vqc_list; list_remove(list, zio); empty = list_is_empty(list); + vq->vq_class[p].vqc_list_numnodes--; } else { avl_tree_t *tree = &vq->vq_class[p].vqc_tree; avl_remove(tree, zio); @@ -1069,7 +1072,7 @@ vdev_queue_class_length(vdev_t *vd, zio_priority_t p) { vdev_queue_t *vq = &vd->vdev_queue; if (vdev_queue_class_fifo(p)) - return (list_is_empty(&vq->vq_class[p].vqc_list) == 0); + return (vq->vq_class[p].vqc_list_numnodes); else return (avl_numnodes(&vq->vq_class[p].vqc_tree)); } diff --git a/sys/contrib/openzfs/module/zfs/zio.c b/sys/contrib/openzfs/module/zfs/zio.c index 4eb276352a23..2f5b423ee72e 100644 --- a/sys/contrib/openzfs/module/zfs/zio.c +++ b/sys/contrib/openzfs/module/zfs/zio.c @@ -899,6 +899,7 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, zio->io_orig_stage = zio->io_stage = stage; zio->io_orig_pipeline = zio->io_pipeline = pipeline; zio->io_pipeline_trace = ZIO_STAGE_OPEN; + zio->io_allocator = ZIO_ALLOCATOR_NONE; zio->io_state[ZIO_WAIT_READY] = (stage >= ZIO_STAGE_READY) || (pipeline & ZIO_STAGE_READY) == 0; @@ -2007,7 +2008,7 @@ zio_taskq_dispatch(zio_t *zio, zio_taskq_type_t q, boolean_t cutinline) */ ASSERT(taskq_empty_ent(&zio->io_tqent)); spa_taskq_dispatch_ent(spa, t, q, zio_execute, zio, flags, - &zio->io_tqent); + &zio->io_tqent, zio); } static boolean_t @@ -2032,8 +2033,8 @@ zio_taskq_member(zio_t *zio, zio_taskq_type_t q) static zio_t * zio_issue_async(zio_t *zio) { + ASSERT((zio->io_type != ZIO_TYPE_WRITE) || ZIO_HAS_ALLOCATOR(zio)); zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE); - return (NULL); } @@ -2347,6 +2348,9 @@ zio_wait(zio_t *zio) ASSERT0(zio->io_queued_timestamp); zio->io_queued_timestamp = gethrtime(); + if (zio->io_type == ZIO_TYPE_WRITE) { + spa_select_allocator(zio); + } __zio_execute(zio); mutex_enter(&zio->io_lock); @@ -2399,6 +2403,9 @@ zio_nowait(zio_t *zio) ASSERT0(zio->io_queued_timestamp); zio->io_queued_timestamp = gethrtime(); + if (zio->io_type == ZIO_TYPE_WRITE) { + spa_select_allocator(zio); + } __zio_execute(zio); } @@ -2864,6 +2871,13 @@ zio_gang_issue(zio_t *zio) } static void +zio_gang_inherit_allocator(zio_t *pio, zio_t *cio) +{ + cio->io_allocator = pio->io_allocator; + cio->io_wr_iss_tq = pio->io_wr_iss_tq; +} + +static void zio_write_gang_member_ready(zio_t *zio) { zio_t *pio = zio_unique_parent(zio); @@ -2934,6 +2948,7 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc) gbh_copies = MIN(2, spa_max_replication(spa)); } + ASSERT(ZIO_HAS_ALLOCATOR(pio)); int flags = METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER; if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) { ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); @@ -2997,6 +3012,8 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc) zio_write_gang_done, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); + zio_gang_inherit_allocator(pio, zio); + /* * Create and nowait the gang children. */ @@ -3027,6 +3044,8 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc) zio_write_gang_done, &gn->gn_child[g], pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); + zio_gang_inherit_allocator(zio, cio); + if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) { ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); ASSERT(has_data); @@ -3539,6 +3558,7 @@ zio_io_to_allocate(spa_t *spa, int allocator) return (NULL); ASSERT(IO_IS_ALLOCATING(zio)); + ASSERT(ZIO_HAS_ALLOCATOR(zio)); /* * Try to place a reservation for this zio. If we're unable to @@ -3575,21 +3595,12 @@ zio_dva_throttle(zio_t *zio) } ASSERT(zio->io_type == ZIO_TYPE_WRITE); + ASSERT(ZIO_HAS_ALLOCATOR(zio)); ASSERT(zio->io_child_type > ZIO_CHILD_GANG); ASSERT3U(zio->io_queued_timestamp, >, 0); ASSERT(zio->io_stage == ZIO_STAGE_DVA_THROTTLE); - zbookmark_phys_t *bm = &zio->io_bookmark; - /* - * We want to try to use as many allocators as possible to help improve - * performance, but we also want logically adjacent IOs to be physically - * adjacent to improve sequential read performance. We chunk each object - * into 2^20 block regions, and then hash based on the objset, object, - * level, and region to accomplish both of these goals. - */ - int allocator = (uint_t)cityhash4(bm->zb_objset, bm->zb_object, - bm->zb_level, bm->zb_blkid >> 20) % spa->spa_alloc_count; - zio->io_allocator = allocator; + int allocator = zio->io_allocator; zio->io_metaslab_class = mc; mutex_enter(&spa->spa_allocs[allocator].spaa_lock); avl_add(&spa->spa_allocs[allocator].spaa_tree, zio); @@ -3663,6 +3674,7 @@ zio_dva_allocate(zio_t *zio) * sync write performance. If a log allocation fails, we will fall * back to spa_sync() which is abysmal for performance. */ + ASSERT(ZIO_HAS_ALLOCATOR(zio)); error = metaslab_alloc(spa, mc, zio->io_size, bp, zio->io_prop.zp_copies, zio->io_txg, NULL, flags, &zio->io_alloc_list, zio, zio->io_allocator); @@ -4515,6 +4527,7 @@ zio_ready(zio_t *zio) ASSERT(IO_IS_ALLOCATING(zio)); ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); ASSERT(zio->io_metaslab_class != NULL); + ASSERT(ZIO_HAS_ALLOCATOR(zio)); /* * We were unable to allocate anything, unreserve and @@ -4601,6 +4614,7 @@ zio_dva_throttle_done(zio_t *zio) } ASSERT(IO_IS_ALLOCATING(pio)); + ASSERT(ZIO_HAS_ALLOCATOR(pio)); ASSERT3P(zio, !=, zio->io_logical); ASSERT(zio->io_logical != NULL); ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REPAIR)); @@ -4663,6 +4677,7 @@ zio_done(zio_t *zio) ASSERT(zio->io_type == ZIO_TYPE_WRITE); ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); ASSERT(zio->io_bp != NULL); + ASSERT(ZIO_HAS_ALLOCATOR(zio)); metaslab_group_alloc_verify(zio->io_spa, zio->io_bp, zio, zio->io_allocator); @@ -4928,7 +4943,7 @@ zio_done(zio_t *zio) ASSERT(taskq_empty_ent(&zio->io_tqent)); spa_taskq_dispatch_ent(zio->io_spa, ZIO_TYPE_CLAIM, ZIO_TASKQ_ISSUE, - zio_reexecute, zio, 0, &zio->io_tqent); + zio_reexecute, zio, 0, &zio->io_tqent, NULL); } return (NULL); } diff --git a/sys/contrib/openzfs/rpm/generic/zfs-dkms.spec.in b/sys/contrib/openzfs/rpm/generic/zfs-dkms.spec.in index 23c3ed6ff408..d56967d7a8b1 100644 --- a/sys/contrib/openzfs/rpm/generic/zfs-dkms.spec.in +++ b/sys/contrib/openzfs/rpm/generic/zfs-dkms.spec.in @@ -24,6 +24,7 @@ BuildRoot: %{_tmppath}/%{name}-%{version}-%{release}-root-%(%{__id_u} -n) BuildArch: noarch Requires: dkms >= 2.2.0.3 +Requires(pre): dkms >= 2.2.0.3 Requires(post): dkms >= 2.2.0.3 Requires(preun): dkms >= 2.2.0.3 Requires: gcc, make, perl, diffutils @@ -68,9 +69,92 @@ fi %defattr(-,root,root) /usr/src/%{module}-%{version} +%pre +echo "Running pre installation script: $0. Parameters: $*" +# We don't want any other versions lingering around in dkms. +# Tests with 'dnf' showed that in case of reinstall, or upgrade +# the preun scriptlet removed the version we are trying to install. +# Because of this, find all zfs dkms sources in /var/lib/dkms and +# remove them, if we find a matching version in dkms. + +dkms_root=/var/lib/dkms +if [ -d ${dkms_root}/%{module} ]; then + cd ${dkms_root}/%{module} + for x in [[:digit:]]*; do + [ -d "$x" ] || continue + otherver="$x" + opath="${dkms_root}/%{module}/${otherver}" + if [ "$otherver" != %{version} ]; then + # This is a workaround for a broken 'dkms status', we caused in a previous version. + # One day it might be not needed anymore, but it does not hurt to keep it. + if dkms status -m %{module} -v "$otherver" 2>&1 | grep "${opath}/source/dkms.conf does not exist" + then + echo "ERROR: dkms status is broken!" >&2 + if [ -L "${opath}/source" -a ! -d "${opath}/source" ] + then + echo "Trying to fix it by removing the symlink: ${opath}/source" >&2 + echo "You should manually remove ${opath}" >&2 + rm -f "${opath}/source" || echo "Removal failed!" >&2 + fi + fi + if [ `dkms status -m %{module} -v "$otherver" | grep -c %{module}` -gt 0 ]; then + echo "Removing old %{module} dkms modules version $otherver from all kernels." + dkms remove -m %{module} -v "$otherver" --all ||: + fi + fi + done +fi + +# Uninstall this version of zfs dkms modules before installation of the package. +if [ `dkms status -m %{module} -v %{version} | grep -c %{module}` -gt 0 ]; then + echo "Removing %{module} dkms modules version %{version} from all kernels." + dkms remove -m %{module} -v %{version} --all ||: +fi + +%post +echo "Running post installation script: $0. Parameters: $*" +# Add the module to dkms, as reccommended in the dkms man page. +# This is generally rpm specfic. +# But this also may help, if we have a broken 'dkms status'. +# Because, if the sources are available and only the symlink pointing +# to them is missing, this will resolve the situation +echo "Adding %{module} dkms modules version %{version} to dkms." +dkms add -m %{module} -v %{version} %{!?not_rpm:--rpm_safe_upgrade} ||: + +# After installing the package, dkms install this zfs version for the current kernel. +# Force the overwriting of old modules to avoid diff warnings in dkms status. +# Or in case of a downgrade to overwrite newer versions. +# Or if some other backed up versions have been restored before. +echo "Installing %{module} dkms modules version %{version} for the current kernel." +dkms install --force -m %{module} -v %{version} ||: + %preun -dkms remove -m %{module} -v %{version} --all +dkms_root="/var/lib/dkms/%{module}/%{version}" +echo "Running pre uninstall script: $0. Parameters: $*" +# In case of upgrade we do nothing. See above comment in pre hook. +if [ "$1" = "1" -o "$1" = "upgrade" ] ; then + echo "This is an upgrade. Skipping pre uninstall action." + exit 0 +fi + +# Check if we uninstall the package. In that case remove the dkms modules. +# '0' is the value for the first parameter for rpm packages. +# 'remove' or 'purge' are the possible names for deb packages. +if [ "$1" = "0" -o "$1" = "remove" -o "$1" = "purge" ] ; then + if [ `dkms status -m %{module} -v %{version} | grep -c %{module}` -gt 0 ]; then + echo "Removing %{module} dkms modules version %{version} from all kernels." + dkms remove -m %{module} -v %{version} --all %{!?not_rpm:--rpm_safe_upgrade} && exit 0 + fi + # If removing the modules failed, it might be because of the broken 'dkms status'. + if dkms status -m %{module} -v %{version} 2>&1 | grep "${dkms_root}/source/dkms.conf does not exist" + then + echo "ERROR: dkms status is broken!" >&2 + echo "You should manually remove ${dkms_root}" >&2 + echo "WARNING: installed modules in /lib/modules/`uname -r`/extra could not be removed automatically!" >&2 + fi +else + echo "Script parameter $1 did not match any removal condition." +fi -%posttrans -/usr/lib/dkms/common.postinst %{module} %{version} +exit 0 diff --git a/sys/modules/zfs/zfs_config.h b/sys/modules/zfs/zfs_config.h index 5344d981882b..dd1a926d286c 100644 --- a/sys/modules/zfs/zfs_config.h +++ b/sys/modules/zfs/zfs_config.h @@ -1110,7 +1110,7 @@ /* #undef ZFS_IS_GPL_COMPATIBLE */ /* Define the project alias string. */ -#define ZFS_META_ALIAS "zfs-2.2.99-184-FreeBSD_g41e55b476" +#define ZFS_META_ALIAS "zfs-2.2.99-197-FreeBSD_g9198de8f1" /* Define the project author. */ #define ZFS_META_AUTHOR "OpenZFS" @@ -1140,7 +1140,7 @@ #define ZFS_META_NAME "zfs" /* Define the project release. */ -#define ZFS_META_RELEASE "184-FreeBSD_g41e55b476" +#define ZFS_META_RELEASE "197-FreeBSD_g9198de8f1" /* Define the project version. */ #define ZFS_META_VERSION "2.2.99" diff --git a/sys/modules/zfs/zfs_gitrev.h b/sys/modules/zfs/zfs_gitrev.h index 9501947797c0..86525c162b79 100644 --- a/sys/modules/zfs/zfs_gitrev.h +++ b/sys/modules/zfs/zfs_gitrev.h @@ -1 +1 @@ -#define ZFS_META_GITREV "zfs-2.2.99-184-g41e55b476" +#define ZFS_META_GITREV "zfs-2.2.99-197-g9198de8f1" |