aboutsummaryrefslogtreecommitdiff
path: root/cddl
diff options
context:
space:
mode:
authorAndriy Gapon <avg@FreeBSD.org>2017-06-20 17:39:24 +0000
committerAndriy Gapon <avg@FreeBSD.org>2017-06-20 17:39:24 +0000
commitf9cdbaba8db0ba21e5281a8dacc28421911a89e2 (patch)
treead0c633ad582bd93e00339eca100e735cb18302d /cddl
parent42ce346fcc4fc0dd4857b53ec81396dcb1786b24 (diff)
parentd7f3871103b500aceeef0253f00dec34b582393d (diff)
downloadsrc-f9cdbaba8db0ba21e5281a8dacc28421911a89e2.tar.gz
src-f9cdbaba8db0ba21e5281a8dacc28421911a89e2.zip
MFV r318946: 8021 ARC buf data scatter-ization
illumos/illumos-gate@770499e185d15678ccb0be57ebc626ad18d93383 https://github.com/illumos/illumos-gate/commit/770499e185d15678ccb0be57ebc626ad18d93383 https://www.illumos.org/issues/8021 The ARC buf data project (known simply as "ABD" since its genesis in the ZoL community) changes the way the ARC allocates `b_pdata` memory from using linear `void *` buffers to using scatter/gather lists of fixed-size 1KB chunks. This improves ZFS's performance by helping to defragment the address space occupied by the ARC, in particular for cases where compressed ARC is enabled. It could also ease future work to allocate pages directly from `segkpm` for minimal- overhead memory allocations, bypassing the `kmem` subsystem. This is essentially the same change as the one which recently landed in ZFS on Linux, although they made some platform-specific changes while adapting this work to their codebase: 1. Implemented the equivalent of the `segkpm` suggestion for future work mentioned above to bypass issues that they've had with the Linux kernel memory allocator. 2. Changed the internal representation of the ABD's scatter/gather list so it could be used to pass I/O directly into Linux block device drivers. (This feature is not available in the illumos block device interface yet.) FreeBSD notes: - the actual (default) chunk size is 4KB (despite the text above saying 1KB) - we can try to reimplement ABDs, so that they are not permanently mapped into the KVA unless explicitly requested, especially on platforms with scarce KVA - we can try to use unmapped I/O and avoid intermediate allocation of a linear, virtual memory mapped buffer - we can try to avoid extra data copying by referring to chunks / pages in the original ABD Reviewed by: Matthew Ahrens <mahrens@delphix.com> Reviewed by: George Wilson <george.wilson@delphix.com> Reviewed by: Paul Dagnelie <pcd@delphix.com> Reviewed by: John Kennedy <john.kennedy@delphix.com> Reviewed by: Prakash Surya <prakash.surya@delphix.com> Reviewed by: Prashanth Sreenivasa <pks@delphix.com> Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com> Reviewed by: Chris Williamson <chris.williamson@delphix.com> Approved by: Richard Lowe <richlowe@richlowe.net> Author: Dan Kimmel <dan.kimmel@delphix.com> MFC after: 3 weeks
Notes
Notes: svn path=/head/; revision=320156
Diffstat (limited to 'cddl')
-rw-r--r--cddl/contrib/opensolaris/cmd/zdb/zdb.c47
-rw-r--r--cddl/contrib/opensolaris/cmd/zdb/zdb_il.c48
-rw-r--r--cddl/contrib/opensolaris/cmd/ztest/ztest.c18
-rw-r--r--cddl/contrib/opensolaris/lib/libzfs/common/libzfs_sendrecv.c15
4 files changed, 79 insertions, 49 deletions
diff --git a/cddl/contrib/opensolaris/cmd/zdb/zdb.c b/cddl/contrib/opensolaris/cmd/zdb/zdb.c
index ac9b06debf76..711377a9fa25 100644
--- a/cddl/contrib/opensolaris/cmd/zdb/zdb.c
+++ b/cddl/contrib/opensolaris/cmd/zdb/zdb.c
@@ -59,6 +59,7 @@
#include <sys/arc.h>
#include <sys/ddt.h>
#include <sys/zfeature.h>
+#include <sys/abd.h>
#include <zfs_comutil.h>
#undef verify
#include <libzfs.h>
@@ -2410,7 +2411,7 @@ zdb_blkptr_done(zio_t *zio)
zdb_cb_t *zcb = zio->io_private;
zbookmark_phys_t *zb = &zio->io_bookmark;
- zio_data_buf_free(zio->io_data, zio->io_size);
+ abd_free(zio->io_abd);
mutex_enter(&spa->spa_scrub_lock);
spa->spa_scrub_inflight--;
@@ -2477,7 +2478,7 @@ zdb_blkptr_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
if (!BP_IS_EMBEDDED(bp) &&
(dump_opt['c'] > 1 || (dump_opt['c'] && is_metadata))) {
size_t size = BP_GET_PSIZE(bp);
- void *data = zio_data_buf_alloc(size);
+ abd_t *abd = abd_alloc(size, B_FALSE);
int flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCRUB | ZIO_FLAG_RAW;
/* If it's an intent log block, failure is expected. */
@@ -2490,7 +2491,7 @@ zdb_blkptr_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
spa->spa_scrub_inflight++;
mutex_exit(&spa->spa_scrub_lock);
- zio_nowait(zio_read(NULL, spa, bp, data, size,
+ zio_nowait(zio_read(NULL, spa, bp, abd, size,
zdb_blkptr_done, zcb, ZIO_PRIORITY_ASYNC_READ, flags, zb));
}
@@ -3270,6 +3271,13 @@ name:
return (NULL);
}
+/* ARGSUSED */
+static int
+random_get_pseudo_bytes_cb(void *buf, size_t len, void *unused)
+{
+ return (random_get_pseudo_bytes(buf, len));
+}
+
/*
* Read a block from a pool and print it out. The syntax of the
* block descriptor is:
@@ -3301,7 +3309,8 @@ zdb_read_block(char *thing, spa_t *spa)
uint64_t offset = 0, size = 0, psize = 0, lsize = 0, blkptr_offset = 0;
zio_t *zio;
vdev_t *vd;
- void *pbuf, *lbuf, *buf;
+ abd_t *pabd;
+ void *lbuf, *buf;
char *s, *p, *dup, *vdev, *flagstr;
int i, error;
@@ -3373,7 +3382,7 @@ zdb_read_block(char *thing, spa_t *spa)
psize = size;
lsize = size;
- pbuf = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL);
+ pabd = abd_alloc_linear(SPA_MAXBLOCKSIZE, B_FALSE);
lbuf = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL);
BP_ZERO(bp);
@@ -3401,15 +3410,15 @@ zdb_read_block(char *thing, spa_t *spa)
/*
* Treat this as a normal block read.
*/
- zio_nowait(zio_read(zio, spa, bp, pbuf, psize, NULL, NULL,
+ zio_nowait(zio_read(zio, spa, bp, pabd, psize, NULL, NULL,
ZIO_PRIORITY_SYNC_READ,
ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW, NULL));
} else {
/*
* Treat this as a vdev child I/O.
*/
- zio_nowait(zio_vdev_child_io(zio, bp, vd, offset, pbuf, psize,
- ZIO_TYPE_READ, ZIO_PRIORITY_SYNC_READ,
+ zio_nowait(zio_vdev_child_io(zio, bp, vd, offset, pabd,
+ psize, ZIO_TYPE_READ, ZIO_PRIORITY_SYNC_READ,
ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE |
ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY |
ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW, NULL, NULL));
@@ -3432,21 +3441,21 @@ zdb_read_block(char *thing, spa_t *spa)
void *pbuf2 = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL);
void *lbuf2 = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL);
- bcopy(pbuf, pbuf2, psize);
+ abd_copy_to_buf(pbuf2, pabd, psize);
- VERIFY(random_get_pseudo_bytes((uint8_t *)pbuf + psize,
- SPA_MAXBLOCKSIZE - psize) == 0);
+ VERIFY0(abd_iterate_func(pabd, psize, SPA_MAXBLOCKSIZE - psize,
+ random_get_pseudo_bytes_cb, NULL));
- VERIFY(random_get_pseudo_bytes((uint8_t *)pbuf2 + psize,
- SPA_MAXBLOCKSIZE - psize) == 0);
+ VERIFY0(random_get_pseudo_bytes((uint8_t *)pbuf2 + psize,
+ SPA_MAXBLOCKSIZE - psize));
for (lsize = SPA_MAXBLOCKSIZE; lsize > psize;
lsize -= SPA_MINBLOCKSIZE) {
for (c = 0; c < ZIO_COMPRESS_FUNCTIONS; c++) {
- if (zio_decompress_data(c, pbuf, lbuf,
- psize, lsize) == 0 &&
- zio_decompress_data(c, pbuf2, lbuf2,
- psize, lsize) == 0 &&
+ if (zio_decompress_data(c, pabd,
+ lbuf, psize, lsize) == 0 &&
+ zio_decompress_data_buf(c, pbuf2,
+ lbuf2, psize, lsize) == 0 &&
bcmp(lbuf, lbuf2, lsize) == 0)
break;
}
@@ -3465,7 +3474,7 @@ zdb_read_block(char *thing, spa_t *spa)
buf = lbuf;
size = lsize;
} else {
- buf = pbuf;
+ buf = abd_to_buf(pabd);
size = psize;
}
@@ -3483,7 +3492,7 @@ zdb_read_block(char *thing, spa_t *spa)
zdb_dump_block(thing, buf, size, flags);
out:
- umem_free(pbuf, SPA_MAXBLOCKSIZE);
+ abd_free(pabd);
umem_free(lbuf, SPA_MAXBLOCKSIZE);
free(dup);
}
diff --git a/cddl/contrib/opensolaris/cmd/zdb/zdb_il.c b/cddl/contrib/opensolaris/cmd/zdb/zdb_il.c
index 583e42228660..bc02b1b6709f 100644
--- a/cddl/contrib/opensolaris/cmd/zdb/zdb_il.c
+++ b/cddl/contrib/opensolaris/cmd/zdb/zdb_il.c
@@ -24,7 +24,7 @@
*/
/*
- * Copyright (c) 2013, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2013, 2016 by Delphix. All rights reserved.
*/
/*
@@ -41,6 +41,7 @@
#include <sys/resource.h>
#include <sys/zil.h>
#include <sys/zil_impl.h>
+#include <sys/abd.h>
extern uint8_t dump_opt[256];
@@ -117,13 +118,27 @@ zil_prt_rec_rename(zilog_t *zilog, int txtype, lr_rename_t *lr)
}
/* ARGSUSED */
+static int
+zil_prt_rec_write_cb(void *data, size_t len, void *unused)
+{
+ char *cdata = data;
+ for (int i = 0; i < len; i++) {
+ if (isprint(*cdata))
+ (void) printf("%c ", *cdata);
+ else
+ (void) printf("%2X", *cdata);
+ cdata++;
+ }
+ return (0);
+}
+
+/* ARGSUSED */
static void
zil_prt_rec_write(zilog_t *zilog, int txtype, lr_write_t *lr)
{
- char *data, *dlimit;
+ abd_t *data;
blkptr_t *bp = &lr->lr_blkptr;
zbookmark_phys_t zb;
- char buf[SPA_MAXBLOCKSIZE];
int verbose = MAX(dump_opt['d'], dump_opt['i']);
int error;
@@ -144,7 +159,6 @@ zil_prt_rec_write(zilog_t *zilog, int txtype, lr_write_t *lr)
if (BP_IS_HOLE(bp)) {
(void) printf("\t\t\tLSIZE 0x%llx\n",
(u_longlong_t)BP_GET_LSIZE(bp));
- bzero(buf, sizeof (buf));
(void) printf("%s<hole>\n", prefix);
return;
}
@@ -157,28 +171,26 @@ zil_prt_rec_write(zilog_t *zilog, int txtype, lr_write_t *lr)
lr->lr_foid, ZB_ZIL_LEVEL,
lr->lr_offset / BP_GET_LSIZE(bp));
+ data = abd_alloc(BP_GET_LSIZE(bp), B_FALSE);
error = zio_wait(zio_read(NULL, zilog->zl_spa,
- bp, buf, BP_GET_LSIZE(bp), NULL, NULL,
+ bp, data, BP_GET_LSIZE(bp), NULL, NULL,
ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL, &zb));
if (error)
- return;
- data = buf;
+ goto out;
} else {
- data = (char *)(lr + 1);
+ /* data is stored after the end of the lr_write record */
+ data = abd_alloc(lr->lr_length, B_FALSE);
+ abd_copy_from_buf(data, lr + 1, lr->lr_length);
}
- dlimit = data + MIN(lr->lr_length,
- (verbose < 6 ? 20 : SPA_MAXBLOCKSIZE));
-
(void) printf("%s", prefix);
- while (data < dlimit) {
- if (isprint(*data))
- (void) printf("%c ", *data);
- else
- (void) printf("%2X", *data);
- data++;
- }
+ (void) abd_iterate_func(data,
+ 0, MIN(lr->lr_length, (verbose < 6 ? 20 : SPA_MAXBLOCKSIZE)),
+ zil_prt_rec_write_cb, NULL);
(void) printf("\n");
+
+out:
+ abd_free(data);
}
/* ARGSUSED */
diff --git a/cddl/contrib/opensolaris/cmd/ztest/ztest.c b/cddl/contrib/opensolaris/cmd/ztest/ztest.c
index 491bd6b21441..a0b6858f9e91 100644
--- a/cddl/contrib/opensolaris/cmd/ztest/ztest.c
+++ b/cddl/contrib/opensolaris/cmd/ztest/ztest.c
@@ -112,6 +112,7 @@
#include <sys/refcount.h>
#include <sys/zfeature.h>
#include <sys/dsl_userhold.h>
+#include <sys/abd.h>
#include <stdio.h>
#include <stdio_ext.h>
#include <stdlib.h>
@@ -190,6 +191,7 @@ extern uint64_t metaslab_df_alloc_threshold;
extern uint64_t zfs_deadman_synctime_ms;
extern int metaslab_preload_limit;
extern boolean_t zfs_compressed_arc_enabled;
+extern boolean_t zfs_abd_scatter_enabled;
static ztest_shared_opts_t *ztest_shared_opts;
static ztest_shared_opts_t ztest_opts;
@@ -5042,7 +5044,7 @@ ztest_ddt_repair(ztest_ds_t *zd, uint64_t id)
enum zio_checksum checksum = spa_dedup_checksum(spa);
dmu_buf_t *db;
dmu_tx_t *tx;
- void *buf;
+ abd_t *abd;
blkptr_t blk;
int copies = 2 * ZIO_DEDUPDITTO_MIN;
@@ -5122,14 +5124,14 @@ ztest_ddt_repair(ztest_ds_t *zd, uint64_t id)
* Damage the block. Dedup-ditto will save us when we read it later.
*/
psize = BP_GET_PSIZE(&blk);
- buf = zio_buf_alloc(psize);
- ztest_pattern_set(buf, psize, ~pattern);
+ abd = abd_alloc_linear(psize, B_TRUE);
+ ztest_pattern_set(abd_to_buf(abd), psize, ~pattern);
(void) zio_wait(zio_rewrite(NULL, spa, 0, &blk,
- buf, psize, NULL, NULL, ZIO_PRIORITY_SYNC_WRITE,
+ abd, psize, NULL, NULL, ZIO_PRIORITY_SYNC_WRITE,
ZIO_FLAG_CANFAIL | ZIO_FLAG_INDUCE_DAMAGE, NULL));
- zio_buf_free(buf, psize);
+ abd_free(abd);
(void) rw_unlock(&ztest_name_lock);
}
@@ -5413,6 +5415,12 @@ ztest_resume_thread(void *arg)
*/
if (ztest_random(10) == 0)
zfs_compressed_arc_enabled = ztest_random(2);
+
+ /*
+ * Periodically change the zfs_abd_scatter_enabled setting.
+ */
+ if (ztest_random(10) == 0)
+ zfs_abd_scatter_enabled = ztest_random(2);
}
return (NULL);
}
diff --git a/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_sendrecv.c b/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_sendrecv.c
index 5cf090263758..3e24741b9832 100644
--- a/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_sendrecv.c
+++ b/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_sendrecv.c
@@ -199,19 +199,19 @@ dump_record(dmu_replay_record_t *drr, void *payload, int payload_len,
{
ASSERT3U(offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum),
==, sizeof (dmu_replay_record_t) - sizeof (zio_cksum_t));
- fletcher_4_incremental_native(drr,
+ (void) fletcher_4_incremental_native(drr,
offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), zc);
if (drr->drr_type != DRR_BEGIN) {
ASSERT(ZIO_CHECKSUM_IS_ZERO(&drr->drr_u.
drr_checksum.drr_checksum));
drr->drr_u.drr_checksum.drr_checksum = *zc;
}
- fletcher_4_incremental_native(&drr->drr_u.drr_checksum.drr_checksum,
- sizeof (zio_cksum_t), zc);
+ (void) fletcher_4_incremental_native(
+ &drr->drr_u.drr_checksum.drr_checksum, sizeof (zio_cksum_t), zc);
if (write(outfd, drr, sizeof (*drr)) == -1)
return (errno);
if (payload_len != 0) {
- fletcher_4_incremental_native(payload, payload_len, zc);
+ (void) fletcher_4_incremental_native(payload, payload_len, zc);
if (write(outfd, payload, payload_len) == -1)
return (errno);
}
@@ -2096,9 +2096,9 @@ recv_read(libzfs_handle_t *hdl, int fd, void *buf, int ilen,
if (zc) {
if (byteswap)
- fletcher_4_incremental_byteswap(buf, ilen, zc);
+ (void) fletcher_4_incremental_byteswap(buf, ilen, zc);
else
- fletcher_4_incremental_native(buf, ilen, zc);
+ (void) fletcher_4_incremental_native(buf, ilen, zc);
}
return (0);
}
@@ -3688,7 +3688,8 @@ zfs_receive_impl(libzfs_handle_t *hdl, const char *tosnap,
* recv_read() above; do it again correctly.
*/
bzero(&zcksum, sizeof (zio_cksum_t));
- fletcher_4_incremental_byteswap(&drr, sizeof (drr), &zcksum);
+ (void) fletcher_4_incremental_byteswap(&drr,
+ sizeof (drr), &zcksum);
flags->byteswap = B_TRUE;
drr.drr_type = BSWAP_32(drr.drr_type);