aboutsummaryrefslogtreecommitdiff
path: root/sys/contrib/openzfs/module
diff options
context:
space:
mode:
Diffstat (limited to 'sys/contrib/openzfs/module')
-rw-r--r--sys/contrib/openzfs/module/Kbuild.in38
-rw-r--r--sys/contrib/openzfs/module/Makefile.bsd34
-rw-r--r--sys/contrib/openzfs/module/avl/avl.c24
-rw-r--r--sys/contrib/openzfs/module/icp/algs/blake3/blake3.c732
-rw-r--r--sys/contrib/openzfs/module/icp/algs/blake3/blake3_generic.c202
-rw-r--r--sys/contrib/openzfs/module/icp/algs/blake3/blake3_impl.c284
-rw-r--r--sys/contrib/openzfs/module/icp/algs/blake3/blake3_impl.h213
-rw-r--r--sys/contrib/openzfs/module/icp/algs/blake3/blake3_x86-64.c248
-rw-r--r--sys/contrib/openzfs/module/icp/asm-aarch64/blake3/b3_aarch64_sse2.S2450
-rw-r--r--sys/contrib/openzfs/module/icp/asm-aarch64/blake3/b3_aarch64_sse41.S2463
-rw-r--r--sys/contrib/openzfs/module/icp/asm-ppc64/blake3/b3_ppc64le_sse2.S2823
-rw-r--r--sys/contrib/openzfs/module/icp/asm-ppc64/blake3/b3_ppc64le_sse41.S3064
-rw-r--r--sys/contrib/openzfs/module/icp/asm-x86_64/blake3/blake3_avx2.S1845
-rw-r--r--sys/contrib/openzfs/module/icp/asm-x86_64/blake3/blake3_avx512.S2618
-rw-r--r--sys/contrib/openzfs/module/icp/asm-x86_64/blake3/blake3_sse2.S2323
-rw-r--r--sys/contrib/openzfs/module/icp/asm-x86_64/blake3/blake3_sse41.S2058
-rw-r--r--sys/contrib/openzfs/module/os/freebsd/zfs/crypto_os.c41
-rw-r--r--sys/contrib/openzfs/module/os/linux/spl/spl-generic.c6
-rw-r--r--sys/contrib/openzfs/module/os/linux/spl/spl-zone.c424
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/policy.c2
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/zfs_ioctl_os.c47
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/zfs_uio.c154
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c20
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/zpl_ctldir.c17
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c1
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/zpl_xattr.c5
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c632
-rw-r--r--sys/contrib/openzfs/module/zcommon/zfeature_common.c31
-rw-r--r--sys/contrib/openzfs/module/zcommon/zfs_prop.c8
-rw-r--r--sys/contrib/openzfs/module/zfs/blake3_zfs.c117
-rw-r--r--sys/contrib/openzfs/module/zfs/dsl_prop.c10
-rw-r--r--sys/contrib/openzfs/module/zfs/dsl_scan.c26
-rw-r--r--sys/contrib/openzfs/module/zfs/spa_misc.c3
-rw-r--r--sys/contrib/openzfs/module/zfs/vdev.c6
-rw-r--r--sys/contrib/openzfs/module/zfs/zcp_synctask.c2
-rw-r--r--sys/contrib/openzfs/module/zfs/zfs_chksum.c323
-rw-r--r--sys/contrib/openzfs/module/zfs/zfs_ioctl.c14
-rw-r--r--sys/contrib/openzfs/module/zfs/zio_checksum.c6
38 files changed, 23113 insertions, 201 deletions
diff --git a/sys/contrib/openzfs/module/Kbuild.in b/sys/contrib/openzfs/module/Kbuild.in
index 11099999fb87..4803952cbfed 100644
--- a/sys/contrib/openzfs/module/Kbuild.in
+++ b/sys/contrib/openzfs/module/Kbuild.in
@@ -65,7 +65,8 @@ SPL_OBJS := \
spl-tsd.o \
spl-vmem.o \
spl-xdr.o \
- spl-zlib.o
+ spl-zlib.o \
+ spl-zone.o
spl-objs += $(addprefix os/linux/spl/,$(SPL_OBJS))
@@ -75,6 +76,10 @@ ICP_OBJS := \
algs/aes/aes_impl.o \
algs/aes/aes_impl_generic.o \
algs/aes/aes_modes.o \
+ algs/blake3/blake3.o \
+ algs/blake3/blake3_generic.o \
+ algs/blake3/blake3_impl.o \
+ algs/blake3/blake3_x86-64.o \
algs/edonr/edonr.o \
algs/modes/cbc.o \
algs/modes/ccm.o \
@@ -105,23 +110,45 @@ ICP_OBJS_X86_64 := \
asm-x86_64/aes/aes_aesni.o \
asm-x86_64/aes/aes_amd64.o \
asm-x86_64/aes/aeskey.o \
+ asm-x86_64/blake3/blake3_avx2.o \
+ asm-x86_64/blake3/blake3_avx512.o \
+ asm-x86_64/blake3/blake3_sse2.o \
+ asm-x86_64/blake3/blake3_sse41.o \
asm-x86_64/modes/aesni-gcm-x86_64.o \
asm-x86_64/modes/gcm_pclmulqdq.o \
asm-x86_64/modes/ghash-x86_64.o \
asm-x86_64/sha2/sha256_impl.o \
asm-x86_64/sha2/sha512_impl.o
+
ICP_OBJS_X86 := \
algs/aes/aes_impl_aesni.o \
algs/aes/aes_impl_x86-64.o \
algs/modes/gcm_pclmulqdq.o
+
+ICP_OBJS_ARM64 := \
+ asm-aarch64/blake3/b3_aarch64_sse2.o \
+ asm-aarch64/blake3/b3_aarch64_sse41.o
+
+
+ICP_OBJS_PPC_PPC64 := \
+ asm-ppc64/blake3/b3_ppc64le_sse2.o \
+ asm-ppc64/blake3/b3_ppc64le_sse41.o
+
zfs-objs += $(addprefix icp/,$(ICP_OBJS))
zfs-$(CONFIG_X86) += $(addprefix icp/,$(ICP_OBJS_X86))
+zfs-$(CONFIG_UML_X86)+= $(addprefix icp/,$(ICP_OBJS_X86))
zfs-$(CONFIG_X86_64) += $(addprefix icp/,$(ICP_OBJS_X86_64))
+zfs-$(CONFIG_ARM64) += $(addprefix icp/,$(ICP_OBJS_ARM64))
+zfs-$(CONFIG_PPC) += $(addprefix icp/,$(ICP_OBJS_PPC_PPC64))
+zfs-$(CONFIG_PPC64) += $(addprefix icp/,$(ICP_OBJS_PPC_PPC64))
+
+$(addprefix $(obj)/icp/,$(ICP_OBJS) $(ICP_OBJS_X86) $(ICP_OBJS_X86_64) \
+ $(ICP_OBJS_ARM64) $(ICP_OBJS_PPC_PPC64)) : asflags-y += -I$(icp_include)
-$(addprefix $(obj)/icp/,$(ICP_OBJS) $(ICP_OBJS_X86) $(ICP_OBJS_X86_64)) : asflags-y += -I$(icp_include)
-$(addprefix $(obj)/icp/,$(ICP_OBJS) $(ICP_OBJS_X86) $(ICP_OBJS_X86_64)) : ccflags-y += -I$(icp_include)
+$(addprefix $(obj)/icp/,$(ICP_OBJS) $(ICP_OBJS_X86) $(ICP_OBJS_X86_64) \
+ $(ICP_OBJS_ARM64) $(ICP_OBJS_PPC_PPC64)) : ccflags-y += -I$(icp_include)
# Suppress objtool "can't find jump dest instruction at" warnings. They
# are caused by the constants which are defined in the text section of the
@@ -129,6 +156,7 @@ $(addprefix $(obj)/icp/,$(ICP_OBJS) $(ICP_OBJS_X86) $(ICP_OBJS_X86_64)) : ccflag
# utility tries to interpret them as opcodes and obviously fails doing so.
OBJECT_FILES_NON_STANDARD_aesni-gcm-x86_64.o := y
OBJECT_FILES_NON_STANDARD_ghash-x86_64.o := y
+
# Suppress objtool "unsupported stack pointer realignment" warnings. We are
# not using a DRAP register while aligning the stack to a 64 byte boundary.
# See #6950 for the reasoning.
@@ -205,6 +233,7 @@ ZCOMMON_OBJS_ARM64 := \
zfs-objs += $(addprefix zcommon/,$(ZCOMMON_OBJS))
zfs-$(CONFIG_X86) += $(addprefix zcommon/,$(ZCOMMON_OBJS_X86))
+zfs-$(CONFIG_UML_X86)+= $(addprefix zcommon/,$(ZCOMMON_OBJS_X86))
zfs-$(CONFIG_ARM64) += $(addprefix zcommon/,$(ZCOMMON_OBJS_ARM64))
@@ -261,6 +290,7 @@ ZFS_OBJS := \
abd.o \
aggsum.o \
arc.o \
+ blake3_zfs.o \
blkptr.o \
bplist.o \
bpobj.o \
@@ -358,6 +388,7 @@ ZFS_OBJS := \
zcp_synctask.o \
zfeature.o \
zfs_byteswap.o \
+ zfs_chksum.o \
zfs_fm.o \
zfs_fuid.o \
zfs_ioctl.o \
@@ -428,6 +459,7 @@ ZFS_OBJS_PPC_PPC64 := \
zfs-objs += $(addprefix zfs/,$(ZFS_OBJS)) $(addprefix os/linux/zfs/,$(ZFS_OBJS_OS))
zfs-$(CONFIG_X86) += $(addprefix zfs/,$(ZFS_OBJS_X86))
+zfs-$(CONFIG_UML_X86)+= $(addprefix zfs/,$(ZFS_OBJS_X86))
zfs-$(CONFIG_ARM64) += $(addprefix zfs/,$(ZFS_OBJS_ARM64))
zfs-$(CONFIG_PPC) += $(addprefix zfs/,$(ZFS_OBJS_PPC_PPC64))
zfs-$(CONFIG_PPC64) += $(addprefix zfs/,$(ZFS_OBJS_PPC_PPC64))
diff --git a/sys/contrib/openzfs/module/Makefile.bsd b/sys/contrib/openzfs/module/Makefile.bsd
index 61f02152d334..589ca60b29be 100644
--- a/sys/contrib/openzfs/module/Makefile.bsd
+++ b/sys/contrib/openzfs/module/Makefile.bsd
@@ -10,6 +10,10 @@ INCDIR=${.CURDIR:H}/include
KMOD= openzfs
.PATH: ${SRCDIR}/avl \
+ ${SRCDIR}/icp/algs/blake3 \
+ ${SRCDIR}/icp/asm-aarch64/blake3 \
+ ${SRCDIR}/icp/asm-ppc64/blake3 \
+ ${SRCDIR}/icp/asm-x86_64/blake3 \
${SRCDIR}/lua \
${SRCDIR}/nvpair \
${SRCDIR}/icp/algs/edonr \
@@ -31,6 +35,7 @@ CFLAGS+= -I${INCDIR}/os/freebsd
CFLAGS+= -I${INCDIR}/os/freebsd/spl
CFLAGS+= -I${INCDIR}/os/freebsd/zfs
CFLAGS+= -I${SRCDIR}/zstd/include
+CFLAGS+= -I${SRCDIR}/icp/include
CFLAGS+= -include ${INCDIR}/os/freebsd/spl/sys/ccompile.h
CFLAGS+= -D__KERNEL__ -DFREEBSD_NAMECACHE -DBUILDING_ZFS -D__BSD_VISIBLE=1 \
@@ -38,7 +43,8 @@ CFLAGS+= -D__KERNEL__ -DFREEBSD_NAMECACHE -DBUILDING_ZFS -D__BSD_VISIBLE=1 \
-D_SYS_VMEM_H_ -DKDTRACE_HOOKS -DSMP -DCOMPAT_FREEBSD11
.if ${MACHINE_ARCH} == "amd64"
-CFLAGS+= -DHAVE_AVX2 -DHAVE_AVX -D__x86_64 -DHAVE_SSE2 -DHAVE_AVX512F -DHAVE_SSSE3
+CFLAGS+= -D__x86_64 -DHAVE_SSE2 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 \
+ -DHAVE_AVX -DHAVE_AVX2 -DHAVE_AVX512F -DHAVE_AVX512VL
.endif
.if defined(WITH_DEBUG) && ${WITH_DEBUG} == "true"
@@ -73,12 +79,32 @@ CFLAGS+= -DBITS_PER_LONG=64
SRCS= vnode_if.h device_if.h bus_if.h
-# avl
+#avl
SRCS+= avl.c
# icp
SRCS+= edonr.c
+#icp/algs/blake3
+SRCS+= blake3.c \
+ blake3_generic.c \
+ blake3_impl.c \
+ blake3_x86-64.c
+
+#icp/asm-aarch64/blake3
+SRCS+= b3_aarch64_sse2.S \
+ b3_aarch64_sse41.S
+
+#icp/asm-ppc64/blake3
+SRCS+= b3_ppc64le_sse2.S \
+ b3_ppc64le_sse41.S
+
+#icp/asm-x86_64/blake3
+SRCS+= blake3_avx2.S \
+ blake3_avx512.S \
+ blake3_sse2.S \
+ blake3_sse41.S
+
#lua
SRCS+= lapi.c \
lauxlib.c \
@@ -189,6 +215,7 @@ SRCS+= zfeature_common.c \
SRCS+= abd.c \
aggsum.c \
arc.c \
+ blake3_zfs.c \
blkptr.c \
bplist.c \
bpobj.c \
@@ -291,6 +318,7 @@ SRCS+= abd.c \
zcp_synctask.c \
zfeature.c \
zfs_byteswap.c \
+ zfs_chksum.c \
zfs_file_os.c \
zfs_fm.c \
zfs_fuid.c \
@@ -337,8 +365,6 @@ SRCS+= zfs_zstd.c \
zstd_decompress.c \
zstd_decompress_block.c
-
-
beforeinstall:
.if ${MK_DEBUG_FILES} != "no"
mtree -eu \
diff --git a/sys/contrib/openzfs/module/avl/avl.c b/sys/contrib/openzfs/module/avl/avl.c
index 69cb8bf6815b..c45be4793fa8 100644
--- a/sys/contrib/openzfs/module/avl/avl.c
+++ b/sys/contrib/openzfs/module/avl/avl.c
@@ -109,21 +109,6 @@
#include <sys/mod.h>
/*
- * Small arrays to translate between balance (or diff) values and child indices.
- *
- * Code that deals with binary tree data structures will randomly use
- * left and right children when examining a tree. C "if()" statements
- * which evaluate randomly suffer from very poor hardware branch prediction.
- * In this code we avoid some of the branch mispredictions by using the
- * following translation arrays. They replace random branches with an
- * additional memory reference. Since the translation arrays are both very
- * small the data should remain efficiently in cache.
- */
-static const int avl_child2balance[] = {-1, 1};
-static const int avl_balance2child[] = {0, 0, 1};
-
-
-/*
* Walk from one node to the previous valued node (ie. an infix walk
* towards the left). At any given node we do one of 2 things:
*
@@ -278,8 +263,7 @@ avl_find(avl_tree_t *tree, const void *value, avl_index_t *where)
#endif
return (AVL_NODE2DATA(node, off));
}
- child = avl_balance2child[1 + diff];
-
+ child = (diff > 0);
}
if (where != NULL)
@@ -527,7 +511,7 @@ avl_insert(avl_tree_t *tree, void *new_data, avl_index_t where)
* Compute the new balance
*/
old_balance = AVL_XBALANCE(node);
- new_balance = old_balance + avl_child2balance[which_child];
+ new_balance = old_balance + (which_child ? 1 : -1);
/*
* If we introduced equal balance, then we are done immediately
@@ -693,7 +677,7 @@ avl_remove(avl_tree_t *tree, void *data)
* choose node to swap from whichever side is taller
*/
old_balance = AVL_XBALANCE(delete);
- left = avl_balance2child[old_balance + 1];
+ left = (old_balance > 0);
right = 1 - left;
/*
@@ -777,7 +761,7 @@ avl_remove(avl_tree_t *tree, void *data)
*/
node = parent;
old_balance = AVL_XBALANCE(node);
- new_balance = old_balance - avl_child2balance[which_child];
+ new_balance = old_balance - (which_child ? 1 : -1);
parent = AVL_XPARENT(node);
which_child = AVL_XCHILD(node);
diff --git a/sys/contrib/openzfs/module/icp/algs/blake3/blake3.c b/sys/contrib/openzfs/module/icp/algs/blake3/blake3.c
new file mode 100644
index 000000000000..8c9c06eb9d9f
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/algs/blake3/blake3.c
@@ -0,0 +1,732 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Based on BLAKE3 v1.3.1, https://github.com/BLAKE3-team/BLAKE3
+ * Copyright (c) 2019-2020 Samuel Neves and Jack O'Connor
+ * Copyright (c) 2021-2022 Tino Reichardt <milky-zfs@mcmilk.de>
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/blake3.h>
+
+#include "blake3_impl.h"
+
+/*
+ * We need 1056 byte stack for blake3_compress_subtree_wide()
+ * - we define this pragma to make gcc happy
+ */
+#if defined(__GNUC__)
+#pragma GCC diagnostic ignored "-Wframe-larger-than="
+#endif
+
+/* internal used */
+typedef struct {
+ uint32_t input_cv[8];
+ uint64_t counter;
+ uint8_t block[BLAKE3_BLOCK_LEN];
+ uint8_t block_len;
+ uint8_t flags;
+} output_t;
+
+/* internal flags */
+enum blake3_flags {
+ CHUNK_START = 1 << 0,
+ CHUNK_END = 1 << 1,
+ PARENT = 1 << 2,
+ ROOT = 1 << 3,
+ KEYED_HASH = 1 << 4,
+ DERIVE_KEY_CONTEXT = 1 << 5,
+ DERIVE_KEY_MATERIAL = 1 << 6,
+};
+
+/* internal start */
+static void chunk_state_init(blake3_chunk_state_t *ctx,
+ const uint32_t key[8], uint8_t flags)
+{
+ memcpy(ctx->cv, key, BLAKE3_KEY_LEN);
+ ctx->chunk_counter = 0;
+ memset(ctx->buf, 0, BLAKE3_BLOCK_LEN);
+ ctx->buf_len = 0;
+ ctx->blocks_compressed = 0;
+ ctx->flags = flags;
+}
+
+static void chunk_state_reset(blake3_chunk_state_t *ctx,
+ const uint32_t key[8], uint64_t chunk_counter)
+{
+ memcpy(ctx->cv, key, BLAKE3_KEY_LEN);
+ ctx->chunk_counter = chunk_counter;
+ ctx->blocks_compressed = 0;
+ memset(ctx->buf, 0, BLAKE3_BLOCK_LEN);
+ ctx->buf_len = 0;
+}
+
+static size_t chunk_state_len(const blake3_chunk_state_t *ctx)
+{
+ return (BLAKE3_BLOCK_LEN * (size_t)ctx->blocks_compressed) +
+ ((size_t)ctx->buf_len);
+}
+
+static size_t chunk_state_fill_buf(blake3_chunk_state_t *ctx,
+ const uint8_t *input, size_t input_len)
+{
+ size_t take = BLAKE3_BLOCK_LEN - ((size_t)ctx->buf_len);
+ if (take > input_len) {
+ take = input_len;
+ }
+ uint8_t *dest = ctx->buf + ((size_t)ctx->buf_len);
+ memcpy(dest, input, take);
+ ctx->buf_len += (uint8_t)take;
+ return (take);
+}
+
+static uint8_t chunk_state_maybe_start_flag(const blake3_chunk_state_t *ctx)
+{
+ if (ctx->blocks_compressed == 0) {
+ return (CHUNK_START);
+ } else {
+ return (0);
+ }
+}
+
+static output_t make_output(const uint32_t input_cv[8],
+ const uint8_t *block, uint8_t block_len,
+ uint64_t counter, uint8_t flags)
+{
+ output_t ret;
+ memcpy(ret.input_cv, input_cv, 32);
+ memcpy(ret.block, block, BLAKE3_BLOCK_LEN);
+ ret.block_len = block_len;
+ ret.counter = counter;
+ ret.flags = flags;
+ return (ret);
+}
+
+/*
+ * Chaining values within a given chunk (specifically the compress_in_place
+ * interface) are represented as words. This avoids unnecessary bytes<->words
+ * conversion overhead in the portable implementation. However, the hash_many
+ * interface handles both user input and parent node blocks, so it accepts
+ * bytes. For that reason, chaining values in the CV stack are represented as
+ * bytes.
+ */
+static void output_chaining_value(const blake3_impl_ops_t *ops,
+ const output_t *ctx, uint8_t cv[32])
+{
+ uint32_t cv_words[8];
+ memcpy(cv_words, ctx->input_cv, 32);
+ ops->compress_in_place(cv_words, ctx->block, ctx->block_len,
+ ctx->counter, ctx->flags);
+ store_cv_words(cv, cv_words);
+}
+
+static void output_root_bytes(const blake3_impl_ops_t *ops, const output_t *ctx,
+ uint64_t seek, uint8_t *out, size_t out_len)
+{
+ uint64_t output_block_counter = seek / 64;
+ size_t offset_within_block = seek % 64;
+ uint8_t wide_buf[64];
+ while (out_len > 0) {
+ ops->compress_xof(ctx->input_cv, ctx->block, ctx->block_len,
+ output_block_counter, ctx->flags | ROOT, wide_buf);
+ size_t available_bytes = 64 - offset_within_block;
+ size_t memcpy_len;
+ if (out_len > available_bytes) {
+ memcpy_len = available_bytes;
+ } else {
+ memcpy_len = out_len;
+ }
+ memcpy(out, wide_buf + offset_within_block, memcpy_len);
+ out += memcpy_len;
+ out_len -= memcpy_len;
+ output_block_counter += 1;
+ offset_within_block = 0;
+ }
+}
+
+static void chunk_state_update(const blake3_impl_ops_t *ops,
+ blake3_chunk_state_t *ctx, const uint8_t *input, size_t input_len)
+{
+ if (ctx->buf_len > 0) {
+ size_t take = chunk_state_fill_buf(ctx, input, input_len);
+ input += take;
+ input_len -= take;
+ if (input_len > 0) {
+ ops->compress_in_place(ctx->cv, ctx->buf,
+ BLAKE3_BLOCK_LEN, ctx->chunk_counter,
+ ctx->flags|chunk_state_maybe_start_flag(ctx));
+ ctx->blocks_compressed += 1;
+ ctx->buf_len = 0;
+ memset(ctx->buf, 0, BLAKE3_BLOCK_LEN);
+ }
+ }
+
+ while (input_len > BLAKE3_BLOCK_LEN) {
+ ops->compress_in_place(ctx->cv, input, BLAKE3_BLOCK_LEN,
+ ctx->chunk_counter,
+ ctx->flags|chunk_state_maybe_start_flag(ctx));
+ ctx->blocks_compressed += 1;
+ input += BLAKE3_BLOCK_LEN;
+ input_len -= BLAKE3_BLOCK_LEN;
+ }
+
+ size_t take = chunk_state_fill_buf(ctx, input, input_len);
+ input += take;
+ input_len -= take;
+}
+
+static output_t chunk_state_output(const blake3_chunk_state_t *ctx)
+{
+ uint8_t block_flags =
+ ctx->flags | chunk_state_maybe_start_flag(ctx) | CHUNK_END;
+ return (make_output(ctx->cv, ctx->buf, ctx->buf_len, ctx->chunk_counter,
+ block_flags));
+}
+
+static output_t parent_output(const uint8_t block[BLAKE3_BLOCK_LEN],
+ const uint32_t key[8], uint8_t flags)
+{
+ return (make_output(key, block, BLAKE3_BLOCK_LEN, 0, flags | PARENT));
+}
+
+/*
+ * Given some input larger than one chunk, return the number of bytes that
+ * should go in the left subtree. This is the largest power-of-2 number of
+ * chunks that leaves at least 1 byte for the right subtree.
+ */
+static size_t left_len(size_t content_len)
+{
+ /*
+ * Subtract 1 to reserve at least one byte for the right side.
+ * content_len
+ * should always be greater than BLAKE3_CHUNK_LEN.
+ */
+ size_t full_chunks = (content_len - 1) / BLAKE3_CHUNK_LEN;
+ return (round_down_to_power_of_2(full_chunks) * BLAKE3_CHUNK_LEN);
+}
+
+/*
+ * Use SIMD parallelism to hash up to MAX_SIMD_DEGREE chunks at the same time
+ * on a single thread. Write out the chunk chaining values and return the
+ * number of chunks hashed. These chunks are never the root and never empty;
+ * those cases use a different codepath.
+ */
+static size_t compress_chunks_parallel(const blake3_impl_ops_t *ops,
+ const uint8_t *input, size_t input_len, const uint32_t key[8],
+ uint64_t chunk_counter, uint8_t flags, uint8_t *out)
+{
+ const uint8_t *chunks_array[MAX_SIMD_DEGREE];
+ size_t input_position = 0;
+ size_t chunks_array_len = 0;
+ while (input_len - input_position >= BLAKE3_CHUNK_LEN) {
+ chunks_array[chunks_array_len] = &input[input_position];
+ input_position += BLAKE3_CHUNK_LEN;
+ chunks_array_len += 1;
+ }
+
+ ops->hash_many(chunks_array, chunks_array_len, BLAKE3_CHUNK_LEN /
+ BLAKE3_BLOCK_LEN, key, chunk_counter, B_TRUE, flags, CHUNK_START,
+ CHUNK_END, out);
+
+ /*
+ * Hash the remaining partial chunk, if there is one. Note that the
+ * empty chunk (meaning the empty message) is a different codepath.
+ */
+ if (input_len > input_position) {
+ uint64_t counter = chunk_counter + (uint64_t)chunks_array_len;
+ blake3_chunk_state_t chunk_state;
+ chunk_state_init(&chunk_state, key, flags);
+ chunk_state.chunk_counter = counter;
+ chunk_state_update(ops, &chunk_state, &input[input_position],
+ input_len - input_position);
+ output_t output = chunk_state_output(&chunk_state);
+ output_chaining_value(ops, &output, &out[chunks_array_len *
+ BLAKE3_OUT_LEN]);
+ return (chunks_array_len + 1);
+ } else {
+ return (chunks_array_len);
+ }
+}
+
+/*
+ * Use SIMD parallelism to hash up to MAX_SIMD_DEGREE parents at the same time
+ * on a single thread. Write out the parent chaining values and return the
+ * number of parents hashed. (If there's an odd input chaining value left over,
+ * return it as an additional output.) These parents are never the root and
+ * never empty; those cases use a different codepath.
+ */
+static size_t compress_parents_parallel(const blake3_impl_ops_t *ops,
+ const uint8_t *child_chaining_values, size_t num_chaining_values,
+ const uint32_t key[8], uint8_t flags, uint8_t *out)
+{
+ const uint8_t *parents_array[MAX_SIMD_DEGREE_OR_2];
+ size_t parents_array_len = 0;
+
+ while (num_chaining_values - (2 * parents_array_len) >= 2) {
+ parents_array[parents_array_len] = &child_chaining_values[2 *
+ parents_array_len * BLAKE3_OUT_LEN];
+ parents_array_len += 1;
+ }
+
+ ops->hash_many(parents_array, parents_array_len, 1, key, 0, B_FALSE,
+ flags | PARENT, 0, 0, out);
+
+ /* If there's an odd child left over, it becomes an output. */
+ if (num_chaining_values > 2 * parents_array_len) {
+ memcpy(&out[parents_array_len * BLAKE3_OUT_LEN],
+ &child_chaining_values[2 * parents_array_len *
+ BLAKE3_OUT_LEN], BLAKE3_OUT_LEN);
+ return (parents_array_len + 1);
+ } else {
+ return (parents_array_len);
+ }
+}
+
+/*
+ * The wide helper function returns (writes out) an array of chaining values
+ * and returns the length of that array. The number of chaining values returned
+ * is the dyanmically detected SIMD degree, at most MAX_SIMD_DEGREE. Or fewer,
+ * if the input is shorter than that many chunks. The reason for maintaining a
+ * wide array of chaining values going back up the tree, is to allow the
+ * implementation to hash as many parents in parallel as possible.
+ *
+ * As a special case when the SIMD degree is 1, this function will still return
+ * at least 2 outputs. This guarantees that this function doesn't perform the
+ * root compression. (If it did, it would use the wrong flags, and also we
+ * wouldn't be able to implement exendable ouput.) Note that this function is
+ * not used when the whole input is only 1 chunk long; that's a different
+ * codepath.
+ *
+ * Why not just have the caller split the input on the first update(), instead
+ * of implementing this special rule? Because we don't want to limit SIMD or
+ * multi-threading parallelism for that update().
+ */
+static size_t blake3_compress_subtree_wide(const blake3_impl_ops_t *ops,
+ const uint8_t *input, size_t input_len, const uint32_t key[8],
+ uint64_t chunk_counter, uint8_t flags, uint8_t *out)
+{
+ /*
+ * Note that the single chunk case does *not* bump the SIMD degree up
+ * to 2 when it is 1. If this implementation adds multi-threading in
+ * the future, this gives us the option of multi-threading even the
+ * 2-chunk case, which can help performance on smaller platforms.
+ */
+ if (input_len <= (size_t)(ops->degree * BLAKE3_CHUNK_LEN)) {
+ return (compress_chunks_parallel(ops, input, input_len, key,
+ chunk_counter, flags, out));
+ }
+
+
+ /*
+ * With more than simd_degree chunks, we need to recurse. Start by
+ * dividing the input into left and right subtrees. (Note that this is
+ * only optimal as long as the SIMD degree is a power of 2. If we ever
+ * get a SIMD degree of 3 or something, we'll need a more complicated
+ * strategy.)
+ */
+ size_t left_input_len = left_len(input_len);
+ size_t right_input_len = input_len - left_input_len;
+ const uint8_t *right_input = &input[left_input_len];
+ uint64_t right_chunk_counter = chunk_counter +
+ (uint64_t)(left_input_len / BLAKE3_CHUNK_LEN);
+
+ /*
+ * Make space for the child outputs. Here we use MAX_SIMD_DEGREE_OR_2
+ * to account for the special case of returning 2 outputs when the
+ * SIMD degree is 1.
+ */
+ uint8_t cv_array[2 * MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN];
+ size_t degree = ops->degree;
+ if (left_input_len > BLAKE3_CHUNK_LEN && degree == 1) {
+
+ /*
+ * The special case: We always use a degree of at least two,
+ * to make sure there are two outputs. Except, as noted above,
+ * at the chunk level, where we allow degree=1. (Note that the
+ * 1-chunk-input case is a different codepath.)
+ */
+ degree = 2;
+ }
+ uint8_t *right_cvs = &cv_array[degree * BLAKE3_OUT_LEN];
+
+ /*
+ * Recurse! If this implementation adds multi-threading support in the
+ * future, this is where it will go.
+ */
+ size_t left_n = blake3_compress_subtree_wide(ops, input, left_input_len,
+ key, chunk_counter, flags, cv_array);
+ size_t right_n = blake3_compress_subtree_wide(ops, right_input,
+ right_input_len, key, right_chunk_counter, flags, right_cvs);
+
+ /*
+ * The special case again. If simd_degree=1, then we'll have left_n=1
+ * and right_n=1. Rather than compressing them into a single output,
+ * return them directly, to make sure we always have at least two
+ * outputs.
+ */
+ if (left_n == 1) {
+ memcpy(out, cv_array, 2 * BLAKE3_OUT_LEN);
+ return (2);
+ }
+
+ /* Otherwise, do one layer of parent node compression. */
+ size_t num_chaining_values = left_n + right_n;
+ return compress_parents_parallel(ops, cv_array,
+ num_chaining_values, key, flags, out);
+}
+
+/*
+ * Hash a subtree with compress_subtree_wide(), and then condense the resulting
+ * list of chaining values down to a single parent node. Don't compress that
+ * last parent node, however. Instead, return its message bytes (the
+ * concatenated chaining values of its children). This is necessary when the
+ * first call to update() supplies a complete subtree, because the topmost
+ * parent node of that subtree could end up being the root. It's also necessary
+ * for extended output in the general case.
+ *
+ * As with compress_subtree_wide(), this function is not used on inputs of 1
+ * chunk or less. That's a different codepath.
+ */
+static void compress_subtree_to_parent_node(const blake3_impl_ops_t *ops,
+ const uint8_t *input, size_t input_len, const uint32_t key[8],
+ uint64_t chunk_counter, uint8_t flags, uint8_t out[2 * BLAKE3_OUT_LEN])
+{
+ uint8_t cv_array[MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN];
+ size_t num_cvs = blake3_compress_subtree_wide(ops, input, input_len,
+ key, chunk_counter, flags, cv_array);
+
+ /*
+ * If MAX_SIMD_DEGREE is greater than 2 and there's enough input,
+ * compress_subtree_wide() returns more than 2 chaining values. Condense
+ * them into 2 by forming parent nodes repeatedly.
+ */
+ uint8_t out_array[MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN / 2];
+ while (num_cvs > 2) {
+ num_cvs = compress_parents_parallel(ops, cv_array, num_cvs, key,
+ flags, out_array);
+ memcpy(cv_array, out_array, num_cvs * BLAKE3_OUT_LEN);
+ }
+ memcpy(out, cv_array, 2 * BLAKE3_OUT_LEN);
+}
+
+static void hasher_init_base(BLAKE3_CTX *ctx, const uint32_t key[8],
+ uint8_t flags)
+{
+ memcpy(ctx->key, key, BLAKE3_KEY_LEN);
+ chunk_state_init(&ctx->chunk, key, flags);
+ ctx->cv_stack_len = 0;
+ ctx->ops = blake3_impl_get_ops();
+}
+
+/*
+ * As described in hasher_push_cv() below, we do "lazy merging", delaying
+ * merges until right before the next CV is about to be added. This is
+ * different from the reference implementation. Another difference is that we
+ * aren't always merging 1 chunk at a time. Instead, each CV might represent
+ * any power-of-two number of chunks, as long as the smaller-above-larger
+ * stack order is maintained. Instead of the "count the trailing 0-bits"
+ * algorithm described in the spec, we use a "count the total number of
+ * 1-bits" variant that doesn't require us to retain the subtree size of the
+ * CV on top of the stack. The principle is the same: each CV that should
+ * remain in the stack is represented by a 1-bit in the total number of chunks
+ * (or bytes) so far.
+ */
+static void hasher_merge_cv_stack(BLAKE3_CTX *ctx, uint64_t total_len)
+{
+ size_t post_merge_stack_len = (size_t)popcnt(total_len);
+ while (ctx->cv_stack_len > post_merge_stack_len) {
+ uint8_t *parent_node =
+ &ctx->cv_stack[(ctx->cv_stack_len - 2) * BLAKE3_OUT_LEN];
+ output_t output =
+ parent_output(parent_node, ctx->key, ctx->chunk.flags);
+ output_chaining_value(ctx->ops, &output, parent_node);
+ ctx->cv_stack_len -= 1;
+ }
+}
+
+/*
+ * In reference_impl.rs, we merge the new CV with existing CVs from the stack
+ * before pushing it. We can do that because we know more input is coming, so
+ * we know none of the merges are root.
+ *
+ * This setting is different. We want to feed as much input as possible to
+ * compress_subtree_wide(), without setting aside anything for the chunk_state.
+ * If the user gives us 64 KiB, we want to parallelize over all 64 KiB at once
+ * as a single subtree, if at all possible.
+ *
+ * This leads to two problems:
+ * 1) This 64 KiB input might be the only call that ever gets made to update.
+ * In this case, the root node of the 64 KiB subtree would be the root node
+ * of the whole tree, and it would need to be ROOT finalized. We can't
+ * compress it until we know.
+ * 2) This 64 KiB input might complete a larger tree, whose root node is
+ * similarly going to be the the root of the whole tree. For example, maybe
+ * we have 196 KiB (that is, 128 + 64) hashed so far. We can't compress the
+ * node at the root of the 256 KiB subtree until we know how to finalize it.
+ *
+ * The second problem is solved with "lazy merging". That is, when we're about
+ * to add a CV to the stack, we don't merge it with anything first, as the
+ * reference impl does. Instead we do merges using the *previous* CV that was
+ * added, which is sitting on top of the stack, and we put the new CV
+ * (unmerged) on top of the stack afterwards. This guarantees that we never
+ * merge the root node until finalize().
+ *
+ * Solving the first problem requires an additional tool,
+ * compress_subtree_to_parent_node(). That function always returns the top
+ * *two* chaining values of the subtree it's compressing. We then do lazy
+ * merging with each of them separately, so that the second CV will always
+ * remain unmerged. (That also helps us support extendable output when we're
+ * hashing an input all-at-once.)
+ */
+static void hasher_push_cv(BLAKE3_CTX *ctx, uint8_t new_cv[BLAKE3_OUT_LEN],
+ uint64_t chunk_counter)
+{
+ hasher_merge_cv_stack(ctx, chunk_counter);
+ memcpy(&ctx->cv_stack[ctx->cv_stack_len * BLAKE3_OUT_LEN], new_cv,
+ BLAKE3_OUT_LEN);
+ ctx->cv_stack_len += 1;
+}
+
+void
+Blake3_Init(BLAKE3_CTX *ctx)
+{
+ hasher_init_base(ctx, BLAKE3_IV, 0);
+}
+
+void
+Blake3_InitKeyed(BLAKE3_CTX *ctx, const uint8_t key[BLAKE3_KEY_LEN])
+{
+ uint32_t key_words[8];
+ load_key_words(key, key_words);
+ hasher_init_base(ctx, key_words, KEYED_HASH);
+}
+
+static void
+Blake3_Update2(BLAKE3_CTX *ctx, const void *input, size_t input_len)
+{
+ /*
+ * Explicitly checking for zero avoids causing UB by passing a null
+ * pointer to memcpy. This comes up in practice with things like:
+ * std::vector<uint8_t> v;
+ * blake3_hasher_update(&hasher, v.data(), v.size());
+ */
+ if (input_len == 0) {
+ return;
+ }
+
+ const uint8_t *input_bytes = (const uint8_t *)input;
+
+ /*
+ * If we have some partial chunk bytes in the internal chunk_state, we
+ * need to finish that chunk first.
+ */
+ if (chunk_state_len(&ctx->chunk) > 0) {
+ size_t take = BLAKE3_CHUNK_LEN - chunk_state_len(&ctx->chunk);
+ if (take > input_len) {
+ take = input_len;
+ }
+ chunk_state_update(ctx->ops, &ctx->chunk, input_bytes, take);
+ input_bytes += take;
+ input_len -= take;
+ /*
+ * If we've filled the current chunk and there's more coming,
+ * finalize this chunk and proceed. In this case we know it's
+ * not the root.
+ */
+ if (input_len > 0) {
+ output_t output = chunk_state_output(&ctx->chunk);
+ uint8_t chunk_cv[32];
+ output_chaining_value(ctx->ops, &output, chunk_cv);
+ hasher_push_cv(ctx, chunk_cv, ctx->chunk.chunk_counter);
+ chunk_state_reset(&ctx->chunk, ctx->key,
+ ctx->chunk.chunk_counter + 1);
+ } else {
+ return;
+ }
+ }
+
+ /*
+ * Now the chunk_state is clear, and we have more input. If there's
+ * more than a single chunk (so, definitely not the root chunk), hash
+ * the largest whole subtree we can, with the full benefits of SIMD
+ * (and maybe in the future, multi-threading) parallelism. Two
+ * restrictions:
+ * - The subtree has to be a power-of-2 number of chunks. Only
+ * subtrees along the right edge can be incomplete, and we don't know
+ * where the right edge is going to be until we get to finalize().
+ * - The subtree must evenly divide the total number of chunks up
+ * until this point (if total is not 0). If the current incomplete
+ * subtree is only waiting for 1 more chunk, we can't hash a subtree
+ * of 4 chunks. We have to complete the current subtree first.
+ * Because we might need to break up the input to form powers of 2, or
+ * to evenly divide what we already have, this part runs in a loop.
+ */
+ while (input_len > BLAKE3_CHUNK_LEN) {
+ size_t subtree_len = round_down_to_power_of_2(input_len);
+ uint64_t count_so_far =
+ ctx->chunk.chunk_counter * BLAKE3_CHUNK_LEN;
+ /*
+ * Shrink the subtree_len until it evenly divides the count so
+ * far. We know that subtree_len itself is a power of 2, so we
+ * can use a bitmasking trick instead of an actual remainder
+ * operation. (Note that if the caller consistently passes
+ * power-of-2 inputs of the same size, as is hopefully
+ * typical, this loop condition will always fail, and
+ * subtree_len will always be the full length of the input.)
+ *
+ * An aside: We don't have to shrink subtree_len quite this
+ * much. For example, if count_so_far is 1, we could pass 2
+ * chunks to compress_subtree_to_parent_node. Since we'll get
+ * 2 CVs back, we'll still get the right answer in the end,
+ * and we might get to use 2-way SIMD parallelism. The problem
+ * with this optimization, is that it gets us stuck always
+ * hashing 2 chunks. The total number of chunks will remain
+ * odd, and we'll never graduate to higher degrees of
+ * parallelism. See
+ * https://github.com/BLAKE3-team/BLAKE3/issues/69.
+ */
+ while ((((uint64_t)(subtree_len - 1)) & count_so_far) != 0) {
+ subtree_len /= 2;
+ }
+ /*
+ * The shrunken subtree_len might now be 1 chunk long. If so,
+ * hash that one chunk by itself. Otherwise, compress the
+ * subtree into a pair of CVs.
+ */
+ uint64_t subtree_chunks = subtree_len / BLAKE3_CHUNK_LEN;
+ if (subtree_len <= BLAKE3_CHUNK_LEN) {
+ blake3_chunk_state_t chunk_state;
+ chunk_state_init(&chunk_state, ctx->key,
+ ctx->chunk.flags);
+ chunk_state.chunk_counter = ctx->chunk.chunk_counter;
+ chunk_state_update(ctx->ops, &chunk_state, input_bytes,
+ subtree_len);
+ output_t output = chunk_state_output(&chunk_state);
+ uint8_t cv[BLAKE3_OUT_LEN];
+ output_chaining_value(ctx->ops, &output, cv);
+ hasher_push_cv(ctx, cv, chunk_state.chunk_counter);
+ } else {
+ /*
+ * This is the high-performance happy path, though
+ * getting here depends on the caller giving us a long
+ * enough input.
+ */
+ uint8_t cv_pair[2 * BLAKE3_OUT_LEN];
+ compress_subtree_to_parent_node(ctx->ops, input_bytes,
+ subtree_len, ctx->key, ctx-> chunk.chunk_counter,
+ ctx->chunk.flags, cv_pair);
+ hasher_push_cv(ctx, cv_pair, ctx->chunk.chunk_counter);
+ hasher_push_cv(ctx, &cv_pair[BLAKE3_OUT_LEN],
+ ctx->chunk.chunk_counter + (subtree_chunks / 2));
+ }
+ ctx->chunk.chunk_counter += subtree_chunks;
+ input_bytes += subtree_len;
+ input_len -= subtree_len;
+ }
+
+ /*
+ * If there's any remaining input less than a full chunk, add it to
+ * the chunk state. In that case, also do a final merge loop to make
+ * sure the subtree stack doesn't contain any unmerged pairs. The
+ * remaining input means we know these merges are non-root. This merge
+ * loop isn't strictly necessary here, because hasher_push_chunk_cv
+ * already does its own merge loop, but it simplifies
+ * blake3_hasher_finalize below.
+ */
+ if (input_len > 0) {
+ chunk_state_update(ctx->ops, &ctx->chunk, input_bytes,
+ input_len);
+ hasher_merge_cv_stack(ctx, ctx->chunk.chunk_counter);
+ }
+}
+
+void
+Blake3_Update(BLAKE3_CTX *ctx, const void *input, size_t todo)
+{
+ size_t done = 0;
+ const uint8_t *data = input;
+ const size_t block_max = 1024 * 64;
+
+ /* max feed buffer to leave the stack size small */
+ while (todo != 0) {
+ size_t block = (todo >= block_max) ? block_max : todo;
+ Blake3_Update2(ctx, data + done, block);
+ done += block;
+ todo -= block;
+ }
+}
+
+void
+Blake3_Final(const BLAKE3_CTX *ctx, uint8_t *out)
+{
+ Blake3_FinalSeek(ctx, 0, out, BLAKE3_OUT_LEN);
+}
+
+void
+Blake3_FinalSeek(const BLAKE3_CTX *ctx, uint64_t seek, uint8_t *out,
+ size_t out_len)
+{
+ /*
+ * Explicitly checking for zero avoids causing UB by passing a null
+ * pointer to memcpy. This comes up in practice with things like:
+ * std::vector<uint8_t> v;
+ * blake3_hasher_finalize(&hasher, v.data(), v.size());
+ */
+ if (out_len == 0) {
+ return;
+ }
+ /* If the subtree stack is empty, then the current chunk is the root. */
+ if (ctx->cv_stack_len == 0) {
+ output_t output = chunk_state_output(&ctx->chunk);
+ output_root_bytes(ctx->ops, &output, seek, out, out_len);
+ return;
+ }
+ /*
+ * If there are any bytes in the chunk state, finalize that chunk and
+ * do a roll-up merge between that chunk hash and every subtree in the
+ * stack. In this case, the extra merge loop at the end of
+ * blake3_hasher_update guarantees that none of the subtrees in the
+ * stack need to be merged with each other first. Otherwise, if there
+ * are no bytes in the chunk state, then the top of the stack is a
+ * chunk hash, and we start the merge from that.
+ */
+ output_t output;
+ size_t cvs_remaining;
+ if (chunk_state_len(&ctx->chunk) > 0) {
+ cvs_remaining = ctx->cv_stack_len;
+ output = chunk_state_output(&ctx->chunk);
+ } else {
+ /* There are always at least 2 CVs in the stack in this case. */
+ cvs_remaining = ctx->cv_stack_len - 2;
+ output = parent_output(&ctx->cv_stack[cvs_remaining * 32],
+ ctx->key, ctx->chunk.flags);
+ }
+ while (cvs_remaining > 0) {
+ cvs_remaining -= 1;
+ uint8_t parent_block[BLAKE3_BLOCK_LEN];
+ memcpy(parent_block, &ctx->cv_stack[cvs_remaining * 32], 32);
+ output_chaining_value(ctx->ops, &output, &parent_block[32]);
+ output = parent_output(parent_block, ctx->key,
+ ctx->chunk.flags);
+ }
+ output_root_bytes(ctx->ops, &output, seek, out, out_len);
+}
diff --git a/sys/contrib/openzfs/module/icp/algs/blake3/blake3_generic.c b/sys/contrib/openzfs/module/icp/algs/blake3/blake3_generic.c
new file mode 100644
index 000000000000..6ff9a845ccdc
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/algs/blake3/blake3_generic.c
@@ -0,0 +1,202 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Based on BLAKE3 v1.3.1, https://github.com/BLAKE3-team/BLAKE3
+ * Copyright (c) 2019-2020 Samuel Neves and Jack O'Connor
+ * Copyright (c) 2021-2022 Tino Reichardt <milky-zfs@mcmilk.de>
+ */
+
+#include <sys/zfs_context.h>
+#include "blake3_impl.h"
+
+#define rotr32(x, n) (((x) >> (n)) | ((x) << (32 - (n))))
+static inline void g(uint32_t *state, size_t a, size_t b, size_t c, size_t d,
+ uint32_t x, uint32_t y)
+{
+ state[a] = state[a] + state[b] + x;
+ state[d] = rotr32(state[d] ^ state[a], 16);
+ state[c] = state[c] + state[d];
+ state[b] = rotr32(state[b] ^ state[c], 12);
+ state[a] = state[a] + state[b] + y;
+ state[d] = rotr32(state[d] ^ state[a], 8);
+ state[c] = state[c] + state[d];
+ state[b] = rotr32(state[b] ^ state[c], 7);
+}
+
+static inline void round_fn(uint32_t state[16], const uint32_t *msg,
+ size_t round)
+{
+ /* Select the message schedule based on the round. */
+ const uint8_t *schedule = BLAKE3_MSG_SCHEDULE[round];
+
+ /* Mix the columns. */
+ g(state, 0, 4, 8, 12, msg[schedule[0]], msg[schedule[1]]);
+ g(state, 1, 5, 9, 13, msg[schedule[2]], msg[schedule[3]]);
+ g(state, 2, 6, 10, 14, msg[schedule[4]], msg[schedule[5]]);
+ g(state, 3, 7, 11, 15, msg[schedule[6]], msg[schedule[7]]);
+
+ /* Mix the rows. */
+ g(state, 0, 5, 10, 15, msg[schedule[8]], msg[schedule[9]]);
+ g(state, 1, 6, 11, 12, msg[schedule[10]], msg[schedule[11]]);
+ g(state, 2, 7, 8, 13, msg[schedule[12]], msg[schedule[13]]);
+ g(state, 3, 4, 9, 14, msg[schedule[14]], msg[schedule[15]]);
+}
+
+static inline void compress_pre(uint32_t state[16], const uint32_t cv[8],
+ const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len,
+ uint64_t counter, uint8_t flags)
+{
+ uint32_t block_words[16];
+ block_words[0] = load32(block + 4 * 0);
+ block_words[1] = load32(block + 4 * 1);
+ block_words[2] = load32(block + 4 * 2);
+ block_words[3] = load32(block + 4 * 3);
+ block_words[4] = load32(block + 4 * 4);
+ block_words[5] = load32(block + 4 * 5);
+ block_words[6] = load32(block + 4 * 6);
+ block_words[7] = load32(block + 4 * 7);
+ block_words[8] = load32(block + 4 * 8);
+ block_words[9] = load32(block + 4 * 9);
+ block_words[10] = load32(block + 4 * 10);
+ block_words[11] = load32(block + 4 * 11);
+ block_words[12] = load32(block + 4 * 12);
+ block_words[13] = load32(block + 4 * 13);
+ block_words[14] = load32(block + 4 * 14);
+ block_words[15] = load32(block + 4 * 15);
+
+ state[0] = cv[0];
+ state[1] = cv[1];
+ state[2] = cv[2];
+ state[3] = cv[3];
+ state[4] = cv[4];
+ state[5] = cv[5];
+ state[6] = cv[6];
+ state[7] = cv[7];
+ state[8] = BLAKE3_IV[0];
+ state[9] = BLAKE3_IV[1];
+ state[10] = BLAKE3_IV[2];
+ state[11] = BLAKE3_IV[3];
+ state[12] = counter_low(counter);
+ state[13] = counter_high(counter);
+ state[14] = (uint32_t)block_len;
+ state[15] = (uint32_t)flags;
+
+ round_fn(state, &block_words[0], 0);
+ round_fn(state, &block_words[0], 1);
+ round_fn(state, &block_words[0], 2);
+ round_fn(state, &block_words[0], 3);
+ round_fn(state, &block_words[0], 4);
+ round_fn(state, &block_words[0], 5);
+ round_fn(state, &block_words[0], 6);
+}
+
+static inline void blake3_compress_in_place_generic(uint32_t cv[8],
+ const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len,
+ uint64_t counter, uint8_t flags)
+{
+ uint32_t state[16];
+ compress_pre(state, cv, block, block_len, counter, flags);
+ cv[0] = state[0] ^ state[8];
+ cv[1] = state[1] ^ state[9];
+ cv[2] = state[2] ^ state[10];
+ cv[3] = state[3] ^ state[11];
+ cv[4] = state[4] ^ state[12];
+ cv[5] = state[5] ^ state[13];
+ cv[6] = state[6] ^ state[14];
+ cv[7] = state[7] ^ state[15];
+}
+
+static inline void hash_one_generic(const uint8_t *input, size_t blocks,
+ const uint32_t key[8], uint64_t counter, uint8_t flags,
+ uint8_t flags_start, uint8_t flags_end, uint8_t out[BLAKE3_OUT_LEN])
+{
+ uint32_t cv[8];
+ memcpy(cv, key, BLAKE3_KEY_LEN);
+ uint8_t block_flags = flags | flags_start;
+ while (blocks > 0) {
+ if (blocks == 1) {
+ block_flags |= flags_end;
+ }
+ blake3_compress_in_place_generic(cv, input, BLAKE3_BLOCK_LEN,
+ counter, block_flags);
+ input = &input[BLAKE3_BLOCK_LEN];
+ blocks -= 1;
+ block_flags = flags;
+ }
+ store_cv_words(out, cv);
+}
+
+static inline void blake3_compress_xof_generic(const uint32_t cv[8],
+ const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len,
+ uint64_t counter, uint8_t flags, uint8_t out[64])
+{
+ uint32_t state[16];
+ compress_pre(state, cv, block, block_len, counter, flags);
+
+ store32(&out[0 * 4], state[0] ^ state[8]);
+ store32(&out[1 * 4], state[1] ^ state[9]);
+ store32(&out[2 * 4], state[2] ^ state[10]);
+ store32(&out[3 * 4], state[3] ^ state[11]);
+ store32(&out[4 * 4], state[4] ^ state[12]);
+ store32(&out[5 * 4], state[5] ^ state[13]);
+ store32(&out[6 * 4], state[6] ^ state[14]);
+ store32(&out[7 * 4], state[7] ^ state[15]);
+ store32(&out[8 * 4], state[8] ^ cv[0]);
+ store32(&out[9 * 4], state[9] ^ cv[1]);
+ store32(&out[10 * 4], state[10] ^ cv[2]);
+ store32(&out[11 * 4], state[11] ^ cv[3]);
+ store32(&out[12 * 4], state[12] ^ cv[4]);
+ store32(&out[13 * 4], state[13] ^ cv[5]);
+ store32(&out[14 * 4], state[14] ^ cv[6]);
+ store32(&out[15 * 4], state[15] ^ cv[7]);
+}
+
+static inline void blake3_hash_many_generic(const uint8_t * const *inputs,
+ size_t num_inputs, size_t blocks, const uint32_t key[8], uint64_t counter,
+ boolean_t increment_counter, uint8_t flags, uint8_t flags_start,
+ uint8_t flags_end, uint8_t *out)
+{
+ while (num_inputs > 0) {
+ hash_one_generic(inputs[0], blocks, key, counter, flags,
+ flags_start, flags_end, out);
+ if (increment_counter) {
+ counter += 1;
+ }
+ inputs += 1;
+ num_inputs -= 1;
+ out = &out[BLAKE3_OUT_LEN];
+ }
+}
+
+static inline boolean_t blake3_is_generic_supported(void)
+{
+ return (B_TRUE);
+}
+
+const blake3_impl_ops_t blake3_generic_impl = {
+ .compress_in_place = blake3_compress_in_place_generic,
+ .compress_xof = blake3_compress_xof_generic,
+ .hash_many = blake3_hash_many_generic,
+ .is_supported = blake3_is_generic_supported,
+ .degree = 4,
+ .name = "generic"
+};
diff --git a/sys/contrib/openzfs/module/icp/algs/blake3/blake3_impl.c b/sys/contrib/openzfs/module/icp/algs/blake3/blake3_impl.c
new file mode 100644
index 000000000000..c3809a2827be
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/algs/blake3/blake3_impl.c
@@ -0,0 +1,284 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2021-2022 Tino Reichardt <milky-zfs@mcmilk.de>
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/zio_checksum.h>
+
+#include "blake3_impl.h"
+
+static const blake3_impl_ops_t *const blake3_impls[] = {
+ &blake3_generic_impl,
+#if defined(__aarch64__) || \
+ (defined(__x86_64) && defined(HAVE_SSE2)) || \
+ (defined(__PPC64__) && defined(__LITTLE_ENDIAN__))
+ &blake3_sse2_impl,
+#endif
+#if defined(__aarch64__) || \
+ (defined(__x86_64) && defined(HAVE_SSE4_1)) || \
+ (defined(__PPC64__) && defined(__LITTLE_ENDIAN__))
+ &blake3_sse41_impl,
+#endif
+#if defined(__x86_64) && defined(HAVE_SSE4_1) && defined(HAVE_AVX2)
+ &blake3_avx2_impl,
+#endif
+#if defined(__x86_64) && defined(HAVE_AVX512F) && defined(HAVE_AVX512VL)
+ &blake3_avx512_impl,
+#endif
+};
+
+/* this pointer holds current ops for implementation */
+static const blake3_impl_ops_t *blake3_selected_impl = &blake3_generic_impl;
+
+/* special implementation selections */
+#define IMPL_FASTEST (UINT32_MAX)
+#define IMPL_CYCLE (UINT32_MAX-1)
+#define IMPL_USER (UINT32_MAX-2)
+#define IMPL_PARAM (UINT32_MAX-3)
+
+#define IMPL_READ(i) (*(volatile uint32_t *) &(i))
+static uint32_t icp_blake3_impl = IMPL_FASTEST;
+
+#define BLAKE3_IMPL_NAME_MAX 16
+
+/* id of fastest implementation */
+static uint32_t blake3_fastest_id = 0;
+
+/* currently used id */
+static uint32_t blake3_current_id = 0;
+
+/* id of module parameter (-1 == unused) */
+static int blake3_param_id = -1;
+
+/* return number of supported implementations */
+int
+blake3_get_impl_count(void)
+{
+ static int impls = 0;
+ int i;
+
+ if (impls)
+ return (impls);
+
+ for (i = 0; i < ARRAY_SIZE(blake3_impls); i++) {
+ if (!blake3_impls[i]->is_supported()) continue;
+ impls++;
+ }
+
+ return (impls);
+}
+
+/* return id of selected implementation */
+int
+blake3_get_impl_id(void)
+{
+ return (blake3_current_id);
+}
+
+/* return name of selected implementation */
+const char *
+blake3_get_impl_name(void)
+{
+ return (blake3_selected_impl->name);
+}
+
+/* setup id as fastest implementation */
+void
+blake3_set_impl_fastest(uint32_t id)
+{
+ blake3_fastest_id = id;
+}
+
+/* set implementation by id */
+void
+blake3_set_impl_id(uint32_t id)
+{
+ int i, cid;
+
+ /* select fastest */
+ if (id == IMPL_FASTEST)
+ id = blake3_fastest_id;
+
+ /* select next or first */
+ if (id == IMPL_CYCLE)
+ id = (++blake3_current_id) % blake3_get_impl_count();
+
+ /* 0..N for the real impl */
+ for (i = 0, cid = 0; i < ARRAY_SIZE(blake3_impls); i++) {
+ if (!blake3_impls[i]->is_supported()) continue;
+ if (cid == id) {
+ blake3_current_id = cid;
+ blake3_selected_impl = blake3_impls[i];
+ return;
+ }
+ cid++;
+ }
+}
+
+/* set implementation by name */
+int
+blake3_set_impl_name(const char *name)
+{
+ int i, cid;
+
+ if (strcmp(name, "fastest") == 0) {
+ atomic_swap_32(&icp_blake3_impl, IMPL_FASTEST);
+ blake3_set_impl_id(IMPL_FASTEST);
+ return (0);
+ } else if (strcmp(name, "cycle") == 0) {
+ atomic_swap_32(&icp_blake3_impl, IMPL_CYCLE);
+ blake3_set_impl_id(IMPL_CYCLE);
+ return (0);
+ }
+
+ for (i = 0, cid = 0; i < ARRAY_SIZE(blake3_impls); i++) {
+ if (!blake3_impls[i]->is_supported()) continue;
+ if (strcmp(name, blake3_impls[i]->name) == 0) {
+ if (icp_blake3_impl == IMPL_PARAM) {
+ blake3_param_id = cid;
+ return (0);
+ }
+ blake3_selected_impl = blake3_impls[i];
+ blake3_current_id = cid;
+ return (0);
+ }
+ cid++;
+ }
+
+ return (-EINVAL);
+}
+
+/* setup implementation */
+void
+blake3_setup_impl(void)
+{
+ switch (IMPL_READ(icp_blake3_impl)) {
+ case IMPL_PARAM:
+ blake3_set_impl_id(blake3_param_id);
+ atomic_swap_32(&icp_blake3_impl, IMPL_USER);
+ break;
+ case IMPL_FASTEST:
+ blake3_set_impl_id(IMPL_FASTEST);
+ break;
+ case IMPL_CYCLE:
+ blake3_set_impl_id(IMPL_CYCLE);
+ break;
+ default:
+ blake3_set_impl_id(blake3_current_id);
+ break;
+ }
+}
+
+/* return selected implementation */
+const blake3_impl_ops_t *
+blake3_impl_get_ops(void)
+{
+ /* each call to ops will cycle */
+ if (icp_blake3_impl == IMPL_CYCLE)
+ blake3_set_impl_id(IMPL_CYCLE);
+
+ return (blake3_selected_impl);
+}
+
+#if defined(_KERNEL)
+void **blake3_per_cpu_ctx;
+
+void
+blake3_per_cpu_ctx_init(void)
+{
+ /*
+ * Create "The Godfather" ptr to hold all blake3 ctx
+ */
+ blake3_per_cpu_ctx = kmem_alloc(max_ncpus * sizeof (void *), KM_SLEEP);
+ for (int i = 0; i < max_ncpus; i++) {
+ blake3_per_cpu_ctx[i] = kmem_alloc(sizeof (BLAKE3_CTX),
+ KM_SLEEP);
+ }
+}
+
+void
+blake3_per_cpu_ctx_fini(void)
+{
+ for (int i = 0; i < max_ncpus; i++) {
+ memset(blake3_per_cpu_ctx[i], 0, sizeof (BLAKE3_CTX));
+ kmem_free(blake3_per_cpu_ctx[i], sizeof (BLAKE3_CTX));
+ }
+ memset(blake3_per_cpu_ctx, 0, max_ncpus * sizeof (void *));
+ kmem_free(blake3_per_cpu_ctx, max_ncpus * sizeof (void *));
+}
+#endif
+
+#if defined(_KERNEL) && defined(__linux__)
+static int
+icp_blake3_impl_set(const char *name, zfs_kernel_param_t *kp)
+{
+ char req_name[BLAKE3_IMPL_NAME_MAX];
+ size_t i;
+
+ /* sanitize input */
+ i = strnlen(name, BLAKE3_IMPL_NAME_MAX);
+ if (i == 0 || i >= BLAKE3_IMPL_NAME_MAX)
+ return (-EINVAL);
+
+ strlcpy(req_name, name, BLAKE3_IMPL_NAME_MAX);
+ while (i > 0 && isspace(req_name[i-1]))
+ i--;
+ req_name[i] = '\0';
+
+ atomic_swap_32(&icp_blake3_impl, IMPL_PARAM);
+ return (blake3_set_impl_name(req_name));
+}
+
+static int
+icp_blake3_impl_get(char *buffer, zfs_kernel_param_t *kp)
+{
+ int i, cid, cnt = 0;
+ char *fmt;
+
+ /* cycling */
+ fmt = (icp_blake3_impl == IMPL_CYCLE) ? "[cycle] " : "cycle ";
+ cnt += sprintf(buffer + cnt, fmt);
+
+ /* fastest one */
+ fmt = (icp_blake3_impl == IMPL_FASTEST) ? "[fastest] " : "fastest ";
+ cnt += sprintf(buffer + cnt, fmt);
+
+ /* user selected */
+ for (i = 0, cid = 0; i < ARRAY_SIZE(blake3_impls); i++) {
+ if (!blake3_impls[i]->is_supported()) continue;
+ fmt = (icp_blake3_impl == IMPL_USER &&
+ cid == blake3_current_id) ? "[%s] " : "%s ";
+ cnt += sprintf(buffer + cnt, fmt, blake3_impls[i]->name);
+ cid++;
+ }
+
+ buffer[cnt] = 0;
+
+ return (cnt);
+}
+
+module_param_call(icp_blake3_impl, icp_blake3_impl_set, icp_blake3_impl_get,
+ NULL, 0644);
+MODULE_PARM_DESC(icp_blake3_impl, "Select BLAKE3 implementation.");
+#endif
diff --git a/sys/contrib/openzfs/module/icp/algs/blake3/blake3_impl.h b/sys/contrib/openzfs/module/icp/algs/blake3/blake3_impl.h
new file mode 100644
index 000000000000..7b40cc4d3f02
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/algs/blake3/blake3_impl.h
@@ -0,0 +1,213 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Based on BLAKE3 v1.3.1, https://github.com/BLAKE3-team/BLAKE3
+ * Copyright (c) 2019-2020 Samuel Neves and Jack O'Connor
+ * Copyright (c) 2021-2022 Tino Reichardt <milky-zfs@mcmilk.de>
+ */
+
+#ifndef BLAKE3_IMPL_H
+#define BLAKE3_IMPL_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <sys/types.h>
+#include <sys/blake3.h>
+#include <sys/simd.h>
+
+/*
+ * Methods used to define BLAKE3 assembler implementations
+ */
+typedef void (*blake3_compress_in_place_f)(uint32_t cv[8],
+ const uint8_t block[BLAKE3_BLOCK_LEN],
+ uint8_t block_len, uint64_t counter,
+ uint8_t flags);
+
+typedef void (*blake3_compress_xof_f)(const uint32_t cv[8],
+ const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len,
+ uint64_t counter, uint8_t flags, uint8_t out[64]);
+
+typedef void (*blake3_hash_many_f)(const uint8_t * const *inputs,
+ size_t num_inputs, size_t blocks, const uint32_t key[8],
+ uint64_t counter, boolean_t increment_counter, uint8_t flags,
+ uint8_t flags_start, uint8_t flags_end, uint8_t *out);
+
+typedef boolean_t (*blake3_is_supported_f)(void);
+
+typedef struct blake3_impl_ops {
+ blake3_compress_in_place_f compress_in_place;
+ blake3_compress_xof_f compress_xof;
+ blake3_hash_many_f hash_many;
+ blake3_is_supported_f is_supported;
+ int degree;
+ const char *name;
+} blake3_impl_ops_t;
+
+/* Return selected BLAKE3 implementation ops */
+extern const blake3_impl_ops_t *blake3_impl_get_ops(void);
+
+extern const blake3_impl_ops_t blake3_generic_impl;
+
+#if defined(__aarch64__) || \
+ (defined(__x86_64) && defined(HAVE_SSE2)) || \
+ (defined(__PPC64__) && defined(__LITTLE_ENDIAN__))
+extern const blake3_impl_ops_t blake3_sse2_impl;
+#endif
+
+#if defined(__aarch64__) || \
+ (defined(__x86_64) && defined(HAVE_SSE4_1)) || \
+ (defined(__PPC64__) && defined(__LITTLE_ENDIAN__))
+extern const blake3_impl_ops_t blake3_sse41_impl;
+#endif
+
+#if defined(__x86_64) && defined(HAVE_SSE4_1) && defined(HAVE_AVX2)
+extern const blake3_impl_ops_t blake3_avx2_impl;
+#endif
+
+#if defined(__x86_64) && defined(HAVE_AVX512F) && defined(HAVE_AVX512VL)
+extern const blake3_impl_ops_t blake3_avx512_impl;
+#endif
+
+#if defined(__x86_64)
+#define MAX_SIMD_DEGREE 16
+#else
+#define MAX_SIMD_DEGREE 4
+#endif
+
+#define MAX_SIMD_DEGREE_OR_2 (MAX_SIMD_DEGREE > 2 ? MAX_SIMD_DEGREE : 2)
+
+static const uint32_t BLAKE3_IV[8] = {
+ 0x6A09E667UL, 0xBB67AE85UL, 0x3C6EF372UL, 0xA54FF53AUL,
+ 0x510E527FUL, 0x9B05688CUL, 0x1F83D9ABUL, 0x5BE0CD19UL};
+
+static const uint8_t BLAKE3_MSG_SCHEDULE[7][16] = {
+ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
+ {2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8},
+ {3, 4, 10, 12, 13, 2, 7, 14, 6, 5, 9, 0, 11, 15, 8, 1},
+ {10, 7, 12, 9, 14, 3, 13, 15, 4, 0, 11, 2, 5, 8, 1, 6},
+ {12, 13, 9, 11, 15, 10, 14, 8, 7, 2, 5, 3, 0, 1, 6, 4},
+ {9, 14, 11, 5, 8, 12, 15, 1, 13, 3, 0, 10, 2, 6, 4, 7},
+ {11, 15, 5, 0, 1, 9, 8, 6, 14, 10, 2, 12, 3, 4, 7, 13},
+};
+
+/* Find index of the highest set bit */
+static inline unsigned int highest_one(uint64_t x) {
+#if defined(__GNUC__) || defined(__clang__)
+ return (63 ^ __builtin_clzll(x));
+#elif defined(_MSC_VER) && defined(IS_X86_64)
+ unsigned long index;
+ _BitScanReverse64(&index, x);
+ return (index);
+#elif defined(_MSC_VER) && defined(IS_X86_32)
+ if (x >> 32) {
+ unsigned long index;
+ _BitScanReverse(&index, x >> 32);
+ return (32 + index);
+ } else {
+ unsigned long index;
+ _BitScanReverse(&index, x);
+ return (index);
+ }
+#else
+ unsigned int c = 0;
+ if (x & 0xffffffff00000000ULL) { x >>= 32; c += 32; }
+ if (x & 0x00000000ffff0000ULL) { x >>= 16; c += 16; }
+ if (x & 0x000000000000ff00ULL) { x >>= 8; c += 8; }
+ if (x & 0x00000000000000f0ULL) { x >>= 4; c += 4; }
+ if (x & 0x000000000000000cULL) { x >>= 2; c += 2; }
+ if (x & 0x0000000000000002ULL) { c += 1; }
+ return (c);
+#endif
+}
+
+/* Count the number of 1 bits. */
+static inline unsigned int popcnt(uint64_t x) {
+ unsigned int count = 0;
+
+ while (x != 0) {
+ count += 1;
+ x &= x - 1;
+ }
+
+ return (count);
+}
+
+/*
+ * Largest power of two less than or equal to x.
+ * As a special case, returns 1 when x is 0.
+ */
+static inline uint64_t round_down_to_power_of_2(uint64_t x) {
+ return (1ULL << highest_one(x | 1));
+}
+
+static inline uint32_t counter_low(uint64_t counter) {
+ return ((uint32_t)counter);
+}
+
+static inline uint32_t counter_high(uint64_t counter) {
+ return ((uint32_t)(counter >> 32));
+}
+
+static inline uint32_t load32(const void *src) {
+ const uint8_t *p = (const uint8_t *)src;
+ return ((uint32_t)(p[0]) << 0) | ((uint32_t)(p[1]) << 8) |
+ ((uint32_t)(p[2]) << 16) | ((uint32_t)(p[3]) << 24);
+}
+
+static inline void load_key_words(const uint8_t key[BLAKE3_KEY_LEN],
+ uint32_t key_words[8]) {
+ key_words[0] = load32(&key[0 * 4]);
+ key_words[1] = load32(&key[1 * 4]);
+ key_words[2] = load32(&key[2 * 4]);
+ key_words[3] = load32(&key[3 * 4]);
+ key_words[4] = load32(&key[4 * 4]);
+ key_words[5] = load32(&key[5 * 4]);
+ key_words[6] = load32(&key[6 * 4]);
+ key_words[7] = load32(&key[7 * 4]);
+}
+
+static inline void store32(void *dst, uint32_t w) {
+ uint8_t *p = (uint8_t *)dst;
+ p[0] = (uint8_t)(w >> 0);
+ p[1] = (uint8_t)(w >> 8);
+ p[2] = (uint8_t)(w >> 16);
+ p[3] = (uint8_t)(w >> 24);
+}
+
+static inline void store_cv_words(uint8_t bytes_out[32], uint32_t cv_words[8]) {
+ store32(&bytes_out[0 * 4], cv_words[0]);
+ store32(&bytes_out[1 * 4], cv_words[1]);
+ store32(&bytes_out[2 * 4], cv_words[2]);
+ store32(&bytes_out[3 * 4], cv_words[3]);
+ store32(&bytes_out[4 * 4], cv_words[4]);
+ store32(&bytes_out[5 * 4], cv_words[5]);
+ store32(&bytes_out[6 * 4], cv_words[6]);
+ store32(&bytes_out[7 * 4], cv_words[7]);
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* BLAKE3_IMPL_H */
diff --git a/sys/contrib/openzfs/module/icp/algs/blake3/blake3_x86-64.c b/sys/contrib/openzfs/module/icp/algs/blake3/blake3_x86-64.c
new file mode 100644
index 000000000000..48715e2128d2
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/algs/blake3/blake3_x86-64.c
@@ -0,0 +1,248 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2021-2022 Tino Reichardt <milky-zfs@mcmilk.de>
+ */
+
+#include "blake3_impl.h"
+
+#if defined(__aarch64__) || \
+ (defined(__x86_64) && defined(HAVE_SSE2)) || \
+ (defined(__PPC64__) && defined(__LITTLE_ENDIAN__))
+
+extern void zfs_blake3_compress_in_place_sse2(uint32_t cv[8],
+ const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len,
+ uint64_t counter, uint8_t flags);
+
+extern void zfs_blake3_compress_xof_sse2(const uint32_t cv[8],
+ const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len,
+ uint64_t counter, uint8_t flags, uint8_t out[64]);
+
+extern void zfs_blake3_hash_many_sse2(const uint8_t * const *inputs,
+ size_t num_inputs, size_t blocks, const uint32_t key[8],
+ uint64_t counter, boolean_t increment_counter, uint8_t flags,
+ uint8_t flags_start, uint8_t flags_end, uint8_t *out);
+
+static void blake3_compress_in_place_sse2(uint32_t cv[8],
+ const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len,
+ uint64_t counter, uint8_t flags) {
+ kfpu_begin();
+ zfs_blake3_compress_in_place_sse2(cv, block, block_len, counter,
+ flags);
+ kfpu_end();
+}
+
+static void blake3_compress_xof_sse2(const uint32_t cv[8],
+ const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len,
+ uint64_t counter, uint8_t flags, uint8_t out[64]) {
+ kfpu_begin();
+ zfs_blake3_compress_xof_sse2(cv, block, block_len, counter, flags,
+ out);
+ kfpu_end();
+}
+
+static void blake3_hash_many_sse2(const uint8_t * const *inputs,
+ size_t num_inputs, size_t blocks, const uint32_t key[8],
+ uint64_t counter, boolean_t increment_counter, uint8_t flags,
+ uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
+ kfpu_begin();
+ zfs_blake3_hash_many_sse2(inputs, num_inputs, blocks, key, counter,
+ increment_counter, flags, flags_start, flags_end, out);
+ kfpu_end();
+}
+
+static boolean_t blake3_is_sse2_supported(void)
+{
+#if defined(__x86_64)
+ return (kfpu_allowed() && zfs_sse2_available());
+#elif defined(__PPC64__)
+ return (kfpu_allowed() && zfs_vsx_available());
+#else
+ return (kfpu_allowed());
+#endif
+}
+
+const blake3_impl_ops_t blake3_sse2_impl = {
+ .compress_in_place = blake3_compress_in_place_sse2,
+ .compress_xof = blake3_compress_xof_sse2,
+ .hash_many = blake3_hash_many_sse2,
+ .is_supported = blake3_is_sse2_supported,
+ .degree = 4,
+ .name = "sse2"
+};
+#endif
+
+#if defined(__aarch64__) || \
+ (defined(__x86_64) && defined(HAVE_SSE2)) || \
+ (defined(__PPC64__) && defined(__LITTLE_ENDIAN__))
+
+extern void zfs_blake3_compress_in_place_sse41(uint32_t cv[8],
+ const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len,
+ uint64_t counter, uint8_t flags);
+
+extern void zfs_blake3_compress_xof_sse41(const uint32_t cv[8],
+ const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len,
+ uint64_t counter, uint8_t flags, uint8_t out[64]);
+
+extern void zfs_blake3_hash_many_sse41(const uint8_t * const *inputs,
+ size_t num_inputs, size_t blocks, const uint32_t key[8],
+ uint64_t counter, boolean_t increment_counter, uint8_t flags,
+ uint8_t flags_start, uint8_t flags_end, uint8_t *out);
+
+static void blake3_compress_in_place_sse41(uint32_t cv[8],
+ const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len,
+ uint64_t counter, uint8_t flags) {
+ kfpu_begin();
+ zfs_blake3_compress_in_place_sse41(cv, block, block_len, counter,
+ flags);
+ kfpu_end();
+}
+
+static void blake3_compress_xof_sse41(const uint32_t cv[8],
+ const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len,
+ uint64_t counter, uint8_t flags, uint8_t out[64]) {
+ kfpu_begin();
+ zfs_blake3_compress_xof_sse41(cv, block, block_len, counter, flags,
+ out);
+ kfpu_end();
+}
+
+static void blake3_hash_many_sse41(const uint8_t * const *inputs,
+ size_t num_inputs, size_t blocks, const uint32_t key[8],
+ uint64_t counter, boolean_t increment_counter, uint8_t flags,
+ uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
+ kfpu_begin();
+ zfs_blake3_hash_many_sse41(inputs, num_inputs, blocks, key, counter,
+ increment_counter, flags, flags_start, flags_end, out);
+ kfpu_end();
+}
+
+static boolean_t blake3_is_sse41_supported(void)
+{
+#if defined(__x86_64)
+ return (kfpu_allowed() && zfs_sse4_1_available());
+#elif defined(__PPC64__)
+ return (kfpu_allowed() && zfs_vsx_available());
+#else
+ return (kfpu_allowed());
+#endif
+}
+
+const blake3_impl_ops_t blake3_sse41_impl = {
+ .compress_in_place = blake3_compress_in_place_sse41,
+ .compress_xof = blake3_compress_xof_sse41,
+ .hash_many = blake3_hash_many_sse41,
+ .is_supported = blake3_is_sse41_supported,
+ .degree = 4,
+ .name = "sse41"
+};
+#endif
+
+#if defined(__x86_64) && defined(HAVE_SSE4_1) && defined(HAVE_AVX2)
+extern void zfs_blake3_hash_many_avx2(const uint8_t * const *inputs,
+ size_t num_inputs, size_t blocks, const uint32_t key[8],
+ uint64_t counter, boolean_t increment_counter, uint8_t flags,
+ uint8_t flags_start, uint8_t flags_end, uint8_t *out);
+
+static void blake3_hash_many_avx2(const uint8_t * const *inputs,
+ size_t num_inputs, size_t blocks, const uint32_t key[8],
+ uint64_t counter, boolean_t increment_counter, uint8_t flags,
+ uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
+ kfpu_begin();
+ zfs_blake3_hash_many_avx2(inputs, num_inputs, blocks, key, counter,
+ increment_counter, flags, flags_start, flags_end, out);
+ kfpu_end();
+}
+
+static boolean_t blake3_is_avx2_supported(void)
+{
+ return (kfpu_allowed() && zfs_sse4_1_available() &&
+ zfs_avx2_available());
+}
+
+const blake3_impl_ops_t blake3_avx2_impl = {
+ .compress_in_place = blake3_compress_in_place_sse41,
+ .compress_xof = blake3_compress_xof_sse41,
+ .hash_many = blake3_hash_many_avx2,
+ .is_supported = blake3_is_avx2_supported,
+ .degree = 8,
+ .name = "avx2"
+};
+#endif
+
+#if defined(__x86_64) && defined(HAVE_AVX512F) && defined(HAVE_AVX512VL)
+extern void zfs_blake3_compress_in_place_avx512(uint32_t cv[8],
+ const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len,
+ uint64_t counter, uint8_t flags);
+
+extern void zfs_blake3_compress_xof_avx512(const uint32_t cv[8],
+ const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len,
+ uint64_t counter, uint8_t flags, uint8_t out[64]);
+
+extern void zfs_blake3_hash_many_avx512(const uint8_t * const *inputs,
+ size_t num_inputs, size_t blocks, const uint32_t key[8],
+ uint64_t counter, boolean_t increment_counter, uint8_t flags,
+ uint8_t flags_start, uint8_t flags_end, uint8_t *out);
+
+static void blake3_compress_in_place_avx512(uint32_t cv[8],
+ const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len,
+ uint64_t counter, uint8_t flags) {
+ kfpu_begin();
+ zfs_blake3_compress_in_place_avx512(cv, block, block_len, counter,
+ flags);
+ kfpu_end();
+}
+
+static void blake3_compress_xof_avx512(const uint32_t cv[8],
+ const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len,
+ uint64_t counter, uint8_t flags, uint8_t out[64]) {
+ kfpu_begin();
+ zfs_blake3_compress_xof_avx512(cv, block, block_len, counter, flags,
+ out);
+ kfpu_end();
+}
+
+static void blake3_hash_many_avx512(const uint8_t * const *inputs,
+ size_t num_inputs, size_t blocks, const uint32_t key[8],
+ uint64_t counter, boolean_t increment_counter, uint8_t flags,
+ uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
+ kfpu_begin();
+ zfs_blake3_hash_many_avx512(inputs, num_inputs, blocks, key, counter,
+ increment_counter, flags, flags_start, flags_end, out);
+ kfpu_end();
+}
+
+static boolean_t blake3_is_avx512_supported(void)
+{
+ return (kfpu_allowed() && zfs_avx512f_available() &&
+ zfs_avx512vl_available());
+}
+
+const blake3_impl_ops_t blake3_avx512_impl = {
+ .compress_in_place = blake3_compress_in_place_avx512,
+ .compress_xof = blake3_compress_xof_avx512,
+ .hash_many = blake3_hash_many_avx512,
+ .is_supported = blake3_is_avx512_supported,
+ .degree = 16,
+ .name = "avx512"
+};
+#endif
diff --git a/sys/contrib/openzfs/module/icp/asm-aarch64/blake3/b3_aarch64_sse2.S b/sys/contrib/openzfs/module/icp/asm-aarch64/blake3/b3_aarch64_sse2.S
new file mode 100644
index 000000000000..59a4d9afd437
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/asm-aarch64/blake3/b3_aarch64_sse2.S
@@ -0,0 +1,2450 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Based on BLAKE3 v1.3.1, https://github.com/BLAKE3-team/BLAKE3
+ * Copyright (c) 2019-2022 Samuel Neves and Matthew Krupcale
+ * Copyright (c) 2022 Tino Reichardt <milky-zfs@mcmilk.de>
+ *
+ * This is converted assembly: SSE2 -> ARMv8-A
+ * Used tools: SIMDe https://github.com/simd-everywhere/simde
+ */
+
+#if defined(__aarch64__)
+ .text
+ .section .rodata.cst16,"aM",@progbits,16
+ .p2align 4
+.LCPI0_0:
+ .word 1779033703
+ .word 3144134277
+ .word 1013904242
+ .word 2773480762
+.LCPI0_1:
+ .xword 0
+ .xword -4294967296
+.LCPI0_2:
+ .xword -1
+ .xword 4294967295
+ .text
+ .globl zfs_blake3_compress_in_place_sse2
+ .p2align 2
+ .type zfs_blake3_compress_in_place_sse2,@function
+zfs_blake3_compress_in_place_sse2:
+ .cfi_startproc
+ ldp q3, q2, [x0]
+ ldp q5, q6, [x1]
+ add x10, x1, #32
+ lsr x11, x3, #32
+ fmov s4, w3
+ ld2 { v17.4s, v18.4s }, [x10]
+ adrp x10, .LCPI0_2
+ and w8, w2, #0xff
+ mov v4.s[1], w11
+ ldr q1, [x10, :lo12:.LCPI0_2]
+ and w9, w4, #0xff
+ adrp x12, .LCPI0_0
+ mov v4.s[2], w8
+ uzp1 v19.4s, v5.4s, v6.4s
+ add v3.4s, v2.4s, v3.4s
+ ldr q7, [x12, :lo12:.LCPI0_0]
+ mov v4.s[3], w9
+ add v3.4s, v3.4s, v19.4s
+ uzp2 v5.4s, v5.4s, v6.4s
+ ext v21.16b, v18.16b, v18.16b, #12
+ uzp1 v6.4s, v19.4s, v19.4s
+ ext v22.16b, v19.16b, v19.16b, #12
+ eor v4.16b, v3.16b, v4.16b
+ ext v20.16b, v17.16b, v17.16b, #12
+ ext v6.16b, v6.16b, v19.16b, #8
+ ext v19.16b, v19.16b, v22.16b, #12
+ zip1 v22.2d, v21.2d, v5.2d
+ rev32 v24.8h, v4.8h
+ mov v4.16b, v1.16b
+ zip2 v23.4s, v5.4s, v21.4s
+ uzp2 v6.4s, v6.4s, v5.4s
+ bsl v4.16b, v22.16b, v20.16b
+ add v3.4s, v3.4s, v5.4s
+ zip1 v5.4s, v23.4s, v20.4s
+ zip1 v22.4s, v20.4s, v23.4s
+ add v23.4s, v24.4s, v7.4s
+ ext v7.16b, v6.16b, v6.16b, #4
+ ext v25.16b, v4.16b, v4.16b, #12
+ ext v5.16b, v22.16b, v5.16b, #8
+ eor v2.16b, v23.16b, v2.16b
+ uzp1 v4.4s, v4.4s, v25.4s
+ uzp1 v22.4s, v7.4s, v7.4s
+ ext v25.16b, v7.16b, v7.16b, #12
+ ext v22.16b, v22.16b, v7.16b, #8
+ ext v7.16b, v7.16b, v25.16b, #12
+ ushr v25.4s, v2.4s, #12
+ shl v2.4s, v2.4s, #20
+ orr v2.16b, v2.16b, v25.16b
+ add v3.4s, v3.4s, v2.4s
+ eor v24.16b, v3.16b, v24.16b
+ add v3.4s, v3.4s, v17.4s
+ ushr v17.4s, v24.4s, #8
+ shl v18.4s, v24.4s, #24
+ orr v17.16b, v18.16b, v17.16b
+ add v18.4s, v17.4s, v23.4s
+ eor v2.16b, v18.16b, v2.16b
+ ushr v23.4s, v2.4s, #7
+ shl v2.4s, v2.4s, #25
+ ext v3.16b, v3.16b, v3.16b, #12
+ orr v2.16b, v2.16b, v23.16b
+ ext v17.16b, v17.16b, v17.16b, #8
+ add v3.4s, v2.4s, v3.4s
+ adrp x11, .LCPI0_1
+ eor v17.16b, v3.16b, v17.16b
+ ldr q16, [x11, :lo12:.LCPI0_1]
+ ext v18.16b, v18.16b, v18.16b, #4
+ rev32 v24.8h, v17.8h
+ movi v0.2d, #0xffffffff00000000
+ add v23.4s, v3.4s, v21.4s
+ mov v21.s[1], v20.s[2]
+ add v20.4s, v18.4s, v24.4s
+ bit v19.16b, v21.16b, v0.16b
+ eor v3.16b, v20.16b, v2.16b
+ uzp2 v2.4s, v22.4s, v19.4s
+ zip1 v17.2d, v5.2d, v19.2d
+ zip2 v18.4s, v19.4s, v5.4s
+ ushr v21.4s, v3.4s, #12
+ shl v3.4s, v3.4s, #20
+ ext v22.16b, v2.16b, v2.16b, #4
+ bsl v16.16b, v4.16b, v17.16b
+ zip1 v17.4s, v18.4s, v4.4s
+ zip1 v18.4s, v4.4s, v18.4s
+ orr v21.16b, v3.16b, v21.16b
+ ext v25.16b, v16.16b, v16.16b, #12
+ ext v3.16b, v18.16b, v17.16b, #8
+ uzp1 v18.4s, v22.4s, v22.4s
+ ext v26.16b, v22.16b, v22.16b, #12
+ add v23.4s, v23.4s, v21.4s
+ uzp1 v17.4s, v16.4s, v25.4s
+ ext v16.16b, v18.16b, v22.16b, #8
+ ext v18.16b, v22.16b, v26.16b, #12
+ eor v22.16b, v23.16b, v24.16b
+ add v6.4s, v23.4s, v6.4s
+ ushr v23.4s, v22.4s, #8
+ shl v22.4s, v22.4s, #24
+ orr v22.16b, v22.16b, v23.16b
+ add v20.4s, v22.4s, v20.4s
+ eor v21.16b, v20.16b, v21.16b
+ ushr v23.4s, v21.4s, #7
+ shl v21.4s, v21.4s, #25
+ ext v6.16b, v6.16b, v6.16b, #4
+ orr v21.16b, v21.16b, v23.16b
+ ext v22.16b, v22.16b, v22.16b, #8
+ add v6.4s, v21.4s, v6.4s
+ eor v22.16b, v6.16b, v22.16b
+ ext v20.16b, v20.16b, v20.16b, #12
+ add v6.4s, v6.4s, v19.4s
+ rev32 v19.8h, v22.8h
+ add v20.4s, v20.4s, v19.4s
+ eor v21.16b, v20.16b, v21.16b
+ ushr v22.4s, v21.4s, #12
+ shl v21.4s, v21.4s, #20
+ orr v21.16b, v21.16b, v22.16b
+ add v6.4s, v6.4s, v21.4s
+ eor v19.16b, v6.16b, v19.16b
+ ushr v22.4s, v19.4s, #8
+ shl v19.4s, v19.4s, #24
+ orr v19.16b, v19.16b, v22.16b
+ add v20.4s, v19.4s, v20.4s
+ eor v21.16b, v20.16b, v21.16b
+ ext v6.16b, v6.16b, v6.16b, #12
+ ushr v22.4s, v21.4s, #7
+ shl v21.4s, v21.4s, #25
+ add v6.4s, v6.4s, v4.4s
+ orr v21.16b, v21.16b, v22.16b
+ ext v19.16b, v19.16b, v19.16b, #8
+ add v6.4s, v6.4s, v21.4s
+ eor v19.16b, v6.16b, v19.16b
+ ext v20.16b, v20.16b, v20.16b, #4
+ rev32 v19.8h, v19.8h
+ add v20.4s, v20.4s, v19.4s
+ add v6.4s, v6.4s, v5.4s
+ mov v5.s[1], v4.s[2]
+ eor v4.16b, v20.16b, v21.16b
+ ushr v21.4s, v4.4s, #12
+ shl v4.4s, v4.4s, #20
+ orr v21.16b, v4.16b, v21.16b
+ add v6.4s, v6.4s, v21.4s
+ eor v19.16b, v6.16b, v19.16b
+ add v2.4s, v6.4s, v2.4s
+ ushr v6.4s, v19.4s, #8
+ shl v19.4s, v19.4s, #24
+ orr v6.16b, v19.16b, v6.16b
+ add v19.4s, v6.4s, v20.4s
+ eor v20.16b, v19.16b, v21.16b
+ ushr v21.4s, v20.4s, #7
+ shl v20.4s, v20.4s, #25
+ ext v2.16b, v2.16b, v2.16b, #4
+ orr v20.16b, v20.16b, v21.16b
+ ext v6.16b, v6.16b, v6.16b, #8
+ add v2.4s, v20.4s, v2.4s
+ eor v6.16b, v2.16b, v6.16b
+ ext v19.16b, v19.16b, v19.16b, #12
+ rev32 v6.8h, v6.8h
+ add v19.4s, v19.4s, v6.4s
+ mov v22.16b, v0.16b
+ eor v20.16b, v19.16b, v20.16b
+ bsl v22.16b, v5.16b, v7.16b
+ ushr v21.4s, v20.4s, #12
+ shl v20.4s, v20.4s, #20
+ add v2.4s, v2.4s, v22.4s
+ orr v20.16b, v20.16b, v21.16b
+ add v2.4s, v2.4s, v20.4s
+ eor v6.16b, v2.16b, v6.16b
+ ushr v21.4s, v6.4s, #8
+ shl v6.4s, v6.4s, #24
+ orr v6.16b, v6.16b, v21.16b
+ add v19.4s, v6.4s, v19.4s
+ eor v20.16b, v19.16b, v20.16b
+ ext v2.16b, v2.16b, v2.16b, #12
+ ushr v21.4s, v20.4s, #7
+ shl v20.4s, v20.4s, #25
+ add v2.4s, v2.4s, v17.4s
+ orr v20.16b, v20.16b, v21.16b
+ ext v6.16b, v6.16b, v6.16b, #8
+ add v2.4s, v2.4s, v20.4s
+ eor v6.16b, v2.16b, v6.16b
+ uzp2 v5.4s, v16.4s, v22.4s
+ zip1 v7.2d, v3.2d, v22.2d
+ zip2 v16.4s, v22.4s, v3.4s
+ ext v19.16b, v19.16b, v19.16b, #4
+ rev32 v22.8h, v6.8h
+ ext v23.16b, v5.16b, v5.16b, #4
+ bif v7.16b, v17.16b, v1.16b
+ zip1 v24.4s, v16.4s, v17.4s
+ zip1 v16.4s, v17.4s, v16.4s
+ add v21.4s, v2.4s, v3.4s
+ mov v3.s[1], v17.s[2]
+ add v17.4s, v19.4s, v22.4s
+ mov v19.16b, v0.16b
+ ext v25.16b, v7.16b, v7.16b, #12
+ ext v4.16b, v16.16b, v24.16b, #8
+ uzp1 v16.4s, v23.4s, v23.4s
+ bsl v19.16b, v3.16b, v18.16b
+ eor v2.16b, v17.16b, v20.16b
+ uzp1 v7.4s, v7.4s, v25.4s
+ ext v25.16b, v16.16b, v23.16b, #8
+ zip1 v3.2d, v4.2d, v19.2d
+ ushr v20.4s, v2.4s, #12
+ shl v2.4s, v2.4s, #20
+ ext v24.16b, v23.16b, v23.16b, #12
+ uzp2 v6.4s, v25.4s, v19.4s
+ zip2 v18.4s, v19.4s, v4.4s
+ bif v3.16b, v7.16b, v1.16b
+ orr v20.16b, v2.16b, v20.16b
+ ext v16.16b, v23.16b, v24.16b, #12
+ ext v23.16b, v6.16b, v6.16b, #4
+ zip1 v24.4s, v18.4s, v7.4s
+ zip1 v18.4s, v7.4s, v18.4s
+ ext v25.16b, v3.16b, v3.16b, #12
+ add v21.4s, v21.4s, v20.4s
+ ext v2.16b, v18.16b, v24.16b, #8
+ uzp1 v18.4s, v23.4s, v23.4s
+ ext v24.16b, v23.16b, v23.16b, #12
+ uzp1 v3.4s, v3.4s, v25.4s
+ eor v22.16b, v21.16b, v22.16b
+ ext v25.16b, v18.16b, v23.16b, #8
+ dup v18.4s, v2.s[3]
+ ext v23.16b, v23.16b, v24.16b, #12
+ add v5.4s, v21.4s, v5.4s
+ trn1 v21.4s, v3.4s, v3.4s
+ ushr v24.4s, v22.4s, #8
+ shl v22.4s, v22.4s, #24
+ ext v18.16b, v21.16b, v18.16b, #8
+ orr v21.16b, v22.16b, v24.16b
+ add v17.4s, v21.4s, v17.4s
+ eor v20.16b, v17.16b, v20.16b
+ ushr v22.4s, v20.4s, #7
+ shl v20.4s, v20.4s, #25
+ ext v5.16b, v5.16b, v5.16b, #4
+ orr v20.16b, v20.16b, v22.16b
+ ext v21.16b, v21.16b, v21.16b, #8
+ add v5.4s, v20.4s, v5.4s
+ eor v21.16b, v5.16b, v21.16b
+ ext v17.16b, v17.16b, v17.16b, #12
+ add v5.4s, v5.4s, v19.4s
+ rev32 v19.8h, v21.8h
+ add v17.4s, v17.4s, v19.4s
+ eor v20.16b, v17.16b, v20.16b
+ ushr v21.4s, v20.4s, #12
+ shl v20.4s, v20.4s, #20
+ orr v20.16b, v20.16b, v21.16b
+ add v5.4s, v5.4s, v20.4s
+ eor v19.16b, v5.16b, v19.16b
+ ushr v21.4s, v19.4s, #8
+ shl v19.4s, v19.4s, #24
+ orr v19.16b, v19.16b, v21.16b
+ add v17.4s, v19.4s, v17.4s
+ eor v20.16b, v17.16b, v20.16b
+ ext v5.16b, v5.16b, v5.16b, #12
+ ushr v21.4s, v20.4s, #7
+ shl v20.4s, v20.4s, #25
+ add v5.4s, v5.4s, v7.4s
+ orr v20.16b, v20.16b, v21.16b
+ ext v19.16b, v19.16b, v19.16b, #8
+ add v5.4s, v5.4s, v20.4s
+ eor v19.16b, v5.16b, v19.16b
+ ext v17.16b, v17.16b, v17.16b, #4
+ rev32 v22.8h, v19.8h
+ add v21.4s, v5.4s, v4.4s
+ mov v4.s[1], v7.s[2]
+ add v19.4s, v17.4s, v22.4s
+ bit v16.16b, v4.16b, v0.16b
+ eor v5.16b, v19.16b, v20.16b
+ uzp2 v4.4s, v25.4s, v16.4s
+ zip1 v7.2d, v2.2d, v16.2d
+ zip2 v17.4s, v16.4s, v2.4s
+ ushr v20.4s, v5.4s, #12
+ shl v5.4s, v5.4s, #20
+ ext v24.16b, v4.16b, v4.16b, #4
+ bif v7.16b, v3.16b, v1.16b
+ zip1 v25.4s, v17.4s, v3.4s
+ zip1 v17.4s, v3.4s, v17.4s
+ orr v20.16b, v5.16b, v20.16b
+ ext v26.16b, v7.16b, v7.16b, #12
+ ext v5.16b, v17.16b, v25.16b, #8
+ uzp1 v17.4s, v24.4s, v24.4s
+ ext v25.16b, v24.16b, v24.16b, #12
+ bit v23.16b, v18.16b, v0.16b
+ add v21.4s, v21.4s, v20.4s
+ uzp1 v7.4s, v7.4s, v26.4s
+ ext v26.16b, v17.16b, v24.16b, #8
+ ext v17.16b, v24.16b, v25.16b, #12
+ eor v22.16b, v21.16b, v22.16b
+ add v6.4s, v21.4s, v6.4s
+ zip1 v21.2d, v5.2d, v23.2d
+ zip2 v24.4s, v23.4s, v5.4s
+ bif v21.16b, v7.16b, v1.16b
+ zip1 v1.4s, v24.4s, v7.4s
+ zip1 v24.4s, v7.4s, v24.4s
+ ext v1.16b, v24.16b, v1.16b, #8
+ ushr v24.4s, v22.4s, #8
+ shl v22.4s, v22.4s, #24
+ orr v22.16b, v22.16b, v24.16b
+ add v19.4s, v22.4s, v19.4s
+ ext v24.16b, v21.16b, v21.16b, #12
+ eor v20.16b, v19.16b, v20.16b
+ uzp1 v21.4s, v21.4s, v24.4s
+ ushr v24.4s, v20.4s, #7
+ shl v20.4s, v20.4s, #25
+ orr v20.16b, v20.16b, v24.16b
+ ext v6.16b, v6.16b, v6.16b, #4
+ ext v22.16b, v22.16b, v22.16b, #8
+ add v6.4s, v20.4s, v6.4s
+ eor v22.16b, v6.16b, v22.16b
+ ext v19.16b, v19.16b, v19.16b, #12
+ add v6.4s, v6.4s, v16.4s
+ rev32 v16.8h, v22.8h
+ add v19.4s, v19.4s, v16.4s
+ eor v20.16b, v19.16b, v20.16b
+ ushr v22.4s, v20.4s, #12
+ shl v20.4s, v20.4s, #20
+ orr v20.16b, v20.16b, v22.16b
+ add v6.4s, v6.4s, v20.4s
+ eor v16.16b, v6.16b, v16.16b
+ ext v6.16b, v6.16b, v6.16b, #12
+ add v3.4s, v6.4s, v3.4s
+ ushr v6.4s, v16.4s, #8
+ shl v16.4s, v16.4s, #24
+ orr v6.16b, v16.16b, v6.16b
+ add v16.4s, v6.4s, v19.4s
+ eor v19.16b, v16.16b, v20.16b
+ ushr v20.4s, v19.4s, #7
+ shl v19.4s, v19.4s, #25
+ orr v19.16b, v19.16b, v20.16b
+ ext v6.16b, v6.16b, v6.16b, #8
+ add v3.4s, v3.4s, v19.4s
+ eor v6.16b, v3.16b, v6.16b
+ ext v16.16b, v16.16b, v16.16b, #4
+ add v2.4s, v3.4s, v2.4s
+ rev32 v3.8h, v6.8h
+ add v6.4s, v16.4s, v3.4s
+ eor v16.16b, v6.16b, v19.16b
+ ushr v19.4s, v16.4s, #12
+ shl v16.4s, v16.4s, #20
+ orr v16.16b, v16.16b, v19.16b
+ add v2.4s, v2.4s, v16.4s
+ eor v3.16b, v2.16b, v3.16b
+ add v2.4s, v2.4s, v4.4s
+ ushr v4.4s, v3.4s, #8
+ shl v3.4s, v3.4s, #24
+ orr v3.16b, v3.16b, v4.16b
+ add v4.4s, v3.4s, v6.4s
+ eor v6.16b, v4.16b, v16.16b
+ ushr v16.4s, v6.4s, #7
+ shl v6.4s, v6.4s, #25
+ ext v2.16b, v2.16b, v2.16b, #4
+ orr v6.16b, v6.16b, v16.16b
+ ext v3.16b, v3.16b, v3.16b, #8
+ add v2.4s, v6.4s, v2.4s
+ eor v3.16b, v2.16b, v3.16b
+ ext v4.16b, v4.16b, v4.16b, #12
+ rev32 v3.8h, v3.8h
+ add v4.4s, v4.4s, v3.4s
+ eor v6.16b, v4.16b, v6.16b
+ ushr v16.4s, v6.4s, #12
+ shl v6.4s, v6.4s, #20
+ add v2.4s, v2.4s, v23.4s
+ orr v6.16b, v6.16b, v16.16b
+ add v2.4s, v2.4s, v6.4s
+ eor v3.16b, v2.16b, v3.16b
+ ushr v16.4s, v3.4s, #8
+ shl v3.4s, v3.4s, #24
+ orr v3.16b, v3.16b, v16.16b
+ add v4.4s, v3.4s, v4.4s
+ eor v6.16b, v4.16b, v6.16b
+ ext v2.16b, v2.16b, v2.16b, #12
+ ushr v16.4s, v6.4s, #7
+ shl v6.4s, v6.4s, #25
+ add v2.4s, v2.4s, v7.4s
+ orr v6.16b, v6.16b, v16.16b
+ ext v3.16b, v3.16b, v3.16b, #8
+ add v2.4s, v2.4s, v6.4s
+ eor v3.16b, v2.16b, v3.16b
+ ext v4.16b, v4.16b, v4.16b, #4
+ rev32 v3.8h, v3.8h
+ add v2.4s, v2.4s, v5.4s
+ mov v5.s[1], v7.s[2]
+ add v4.4s, v4.4s, v3.4s
+ bsl v0.16b, v5.16b, v17.16b
+ eor v5.16b, v4.16b, v6.16b
+ ushr v6.4s, v5.4s, #12
+ shl v5.4s, v5.4s, #20
+ orr v5.16b, v5.16b, v6.16b
+ add v2.4s, v2.4s, v5.4s
+ eor v3.16b, v2.16b, v3.16b
+ ushr v6.4s, v3.4s, #8
+ shl v3.4s, v3.4s, #24
+ orr v3.16b, v3.16b, v6.16b
+ add v4.4s, v3.4s, v4.4s
+ uzp2 v18.4s, v26.4s, v18.4s
+ eor v5.16b, v4.16b, v5.16b
+ add v2.4s, v2.4s, v18.4s
+ ushr v6.4s, v5.4s, #7
+ shl v5.4s, v5.4s, #25
+ ext v2.16b, v2.16b, v2.16b, #4
+ orr v5.16b, v5.16b, v6.16b
+ ext v3.16b, v3.16b, v3.16b, #8
+ add v2.4s, v5.4s, v2.4s
+ eor v3.16b, v2.16b, v3.16b
+ ext v4.16b, v4.16b, v4.16b, #12
+ add v0.4s, v2.4s, v0.4s
+ rev32 v2.8h, v3.8h
+ add v3.4s, v4.4s, v2.4s
+ eor v4.16b, v3.16b, v5.16b
+ ushr v5.4s, v4.4s, #12
+ shl v4.4s, v4.4s, #20
+ orr v4.16b, v4.16b, v5.16b
+ add v0.4s, v0.4s, v4.4s
+ eor v2.16b, v0.16b, v2.16b
+ ushr v5.4s, v2.4s, #8
+ shl v2.4s, v2.4s, #24
+ orr v2.16b, v2.16b, v5.16b
+ add v3.4s, v2.4s, v3.4s
+ eor v4.16b, v3.16b, v4.16b
+ ext v0.16b, v0.16b, v0.16b, #12
+ ushr v5.4s, v4.4s, #7
+ shl v4.4s, v4.4s, #25
+ add v0.4s, v0.4s, v21.4s
+ orr v4.16b, v4.16b, v5.16b
+ ext v2.16b, v2.16b, v2.16b, #8
+ add v0.4s, v0.4s, v4.4s
+ eor v2.16b, v0.16b, v2.16b
+ ext v3.16b, v3.16b, v3.16b, #4
+ add v0.4s, v0.4s, v1.4s
+ rev32 v1.8h, v2.8h
+ add v2.4s, v3.4s, v1.4s
+ eor v3.16b, v2.16b, v4.16b
+ ushr v4.4s, v3.4s, #12
+ shl v3.4s, v3.4s, #20
+ orr v3.16b, v3.16b, v4.16b
+ add v0.4s, v0.4s, v3.4s
+ eor v1.16b, v0.16b, v1.16b
+ ushr v4.4s, v1.4s, #8
+ shl v1.4s, v1.4s, #24
+ orr v1.16b, v1.16b, v4.16b
+ add v2.4s, v1.4s, v2.4s
+ eor v3.16b, v2.16b, v3.16b
+ ext v0.16b, v0.16b, v0.16b, #4
+ ext v2.16b, v2.16b, v2.16b, #12
+ ushr v4.4s, v3.4s, #7
+ shl v3.4s, v3.4s, #25
+ ext v1.16b, v1.16b, v1.16b, #8
+ eor v0.16b, v2.16b, v0.16b
+ orr v2.16b, v3.16b, v4.16b
+ eor v1.16b, v2.16b, v1.16b
+ stp q0, q1, [x0]
+ ret
+.Lfunc_end0:
+ .size zfs_blake3_compress_in_place_sse2, .Lfunc_end0-zfs_blake3_compress_in_place_sse2
+ .cfi_endproc
+
+ .section .rodata.cst16,"aM",@progbits,16
+ .p2align 4
+.LCPI1_0:
+ .word 1779033703
+ .word 3144134277
+ .word 1013904242
+ .word 2773480762
+.LCPI1_1:
+ .xword 0
+ .xword -4294967296
+.LCPI1_2:
+ .xword -1
+ .xword 4294967295
+ .text
+ .globl zfs_blake3_compress_xof_sse2
+ .p2align 2
+ .type zfs_blake3_compress_xof_sse2,@function
+zfs_blake3_compress_xof_sse2:
+ .cfi_startproc
+ ldp q3, q2, [x0]
+ ldp q5, q6, [x1]
+ add x10, x1, #32
+ lsr x11, x3, #32
+ fmov s4, w3
+ ld2 { v17.4s, v18.4s }, [x10]
+ adrp x10, .LCPI1_2
+ and w8, w2, #0xff
+ mov v4.s[1], w11
+ ldr q1, [x10, :lo12:.LCPI1_2]
+ and w9, w4, #0xff
+ adrp x12, .LCPI1_0
+ mov v4.s[2], w8
+ uzp1 v19.4s, v5.4s, v6.4s
+ add v3.4s, v2.4s, v3.4s
+ ldr q7, [x12, :lo12:.LCPI1_0]
+ mov v4.s[3], w9
+ add v3.4s, v3.4s, v19.4s
+ uzp2 v5.4s, v5.4s, v6.4s
+ ext v21.16b, v18.16b, v18.16b, #12
+ uzp1 v6.4s, v19.4s, v19.4s
+ ext v22.16b, v19.16b, v19.16b, #12
+ eor v4.16b, v3.16b, v4.16b
+ ext v20.16b, v17.16b, v17.16b, #12
+ ext v6.16b, v6.16b, v19.16b, #8
+ ext v19.16b, v19.16b, v22.16b, #12
+ zip1 v22.2d, v21.2d, v5.2d
+ rev32 v24.8h, v4.8h
+ mov v4.16b, v1.16b
+ zip2 v23.4s, v5.4s, v21.4s
+ uzp2 v6.4s, v6.4s, v5.4s
+ bsl v4.16b, v22.16b, v20.16b
+ add v3.4s, v3.4s, v5.4s
+ zip1 v5.4s, v23.4s, v20.4s
+ zip1 v22.4s, v20.4s, v23.4s
+ add v23.4s, v24.4s, v7.4s
+ ext v7.16b, v6.16b, v6.16b, #4
+ ext v25.16b, v4.16b, v4.16b, #12
+ ext v5.16b, v22.16b, v5.16b, #8
+ eor v2.16b, v23.16b, v2.16b
+ uzp1 v4.4s, v4.4s, v25.4s
+ uzp1 v22.4s, v7.4s, v7.4s
+ ext v25.16b, v7.16b, v7.16b, #12
+ ext v22.16b, v22.16b, v7.16b, #8
+ ext v7.16b, v7.16b, v25.16b, #12
+ ushr v25.4s, v2.4s, #12
+ shl v2.4s, v2.4s, #20
+ orr v2.16b, v2.16b, v25.16b
+ add v3.4s, v3.4s, v2.4s
+ eor v24.16b, v3.16b, v24.16b
+ add v3.4s, v3.4s, v17.4s
+ ushr v17.4s, v24.4s, #8
+ shl v18.4s, v24.4s, #24
+ orr v17.16b, v18.16b, v17.16b
+ add v18.4s, v17.4s, v23.4s
+ eor v2.16b, v18.16b, v2.16b
+ ushr v23.4s, v2.4s, #7
+ shl v2.4s, v2.4s, #25
+ ext v3.16b, v3.16b, v3.16b, #12
+ orr v2.16b, v2.16b, v23.16b
+ ext v17.16b, v17.16b, v17.16b, #8
+ add v3.4s, v2.4s, v3.4s
+ adrp x11, .LCPI1_1
+ eor v17.16b, v3.16b, v17.16b
+ ldr q16, [x11, :lo12:.LCPI1_1]
+ ext v18.16b, v18.16b, v18.16b, #4
+ rev32 v24.8h, v17.8h
+ movi v0.2d, #0xffffffff00000000
+ add v23.4s, v3.4s, v21.4s
+ mov v21.s[1], v20.s[2]
+ add v20.4s, v18.4s, v24.4s
+ bit v19.16b, v21.16b, v0.16b
+ eor v3.16b, v20.16b, v2.16b
+ uzp2 v2.4s, v22.4s, v19.4s
+ zip1 v17.2d, v5.2d, v19.2d
+ zip2 v18.4s, v19.4s, v5.4s
+ ushr v21.4s, v3.4s, #12
+ shl v3.4s, v3.4s, #20
+ ext v22.16b, v2.16b, v2.16b, #4
+ bsl v16.16b, v4.16b, v17.16b
+ zip1 v17.4s, v18.4s, v4.4s
+ zip1 v18.4s, v4.4s, v18.4s
+ orr v21.16b, v3.16b, v21.16b
+ ext v25.16b, v16.16b, v16.16b, #12
+ ext v3.16b, v18.16b, v17.16b, #8
+ uzp1 v18.4s, v22.4s, v22.4s
+ ext v26.16b, v22.16b, v22.16b, #12
+ add v23.4s, v23.4s, v21.4s
+ uzp1 v17.4s, v16.4s, v25.4s
+ ext v16.16b, v18.16b, v22.16b, #8
+ ext v18.16b, v22.16b, v26.16b, #12
+ eor v22.16b, v23.16b, v24.16b
+ add v6.4s, v23.4s, v6.4s
+ ushr v23.4s, v22.4s, #8
+ shl v22.4s, v22.4s, #24
+ orr v22.16b, v22.16b, v23.16b
+ add v20.4s, v22.4s, v20.4s
+ eor v21.16b, v20.16b, v21.16b
+ ushr v23.4s, v21.4s, #7
+ shl v21.4s, v21.4s, #25
+ ext v6.16b, v6.16b, v6.16b, #4
+ orr v21.16b, v21.16b, v23.16b
+ ext v22.16b, v22.16b, v22.16b, #8
+ add v6.4s, v21.4s, v6.4s
+ eor v22.16b, v6.16b, v22.16b
+ ext v20.16b, v20.16b, v20.16b, #12
+ add v6.4s, v6.4s, v19.4s
+ rev32 v19.8h, v22.8h
+ add v20.4s, v20.4s, v19.4s
+ eor v21.16b, v20.16b, v21.16b
+ ushr v22.4s, v21.4s, #12
+ shl v21.4s, v21.4s, #20
+ orr v21.16b, v21.16b, v22.16b
+ add v6.4s, v6.4s, v21.4s
+ eor v19.16b, v6.16b, v19.16b
+ ushr v22.4s, v19.4s, #8
+ shl v19.4s, v19.4s, #24
+ orr v19.16b, v19.16b, v22.16b
+ add v20.4s, v19.4s, v20.4s
+ eor v21.16b, v20.16b, v21.16b
+ ext v6.16b, v6.16b, v6.16b, #12
+ ushr v22.4s, v21.4s, #7
+ shl v21.4s, v21.4s, #25
+ add v6.4s, v6.4s, v4.4s
+ orr v21.16b, v21.16b, v22.16b
+ ext v19.16b, v19.16b, v19.16b, #8
+ add v6.4s, v6.4s, v21.4s
+ eor v19.16b, v6.16b, v19.16b
+ ext v20.16b, v20.16b, v20.16b, #4
+ rev32 v19.8h, v19.8h
+ add v20.4s, v20.4s, v19.4s
+ add v6.4s, v6.4s, v5.4s
+ mov v5.s[1], v4.s[2]
+ eor v4.16b, v20.16b, v21.16b
+ ushr v21.4s, v4.4s, #12
+ shl v4.4s, v4.4s, #20
+ orr v21.16b, v4.16b, v21.16b
+ add v6.4s, v6.4s, v21.4s
+ eor v19.16b, v6.16b, v19.16b
+ add v2.4s, v6.4s, v2.4s
+ ushr v6.4s, v19.4s, #8
+ shl v19.4s, v19.4s, #24
+ orr v6.16b, v19.16b, v6.16b
+ add v19.4s, v6.4s, v20.4s
+ eor v20.16b, v19.16b, v21.16b
+ ushr v21.4s, v20.4s, #7
+ shl v20.4s, v20.4s, #25
+ ext v2.16b, v2.16b, v2.16b, #4
+ orr v20.16b, v20.16b, v21.16b
+ ext v6.16b, v6.16b, v6.16b, #8
+ add v2.4s, v20.4s, v2.4s
+ eor v6.16b, v2.16b, v6.16b
+ ext v19.16b, v19.16b, v19.16b, #12
+ rev32 v6.8h, v6.8h
+ add v19.4s, v19.4s, v6.4s
+ mov v22.16b, v0.16b
+ eor v20.16b, v19.16b, v20.16b
+ bsl v22.16b, v5.16b, v7.16b
+ ushr v21.4s, v20.4s, #12
+ shl v20.4s, v20.4s, #20
+ add v2.4s, v2.4s, v22.4s
+ orr v20.16b, v20.16b, v21.16b
+ add v2.4s, v2.4s, v20.4s
+ eor v6.16b, v2.16b, v6.16b
+ ushr v21.4s, v6.4s, #8
+ shl v6.4s, v6.4s, #24
+ orr v6.16b, v6.16b, v21.16b
+ add v19.4s, v6.4s, v19.4s
+ eor v20.16b, v19.16b, v20.16b
+ ext v2.16b, v2.16b, v2.16b, #12
+ ushr v21.4s, v20.4s, #7
+ shl v20.4s, v20.4s, #25
+ add v2.4s, v2.4s, v17.4s
+ orr v20.16b, v20.16b, v21.16b
+ ext v6.16b, v6.16b, v6.16b, #8
+ add v2.4s, v2.4s, v20.4s
+ eor v6.16b, v2.16b, v6.16b
+ uzp2 v5.4s, v16.4s, v22.4s
+ zip1 v7.2d, v3.2d, v22.2d
+ zip2 v16.4s, v22.4s, v3.4s
+ ext v19.16b, v19.16b, v19.16b, #4
+ rev32 v22.8h, v6.8h
+ ext v23.16b, v5.16b, v5.16b, #4
+ bif v7.16b, v17.16b, v1.16b
+ zip1 v24.4s, v16.4s, v17.4s
+ zip1 v16.4s, v17.4s, v16.4s
+ add v21.4s, v2.4s, v3.4s
+ mov v3.s[1], v17.s[2]
+ add v17.4s, v19.4s, v22.4s
+ mov v19.16b, v0.16b
+ ext v25.16b, v7.16b, v7.16b, #12
+ ext v4.16b, v16.16b, v24.16b, #8
+ uzp1 v16.4s, v23.4s, v23.4s
+ bsl v19.16b, v3.16b, v18.16b
+ eor v2.16b, v17.16b, v20.16b
+ uzp1 v7.4s, v7.4s, v25.4s
+ ext v25.16b, v16.16b, v23.16b, #8
+ zip1 v3.2d, v4.2d, v19.2d
+ ushr v20.4s, v2.4s, #12
+ shl v2.4s, v2.4s, #20
+ ext v24.16b, v23.16b, v23.16b, #12
+ uzp2 v6.4s, v25.4s, v19.4s
+ zip2 v18.4s, v19.4s, v4.4s
+ bif v3.16b, v7.16b, v1.16b
+ orr v20.16b, v2.16b, v20.16b
+ ext v16.16b, v23.16b, v24.16b, #12
+ ext v23.16b, v6.16b, v6.16b, #4
+ zip1 v24.4s, v18.4s, v7.4s
+ zip1 v18.4s, v7.4s, v18.4s
+ ext v25.16b, v3.16b, v3.16b, #12
+ add v21.4s, v21.4s, v20.4s
+ ext v2.16b, v18.16b, v24.16b, #8
+ uzp1 v18.4s, v23.4s, v23.4s
+ ext v24.16b, v23.16b, v23.16b, #12
+ uzp1 v3.4s, v3.4s, v25.4s
+ eor v22.16b, v21.16b, v22.16b
+ ext v25.16b, v18.16b, v23.16b, #8
+ dup v18.4s, v2.s[3]
+ ext v23.16b, v23.16b, v24.16b, #12
+ add v5.4s, v21.4s, v5.4s
+ trn1 v21.4s, v3.4s, v3.4s
+ ushr v24.4s, v22.4s, #8
+ shl v22.4s, v22.4s, #24
+ ext v18.16b, v21.16b, v18.16b, #8
+ orr v21.16b, v22.16b, v24.16b
+ add v17.4s, v21.4s, v17.4s
+ eor v20.16b, v17.16b, v20.16b
+ ushr v22.4s, v20.4s, #7
+ shl v20.4s, v20.4s, #25
+ ext v5.16b, v5.16b, v5.16b, #4
+ orr v20.16b, v20.16b, v22.16b
+ ext v21.16b, v21.16b, v21.16b, #8
+ add v5.4s, v20.4s, v5.4s
+ eor v21.16b, v5.16b, v21.16b
+ ext v17.16b, v17.16b, v17.16b, #12
+ add v5.4s, v5.4s, v19.4s
+ rev32 v19.8h, v21.8h
+ add v17.4s, v17.4s, v19.4s
+ eor v20.16b, v17.16b, v20.16b
+ ushr v21.4s, v20.4s, #12
+ shl v20.4s, v20.4s, #20
+ orr v20.16b, v20.16b, v21.16b
+ add v5.4s, v5.4s, v20.4s
+ eor v19.16b, v5.16b, v19.16b
+ ushr v21.4s, v19.4s, #8
+ shl v19.4s, v19.4s, #24
+ orr v19.16b, v19.16b, v21.16b
+ add v17.4s, v19.4s, v17.4s
+ eor v20.16b, v17.16b, v20.16b
+ ext v5.16b, v5.16b, v5.16b, #12
+ ushr v21.4s, v20.4s, #7
+ shl v20.4s, v20.4s, #25
+ add v5.4s, v5.4s, v7.4s
+ orr v20.16b, v20.16b, v21.16b
+ ext v19.16b, v19.16b, v19.16b, #8
+ add v5.4s, v5.4s, v20.4s
+ eor v19.16b, v5.16b, v19.16b
+ ext v17.16b, v17.16b, v17.16b, #4
+ rev32 v22.8h, v19.8h
+ add v21.4s, v5.4s, v4.4s
+ mov v4.s[1], v7.s[2]
+ add v19.4s, v17.4s, v22.4s
+ bit v16.16b, v4.16b, v0.16b
+ eor v5.16b, v19.16b, v20.16b
+ uzp2 v4.4s, v25.4s, v16.4s
+ zip1 v7.2d, v2.2d, v16.2d
+ zip2 v17.4s, v16.4s, v2.4s
+ ushr v20.4s, v5.4s, #12
+ shl v5.4s, v5.4s, #20
+ ext v24.16b, v4.16b, v4.16b, #4
+ bif v7.16b, v3.16b, v1.16b
+ zip1 v25.4s, v17.4s, v3.4s
+ zip1 v17.4s, v3.4s, v17.4s
+ orr v20.16b, v5.16b, v20.16b
+ ext v26.16b, v7.16b, v7.16b, #12
+ ext v5.16b, v17.16b, v25.16b, #8
+ uzp1 v17.4s, v24.4s, v24.4s
+ ext v25.16b, v24.16b, v24.16b, #12
+ bit v23.16b, v18.16b, v0.16b
+ add v21.4s, v21.4s, v20.4s
+ uzp1 v7.4s, v7.4s, v26.4s
+ ext v26.16b, v17.16b, v24.16b, #8
+ ext v17.16b, v24.16b, v25.16b, #12
+ eor v22.16b, v21.16b, v22.16b
+ add v6.4s, v21.4s, v6.4s
+ zip1 v21.2d, v5.2d, v23.2d
+ zip2 v24.4s, v23.4s, v5.4s
+ bif v21.16b, v7.16b, v1.16b
+ zip1 v1.4s, v24.4s, v7.4s
+ zip1 v24.4s, v7.4s, v24.4s
+ ext v1.16b, v24.16b, v1.16b, #8
+ ushr v24.4s, v22.4s, #8
+ shl v22.4s, v22.4s, #24
+ orr v22.16b, v22.16b, v24.16b
+ add v19.4s, v22.4s, v19.4s
+ ext v24.16b, v21.16b, v21.16b, #12
+ eor v20.16b, v19.16b, v20.16b
+ uzp1 v21.4s, v21.4s, v24.4s
+ ushr v24.4s, v20.4s, #7
+ shl v20.4s, v20.4s, #25
+ orr v20.16b, v20.16b, v24.16b
+ ext v6.16b, v6.16b, v6.16b, #4
+ ext v22.16b, v22.16b, v22.16b, #8
+ add v6.4s, v20.4s, v6.4s
+ eor v22.16b, v6.16b, v22.16b
+ ext v19.16b, v19.16b, v19.16b, #12
+ add v6.4s, v6.4s, v16.4s
+ rev32 v16.8h, v22.8h
+ add v19.4s, v19.4s, v16.4s
+ eor v20.16b, v19.16b, v20.16b
+ ushr v22.4s, v20.4s, #12
+ shl v20.4s, v20.4s, #20
+ orr v20.16b, v20.16b, v22.16b
+ add v6.4s, v6.4s, v20.4s
+ eor v16.16b, v6.16b, v16.16b
+ ext v6.16b, v6.16b, v6.16b, #12
+ add v3.4s, v6.4s, v3.4s
+ ushr v6.4s, v16.4s, #8
+ shl v16.4s, v16.4s, #24
+ orr v6.16b, v16.16b, v6.16b
+ add v16.4s, v6.4s, v19.4s
+ eor v19.16b, v16.16b, v20.16b
+ ushr v20.4s, v19.4s, #7
+ shl v19.4s, v19.4s, #25
+ orr v19.16b, v19.16b, v20.16b
+ ext v6.16b, v6.16b, v6.16b, #8
+ add v3.4s, v3.4s, v19.4s
+ eor v6.16b, v3.16b, v6.16b
+ ext v16.16b, v16.16b, v16.16b, #4
+ add v2.4s, v3.4s, v2.4s
+ rev32 v3.8h, v6.8h
+ add v6.4s, v16.4s, v3.4s
+ eor v16.16b, v6.16b, v19.16b
+ ushr v19.4s, v16.4s, #12
+ shl v16.4s, v16.4s, #20
+ orr v16.16b, v16.16b, v19.16b
+ add v2.4s, v2.4s, v16.4s
+ eor v3.16b, v2.16b, v3.16b
+ add v2.4s, v2.4s, v4.4s
+ ushr v4.4s, v3.4s, #8
+ shl v3.4s, v3.4s, #24
+ orr v3.16b, v3.16b, v4.16b
+ add v4.4s, v3.4s, v6.4s
+ eor v6.16b, v4.16b, v16.16b
+ ushr v16.4s, v6.4s, #7
+ shl v6.4s, v6.4s, #25
+ ext v2.16b, v2.16b, v2.16b, #4
+ orr v6.16b, v6.16b, v16.16b
+ ext v3.16b, v3.16b, v3.16b, #8
+ add v2.4s, v6.4s, v2.4s
+ eor v3.16b, v2.16b, v3.16b
+ ext v4.16b, v4.16b, v4.16b, #12
+ rev32 v3.8h, v3.8h
+ add v4.4s, v4.4s, v3.4s
+ eor v6.16b, v4.16b, v6.16b
+ ushr v16.4s, v6.4s, #12
+ shl v6.4s, v6.4s, #20
+ add v2.4s, v2.4s, v23.4s
+ orr v6.16b, v6.16b, v16.16b
+ add v2.4s, v2.4s, v6.4s
+ eor v3.16b, v2.16b, v3.16b
+ ushr v16.4s, v3.4s, #8
+ shl v3.4s, v3.4s, #24
+ orr v3.16b, v3.16b, v16.16b
+ add v4.4s, v3.4s, v4.4s
+ eor v6.16b, v4.16b, v6.16b
+ ext v2.16b, v2.16b, v2.16b, #12
+ ushr v16.4s, v6.4s, #7
+ shl v6.4s, v6.4s, #25
+ add v2.4s, v2.4s, v7.4s
+ orr v6.16b, v6.16b, v16.16b
+ ext v3.16b, v3.16b, v3.16b, #8
+ add v2.4s, v2.4s, v6.4s
+ eor v3.16b, v2.16b, v3.16b
+ ext v4.16b, v4.16b, v4.16b, #4
+ rev32 v3.8h, v3.8h
+ add v2.4s, v2.4s, v5.4s
+ mov v5.s[1], v7.s[2]
+ add v4.4s, v4.4s, v3.4s
+ bsl v0.16b, v5.16b, v17.16b
+ eor v5.16b, v4.16b, v6.16b
+ ushr v6.4s, v5.4s, #12
+ shl v5.4s, v5.4s, #20
+ orr v5.16b, v5.16b, v6.16b
+ add v2.4s, v2.4s, v5.4s
+ eor v3.16b, v2.16b, v3.16b
+ ushr v6.4s, v3.4s, #8
+ shl v3.4s, v3.4s, #24
+ orr v3.16b, v3.16b, v6.16b
+ add v4.4s, v3.4s, v4.4s
+ uzp2 v18.4s, v26.4s, v18.4s
+ eor v5.16b, v4.16b, v5.16b
+ add v2.4s, v2.4s, v18.4s
+ ushr v6.4s, v5.4s, #7
+ shl v5.4s, v5.4s, #25
+ ext v2.16b, v2.16b, v2.16b, #4
+ orr v5.16b, v5.16b, v6.16b
+ ext v3.16b, v3.16b, v3.16b, #8
+ add v2.4s, v5.4s, v2.4s
+ eor v3.16b, v2.16b, v3.16b
+ ext v4.16b, v4.16b, v4.16b, #12
+ add v0.4s, v2.4s, v0.4s
+ rev32 v2.8h, v3.8h
+ add v3.4s, v4.4s, v2.4s
+ eor v4.16b, v3.16b, v5.16b
+ ushr v5.4s, v4.4s, #12
+ shl v4.4s, v4.4s, #20
+ orr v4.16b, v4.16b, v5.16b
+ add v0.4s, v0.4s, v4.4s
+ eor v2.16b, v0.16b, v2.16b
+ ushr v5.4s, v2.4s, #8
+ shl v2.4s, v2.4s, #24
+ orr v2.16b, v2.16b, v5.16b
+ add v3.4s, v2.4s, v3.4s
+ eor v4.16b, v3.16b, v4.16b
+ ext v0.16b, v0.16b, v0.16b, #12
+ ushr v5.4s, v4.4s, #7
+ shl v4.4s, v4.4s, #25
+ add v0.4s, v0.4s, v21.4s
+ orr v4.16b, v4.16b, v5.16b
+ ext v2.16b, v2.16b, v2.16b, #8
+ add v0.4s, v0.4s, v4.4s
+ eor v2.16b, v0.16b, v2.16b
+ ext v3.16b, v3.16b, v3.16b, #4
+ add v0.4s, v0.4s, v1.4s
+ rev32 v1.8h, v2.8h
+ add v2.4s, v3.4s, v1.4s
+ eor v3.16b, v2.16b, v4.16b
+ ushr v4.4s, v3.4s, #12
+ shl v3.4s, v3.4s, #20
+ orr v3.16b, v3.16b, v4.16b
+ add v0.4s, v0.4s, v3.4s
+ eor v1.16b, v0.16b, v1.16b
+ ushr v4.4s, v1.4s, #8
+ shl v1.4s, v1.4s, #24
+ orr v1.16b, v1.16b, v4.16b
+ add v2.4s, v1.4s, v2.4s
+ eor v3.16b, v2.16b, v3.16b
+ ushr v4.4s, v3.4s, #7
+ shl v3.4s, v3.4s, #25
+ ext v0.16b, v0.16b, v0.16b, #4
+ ext v1.16b, v1.16b, v1.16b, #8
+ ext v2.16b, v2.16b, v2.16b, #12
+ orr v3.16b, v3.16b, v4.16b
+ eor v0.16b, v2.16b, v0.16b
+ eor v3.16b, v3.16b, v1.16b
+ stp q0, q3, [x5]
+ ldr q0, [x0]
+ eor v0.16b, v0.16b, v2.16b
+ str q0, [x5, #32]
+ ldr q0, [x0, #16]
+ eor v0.16b, v0.16b, v1.16b
+ str q0, [x5, #48]
+ ret
+.Lfunc_end1:
+ .size zfs_blake3_compress_xof_sse2, .Lfunc_end1-zfs_blake3_compress_xof_sse2
+ .cfi_endproc
+
+ .section .rodata.cst16,"aM",@progbits,16
+ .p2align 4
+.LCPI2_0:
+ .word 0
+ .word 1
+ .word 2
+ .word 3
+ .text
+ .globl zfs_blake3_hash_many_sse2
+ .p2align 2
+ .type zfs_blake3_hash_many_sse2,@function
+zfs_blake3_hash_many_sse2:
+ .cfi_startproc
+ stp d15, d14, [sp, #-160]!
+ stp d13, d12, [sp, #16]
+ stp d11, d10, [sp, #32]
+ stp d9, d8, [sp, #48]
+ stp x29, x30, [sp, #64]
+ stp x28, x27, [sp, #80]
+ stp x26, x25, [sp, #96]
+ stp x24, x23, [sp, #112]
+ stp x22, x21, [sp, #128]
+ stp x20, x19, [sp, #144]
+ mov x29, sp
+ sub sp, sp, #384
+ .cfi_def_cfa w29, 160
+ .cfi_offset w19, -8
+ .cfi_offset w20, -16
+ .cfi_offset w21, -24
+ .cfi_offset w22, -32
+ .cfi_offset w23, -40
+ .cfi_offset w24, -48
+ .cfi_offset w25, -56
+ .cfi_offset w26, -64
+ .cfi_offset w27, -72
+ .cfi_offset w28, -80
+ .cfi_offset w30, -88
+ .cfi_offset w29, -96
+ .cfi_offset b8, -104
+ .cfi_offset b9, -112
+ .cfi_offset b10, -120
+ .cfi_offset b11, -128
+ .cfi_offset b12, -136
+ .cfi_offset b13, -144
+ .cfi_offset b14, -152
+ .cfi_offset b15, -160
+ ldr x26, [x29, #168]
+ ldrb w27, [x29, #160]
+ mov w19, w6
+ mov x20, x4
+ mov x22, x2
+ mov x28, x1
+ cmp x1, #4
+ mov x24, x0
+ str x3, [sp, #40]
+ b.lo .LBB2_8
+ adrp x9, .LCPI2_0
+ ldr q0, [x9, :lo12:.LCPI2_0]
+ sbfx w11, w5, #0, #1
+ dup v1.4s, w11
+ mov w9, #58983
+ mov w10, #44677
+ and v0.16b, v1.16b, v0.16b
+ mov w11, #62322
+ mov w12, #62778
+ orr w8, w7, w19
+ movk w9, #27145, lsl #16
+ movk w10, #47975, lsl #16
+ movk w11, #15470, lsl #16
+ str q0, [sp, #16]
+ orr v0.4s, #128, lsl #24
+ movk w12, #42319, lsl #16
+ str q0, [sp]
+.LBB2_2:
+ ldr x0, [sp, #40]
+ mov x13, x0
+ ld1r { v20.4s }, [x13], #4
+ add x14, x0, #8
+ add x15, x0, #12
+ add x16, x0, #16
+ add x17, x0, #20
+ add x18, x0, #24
+ add x0, x0, #28
+ ld1r { v17.4s }, [x14]
+ ld1r { v6.4s }, [x15]
+ ld1r { v8.4s }, [x16]
+ ld1r { v9.4s }, [x17]
+ ld1r { v31.4s }, [x18]
+ ld1r { v26.4s }, [x13]
+ ld1r { v15.4s }, [x0]
+ cbz x22, .LBB2_7
+ ldr q1, [sp, #16]
+ dup v0.4s, w20
+ ldp x13, x14, [x24]
+ ldp x15, x16, [x24, #16]
+ add v1.4s, v0.4s, v1.4s
+ movi v0.4s, #128, lsl #24
+ str q1, [sp, #64]
+ eor v0.16b, v1.16b, v0.16b
+ ldr q1, [sp]
+ lsr x18, x20, #32
+ mov x17, xzr
+ cmgt v0.4s, v1.4s, v0.4s
+ dup v1.4s, w18
+ sub v0.4s, v1.4s, v0.4s
+ mov w18, w8
+ str q0, [sp, #48]
+.LBB2_4:
+ mov w2, #16
+ bfi x2, x17, #6, #58
+ ldr q1, [x13, x2]
+ ldr q3, [x14, x2]
+ ldr q2, [x15, x2]
+ ldr q4, [x16, x2]
+ mov w2, #32
+ bfi x2, x17, #6, #58
+ ldr q5, [x13, x2]
+ ldr q18, [x14, x2]
+ ldr q19, [x15, x2]
+ ldr q23, [x16, x2]
+ mov w2, #48
+ lsl x3, x17, #6
+ bfi x2, x17, #6, #58
+ add x17, x17, #1
+ ldr q0, [x13, x3]
+ ldr q21, [x14, x3]
+ ldr q7, [x15, x3]
+ ldr q16, [x16, x3]
+ cmp x17, x22
+ ldr q13, [x13, x2]
+ ldr q14, [x14, x2]
+ ldr q29, [x15, x2]
+ ldr q10, [x16, x2]
+ csel w2, w27, wzr, eq
+ orr w18, w2, w18
+ mov x0, xzr
+ and w18, w18, #0xff
+ add x3, x3, #256
+.LBB2_5:
+ ldr x2, [x24, x0]
+ add x0, x0, #8
+ cmp x0, #32
+ add x2, x2, x3
+ prfm pldl1keep, [x2]
+ b.ne .LBB2_5
+ dup v22.4s, w18
+ str q22, [sp, #192]
+ zip1 v27.4s, v0.4s, v21.4s
+ zip2 v21.4s, v0.4s, v21.4s
+ zip1 v0.4s, v7.4s, v16.4s
+ zip2 v22.4s, v7.4s, v16.4s
+ zip1 v7.4s, v1.4s, v3.4s
+ zip1 v25.4s, v2.4s, v4.4s
+ zip2 v16.4s, v2.4s, v4.4s
+ zip1 v11.4s, v19.4s, v23.4s
+ zip2 v12.4s, v19.4s, v23.4s
+ zip1 v19.4s, v13.4s, v14.4s
+ zip2 v23.4s, v13.4s, v14.4s
+ zip1 v13.4s, v29.4s, v10.4s
+ zip2 v14.4s, v29.4s, v10.4s
+ add v10.4s, v20.4s, v8.4s
+ add v2.4s, v26.4s, v9.4s
+ ext v20.16b, v22.16b, v21.16b, #8
+ ext v26.16b, v25.16b, v7.16b, #8
+ zip2 v24.4s, v1.4s, v3.4s
+ add v1.4s, v6.4s, v15.4s
+ ext v6.16b, v0.16b, v27.16b, #8
+ ext v20.16b, v21.16b, v20.16b, #8
+ mov v21.d[1], v22.d[0]
+ ext v22.16b, v7.16b, v26.16b, #8
+ mov v7.d[1], v25.d[0]
+ add v3.4s, v17.4s, v31.4s
+ str q1, [sp, #144]
+ ext v1.16b, v27.16b, v6.16b, #8
+ mov v6.16b, v7.16b
+ zip1 v28.4s, v5.4s, v18.4s
+ stur q1, [x29, #-80]
+ mov v1.16b, v27.16b
+ mov v27.16b, v24.16b
+ add v3.4s, v3.4s, v6.4s
+ ldr q6, [sp, #64]
+ ext v29.16b, v16.16b, v24.16b, #8
+ mov v1.d[1], v0.d[0]
+ ext v0.16b, v11.16b, v28.16b, #8
+ mov v27.d[1], v16.d[0]
+ ext v16.16b, v14.16b, v23.16b, #8
+ stur q7, [x29, #-144]
+ ext v7.16b, v24.16b, v29.16b, #8
+ ext v29.16b, v28.16b, v0.16b, #8
+ ext v0.16b, v23.16b, v16.16b, #8
+ mov v23.d[1], v14.d[0]
+ stp q0, q23, [sp, #80]
+ add v0.4s, v10.4s, v1.4s
+ eor v16.16b, v0.16b, v6.16b
+ ldr q6, [sp, #48]
+ add v2.4s, v2.4s, v21.4s
+ mov v28.d[1], v11.d[0]
+ zip2 v18.4s, v5.4s, v18.4s
+ eor v10.16b, v2.16b, v6.16b
+ movi v6.4s, #64
+ eor v11.16b, v3.16b, v6.16b
+ ldr q6, [sp, #144]
+ dup v17.4s, w9
+ ext v30.16b, v12.16b, v18.16b, #8
+ rev32 v16.8h, v16.8h
+ dup v5.4s, w10
+ ext v25.16b, v18.16b, v30.16b, #8
+ mov v30.16b, v23.16b
+ mov v23.16b, v1.16b
+ str q1, [sp, #160]
+ rev32 v10.8h, v10.8h
+ add v1.4s, v16.4s, v17.4s
+ add v17.4s, v6.4s, v27.4s
+ ldr q6, [sp, #192]
+ dup v4.4s, w11
+ rev32 v11.8h, v11.8h
+ add v5.4s, v10.4s, v5.4s
+ eor v8.16b, v1.16b, v8.16b
+ stur q21, [x29, #-128]
+ mov v18.d[1], v12.d[0]
+ add v4.4s, v11.4s, v4.4s
+ eor v9.16b, v5.16b, v9.16b
+ ushr v12.4s, v8.4s, #12
+ shl v8.4s, v8.4s, #20
+ ldur q21, [x29, #-80]
+ ext v26.16b, v13.16b, v19.16b, #8
+ eor v31.16b, v4.16b, v31.16b
+ orr v8.16b, v8.16b, v12.16b
+ ushr v12.4s, v9.4s, #12
+ shl v9.4s, v9.4s, #20
+ ext v26.16b, v19.16b, v26.16b, #8
+ mov v19.d[1], v13.d[0]
+ orr v9.16b, v9.16b, v12.16b
+ ushr v12.4s, v31.4s, #12
+ shl v31.4s, v31.4s, #20
+ eor v13.16b, v17.16b, v6.16b
+ orr v31.16b, v31.16b, v12.16b
+ dup v12.4s, w12
+ rev32 v13.8h, v13.8h
+ add v12.4s, v13.4s, v12.4s
+ add v0.4s, v0.4s, v21.4s
+ eor v14.16b, v12.16b, v15.16b
+ add v0.4s, v0.4s, v8.4s
+ add v2.4s, v2.4s, v20.4s
+ ushr v15.4s, v14.4s, #12
+ shl v14.4s, v14.4s, #20
+ eor v16.16b, v0.16b, v16.16b
+ add v2.4s, v2.4s, v9.4s
+ add v3.4s, v3.4s, v22.4s
+ orr v14.16b, v14.16b, v15.16b
+ ushr v15.4s, v16.4s, #8
+ shl v16.4s, v16.4s, #24
+ eor v10.16b, v2.16b, v10.16b
+ add v3.4s, v3.4s, v31.4s
+ add v17.4s, v17.4s, v7.4s
+ orr v16.16b, v16.16b, v15.16b
+ ushr v15.4s, v10.4s, #8
+ shl v10.4s, v10.4s, #24
+ eor v11.16b, v3.16b, v11.16b
+ add v17.4s, v17.4s, v14.4s
+ orr v10.16b, v10.16b, v15.16b
+ ushr v15.4s, v11.4s, #8
+ shl v11.4s, v11.4s, #24
+ eor v13.16b, v17.16b, v13.16b
+ add v1.4s, v16.4s, v1.4s
+ orr v11.16b, v11.16b, v15.16b
+ ushr v15.4s, v13.4s, #8
+ shl v13.4s, v13.4s, #24
+ eor v8.16b, v1.16b, v8.16b
+ add v5.4s, v10.4s, v5.4s
+ orr v13.16b, v13.16b, v15.16b
+ ushr v15.4s, v8.4s, #7
+ shl v8.4s, v8.4s, #25
+ eor v9.16b, v5.16b, v9.16b
+ add v4.4s, v11.4s, v4.4s
+ orr v8.16b, v8.16b, v15.16b
+ ushr v15.4s, v9.4s, #7
+ shl v9.4s, v9.4s, #25
+ eor v31.16b, v4.16b, v31.16b
+ add v12.4s, v13.4s, v12.4s
+ orr v9.16b, v9.16b, v15.16b
+ ushr v15.4s, v31.4s, #7
+ shl v31.4s, v31.4s, #25
+ eor v14.16b, v12.16b, v14.16b
+ add v0.4s, v0.4s, v28.4s
+ orr v31.16b, v31.16b, v15.16b
+ ushr v15.4s, v14.4s, #7
+ shl v14.4s, v14.4s, #25
+ add v0.4s, v0.4s, v9.4s
+ add v2.4s, v2.4s, v18.4s
+ orr v14.16b, v14.16b, v15.16b
+ eor v13.16b, v0.16b, v13.16b
+ add v2.4s, v2.4s, v31.4s
+ add v3.4s, v3.4s, v19.4s
+ rev32 v13.8h, v13.8h
+ eor v16.16b, v2.16b, v16.16b
+ add v3.4s, v3.4s, v14.4s
+ add v17.4s, v17.4s, v30.4s
+ add v4.4s, v4.4s, v13.4s
+ rev32 v16.8h, v16.8h
+ eor v10.16b, v3.16b, v10.16b
+ add v17.4s, v17.4s, v8.4s
+ eor v9.16b, v4.16b, v9.16b
+ add v12.4s, v12.4s, v16.4s
+ rev32 v10.8h, v10.8h
+ eor v11.16b, v17.16b, v11.16b
+ mov v24.16b, v7.16b
+ stur q7, [x29, #-112]
+ ushr v15.4s, v9.4s, #12
+ shl v9.4s, v9.4s, #20
+ eor v31.16b, v12.16b, v31.16b
+ add v1.4s, v1.4s, v10.4s
+ rev32 v11.8h, v11.8h
+ mov v7.16b, v26.16b
+ add v3.4s, v3.4s, v26.4s
+ ldr q26, [sp, #80]
+ orr v9.16b, v9.16b, v15.16b
+ ushr v15.4s, v31.4s, #12
+ shl v31.4s, v31.4s, #20
+ eor v14.16b, v1.16b, v14.16b
+ add v5.4s, v5.4s, v11.4s
+ add v0.4s, v0.4s, v29.4s
+ orr v31.16b, v31.16b, v15.16b
+ ushr v15.4s, v14.4s, #12
+ shl v14.4s, v14.4s, #20
+ eor v8.16b, v5.16b, v8.16b
+ add v0.4s, v0.4s, v9.4s
+ add v2.4s, v2.4s, v25.4s
+ orr v14.16b, v14.16b, v15.16b
+ ushr v15.4s, v8.4s, #12
+ shl v8.4s, v8.4s, #20
+ eor v13.16b, v0.16b, v13.16b
+ add v2.4s, v2.4s, v31.4s
+ orr v8.16b, v8.16b, v15.16b
+ ushr v15.4s, v13.4s, #8
+ shl v13.4s, v13.4s, #24
+ eor v16.16b, v2.16b, v16.16b
+ add v3.4s, v3.4s, v14.4s
+ add v17.4s, v17.4s, v26.4s
+ orr v13.16b, v13.16b, v15.16b
+ ushr v15.4s, v16.4s, #8
+ shl v16.4s, v16.4s, #24
+ eor v10.16b, v3.16b, v10.16b
+ add v17.4s, v17.4s, v8.4s
+ orr v16.16b, v16.16b, v15.16b
+ ushr v15.4s, v10.4s, #8
+ shl v10.4s, v10.4s, #24
+ eor v11.16b, v17.16b, v11.16b
+ add v4.4s, v13.4s, v4.4s
+ orr v10.16b, v10.16b, v15.16b
+ ushr v15.4s, v11.4s, #8
+ shl v11.4s, v11.4s, #24
+ eor v9.16b, v4.16b, v9.16b
+ add v12.4s, v16.4s, v12.4s
+ str q22, [sp, #128]
+ orr v11.16b, v11.16b, v15.16b
+ ushr v15.4s, v9.4s, #7
+ shl v9.4s, v9.4s, #25
+ eor v31.16b, v12.16b, v31.16b
+ add v1.4s, v10.4s, v1.4s
+ ldur q22, [x29, #-128]
+ orr v9.16b, v9.16b, v15.16b
+ ushr v15.4s, v31.4s, #7
+ shl v31.4s, v31.4s, #25
+ eor v14.16b, v1.16b, v14.16b
+ add v5.4s, v11.4s, v5.4s
+ orr v31.16b, v31.16b, v15.16b
+ ushr v15.4s, v14.4s, #7
+ shl v14.4s, v14.4s, #25
+ eor v8.16b, v5.16b, v8.16b
+ mov v6.16b, v18.16b
+ orr v14.16b, v14.16b, v15.16b
+ ushr v15.4s, v8.4s, #7
+ shl v8.4s, v8.4s, #25
+ ldur q18, [x29, #-144]
+ orr v8.16b, v8.16b, v15.16b
+ add v0.4s, v0.4s, v22.4s
+ add v0.4s, v0.4s, v8.4s
+ add v2.4s, v2.4s, v20.4s
+ eor v16.16b, v0.16b, v16.16b
+ add v2.4s, v2.4s, v9.4s
+ add v3.4s, v3.4s, v24.4s
+ rev32 v16.8h, v16.8h
+ eor v10.16b, v2.16b, v10.16b
+ add v3.4s, v3.4s, v31.4s
+ add v17.4s, v17.4s, v18.4s
+ add v1.4s, v1.4s, v16.4s
+ rev32 v10.8h, v10.8h
+ eor v11.16b, v3.16b, v11.16b
+ add v17.4s, v17.4s, v14.4s
+ eor v8.16b, v1.16b, v8.16b
+ add v5.4s, v5.4s, v10.4s
+ rev32 v11.8h, v11.8h
+ eor v13.16b, v17.16b, v13.16b
+ ushr v15.4s, v8.4s, #12
+ shl v8.4s, v8.4s, #20
+ eor v9.16b, v5.16b, v9.16b
+ add v4.4s, v4.4s, v11.4s
+ rev32 v13.8h, v13.8h
+ orr v8.16b, v8.16b, v15.16b
+ ushr v15.4s, v9.4s, #12
+ shl v9.4s, v9.4s, #20
+ eor v31.16b, v4.16b, v31.16b
+ add v12.4s, v12.4s, v13.4s
+ add v0.4s, v0.4s, v27.4s
+ orr v9.16b, v9.16b, v15.16b
+ ushr v15.4s, v31.4s, #12
+ shl v31.4s, v31.4s, #20
+ eor v14.16b, v12.16b, v14.16b
+ add v0.4s, v0.4s, v8.4s
+ add v2.4s, v2.4s, v6.4s
+ orr v31.16b, v31.16b, v15.16b
+ ushr v15.4s, v14.4s, #12
+ shl v14.4s, v14.4s, #20
+ eor v16.16b, v0.16b, v16.16b
+ add v2.4s, v2.4s, v9.4s
+ add v3.4s, v3.4s, v23.4s
+ orr v14.16b, v14.16b, v15.16b
+ ushr v15.4s, v16.4s, #8
+ shl v16.4s, v16.4s, #24
+ eor v10.16b, v2.16b, v10.16b
+ add v3.4s, v3.4s, v31.4s
+ add v17.4s, v17.4s, v7.4s
+ orr v16.16b, v16.16b, v15.16b
+ ushr v15.4s, v10.4s, #8
+ shl v10.4s, v10.4s, #24
+ eor v11.16b, v3.16b, v11.16b
+ add v17.4s, v17.4s, v14.4s
+ orr v10.16b, v10.16b, v15.16b
+ ushr v15.4s, v11.4s, #8
+ shl v11.4s, v11.4s, #24
+ eor v13.16b, v17.16b, v13.16b
+ add v1.4s, v16.4s, v1.4s
+ orr v11.16b, v11.16b, v15.16b
+ ushr v15.4s, v13.4s, #8
+ shl v13.4s, v13.4s, #24
+ eor v8.16b, v1.16b, v8.16b
+ add v5.4s, v10.4s, v5.4s
+ orr v13.16b, v13.16b, v15.16b
+ ushr v15.4s, v8.4s, #7
+ shl v8.4s, v8.4s, #25
+ eor v9.16b, v5.16b, v9.16b
+ add v4.4s, v11.4s, v4.4s
+ orr v8.16b, v8.16b, v15.16b
+ ushr v15.4s, v9.4s, #7
+ shl v9.4s, v9.4s, #25
+ eor v31.16b, v4.16b, v31.16b
+ add v12.4s, v13.4s, v12.4s
+ orr v9.16b, v9.16b, v15.16b
+ ushr v15.4s, v31.4s, #7
+ shl v31.4s, v31.4s, #25
+ eor v14.16b, v12.16b, v14.16b
+ add v0.4s, v0.4s, v21.4s
+ orr v31.16b, v31.16b, v15.16b
+ ushr v15.4s, v14.4s, #7
+ shl v14.4s, v14.4s, #25
+ add v0.4s, v0.4s, v9.4s
+ add v2.4s, v2.4s, v19.4s
+ orr v14.16b, v14.16b, v15.16b
+ eor v13.16b, v0.16b, v13.16b
+ add v2.4s, v2.4s, v31.4s
+ add v3.4s, v3.4s, v29.4s
+ str q28, [sp, #112]
+ rev32 v13.8h, v13.8h
+ eor v16.16b, v2.16b, v16.16b
+ add v3.4s, v3.4s, v14.4s
+ add v17.4s, v17.4s, v26.4s
+ add v4.4s, v4.4s, v13.4s
+ rev32 v16.8h, v16.8h
+ eor v10.16b, v3.16b, v10.16b
+ add v17.4s, v17.4s, v8.4s
+ ldp q28, q23, [sp, #112]
+ eor v9.16b, v4.16b, v9.16b
+ add v12.4s, v12.4s, v16.4s
+ rev32 v10.8h, v10.8h
+ eor v11.16b, v17.16b, v11.16b
+ ldr q21, [sp, #96]
+ ushr v15.4s, v9.4s, #12
+ shl v9.4s, v9.4s, #20
+ eor v31.16b, v12.16b, v31.16b
+ add v1.4s, v1.4s, v10.4s
+ rev32 v11.8h, v11.8h
+ orr v9.16b, v9.16b, v15.16b
+ ushr v15.4s, v31.4s, #12
+ shl v31.4s, v31.4s, #20
+ eor v14.16b, v1.16b, v14.16b
+ add v5.4s, v5.4s, v11.4s
+ add v0.4s, v0.4s, v25.4s
+ orr v31.16b, v31.16b, v15.16b
+ ushr v15.4s, v14.4s, #12
+ shl v14.4s, v14.4s, #20
+ eor v8.16b, v5.16b, v8.16b
+ add v0.4s, v0.4s, v9.4s
+ add v2.4s, v2.4s, v23.4s
+ orr v14.16b, v14.16b, v15.16b
+ ushr v15.4s, v8.4s, #12
+ shl v8.4s, v8.4s, #20
+ eor v13.16b, v0.16b, v13.16b
+ add v2.4s, v2.4s, v31.4s
+ add v3.4s, v3.4s, v21.4s
+ orr v8.16b, v8.16b, v15.16b
+ ushr v15.4s, v13.4s, #8
+ shl v13.4s, v13.4s, #24
+ eor v16.16b, v2.16b, v16.16b
+ add v3.4s, v3.4s, v14.4s
+ add v17.4s, v17.4s, v28.4s
+ orr v13.16b, v13.16b, v15.16b
+ ushr v15.4s, v16.4s, #8
+ shl v16.4s, v16.4s, #24
+ eor v10.16b, v3.16b, v10.16b
+ add v17.4s, v17.4s, v8.4s
+ orr v16.16b, v16.16b, v15.16b
+ ushr v15.4s, v10.4s, #8
+ shl v10.4s, v10.4s, #24
+ eor v11.16b, v17.16b, v11.16b
+ add v4.4s, v13.4s, v4.4s
+ orr v10.16b, v10.16b, v15.16b
+ ushr v15.4s, v11.4s, #8
+ shl v11.4s, v11.4s, #24
+ eor v9.16b, v4.16b, v9.16b
+ add v12.4s, v16.4s, v12.4s
+ orr v11.16b, v11.16b, v15.16b
+ ushr v15.4s, v9.4s, #7
+ shl v9.4s, v9.4s, #25
+ eor v31.16b, v12.16b, v31.16b
+ add v1.4s, v10.4s, v1.4s
+ orr v9.16b, v9.16b, v15.16b
+ ushr v15.4s, v31.4s, #7
+ shl v31.4s, v31.4s, #25
+ eor v14.16b, v1.16b, v14.16b
+ add v5.4s, v11.4s, v5.4s
+ orr v31.16b, v31.16b, v15.16b
+ ushr v15.4s, v14.4s, #7
+ shl v14.4s, v14.4s, #25
+ eor v8.16b, v5.16b, v8.16b
+ mov v30.16b, v29.16b
+ mov v29.16b, v25.16b
+ orr v14.16b, v14.16b, v15.16b
+ ushr v15.4s, v8.4s, #7
+ shl v8.4s, v8.4s, #25
+ ldur q25, [x29, #-112]
+ orr v8.16b, v8.16b, v15.16b
+ add v0.4s, v0.4s, v20.4s
+ add v0.4s, v0.4s, v8.4s
+ add v2.4s, v2.4s, v6.4s
+ eor v16.16b, v0.16b, v16.16b
+ add v2.4s, v2.4s, v9.4s
+ add v3.4s, v3.4s, v7.4s
+ rev32 v16.8h, v16.8h
+ eor v10.16b, v2.16b, v10.16b
+ add v3.4s, v3.4s, v31.4s
+ add v17.4s, v17.4s, v25.4s
+ add v1.4s, v1.4s, v16.4s
+ rev32 v10.8h, v10.8h
+ eor v11.16b, v3.16b, v11.16b
+ add v17.4s, v17.4s, v14.4s
+ eor v8.16b, v1.16b, v8.16b
+ add v5.4s, v5.4s, v10.4s
+ rev32 v11.8h, v11.8h
+ eor v13.16b, v17.16b, v13.16b
+ ushr v15.4s, v8.4s, #12
+ shl v8.4s, v8.4s, #20
+ eor v9.16b, v5.16b, v9.16b
+ add v4.4s, v4.4s, v11.4s
+ rev32 v13.8h, v13.8h
+ orr v8.16b, v8.16b, v15.16b
+ ushr v15.4s, v9.4s, #12
+ shl v9.4s, v9.4s, #20
+ eor v31.16b, v4.16b, v31.16b
+ add v12.4s, v12.4s, v13.4s
+ add v0.4s, v0.4s, v18.4s
+ orr v9.16b, v9.16b, v15.16b
+ ushr v15.4s, v31.4s, #12
+ shl v31.4s, v31.4s, #20
+ eor v14.16b, v12.16b, v14.16b
+ add v0.4s, v0.4s, v8.4s
+ add v2.4s, v2.4s, v19.4s
+ orr v31.16b, v31.16b, v15.16b
+ ushr v15.4s, v14.4s, #12
+ shl v14.4s, v14.4s, #20
+ eor v16.16b, v0.16b, v16.16b
+ add v2.4s, v2.4s, v9.4s
+ add v3.4s, v3.4s, v22.4s
+ orr v14.16b, v14.16b, v15.16b
+ ushr v15.4s, v16.4s, #8
+ shl v16.4s, v16.4s, #24
+ eor v10.16b, v2.16b, v10.16b
+ add v3.4s, v3.4s, v31.4s
+ add v17.4s, v17.4s, v21.4s
+ orr v16.16b, v16.16b, v15.16b
+ ushr v15.4s, v10.4s, #8
+ shl v10.4s, v10.4s, #24
+ eor v11.16b, v3.16b, v11.16b
+ add v17.4s, v17.4s, v14.4s
+ orr v10.16b, v10.16b, v15.16b
+ ushr v15.4s, v11.4s, #8
+ shl v11.4s, v11.4s, #24
+ eor v13.16b, v17.16b, v13.16b
+ add v1.4s, v16.4s, v1.4s
+ orr v11.16b, v11.16b, v15.16b
+ ushr v15.4s, v13.4s, #8
+ shl v13.4s, v13.4s, #24
+ eor v8.16b, v1.16b, v8.16b
+ add v5.4s, v10.4s, v5.4s
+ orr v13.16b, v13.16b, v15.16b
+ ushr v15.4s, v8.4s, #7
+ shl v8.4s, v8.4s, #25
+ eor v9.16b, v5.16b, v9.16b
+ add v4.4s, v11.4s, v4.4s
+ orr v8.16b, v8.16b, v15.16b
+ ushr v15.4s, v9.4s, #7
+ shl v9.4s, v9.4s, #25
+ eor v31.16b, v4.16b, v31.16b
+ add v12.4s, v13.4s, v12.4s
+ orr v9.16b, v9.16b, v15.16b
+ ushr v15.4s, v31.4s, #7
+ shl v31.4s, v31.4s, #25
+ eor v14.16b, v12.16b, v14.16b
+ add v0.4s, v0.4s, v27.4s
+ orr v31.16b, v31.16b, v15.16b
+ ushr v15.4s, v14.4s, #7
+ shl v14.4s, v14.4s, #25
+ add v0.4s, v0.4s, v9.4s
+ add v2.4s, v2.4s, v30.4s
+ orr v14.16b, v14.16b, v15.16b
+ eor v13.16b, v0.16b, v13.16b
+ add v2.4s, v2.4s, v31.4s
+ add v3.4s, v3.4s, v29.4s
+ rev32 v13.8h, v13.8h
+ eor v16.16b, v2.16b, v16.16b
+ add v3.4s, v3.4s, v14.4s
+ add v17.4s, v17.4s, v28.4s
+ add v4.4s, v4.4s, v13.4s
+ rev32 v16.8h, v16.8h
+ eor v10.16b, v3.16b, v10.16b
+ add v17.4s, v17.4s, v8.4s
+ eor v9.16b, v4.16b, v9.16b
+ add v12.4s, v12.4s, v16.4s
+ rev32 v10.8h, v10.8h
+ eor v11.16b, v17.16b, v11.16b
+ ushr v15.4s, v9.4s, #12
+ shl v9.4s, v9.4s, #20
+ eor v31.16b, v12.16b, v31.16b
+ add v1.4s, v1.4s, v10.4s
+ rev32 v11.8h, v11.8h
+ ldr q24, [sp, #160]
+ orr v9.16b, v9.16b, v15.16b
+ ushr v15.4s, v31.4s, #12
+ shl v31.4s, v31.4s, #20
+ eor v14.16b, v1.16b, v14.16b
+ add v5.4s, v5.4s, v11.4s
+ stur q7, [x29, #-64]
+ orr v31.16b, v31.16b, v15.16b
+ ushr v15.4s, v14.4s, #12
+ shl v14.4s, v14.4s, #20
+ eor v8.16b, v5.16b, v8.16b
+ mov v7.16b, v26.16b
+ add v3.4s, v3.4s, v26.4s
+ ldur q26, [x29, #-80]
+ orr v14.16b, v14.16b, v15.16b
+ ushr v15.4s, v8.4s, #12
+ shl v8.4s, v8.4s, #20
+ add v0.4s, v0.4s, v23.4s
+ orr v8.16b, v8.16b, v15.16b
+ add v15.4s, v0.4s, v9.4s
+ add v2.4s, v2.4s, v24.4s
+ eor v0.16b, v15.16b, v13.16b
+ add v2.4s, v2.4s, v31.4s
+ ushr v13.4s, v0.4s, #8
+ shl v0.4s, v0.4s, #24
+ eor v16.16b, v2.16b, v16.16b
+ add v3.4s, v3.4s, v14.4s
+ add v17.4s, v17.4s, v26.4s
+ orr v0.16b, v0.16b, v13.16b
+ ushr v13.4s, v16.4s, #8
+ shl v16.4s, v16.4s, #24
+ eor v10.16b, v3.16b, v10.16b
+ add v17.4s, v17.4s, v8.4s
+ orr v16.16b, v16.16b, v13.16b
+ ushr v13.4s, v10.4s, #8
+ shl v10.4s, v10.4s, #24
+ eor v11.16b, v17.16b, v11.16b
+ add v4.4s, v0.4s, v4.4s
+ orr v10.16b, v10.16b, v13.16b
+ ushr v13.4s, v11.4s, #8
+ shl v11.4s, v11.4s, #24
+ eor v9.16b, v4.16b, v9.16b
+ add v12.4s, v16.4s, v12.4s
+ orr v11.16b, v11.16b, v13.16b
+ ushr v13.4s, v9.4s, #7
+ shl v9.4s, v9.4s, #25
+ eor v31.16b, v12.16b, v31.16b
+ orr v9.16b, v9.16b, v13.16b
+ ushr v13.4s, v31.4s, #7
+ shl v31.4s, v31.4s, #25
+ add v1.4s, v10.4s, v1.4s
+ orr v31.16b, v31.16b, v13.16b
+ eor v13.16b, v1.16b, v14.16b
+ add v5.4s, v11.4s, v5.4s
+ ushr v14.4s, v13.4s, #7
+ shl v13.4s, v13.4s, #25
+ eor v8.16b, v5.16b, v8.16b
+ orr v13.16b, v13.16b, v14.16b
+ ushr v14.4s, v8.4s, #7
+ shl v8.4s, v8.4s, #25
+ stur q6, [x29, #-96]
+ orr v8.16b, v8.16b, v14.16b
+ add v14.4s, v15.4s, v6.4s
+ ldur q6, [x29, #-64]
+ mov v18.16b, v19.16b
+ add v14.4s, v14.4s, v8.4s
+ add v2.4s, v2.4s, v18.4s
+ eor v16.16b, v14.16b, v16.16b
+ add v2.4s, v2.4s, v9.4s
+ add v3.4s, v3.4s, v21.4s
+ rev32 v16.8h, v16.8h
+ eor v10.16b, v2.16b, v10.16b
+ add v3.4s, v3.4s, v31.4s
+ add v17.4s, v17.4s, v6.4s
+ add v1.4s, v1.4s, v16.4s
+ rev32 v10.8h, v10.8h
+ eor v11.16b, v3.16b, v11.16b
+ add v17.4s, v17.4s, v13.4s
+ eor v8.16b, v1.16b, v8.16b
+ add v5.4s, v5.4s, v10.4s
+ rev32 v11.8h, v11.8h
+ eor v0.16b, v17.16b, v0.16b
+ ushr v15.4s, v8.4s, #12
+ shl v8.4s, v8.4s, #20
+ eor v9.16b, v5.16b, v9.16b
+ add v4.4s, v4.4s, v11.4s
+ rev32 v0.8h, v0.8h
+ str q27, [sp, #176]
+ mov v27.16b, v30.16b
+ orr v8.16b, v8.16b, v15.16b
+ ushr v15.4s, v9.4s, #12
+ shl v9.4s, v9.4s, #20
+ eor v31.16b, v4.16b, v31.16b
+ add v12.4s, v12.4s, v0.4s
+ add v14.4s, v14.4s, v25.4s
+ orr v9.16b, v9.16b, v15.16b
+ ushr v15.4s, v31.4s, #12
+ shl v31.4s, v31.4s, #20
+ eor v13.16b, v12.16b, v13.16b
+ add v14.4s, v14.4s, v8.4s
+ add v2.4s, v2.4s, v27.4s
+ orr v31.16b, v31.16b, v15.16b
+ ushr v15.4s, v13.4s, #12
+ shl v13.4s, v13.4s, #20
+ eor v16.16b, v14.16b, v16.16b
+ add v2.4s, v2.4s, v9.4s
+ add v3.4s, v3.4s, v20.4s
+ orr v13.16b, v13.16b, v15.16b
+ ushr v15.4s, v16.4s, #8
+ shl v16.4s, v16.4s, #24
+ eor v10.16b, v2.16b, v10.16b
+ add v3.4s, v3.4s, v31.4s
+ add v17.4s, v17.4s, v7.4s
+ orr v16.16b, v16.16b, v15.16b
+ ushr v15.4s, v10.4s, #8
+ shl v10.4s, v10.4s, #24
+ eor v11.16b, v3.16b, v11.16b
+ add v17.4s, v17.4s, v13.4s
+ mov v30.16b, v23.16b
+ orr v10.16b, v10.16b, v15.16b
+ ushr v15.4s, v11.4s, #8
+ shl v11.4s, v11.4s, #24
+ eor v0.16b, v17.16b, v0.16b
+ add v1.4s, v16.4s, v1.4s
+ ldur q23, [x29, #-144]
+ orr v11.16b, v11.16b, v15.16b
+ ushr v15.4s, v0.4s, #8
+ shl v0.4s, v0.4s, #24
+ eor v8.16b, v1.16b, v8.16b
+ add v5.4s, v10.4s, v5.4s
+ orr v0.16b, v0.16b, v15.16b
+ ushr v15.4s, v8.4s, #7
+ shl v8.4s, v8.4s, #25
+ eor v9.16b, v5.16b, v9.16b
+ add v4.4s, v11.4s, v4.4s
+ orr v8.16b, v8.16b, v15.16b
+ ushr v15.4s, v9.4s, #7
+ shl v9.4s, v9.4s, #25
+ eor v31.16b, v4.16b, v31.16b
+ add v12.4s, v0.4s, v12.4s
+ orr v9.16b, v9.16b, v15.16b
+ ushr v15.4s, v31.4s, #7
+ shl v31.4s, v31.4s, #25
+ eor v13.16b, v12.16b, v13.16b
+ add v14.4s, v14.4s, v23.4s
+ orr v31.16b, v31.16b, v15.16b
+ ushr v15.4s, v13.4s, #7
+ shl v13.4s, v13.4s, #25
+ add v14.4s, v14.4s, v9.4s
+ add v2.4s, v2.4s, v29.4s
+ orr v13.16b, v13.16b, v15.16b
+ eor v0.16b, v14.16b, v0.16b
+ add v2.4s, v2.4s, v31.4s
+ add v3.4s, v3.4s, v30.4s
+ rev32 v0.8h, v0.8h
+ eor v16.16b, v2.16b, v16.16b
+ add v3.4s, v3.4s, v13.4s
+ add v17.4s, v17.4s, v26.4s
+ add v4.4s, v4.4s, v0.4s
+ rev32 v16.8h, v16.8h
+ eor v10.16b, v3.16b, v10.16b
+ add v17.4s, v17.4s, v8.4s
+ ldur q22, [x29, #-128]
+ eor v9.16b, v4.16b, v9.16b
+ add v12.4s, v12.4s, v16.4s
+ rev32 v10.8h, v10.8h
+ eor v11.16b, v17.16b, v11.16b
+ ushr v15.4s, v9.4s, #12
+ shl v9.4s, v9.4s, #20
+ eor v31.16b, v12.16b, v31.16b
+ add v1.4s, v1.4s, v10.4s
+ rev32 v11.8h, v11.8h
+ ldr q26, [sp, #176]
+ orr v9.16b, v9.16b, v15.16b
+ ushr v15.4s, v31.4s, #12
+ shl v31.4s, v31.4s, #20
+ eor v13.16b, v1.16b, v13.16b
+ add v5.4s, v5.4s, v11.4s
+ add v14.4s, v14.4s, v24.4s
+ orr v31.16b, v31.16b, v15.16b
+ ushr v15.4s, v13.4s, #12
+ shl v13.4s, v13.4s, #20
+ eor v8.16b, v5.16b, v8.16b
+ add v14.4s, v14.4s, v9.4s
+ add v2.4s, v2.4s, v22.4s
+ orr v13.16b, v13.16b, v15.16b
+ ushr v15.4s, v8.4s, #12
+ shl v8.4s, v8.4s, #20
+ eor v0.16b, v14.16b, v0.16b
+ add v2.4s, v2.4s, v31.4s
+ add v3.4s, v3.4s, v28.4s
+ orr v8.16b, v8.16b, v15.16b
+ ushr v15.4s, v0.4s, #8
+ shl v0.4s, v0.4s, #24
+ eor v16.16b, v2.16b, v16.16b
+ add v3.4s, v3.4s, v13.4s
+ add v17.4s, v17.4s, v26.4s
+ orr v0.16b, v0.16b, v15.16b
+ ushr v15.4s, v16.4s, #8
+ shl v16.4s, v16.4s, #24
+ eor v10.16b, v3.16b, v10.16b
+ add v17.4s, v17.4s, v8.4s
+ orr v16.16b, v16.16b, v15.16b
+ ushr v15.4s, v10.4s, #8
+ shl v10.4s, v10.4s, #24
+ eor v11.16b, v17.16b, v11.16b
+ add v4.4s, v0.4s, v4.4s
+ orr v10.16b, v10.16b, v15.16b
+ ushr v15.4s, v11.4s, #8
+ shl v11.4s, v11.4s, #24
+ eor v9.16b, v4.16b, v9.16b
+ add v12.4s, v16.4s, v12.4s
+ orr v11.16b, v11.16b, v15.16b
+ ushr v15.4s, v9.4s, #7
+ shl v9.4s, v9.4s, #25
+ eor v31.16b, v12.16b, v31.16b
+ add v1.4s, v10.4s, v1.4s
+ orr v9.16b, v9.16b, v15.16b
+ ushr v15.4s, v31.4s, #7
+ shl v31.4s, v31.4s, #25
+ eor v13.16b, v1.16b, v13.16b
+ add v5.4s, v11.4s, v5.4s
+ orr v31.16b, v31.16b, v15.16b
+ ushr v15.4s, v13.4s, #7
+ shl v13.4s, v13.4s, #25
+ eor v8.16b, v5.16b, v8.16b
+ orr v13.16b, v13.16b, v15.16b
+ ushr v15.4s, v8.4s, #7
+ shl v8.4s, v8.4s, #25
+ orr v8.16b, v8.16b, v15.16b
+ add v14.4s, v14.4s, v18.4s
+ add v14.4s, v14.4s, v8.4s
+ add v2.4s, v2.4s, v27.4s
+ eor v16.16b, v14.16b, v16.16b
+ add v2.4s, v2.4s, v9.4s
+ add v3.4s, v3.4s, v7.4s
+ rev32 v16.8h, v16.8h
+ eor v10.16b, v2.16b, v10.16b
+ add v3.4s, v3.4s, v31.4s
+ add v17.4s, v17.4s, v21.4s
+ add v1.4s, v1.4s, v16.4s
+ rev32 v10.8h, v10.8h
+ eor v11.16b, v3.16b, v11.16b
+ add v17.4s, v17.4s, v13.4s
+ eor v8.16b, v1.16b, v8.16b
+ add v5.4s, v5.4s, v10.4s
+ rev32 v11.8h, v11.8h
+ eor v0.16b, v17.16b, v0.16b
+ add v14.4s, v14.4s, v6.4s
+ ldur q6, [x29, #-96]
+ ushr v15.4s, v8.4s, #12
+ shl v8.4s, v8.4s, #20
+ eor v9.16b, v5.16b, v9.16b
+ add v4.4s, v4.4s, v11.4s
+ rev32 v0.8h, v0.8h
+ stur q20, [x29, #-160]
+ mov v20.16b, v29.16b
+ orr v8.16b, v8.16b, v15.16b
+ ushr v15.4s, v9.4s, #12
+ shl v9.4s, v9.4s, #20
+ eor v31.16b, v4.16b, v31.16b
+ add v12.4s, v12.4s, v0.4s
+ mov v19.16b, v29.16b
+ orr v9.16b, v9.16b, v15.16b
+ ushr v15.4s, v31.4s, #12
+ shl v31.4s, v31.4s, #20
+ eor v13.16b, v12.16b, v13.16b
+ add v14.4s, v14.4s, v8.4s
+ add v2.4s, v2.4s, v20.4s
+ mov v19.16b, v28.16b
+ orr v31.16b, v31.16b, v15.16b
+ ushr v15.4s, v13.4s, #12
+ shl v13.4s, v13.4s, #20
+ eor v16.16b, v14.16b, v16.16b
+ add v2.4s, v2.4s, v9.4s
+ add v3.4s, v3.4s, v6.4s
+ orr v13.16b, v13.16b, v15.16b
+ ushr v15.4s, v16.4s, #8
+ shl v16.4s, v16.4s, #24
+ eor v10.16b, v2.16b, v10.16b
+ add v3.4s, v3.4s, v31.4s
+ add v17.4s, v17.4s, v19.4s
+ orr v16.16b, v16.16b, v15.16b
+ ushr v15.4s, v10.4s, #8
+ shl v10.4s, v10.4s, #24
+ eor v11.16b, v3.16b, v11.16b
+ add v17.4s, v17.4s, v13.4s
+ orr v10.16b, v10.16b, v15.16b
+ ushr v15.4s, v11.4s, #8
+ shl v11.4s, v11.4s, #24
+ eor v0.16b, v17.16b, v0.16b
+ add v1.4s, v16.4s, v1.4s
+ orr v11.16b, v11.16b, v15.16b
+ ushr v15.4s, v0.4s, #8
+ shl v0.4s, v0.4s, #24
+ eor v8.16b, v1.16b, v8.16b
+ add v5.4s, v10.4s, v5.4s
+ orr v0.16b, v0.16b, v15.16b
+ ushr v15.4s, v8.4s, #7
+ shl v8.4s, v8.4s, #25
+ eor v9.16b, v5.16b, v9.16b
+ add v4.4s, v11.4s, v4.4s
+ orr v8.16b, v8.16b, v15.16b
+ ushr v15.4s, v9.4s, #7
+ shl v9.4s, v9.4s, #25
+ eor v31.16b, v4.16b, v31.16b
+ add v12.4s, v0.4s, v12.4s
+ orr v9.16b, v9.16b, v15.16b
+ ushr v15.4s, v31.4s, #7
+ shl v31.4s, v31.4s, #25
+ eor v13.16b, v12.16b, v13.16b
+ add v14.4s, v14.4s, v25.4s
+ orr v31.16b, v31.16b, v15.16b
+ ushr v15.4s, v13.4s, #7
+ shl v13.4s, v13.4s, #25
+ add v14.4s, v14.4s, v9.4s
+ add v2.4s, v2.4s, v30.4s
+ orr v13.16b, v13.16b, v15.16b
+ eor v0.16b, v14.16b, v0.16b
+ add v2.4s, v2.4s, v31.4s
+ add v3.4s, v3.4s, v24.4s
+ rev32 v0.8h, v0.8h
+ eor v16.16b, v2.16b, v16.16b
+ add v3.4s, v3.4s, v13.4s
+ add v17.4s, v17.4s, v26.4s
+ mov v29.16b, v27.16b
+ add v4.4s, v4.4s, v0.4s
+ rev32 v16.8h, v16.8h
+ eor v10.16b, v3.16b, v10.16b
+ add v17.4s, v17.4s, v8.4s
+ ldur q27, [x29, #-160]
+ eor v9.16b, v4.16b, v9.16b
+ add v12.4s, v12.4s, v16.4s
+ rev32 v10.8h, v10.8h
+ eor v11.16b, v17.16b, v11.16b
+ ldur q6, [x29, #-80]
+ ushr v15.4s, v9.4s, #12
+ shl v9.4s, v9.4s, #20
+ eor v31.16b, v12.16b, v31.16b
+ add v1.4s, v1.4s, v10.4s
+ rev32 v11.8h, v11.8h
+ orr v9.16b, v9.16b, v15.16b
+ ushr v15.4s, v31.4s, #12
+ shl v31.4s, v31.4s, #20
+ eor v13.16b, v1.16b, v13.16b
+ add v5.4s, v5.4s, v11.4s
+ add v14.4s, v14.4s, v22.4s
+ orr v31.16b, v31.16b, v15.16b
+ ushr v15.4s, v13.4s, #12
+ shl v13.4s, v13.4s, #20
+ eor v8.16b, v5.16b, v8.16b
+ add v14.4s, v14.4s, v9.4s
+ add v2.4s, v2.4s, v27.4s
+ orr v13.16b, v13.16b, v15.16b
+ ushr v15.4s, v8.4s, #12
+ shl v8.4s, v8.4s, #20
+ eor v0.16b, v14.16b, v0.16b
+ add v2.4s, v2.4s, v31.4s
+ add v3.4s, v3.4s, v6.4s
+ orr v8.16b, v8.16b, v15.16b
+ ushr v15.4s, v0.4s, #8
+ shl v0.4s, v0.4s, #24
+ eor v16.16b, v2.16b, v16.16b
+ add v3.4s, v3.4s, v13.4s
+ add v17.4s, v17.4s, v23.4s
+ orr v0.16b, v0.16b, v15.16b
+ ushr v15.4s, v16.4s, #8
+ shl v16.4s, v16.4s, #24
+ eor v10.16b, v3.16b, v10.16b
+ add v17.4s, v17.4s, v8.4s
+ orr v16.16b, v16.16b, v15.16b
+ ushr v15.4s, v10.4s, #8
+ shl v10.4s, v10.4s, #24
+ eor v11.16b, v17.16b, v11.16b
+ add v4.4s, v0.4s, v4.4s
+ orr v10.16b, v10.16b, v15.16b
+ ushr v15.4s, v11.4s, #8
+ shl v11.4s, v11.4s, #24
+ eor v9.16b, v4.16b, v9.16b
+ add v12.4s, v16.4s, v12.4s
+ orr v11.16b, v11.16b, v15.16b
+ ushr v15.4s, v9.4s, #7
+ shl v9.4s, v9.4s, #25
+ eor v31.16b, v12.16b, v31.16b
+ add v1.4s, v10.4s, v1.4s
+ orr v9.16b, v9.16b, v15.16b
+ ushr v15.4s, v31.4s, #7
+ shl v31.4s, v31.4s, #25
+ eor v13.16b, v1.16b, v13.16b
+ add v5.4s, v11.4s, v5.4s
+ orr v31.16b, v31.16b, v15.16b
+ ushr v15.4s, v13.4s, #7
+ shl v13.4s, v13.4s, #25
+ eor v8.16b, v5.16b, v8.16b
+ orr v13.16b, v13.16b, v15.16b
+ ushr v15.4s, v8.4s, #7
+ shl v8.4s, v8.4s, #25
+ orr v8.16b, v8.16b, v15.16b
+ add v14.4s, v14.4s, v29.4s
+ add v14.4s, v14.4s, v8.4s
+ add v2.4s, v2.4s, v20.4s
+ mov v28.16b, v7.16b
+ eor v16.16b, v14.16b, v16.16b
+ add v2.4s, v2.4s, v9.4s
+ add v3.4s, v3.4s, v19.4s
+ rev32 v16.8h, v16.8h
+ eor v10.16b, v2.16b, v10.16b
+ add v3.4s, v3.4s, v31.4s
+ add v17.4s, v17.4s, v28.4s
+ add v1.4s, v1.4s, v16.4s
+ rev32 v10.8h, v10.8h
+ eor v11.16b, v3.16b, v11.16b
+ add v17.4s, v17.4s, v13.4s
+ eor v8.16b, v1.16b, v8.16b
+ add v5.4s, v5.4s, v10.4s
+ rev32 v11.8h, v11.8h
+ eor v0.16b, v17.16b, v0.16b
+ ushr v15.4s, v8.4s, #12
+ shl v8.4s, v8.4s, #20
+ eor v9.16b, v5.16b, v9.16b
+ add v4.4s, v4.4s, v11.4s
+ rev32 v0.8h, v0.8h
+ orr v8.16b, v8.16b, v15.16b
+ ushr v15.4s, v9.4s, #12
+ shl v9.4s, v9.4s, #20
+ eor v31.16b, v4.16b, v31.16b
+ add v12.4s, v12.4s, v0.4s
+ add v14.4s, v14.4s, v21.4s
+ orr v9.16b, v9.16b, v15.16b
+ ushr v15.4s, v31.4s, #12
+ shl v31.4s, v31.4s, #20
+ eor v13.16b, v12.16b, v13.16b
+ add v14.4s, v14.4s, v8.4s
+ add v2.4s, v2.4s, v30.4s
+ orr v31.16b, v31.16b, v15.16b
+ ushr v15.4s, v13.4s, #12
+ shl v13.4s, v13.4s, #20
+ eor v16.16b, v14.16b, v16.16b
+ add v2.4s, v2.4s, v9.4s
+ orr v13.16b, v13.16b, v15.16b
+ ushr v15.4s, v16.4s, #8
+ shl v16.4s, v16.4s, #24
+ eor v10.16b, v2.16b, v10.16b
+ orr v16.16b, v16.16b, v15.16b
+ ushr v15.4s, v10.4s, #8
+ shl v10.4s, v10.4s, #24
+ add v3.4s, v3.4s, v18.4s
+ orr v10.16b, v10.16b, v15.16b
+ add v15.4s, v3.4s, v31.4s
+ eor v3.16b, v15.16b, v11.16b
+ ushr v11.4s, v3.4s, #8
+ shl v3.4s, v3.4s, #24
+ orr v11.16b, v3.16b, v11.16b
+ add v3.4s, v17.4s, v6.4s
+ add v17.4s, v3.4s, v13.4s
+ eor v0.16b, v17.16b, v0.16b
+ ushr v3.4s, v0.4s, #8
+ shl v0.4s, v0.4s, #24
+ add v1.4s, v16.4s, v1.4s
+ orr v0.16b, v0.16b, v3.16b
+ eor v3.16b, v1.16b, v8.16b
+ ushr v8.4s, v3.4s, #7
+ shl v3.4s, v3.4s, #25
+ add v5.4s, v10.4s, v5.4s
+ orr v8.16b, v3.16b, v8.16b
+ eor v3.16b, v5.16b, v9.16b
+ add v4.4s, v11.4s, v4.4s
+ ushr v9.4s, v3.4s, #7
+ shl v3.4s, v3.4s, #25
+ eor v31.16b, v4.16b, v31.16b
+ mov v7.16b, v23.16b
+ mov v23.16b, v28.16b
+ mov v28.16b, v6.16b
+ orr v3.16b, v3.16b, v9.16b
+ ushr v9.4s, v31.4s, #7
+ shl v31.4s, v31.4s, #25
+ ldur q6, [x29, #-64]
+ orr v31.16b, v31.16b, v9.16b
+ add v9.4s, v0.4s, v12.4s
+ eor v12.16b, v9.16b, v13.16b
+ ushr v13.4s, v12.4s, #7
+ shl v12.4s, v12.4s, #25
+ orr v12.16b, v12.16b, v13.16b
+ add v13.4s, v14.4s, v6.4s
+ add v13.4s, v13.4s, v3.4s
+ eor v0.16b, v13.16b, v0.16b
+ add v2.4s, v2.4s, v24.4s
+ rev32 v14.8h, v0.8h
+ add v0.4s, v2.4s, v31.4s
+ add v6.4s, v4.4s, v14.4s
+ eor v2.16b, v0.16b, v16.16b
+ eor v3.16b, v6.16b, v3.16b
+ rev32 v16.8h, v2.8h
+ ushr v4.4s, v3.4s, #12
+ shl v3.4s, v3.4s, #20
+ add v2.4s, v9.4s, v16.4s
+ orr v4.16b, v3.16b, v4.16b
+ eor v3.16b, v2.16b, v31.16b
+ ushr v31.4s, v3.4s, #12
+ shl v3.4s, v3.4s, #20
+ orr v3.16b, v3.16b, v31.16b
+ add v31.4s, v15.4s, v22.4s
+ add v31.4s, v31.4s, v12.4s
+ add v17.4s, v17.4s, v7.4s
+ eor v9.16b, v31.16b, v10.16b
+ add v17.4s, v17.4s, v8.4s
+ rev32 v9.8h, v9.8h
+ eor v11.16b, v17.16b, v11.16b
+ add v1.4s, v1.4s, v9.4s
+ rev32 v11.8h, v11.8h
+ eor v10.16b, v1.16b, v12.16b
+ add v5.4s, v5.4s, v11.4s
+ ushr v12.4s, v10.4s, #12
+ shl v10.4s, v10.4s, #20
+ eor v8.16b, v5.16b, v8.16b
+ orr v10.16b, v10.16b, v12.16b
+ ushr v12.4s, v8.4s, #12
+ shl v8.4s, v8.4s, #20
+ orr v8.16b, v8.16b, v12.16b
+ add v12.4s, v13.4s, v27.4s
+ add v12.4s, v12.4s, v4.4s
+ eor v13.16b, v12.16b, v14.16b
+ ldur q14, [x29, #-96]
+ mov v25.16b, v29.16b
+ add v29.4s, v12.4s, v20.4s
+ add v20.4s, v31.4s, v26.4s
+ add v0.4s, v0.4s, v14.4s
+ add v0.4s, v0.4s, v3.4s
+ eor v16.16b, v0.16b, v16.16b
+ add v0.4s, v0.4s, v30.4s
+ ldur q30, [x29, #-112]
+ add v20.4s, v20.4s, v10.4s
+ eor v31.16b, v20.16b, v9.16b
+ add v20.4s, v20.4s, v28.4s
+ add v17.4s, v17.4s, v30.4s
+ add v17.4s, v17.4s, v8.4s
+ eor v9.16b, v17.16b, v11.16b
+ ushr v28.4s, v13.4s, #8
+ shl v11.4s, v13.4s, #24
+ orr v28.16b, v11.16b, v28.16b
+ ushr v11.4s, v16.4s, #8
+ shl v16.4s, v16.4s, #24
+ orr v16.16b, v16.16b, v11.16b
+ ushr v11.4s, v31.4s, #8
+ shl v31.4s, v31.4s, #24
+ add v6.4s, v28.4s, v6.4s
+ orr v31.16b, v31.16b, v11.16b
+ ushr v11.4s, v9.4s, #8
+ shl v9.4s, v9.4s, #24
+ add v2.4s, v16.4s, v2.4s
+ eor v4.16b, v6.16b, v4.16b
+ orr v9.16b, v9.16b, v11.16b
+ add v1.4s, v31.4s, v1.4s
+ eor v3.16b, v2.16b, v3.16b
+ ushr v11.4s, v4.4s, #7
+ shl v4.4s, v4.4s, #25
+ add v5.4s, v9.4s, v5.4s
+ eor v10.16b, v1.16b, v10.16b
+ orr v4.16b, v4.16b, v11.16b
+ ushr v11.4s, v3.4s, #7
+ shl v3.4s, v3.4s, #25
+ eor v8.16b, v5.16b, v8.16b
+ orr v3.16b, v3.16b, v11.16b
+ ushr v11.4s, v10.4s, #7
+ shl v10.4s, v10.4s, #25
+ orr v10.16b, v10.16b, v11.16b
+ ushr v11.4s, v8.4s, #7
+ shl v8.4s, v8.4s, #25
+ orr v8.16b, v8.16b, v11.16b
+ add v29.4s, v29.4s, v8.4s
+ eor v16.16b, v29.16b, v16.16b
+ add v0.4s, v0.4s, v4.4s
+ mov v12.16b, v26.16b
+ add v17.4s, v17.4s, v19.4s
+ add v26.4s, v29.4s, v23.4s
+ eor v29.16b, v0.16b, v31.16b
+ add v20.4s, v20.4s, v3.4s
+ rev32 v16.8h, v16.8h
+ stur q18, [x29, #-176]
+ mov v18.16b, v27.16b
+ add v0.4s, v0.4s, v24.4s
+ eor v27.16b, v20.16b, v9.16b
+ add v17.4s, v17.4s, v10.4s
+ rev32 v24.8h, v29.8h
+ add v1.4s, v1.4s, v16.4s
+ add v20.4s, v20.4s, v25.4s
+ eor v25.16b, v17.16b, v28.16b
+ rev32 v27.8h, v27.8h
+ add v5.4s, v5.4s, v24.4s
+ eor v28.16b, v1.16b, v8.16b
+ rev32 v25.8h, v25.8h
+ add v6.4s, v6.4s, v27.4s
+ eor v4.16b, v5.16b, v4.16b
+ ushr v31.4s, v28.4s, #12
+ shl v28.4s, v28.4s, #20
+ add v2.4s, v2.4s, v25.4s
+ eor v3.16b, v6.16b, v3.16b
+ orr v28.16b, v28.16b, v31.16b
+ ushr v31.4s, v4.4s, #12
+ shl v4.4s, v4.4s, #20
+ eor v29.16b, v2.16b, v10.16b
+ orr v4.16b, v4.16b, v31.16b
+ ushr v31.4s, v3.4s, #12
+ shl v3.4s, v3.4s, #20
+ add v26.4s, v26.4s, v28.4s
+ orr v3.16b, v3.16b, v31.16b
+ ushr v31.4s, v29.4s, #12
+ shl v29.4s, v29.4s, #20
+ eor v16.16b, v26.16b, v16.16b
+ add v0.4s, v0.4s, v4.4s
+ add v17.4s, v17.4s, v12.4s
+ orr v29.16b, v29.16b, v31.16b
+ eor v24.16b, v0.16b, v24.16b
+ add v0.4s, v0.4s, v22.4s
+ add v20.4s, v20.4s, v3.4s
+ ushr v22.4s, v16.4s, #8
+ shl v16.4s, v16.4s, #24
+ add v23.4s, v26.4s, v21.4s
+ eor v21.16b, v20.16b, v27.16b
+ add v17.4s, v17.4s, v29.4s
+ orr v16.16b, v16.16b, v22.16b
+ ushr v22.4s, v24.4s, #8
+ shl v24.4s, v24.4s, #24
+ eor v25.16b, v17.16b, v25.16b
+ orr v22.16b, v24.16b, v22.16b
+ ushr v24.4s, v21.4s, #8
+ shl v21.4s, v21.4s, #24
+ orr v21.16b, v21.16b, v24.16b
+ ushr v24.4s, v25.4s, #8
+ shl v25.4s, v25.4s, #24
+ add v1.4s, v16.4s, v1.4s
+ orr v24.16b, v25.16b, v24.16b
+ add v5.4s, v22.4s, v5.4s
+ eor v25.16b, v1.16b, v28.16b
+ add v6.4s, v21.4s, v6.4s
+ eor v4.16b, v5.16b, v4.16b
+ ushr v27.4s, v25.4s, #7
+ shl v25.4s, v25.4s, #25
+ add v2.4s, v24.4s, v2.4s
+ eor v3.16b, v6.16b, v3.16b
+ orr v25.16b, v25.16b, v27.16b
+ ushr v27.4s, v4.4s, #7
+ shl v4.4s, v4.4s, #25
+ ldur q19, [x29, #-176]
+ eor v26.16b, v2.16b, v29.16b
+ orr v4.16b, v4.16b, v27.16b
+ ushr v27.4s, v3.4s, #7
+ shl v3.4s, v3.4s, #25
+ orr v3.16b, v3.16b, v27.16b
+ ushr v27.4s, v26.4s, #7
+ shl v26.4s, v26.4s, #25
+ add v20.4s, v20.4s, v18.4s
+ add v17.4s, v17.4s, v30.4s
+ orr v26.16b, v26.16b, v27.16b
+ add v0.4s, v0.4s, v3.4s
+ eor v16.16b, v0.16b, v16.16b
+ add v0.4s, v0.4s, v19.4s
+ add v19.4s, v20.4s, v26.4s
+ add v17.4s, v17.4s, v25.4s
+ eor v20.16b, v19.16b, v22.16b
+ add v7.4s, v19.4s, v7.4s
+ eor v19.16b, v17.16b, v21.16b
+ ldur q21, [x29, #-64]
+ add v23.4s, v23.4s, v4.4s
+ eor v24.16b, v23.16b, v24.16b
+ rev32 v16.8h, v16.8h
+ add v17.4s, v17.4s, v21.4s
+ rev32 v21.8h, v24.8h
+ add v6.4s, v6.4s, v21.4s
+ rev32 v20.8h, v20.8h
+ add v2.4s, v2.4s, v16.4s
+ eor v4.16b, v6.16b, v4.16b
+ rev32 v19.8h, v19.8h
+ add v1.4s, v1.4s, v20.4s
+ eor v3.16b, v2.16b, v3.16b
+ ushr v24.4s, v4.4s, #12
+ shl v4.4s, v4.4s, #20
+ add v5.4s, v5.4s, v19.4s
+ eor v22.16b, v1.16b, v26.16b
+ orr v4.16b, v4.16b, v24.16b
+ ushr v24.4s, v3.4s, #12
+ shl v3.4s, v3.4s, #20
+ add v18.4s, v23.4s, v14.4s
+ eor v23.16b, v5.16b, v25.16b
+ orr v3.16b, v3.16b, v24.16b
+ ushr v24.4s, v22.4s, #12
+ shl v22.4s, v22.4s, #20
+ orr v22.16b, v22.16b, v24.16b
+ ushr v24.4s, v23.4s, #12
+ shl v23.4s, v23.4s, #20
+ orr v23.16b, v23.16b, v24.16b
+ add v18.4s, v18.4s, v4.4s
+ add v0.4s, v0.4s, v3.4s
+ add v24.4s, v17.4s, v23.4s
+ eor v17.16b, v18.16b, v21.16b
+ add v7.4s, v7.4s, v22.4s
+ eor v16.16b, v0.16b, v16.16b
+ ushr v21.4s, v17.4s, #8
+ shl v17.4s, v17.4s, #24
+ eor v20.16b, v7.16b, v20.16b
+ orr v21.16b, v17.16b, v21.16b
+ ushr v17.4s, v16.4s, #8
+ shl v16.4s, v16.4s, #24
+ eor v19.16b, v24.16b, v19.16b
+ orr v16.16b, v16.16b, v17.16b
+ ushr v17.4s, v20.4s, #8
+ shl v20.4s, v20.4s, #24
+ orr v25.16b, v20.16b, v17.16b
+ ushr v17.4s, v19.4s, #8
+ shl v19.4s, v19.4s, #24
+ orr v19.16b, v19.16b, v17.16b
+ add v1.4s, v25.4s, v1.4s
+ eor v22.16b, v1.16b, v22.16b
+ eor v20.16b, v1.16b, v18.16b
+ add v1.4s, v19.4s, v5.4s
+ eor v26.16b, v1.16b, v0.16b
+ add v0.4s, v21.4s, v6.4s
+ eor v5.16b, v1.16b, v23.16b
+ eor v1.16b, v0.16b, v4.16b
+ eor v17.16b, v0.16b, v7.16b
+ add v0.4s, v16.4s, v2.4s
+ eor v2.16b, v0.16b, v3.16b
+ eor v6.16b, v0.16b, v24.16b
+ ushr v0.4s, v1.4s, #7
+ shl v1.4s, v1.4s, #25
+ orr v0.16b, v1.16b, v0.16b
+ ushr v1.4s, v2.4s, #7
+ shl v2.4s, v2.4s, #25
+ orr v1.16b, v2.16b, v1.16b
+ ushr v2.4s, v22.4s, #7
+ shl v3.4s, v22.4s, #25
+ orr v2.16b, v3.16b, v2.16b
+ ushr v3.4s, v5.4s, #7
+ shl v4.4s, v5.4s, #25
+ orr v3.16b, v4.16b, v3.16b
+ eor v8.16b, v16.16b, v3.16b
+ eor v9.16b, v25.16b, v0.16b
+ eor v31.16b, v1.16b, v19.16b
+ cmp x17, x22
+ eor v15.16b, v2.16b, v21.16b
+ mov w18, w19
+ b.ne .LBB2_4
+.LBB2_7:
+ zip1 v0.4s, v20.4s, v26.4s
+ zip2 v1.4s, v20.4s, v26.4s
+ zip1 v2.4s, v17.4s, v6.4s
+ zip2 v3.4s, v17.4s, v6.4s
+ zip1 v4.4s, v8.4s, v9.4s
+ zip2 v5.4s, v8.4s, v9.4s
+ zip1 v6.4s, v31.4s, v15.4s
+ zip2 v7.4s, v31.4s, v15.4s
+ add x13, x20, #4
+ tst w5, #0x1
+ sub x28, x28, #4
+ zip1 v16.2d, v0.2d, v2.2d
+ zip2 v0.2d, v0.2d, v2.2d
+ zip1 v2.2d, v1.2d, v3.2d
+ zip2 v1.2d, v1.2d, v3.2d
+ zip1 v3.2d, v4.2d, v6.2d
+ zip2 v4.2d, v4.2d, v6.2d
+ zip1 v6.2d, v5.2d, v7.2d
+ zip2 v5.2d, v5.2d, v7.2d
+ add x24, x24, #32
+ csel x20, x13, x20, ne
+ cmp x28, #3
+ stp q16, q3, [x26]
+ stp q0, q4, [x26, #32]
+ stp q2, q6, [x26, #64]
+ stp q1, q5, [x26, #96]
+ add x26, x26, #128
+ b.hi .LBB2_2
+.LBB2_8:
+ cbz x28, .LBB2_16
+ orr w8, w7, w19
+ and x21, x5, #0x1
+ stur w8, [x29, #-64]
+.LBB2_10:
+ ldr x8, [sp, #40]
+ ldr x25, [x24]
+ ldur w4, [x29, #-64]
+ ldp q1, q0, [x8]
+ mov x8, x22
+ stp q1, q0, [x29, #-48]
+.LBB2_11:
+ subs x23, x8, #1
+ b.eq .LBB2_13
+ cbnz x8, .LBB2_14
+ b .LBB2_15
+.LBB2_13:
+ orr w4, w4, w27
+.LBB2_14:
+ sub x0, x29, #48
+ mov w2, #64
+ mov x1, x25
+ mov x3, x20
+ bl zfs_blake3_compress_in_place_sse2
+ add x25, x25, #64
+ mov x8, x23
+ mov w4, w19
+ b .LBB2_11
+.LBB2_15:
+ ldp q0, q1, [x29, #-48]
+ add x20, x20, x21
+ add x24, x24, #8
+ subs x28, x28, #1
+ stp q0, q1, [x26], #32
+ b.ne .LBB2_10
+.LBB2_16:
+ add sp, sp, #384
+ ldp x20, x19, [sp, #144]
+ ldp x22, x21, [sp, #128]
+ ldp x24, x23, [sp, #112]
+ ldp x26, x25, [sp, #96]
+ ldp x28, x27, [sp, #80]
+ ldp x29, x30, [sp, #64]
+ ldp d9, d8, [sp, #48]
+ ldp d11, d10, [sp, #32]
+ ldp d13, d12, [sp, #16]
+ ldp d15, d14, [sp], #160
+ ret
+.Lfunc_end2:
+ .size zfs_blake3_hash_many_sse2, .Lfunc_end2-zfs_blake3_hash_many_sse2
+ .cfi_endproc
+ .section ".note.GNU-stack","",@progbits
+#endif
diff --git a/sys/contrib/openzfs/module/icp/asm-aarch64/blake3/b3_aarch64_sse41.S b/sys/contrib/openzfs/module/icp/asm-aarch64/blake3/b3_aarch64_sse41.S
new file mode 100644
index 000000000000..eb6946400b8a
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/asm-aarch64/blake3/b3_aarch64_sse41.S
@@ -0,0 +1,2463 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Based on BLAKE3 v1.3.1, https://github.com/BLAKE3-team/BLAKE3
+ * Copyright (c) 2019-2022 Samuel Neves
+ * Copyright (c) 2022 Tino Reichardt <milky-zfs@mcmilk.de>
+ *
+ * This is converted assembly: SSE4.1 -> ARMv8-A
+ * Used tools: SIMDe https://github.com/simd-everywhere/simde
+ */
+
+#if defined(__aarch64__)
+ .text
+ .section .rodata.cst16,"aM",@progbits,16
+ .p2align 4
+.LCPI0_0:
+ .byte 2
+ .byte 3
+ .byte 0
+ .byte 1
+ .byte 6
+ .byte 7
+ .byte 4
+ .byte 5
+ .byte 10
+ .byte 11
+ .byte 8
+ .byte 9
+ .byte 14
+ .byte 15
+ .byte 12
+ .byte 13
+.LCPI0_1:
+ .word 1779033703
+ .word 3144134277
+ .word 1013904242
+ .word 2773480762
+.LCPI0_2:
+ .byte 1
+ .byte 2
+ .byte 3
+ .byte 0
+ .byte 5
+ .byte 6
+ .byte 7
+ .byte 4
+ .byte 9
+ .byte 10
+ .byte 11
+ .byte 8
+ .byte 13
+ .byte 14
+ .byte 15
+ .byte 12
+.LCPI0_3:
+ .byte 0
+ .byte 1
+ .byte 2
+ .byte 3
+ .byte 20
+ .byte 21
+ .byte 22
+ .byte 23
+ .byte 8
+ .byte 9
+ .byte 10
+ .byte 11
+ .byte 28
+ .byte 29
+ .byte 30
+ .byte 31
+.LCPI0_4:
+ .byte 0
+ .byte 1
+ .byte 2
+ .byte 3
+ .byte 4
+ .byte 5
+ .byte 6
+ .byte 7
+ .byte 8
+ .byte 9
+ .byte 10
+ .byte 11
+ .byte 28
+ .byte 29
+ .byte 30
+ .byte 31
+ .text
+ .globl zfs_blake3_compress_in_place_sse41
+ .p2align 2
+ .type zfs_blake3_compress_in_place_sse41,@function
+zfs_blake3_compress_in_place_sse41:
+ .cfi_startproc
+ ldp q7, q6, [x0]
+ ldp q17, q18, [x1]
+ add x12, x1, #32
+ ld2 { v4.4s, v5.4s }, [x12]
+ lsr x10, x3, #32
+ fmov s16, w3
+ adrp x13, .LCPI0_0
+ adrp x11, .LCPI0_1
+ and w8, w2, #0xff
+ mov v16.s[1], w10
+ ldr q0, [x13, :lo12:.LCPI0_0]
+ ldr q20, [x11, :lo12:.LCPI0_1]
+ adrp x11, .LCPI0_4
+ and w9, w4, #0xff
+ ldr q2, [x11, :lo12:.LCPI0_4]
+ mov v16.s[2], w8
+ uzp1 v21.4s, v17.4s, v18.4s
+ add v7.4s, v6.4s, v7.4s
+ adrp x12, .LCPI0_3
+ mov v16.s[3], w9
+ uzp2 v18.4s, v17.4s, v18.4s
+ add v7.4s, v7.4s, v21.4s
+ ext v17.16b, v5.16b, v5.16b, #12
+ ldr q3, [x12, :lo12:.LCPI0_3]
+ ext v24.16b, v4.16b, v4.16b, #12
+ eor v16.16b, v7.16b, v16.16b
+ mov v27.16b, v17.16b
+ uzp1 v19.4s, v21.4s, v21.4s
+ ext v25.16b, v21.16b, v21.16b, #12
+ zip2 v28.4s, v18.4s, v17.4s
+ tbl v29.16b, { v16.16b }, v0.16b
+ mov v27.s[1], v24.s[2]
+ zip1 v23.2d, v17.2d, v18.2d
+ ext v19.16b, v19.16b, v21.16b, #8
+ add v22.4s, v29.4s, v20.4s
+ ext v26.16b, v21.16b, v25.16b, #12
+ tbl v20.16b, { v23.16b, v24.16b }, v2.16b
+ zip1 v21.4s, v28.4s, v24.4s
+ zip1 v23.4s, v24.4s, v28.4s
+ uzp2 v19.4s, v19.4s, v18.4s
+ eor v24.16b, v22.16b, v6.16b
+ ext v25.16b, v20.16b, v20.16b, #12
+ ext v6.16b, v23.16b, v21.16b, #8
+ add v7.4s, v7.4s, v18.4s
+ ext v18.16b, v19.16b, v19.16b, #4
+ tbl v16.16b, { v26.16b, v27.16b }, v3.16b
+ uzp1 v21.4s, v20.4s, v25.4s
+ mov v26.16b, v6.16b
+ ext v23.16b, v18.16b, v18.16b, #12
+ mov v26.s[1], v21.s[2]
+ adrp x10, .LCPI0_2
+ ext v25.16b, v18.16b, v23.16b, #12
+ uzp1 v23.4s, v18.4s, v18.4s
+ ldr q1, [x10, :lo12:.LCPI0_2]
+ ext v18.16b, v23.16b, v18.16b, #8
+ ushr v23.4s, v24.4s, #12
+ shl v24.4s, v24.4s, #20
+ orr v23.16b, v24.16b, v23.16b
+ add v7.4s, v7.4s, v23.4s
+ eor v27.16b, v29.16b, v7.16b
+ add v4.4s, v7.4s, v4.4s
+ tbl v7.16b, { v25.16b, v26.16b }, v3.16b
+ tbl v26.16b, { v27.16b }, v1.16b
+ add v22.4s, v22.4s, v26.4s
+ uzp2 v18.4s, v18.4s, v16.4s
+ eor v23.16b, v23.16b, v22.16b
+ ext v5.16b, v18.16b, v18.16b, #4
+ ushr v27.4s, v23.4s, #7
+ shl v23.4s, v23.4s, #25
+ uzp1 v25.4s, v5.4s, v5.4s
+ orr v23.16b, v23.16b, v27.16b
+ ext v28.16b, v4.16b, v4.16b, #12
+ ext v4.16b, v25.16b, v5.16b, #8
+ ext v25.16b, v26.16b, v26.16b, #8
+ add v26.4s, v28.4s, v23.4s
+ eor v25.16b, v26.16b, v25.16b
+ ext v22.16b, v22.16b, v22.16b, #4
+ tbl v25.16b, { v25.16b }, v0.16b
+ add v22.4s, v22.4s, v25.4s
+ eor v23.16b, v23.16b, v22.16b
+ add v17.4s, v26.4s, v17.4s
+ ushr v26.4s, v23.4s, #12
+ shl v23.4s, v23.4s, #20
+ orr v23.16b, v23.16b, v26.16b
+ add v17.4s, v17.4s, v23.4s
+ eor v25.16b, v25.16b, v17.16b
+ add v17.4s, v17.4s, v19.4s
+ tbl v19.16b, { v25.16b }, v1.16b
+ add v22.4s, v22.4s, v19.4s
+ eor v23.16b, v23.16b, v22.16b
+ ushr v25.4s, v23.4s, #7
+ shl v23.4s, v23.4s, #25
+ ext v17.16b, v17.16b, v17.16b, #4
+ orr v23.16b, v23.16b, v25.16b
+ ext v19.16b, v19.16b, v19.16b, #8
+ add v17.4s, v17.4s, v23.4s
+ eor v19.16b, v17.16b, v19.16b
+ ext v22.16b, v22.16b, v22.16b, #12
+ tbl v19.16b, { v19.16b }, v0.16b
+ add v22.4s, v22.4s, v19.4s
+ eor v23.16b, v23.16b, v22.16b
+ ushr v25.4s, v23.4s, #12
+ shl v23.4s, v23.4s, #20
+ add v17.4s, v17.4s, v16.4s
+ orr v23.16b, v23.16b, v25.16b
+ add v17.4s, v17.4s, v23.4s
+ ext v25.16b, v17.16b, v17.16b, #12
+ eor v17.16b, v19.16b, v17.16b
+ tbl v17.16b, { v17.16b }, v1.16b
+ add v19.4s, v22.4s, v17.4s
+ eor v22.16b, v23.16b, v19.16b
+ add v25.4s, v25.4s, v21.4s
+ zip1 v20.2d, v6.2d, v16.2d
+ ushr v23.4s, v22.4s, #7
+ shl v22.4s, v22.4s, #25
+ zip2 v24.4s, v16.4s, v6.4s
+ tbl v26.16b, { v20.16b, v21.16b }, v2.16b
+ orr v22.16b, v22.16b, v23.16b
+ zip1 v16.4s, v24.4s, v21.4s
+ zip1 v20.4s, v21.4s, v24.4s
+ ext v21.16b, v26.16b, v26.16b, #12
+ ext v17.16b, v17.16b, v17.16b, #8
+ add v25.4s, v25.4s, v22.4s
+ ext v16.16b, v20.16b, v16.16b, #8
+ uzp1 v21.4s, v26.4s, v21.4s
+ eor v26.16b, v25.16b, v17.16b
+ ext v19.16b, v19.16b, v19.16b, #4
+ tbl v26.16b, { v26.16b }, v0.16b
+ mov v29.16b, v16.16b
+ add v19.4s, v19.4s, v26.4s
+ ext v27.16b, v5.16b, v5.16b, #12
+ mov v29.s[1], v21.s[2]
+ eor v22.16b, v22.16b, v19.16b
+ ext v28.16b, v5.16b, v27.16b, #12
+ ushr v27.4s, v22.4s, #12
+ shl v22.4s, v22.4s, #20
+ add v6.4s, v25.4s, v6.4s
+ orr v22.16b, v22.16b, v27.16b
+ add v6.4s, v6.4s, v22.4s
+ eor v26.16b, v26.16b, v6.16b
+ add v6.4s, v6.4s, v18.4s
+ tbl v18.16b, { v26.16b }, v1.16b
+ add v19.4s, v19.4s, v18.4s
+ eor v22.16b, v22.16b, v19.16b
+ ushr v26.4s, v22.4s, #7
+ shl v22.4s, v22.4s, #25
+ ext v6.16b, v6.16b, v6.16b, #4
+ orr v22.16b, v22.16b, v26.16b
+ ext v18.16b, v18.16b, v18.16b, #8
+ add v6.4s, v6.4s, v22.4s
+ eor v18.16b, v6.16b, v18.16b
+ ext v19.16b, v19.16b, v19.16b, #12
+ tbl v18.16b, { v18.16b }, v0.16b
+ add v19.4s, v19.4s, v18.4s
+ eor v22.16b, v22.16b, v19.16b
+ ushr v26.4s, v22.4s, #12
+ shl v22.4s, v22.4s, #20
+ add v6.4s, v6.4s, v7.4s
+ orr v22.16b, v22.16b, v26.16b
+ add v6.4s, v6.4s, v22.4s
+ ext v26.16b, v6.16b, v6.16b, #12
+ eor v6.16b, v18.16b, v6.16b
+ uzp2 v4.4s, v4.4s, v7.4s
+ zip2 v25.4s, v7.4s, v16.4s
+ add v26.4s, v26.4s, v21.4s
+ zip1 v20.2d, v16.2d, v7.2d
+ tbl v6.16b, { v6.16b }, v1.16b
+ ext v24.16b, v4.16b, v4.16b, #4
+ tbl v27.16b, { v20.16b, v21.16b }, v2.16b
+ zip1 v7.4s, v25.4s, v21.4s
+ zip1 v20.4s, v21.4s, v25.4s
+ add v18.4s, v19.4s, v6.4s
+ uzp1 v5.4s, v24.4s, v24.4s
+ ext v21.16b, v27.16b, v27.16b, #12
+ ext v7.16b, v20.16b, v7.16b, #8
+ eor v19.16b, v22.16b, v18.16b
+ ext v5.16b, v5.16b, v24.16b, #8
+ tbl v17.16b, { v28.16b, v29.16b }, v3.16b
+ uzp1 v21.4s, v27.4s, v21.4s
+ mov v28.16b, v7.16b
+ ushr v22.4s, v19.4s, #7
+ shl v19.4s, v19.4s, #25
+ ext v23.16b, v24.16b, v24.16b, #12
+ uzp2 v5.4s, v5.4s, v17.4s
+ mov v28.s[1], v21.s[2]
+ orr v19.16b, v19.16b, v22.16b
+ ext v27.16b, v24.16b, v23.16b, #12
+ ext v23.16b, v5.16b, v5.16b, #4
+ ext v6.16b, v6.16b, v6.16b, #8
+ ext v25.16b, v18.16b, v18.16b, #4
+ add v18.4s, v26.4s, v19.4s
+ uzp1 v24.4s, v23.4s, v23.4s
+ eor v6.16b, v18.16b, v6.16b
+ ext v24.16b, v24.16b, v23.16b, #8
+ add v16.4s, v18.4s, v16.4s
+ tbl v18.16b, { v27.16b, v28.16b }, v3.16b
+ tbl v27.16b, { v6.16b }, v0.16b
+ uzp2 v6.4s, v24.4s, v18.4s
+ add v24.4s, v25.4s, v27.4s
+ eor v19.16b, v19.16b, v24.16b
+ ushr v25.4s, v19.4s, #12
+ shl v19.4s, v19.4s, #20
+ orr v19.16b, v19.16b, v25.16b
+ add v16.4s, v16.4s, v19.4s
+ eor v25.16b, v27.16b, v16.16b
+ add v4.4s, v16.4s, v4.4s
+ tbl v16.16b, { v25.16b }, v1.16b
+ add v24.4s, v24.4s, v16.4s
+ eor v19.16b, v19.16b, v24.16b
+ ushr v25.4s, v19.4s, #7
+ shl v19.4s, v19.4s, #25
+ ext v4.16b, v4.16b, v4.16b, #4
+ orr v19.16b, v19.16b, v25.16b
+ ext v16.16b, v16.16b, v16.16b, #8
+ add v4.4s, v4.4s, v19.4s
+ eor v16.16b, v4.16b, v16.16b
+ ext v24.16b, v24.16b, v24.16b, #12
+ tbl v25.16b, { v16.16b }, v0.16b
+ add v24.4s, v24.4s, v25.4s
+ eor v16.16b, v19.16b, v24.16b
+ ushr v19.4s, v16.4s, #12
+ shl v16.4s, v16.4s, #20
+ add v4.4s, v4.4s, v17.4s
+ orr v19.16b, v16.16b, v19.16b
+ add v27.4s, v4.4s, v19.4s
+ eor v25.16b, v25.16b, v27.16b
+ tbl v25.16b, { v25.16b }, v1.16b
+ add v24.4s, v24.4s, v25.4s
+ zip2 v26.4s, v17.4s, v7.4s
+ ext v4.16b, v27.16b, v27.16b, #12
+ eor v19.16b, v19.16b, v24.16b
+ add v28.4s, v4.4s, v21.4s
+ zip1 v20.2d, v7.2d, v17.2d
+ zip1 v4.4s, v26.4s, v21.4s
+ zip1 v17.4s, v21.4s, v26.4s
+ ushr v26.4s, v19.4s, #7
+ shl v19.4s, v19.4s, #25
+ orr v19.16b, v19.16b, v26.16b
+ ext v25.16b, v25.16b, v25.16b, #8
+ add v27.4s, v28.4s, v19.4s
+ eor v25.16b, v27.16b, v25.16b
+ ext v24.16b, v24.16b, v24.16b, #4
+ tbl v25.16b, { v25.16b }, v0.16b
+ add v24.4s, v24.4s, v25.4s
+ eor v19.16b, v19.16b, v24.16b
+ add v7.4s, v27.4s, v7.4s
+ ushr v27.4s, v19.4s, #12
+ shl v19.4s, v19.4s, #20
+ orr v19.16b, v19.16b, v27.16b
+ add v7.4s, v7.4s, v19.4s
+ eor v25.16b, v25.16b, v7.16b
+ add v5.4s, v7.4s, v5.4s
+ tbl v7.16b, { v25.16b }, v1.16b
+ add v24.4s, v24.4s, v7.4s
+ eor v19.16b, v19.16b, v24.16b
+ ushr v25.4s, v19.4s, #7
+ shl v19.4s, v19.4s, #25
+ ext v5.16b, v5.16b, v5.16b, #4
+ orr v19.16b, v19.16b, v25.16b
+ ext v7.16b, v7.16b, v7.16b, #8
+ add v5.4s, v5.4s, v19.4s
+ eor v7.16b, v5.16b, v7.16b
+ ext v24.16b, v24.16b, v24.16b, #12
+ tbl v7.16b, { v7.16b }, v0.16b
+ add v24.4s, v24.4s, v7.4s
+ eor v19.16b, v19.16b, v24.16b
+ ushr v25.4s, v19.4s, #12
+ shl v19.4s, v19.4s, #20
+ tbl v16.16b, { v20.16b, v21.16b }, v2.16b
+ add v5.4s, v5.4s, v18.4s
+ orr v19.16b, v19.16b, v25.16b
+ ext v20.16b, v16.16b, v16.16b, #12
+ ext v4.16b, v17.16b, v4.16b, #8
+ add v5.4s, v5.4s, v19.4s
+ uzp1 v21.4s, v16.4s, v20.4s
+ mov v17.16b, v4.16b
+ ext v25.16b, v5.16b, v5.16b, #12
+ mov v17.s[1], v21.s[2]
+ add v25.4s, v25.4s, v21.4s
+ zip1 v20.2d, v4.2d, v18.2d
+ ext v22.16b, v23.16b, v23.16b, #12
+ zip2 v26.4s, v18.4s, v4.4s
+ tbl v18.16b, { v20.16b, v21.16b }, v2.16b
+ eor v5.16b, v7.16b, v5.16b
+ ext v16.16b, v23.16b, v22.16b, #12
+ ext v22.16b, v6.16b, v6.16b, #4
+ zip1 v27.4s, v26.4s, v21.4s
+ zip1 v20.4s, v21.4s, v26.4s
+ ext v21.16b, v18.16b, v18.16b, #12
+ tbl v5.16b, { v5.16b }, v1.16b
+ ext v20.16b, v20.16b, v27.16b, #8
+ uzp1 v27.4s, v18.4s, v21.4s
+ uzp1 v18.4s, v22.4s, v22.4s
+ add v21.4s, v24.4s, v5.4s
+ ext v18.16b, v18.16b, v22.16b, #8
+ eor v19.16b, v19.16b, v21.16b
+ tbl v7.16b, { v16.16b, v17.16b }, v3.16b
+ uzp2 v18.4s, v18.4s, v17.4s
+ zip2 v16.4s, v16.4s, v20.4s
+ ushr v17.4s, v19.4s, #7
+ shl v19.4s, v19.4s, #25
+ orr v17.16b, v19.16b, v17.16b
+ ext v5.16b, v5.16b, v5.16b, #8
+ add v19.4s, v25.4s, v17.4s
+ eor v5.16b, v19.16b, v5.16b
+ ext v21.16b, v21.16b, v21.16b, #4
+ tbl v5.16b, { v5.16b }, v0.16b
+ add v4.4s, v19.4s, v4.4s
+ add v19.4s, v21.4s, v5.4s
+ eor v17.16b, v17.16b, v19.16b
+ ushr v21.4s, v17.4s, #12
+ shl v17.4s, v17.4s, #20
+ orr v17.16b, v17.16b, v21.16b
+ add v4.4s, v4.4s, v17.4s
+ eor v5.16b, v5.16b, v4.16b
+ tbl v5.16b, { v5.16b }, v1.16b
+ add v4.4s, v4.4s, v6.4s
+ add v6.4s, v19.4s, v5.4s
+ eor v17.16b, v17.16b, v6.16b
+ ushr v19.4s, v17.4s, #7
+ shl v17.4s, v17.4s, #25
+ ext v4.16b, v4.16b, v4.16b, #4
+ orr v17.16b, v17.16b, v19.16b
+ ext v5.16b, v5.16b, v5.16b, #8
+ add v4.4s, v4.4s, v17.4s
+ eor v5.16b, v4.16b, v5.16b
+ ext v6.16b, v6.16b, v6.16b, #12
+ tbl v5.16b, { v5.16b }, v0.16b
+ add v6.4s, v6.4s, v5.4s
+ eor v17.16b, v17.16b, v6.16b
+ ushr v19.4s, v17.4s, #12
+ shl v17.4s, v17.4s, #20
+ add v4.4s, v4.4s, v7.4s
+ orr v17.16b, v17.16b, v19.16b
+ add v4.4s, v4.4s, v17.4s
+ eor v5.16b, v5.16b, v4.16b
+ tbl v5.16b, { v5.16b }, v1.16b
+ mov v29.16b, v20.16b
+ ext v4.16b, v4.16b, v4.16b, #12
+ add v6.4s, v6.4s, v5.4s
+ mov v29.s[1], v27.s[2]
+ add v4.4s, v4.4s, v27.4s
+ zip1 v26.2d, v20.2d, v7.2d
+ zip1 v7.4s, v16.4s, v27.4s
+ zip1 v16.4s, v27.4s, v16.4s
+ eor v17.16b, v17.16b, v6.16b
+ ext v7.16b, v16.16b, v7.16b, #8
+ ushr v16.4s, v17.4s, #7
+ shl v17.4s, v17.4s, #25
+ orr v16.16b, v17.16b, v16.16b
+ ext v5.16b, v5.16b, v5.16b, #8
+ add v4.4s, v4.4s, v16.4s
+ eor v5.16b, v4.16b, v5.16b
+ ext v6.16b, v6.16b, v6.16b, #4
+ tbl v5.16b, { v5.16b }, v0.16b
+ add v6.4s, v6.4s, v5.4s
+ eor v16.16b, v16.16b, v6.16b
+ ushr v17.4s, v16.4s, #12
+ shl v16.4s, v16.4s, #20
+ add v4.4s, v4.4s, v20.4s
+ orr v16.16b, v16.16b, v17.16b
+ add v4.4s, v4.4s, v16.4s
+ eor v5.16b, v5.16b, v4.16b
+ tbl v5.16b, { v5.16b }, v1.16b
+ add v6.4s, v6.4s, v5.4s
+ eor v16.16b, v16.16b, v6.16b
+ add v4.4s, v4.4s, v18.4s
+ ushr v17.4s, v16.4s, #7
+ shl v16.4s, v16.4s, #25
+ ext v23.16b, v22.16b, v22.16b, #12
+ ext v4.16b, v4.16b, v4.16b, #4
+ orr v16.16b, v16.16b, v17.16b
+ ext v28.16b, v22.16b, v23.16b, #12
+ ext v5.16b, v5.16b, v5.16b, #8
+ add v4.4s, v16.4s, v4.4s
+ tbl v3.16b, { v28.16b, v29.16b }, v3.16b
+ eor v5.16b, v4.16b, v5.16b
+ ext v6.16b, v6.16b, v6.16b, #12
+ add v3.4s, v4.4s, v3.4s
+ tbl v4.16b, { v5.16b }, v0.16b
+ add v5.4s, v6.4s, v4.4s
+ eor v6.16b, v16.16b, v5.16b
+ ushr v16.4s, v6.4s, #12
+ shl v6.4s, v6.4s, #20
+ orr v6.16b, v6.16b, v16.16b
+ tbl v2.16b, { v26.16b, v27.16b }, v2.16b
+ add v3.4s, v3.4s, v6.4s
+ ext v19.16b, v2.16b, v2.16b, #12
+ eor v4.16b, v4.16b, v3.16b
+ uzp1 v2.4s, v2.4s, v19.4s
+ ext v3.16b, v3.16b, v3.16b, #12
+ tbl v4.16b, { v4.16b }, v1.16b
+ add v2.4s, v3.4s, v2.4s
+ add v3.4s, v5.4s, v4.4s
+ eor v5.16b, v6.16b, v3.16b
+ ushr v6.4s, v5.4s, #7
+ shl v5.4s, v5.4s, #25
+ orr v5.16b, v5.16b, v6.16b
+ ext v4.16b, v4.16b, v4.16b, #8
+ add v2.4s, v2.4s, v5.4s
+ eor v4.16b, v2.16b, v4.16b
+ ext v3.16b, v3.16b, v3.16b, #4
+ tbl v0.16b, { v4.16b }, v0.16b
+ add v3.4s, v3.4s, v0.4s
+ eor v4.16b, v5.16b, v3.16b
+ ushr v5.4s, v4.4s, #12
+ shl v4.4s, v4.4s, #20
+ add v2.4s, v2.4s, v7.4s
+ orr v4.16b, v4.16b, v5.16b
+ add v2.4s, v2.4s, v4.4s
+ eor v0.16b, v0.16b, v2.16b
+ tbl v0.16b, { v0.16b }, v1.16b
+ add v1.4s, v3.4s, v0.4s
+ eor v3.16b, v4.16b, v1.16b
+ ext v2.16b, v2.16b, v2.16b, #4
+ ext v1.16b, v1.16b, v1.16b, #12
+ ushr v4.4s, v3.4s, #7
+ shl v3.4s, v3.4s, #25
+ ext v0.16b, v0.16b, v0.16b, #8
+ eor v1.16b, v2.16b, v1.16b
+ orr v2.16b, v3.16b, v4.16b
+ eor v0.16b, v2.16b, v0.16b
+ stp q1, q0, [x0]
+ ret
+.Lfunc_end0:
+ .size zfs_blake3_compress_in_place_sse41, .Lfunc_end0-zfs_blake3_compress_in_place_sse41
+ .cfi_endproc
+
+ .section .rodata.cst16,"aM",@progbits,16
+ .p2align 4
+.LCPI1_0:
+ .byte 2
+ .byte 3
+ .byte 0
+ .byte 1
+ .byte 6
+ .byte 7
+ .byte 4
+ .byte 5
+ .byte 10
+ .byte 11
+ .byte 8
+ .byte 9
+ .byte 14
+ .byte 15
+ .byte 12
+ .byte 13
+.LCPI1_1:
+ .word 1779033703
+ .word 3144134277
+ .word 1013904242
+ .word 2773480762
+.LCPI1_2:
+ .byte 1
+ .byte 2
+ .byte 3
+ .byte 0
+ .byte 5
+ .byte 6
+ .byte 7
+ .byte 4
+ .byte 9
+ .byte 10
+ .byte 11
+ .byte 8
+ .byte 13
+ .byte 14
+ .byte 15
+ .byte 12
+.LCPI1_3:
+ .byte 0
+ .byte 1
+ .byte 2
+ .byte 3
+ .byte 20
+ .byte 21
+ .byte 22
+ .byte 23
+ .byte 8
+ .byte 9
+ .byte 10
+ .byte 11
+ .byte 28
+ .byte 29
+ .byte 30
+ .byte 31
+.LCPI1_4:
+ .byte 0
+ .byte 1
+ .byte 2
+ .byte 3
+ .byte 4
+ .byte 5
+ .byte 6
+ .byte 7
+ .byte 8
+ .byte 9
+ .byte 10
+ .byte 11
+ .byte 28
+ .byte 29
+ .byte 30
+ .byte 31
+ .text
+ .globl zfs_blake3_compress_xof_sse41
+ .p2align 2
+ .type zfs_blake3_compress_xof_sse41,@function
+zfs_blake3_compress_xof_sse41:
+ .cfi_startproc
+ ldp q7, q6, [x0]
+ ldp q17, q18, [x1]
+ add x12, x1, #32
+ ld2 { v4.4s, v5.4s }, [x12]
+ lsr x10, x3, #32
+ fmov s16, w3
+ adrp x13, .LCPI1_0
+ adrp x11, .LCPI1_1
+ and w8, w2, #0xff
+ mov v16.s[1], w10
+ ldr q0, [x13, :lo12:.LCPI1_0]
+ ldr q20, [x11, :lo12:.LCPI1_1]
+ adrp x11, .LCPI1_4
+ and w9, w4, #0xff
+ ldr q2, [x11, :lo12:.LCPI1_4]
+ mov v16.s[2], w8
+ uzp1 v21.4s, v17.4s, v18.4s
+ add v7.4s, v6.4s, v7.4s
+ adrp x12, .LCPI1_3
+ mov v16.s[3], w9
+ uzp2 v18.4s, v17.4s, v18.4s
+ add v7.4s, v7.4s, v21.4s
+ ext v17.16b, v5.16b, v5.16b, #12
+ ldr q3, [x12, :lo12:.LCPI1_3]
+ ext v24.16b, v4.16b, v4.16b, #12
+ eor v16.16b, v7.16b, v16.16b
+ mov v27.16b, v17.16b
+ uzp1 v19.4s, v21.4s, v21.4s
+ ext v25.16b, v21.16b, v21.16b, #12
+ zip2 v28.4s, v18.4s, v17.4s
+ tbl v29.16b, { v16.16b }, v0.16b
+ mov v27.s[1], v24.s[2]
+ zip1 v23.2d, v17.2d, v18.2d
+ ext v19.16b, v19.16b, v21.16b, #8
+ add v22.4s, v29.4s, v20.4s
+ ext v26.16b, v21.16b, v25.16b, #12
+ tbl v20.16b, { v23.16b, v24.16b }, v2.16b
+ zip1 v21.4s, v28.4s, v24.4s
+ zip1 v23.4s, v24.4s, v28.4s
+ uzp2 v19.4s, v19.4s, v18.4s
+ eor v24.16b, v22.16b, v6.16b
+ ext v25.16b, v20.16b, v20.16b, #12
+ ext v6.16b, v23.16b, v21.16b, #8
+ add v7.4s, v7.4s, v18.4s
+ ext v18.16b, v19.16b, v19.16b, #4
+ tbl v16.16b, { v26.16b, v27.16b }, v3.16b
+ uzp1 v21.4s, v20.4s, v25.4s
+ mov v26.16b, v6.16b
+ ext v23.16b, v18.16b, v18.16b, #12
+ mov v26.s[1], v21.s[2]
+ adrp x10, .LCPI1_2
+ ext v25.16b, v18.16b, v23.16b, #12
+ uzp1 v23.4s, v18.4s, v18.4s
+ ldr q1, [x10, :lo12:.LCPI1_2]
+ ext v18.16b, v23.16b, v18.16b, #8
+ ushr v23.4s, v24.4s, #12
+ shl v24.4s, v24.4s, #20
+ orr v23.16b, v24.16b, v23.16b
+ add v7.4s, v7.4s, v23.4s
+ eor v27.16b, v29.16b, v7.16b
+ add v4.4s, v7.4s, v4.4s
+ tbl v7.16b, { v25.16b, v26.16b }, v3.16b
+ tbl v26.16b, { v27.16b }, v1.16b
+ add v22.4s, v22.4s, v26.4s
+ uzp2 v18.4s, v18.4s, v16.4s
+ eor v23.16b, v23.16b, v22.16b
+ ext v5.16b, v18.16b, v18.16b, #4
+ ushr v27.4s, v23.4s, #7
+ shl v23.4s, v23.4s, #25
+ uzp1 v25.4s, v5.4s, v5.4s
+ orr v23.16b, v23.16b, v27.16b
+ ext v28.16b, v4.16b, v4.16b, #12
+ ext v4.16b, v25.16b, v5.16b, #8
+ ext v25.16b, v26.16b, v26.16b, #8
+ add v26.4s, v28.4s, v23.4s
+ eor v25.16b, v26.16b, v25.16b
+ ext v22.16b, v22.16b, v22.16b, #4
+ tbl v25.16b, { v25.16b }, v0.16b
+ add v22.4s, v22.4s, v25.4s
+ eor v23.16b, v23.16b, v22.16b
+ add v17.4s, v26.4s, v17.4s
+ ushr v26.4s, v23.4s, #12
+ shl v23.4s, v23.4s, #20
+ orr v23.16b, v23.16b, v26.16b
+ add v17.4s, v17.4s, v23.4s
+ eor v25.16b, v25.16b, v17.16b
+ add v17.4s, v17.4s, v19.4s
+ tbl v19.16b, { v25.16b }, v1.16b
+ add v22.4s, v22.4s, v19.4s
+ eor v23.16b, v23.16b, v22.16b
+ ushr v25.4s, v23.4s, #7
+ shl v23.4s, v23.4s, #25
+ ext v17.16b, v17.16b, v17.16b, #4
+ orr v23.16b, v23.16b, v25.16b
+ ext v19.16b, v19.16b, v19.16b, #8
+ add v17.4s, v17.4s, v23.4s
+ eor v19.16b, v17.16b, v19.16b
+ ext v22.16b, v22.16b, v22.16b, #12
+ tbl v19.16b, { v19.16b }, v0.16b
+ add v22.4s, v22.4s, v19.4s
+ eor v23.16b, v23.16b, v22.16b
+ ushr v25.4s, v23.4s, #12
+ shl v23.4s, v23.4s, #20
+ add v17.4s, v17.4s, v16.4s
+ orr v23.16b, v23.16b, v25.16b
+ add v17.4s, v17.4s, v23.4s
+ ext v25.16b, v17.16b, v17.16b, #12
+ eor v17.16b, v19.16b, v17.16b
+ tbl v17.16b, { v17.16b }, v1.16b
+ add v19.4s, v22.4s, v17.4s
+ eor v22.16b, v23.16b, v19.16b
+ add v25.4s, v25.4s, v21.4s
+ zip1 v20.2d, v6.2d, v16.2d
+ ushr v23.4s, v22.4s, #7
+ shl v22.4s, v22.4s, #25
+ zip2 v24.4s, v16.4s, v6.4s
+ tbl v26.16b, { v20.16b, v21.16b }, v2.16b
+ orr v22.16b, v22.16b, v23.16b
+ zip1 v16.4s, v24.4s, v21.4s
+ zip1 v20.4s, v21.4s, v24.4s
+ ext v21.16b, v26.16b, v26.16b, #12
+ ext v17.16b, v17.16b, v17.16b, #8
+ add v25.4s, v25.4s, v22.4s
+ ext v16.16b, v20.16b, v16.16b, #8
+ uzp1 v21.4s, v26.4s, v21.4s
+ eor v26.16b, v25.16b, v17.16b
+ ext v19.16b, v19.16b, v19.16b, #4
+ tbl v26.16b, { v26.16b }, v0.16b
+ mov v29.16b, v16.16b
+ add v19.4s, v19.4s, v26.4s
+ ext v27.16b, v5.16b, v5.16b, #12
+ mov v29.s[1], v21.s[2]
+ eor v22.16b, v22.16b, v19.16b
+ ext v28.16b, v5.16b, v27.16b, #12
+ ushr v27.4s, v22.4s, #12
+ shl v22.4s, v22.4s, #20
+ add v6.4s, v25.4s, v6.4s
+ orr v22.16b, v22.16b, v27.16b
+ add v6.4s, v6.4s, v22.4s
+ eor v26.16b, v26.16b, v6.16b
+ add v6.4s, v6.4s, v18.4s
+ tbl v18.16b, { v26.16b }, v1.16b
+ add v19.4s, v19.4s, v18.4s
+ eor v22.16b, v22.16b, v19.16b
+ ushr v26.4s, v22.4s, #7
+ shl v22.4s, v22.4s, #25
+ ext v6.16b, v6.16b, v6.16b, #4
+ orr v22.16b, v22.16b, v26.16b
+ ext v18.16b, v18.16b, v18.16b, #8
+ add v6.4s, v6.4s, v22.4s
+ eor v18.16b, v6.16b, v18.16b
+ ext v19.16b, v19.16b, v19.16b, #12
+ tbl v18.16b, { v18.16b }, v0.16b
+ add v19.4s, v19.4s, v18.4s
+ eor v22.16b, v22.16b, v19.16b
+ ushr v26.4s, v22.4s, #12
+ shl v22.4s, v22.4s, #20
+ add v6.4s, v6.4s, v7.4s
+ orr v22.16b, v22.16b, v26.16b
+ add v6.4s, v6.4s, v22.4s
+ ext v26.16b, v6.16b, v6.16b, #12
+ eor v6.16b, v18.16b, v6.16b
+ uzp2 v4.4s, v4.4s, v7.4s
+ zip2 v25.4s, v7.4s, v16.4s
+ add v26.4s, v26.4s, v21.4s
+ zip1 v20.2d, v16.2d, v7.2d
+ tbl v6.16b, { v6.16b }, v1.16b
+ ext v24.16b, v4.16b, v4.16b, #4
+ tbl v27.16b, { v20.16b, v21.16b }, v2.16b
+ zip1 v7.4s, v25.4s, v21.4s
+ zip1 v20.4s, v21.4s, v25.4s
+ add v18.4s, v19.4s, v6.4s
+ uzp1 v5.4s, v24.4s, v24.4s
+ ext v21.16b, v27.16b, v27.16b, #12
+ ext v7.16b, v20.16b, v7.16b, #8
+ eor v19.16b, v22.16b, v18.16b
+ ext v5.16b, v5.16b, v24.16b, #8
+ tbl v17.16b, { v28.16b, v29.16b }, v3.16b
+ uzp1 v21.4s, v27.4s, v21.4s
+ mov v28.16b, v7.16b
+ ushr v22.4s, v19.4s, #7
+ shl v19.4s, v19.4s, #25
+ ext v23.16b, v24.16b, v24.16b, #12
+ uzp2 v5.4s, v5.4s, v17.4s
+ mov v28.s[1], v21.s[2]
+ orr v19.16b, v19.16b, v22.16b
+ ext v27.16b, v24.16b, v23.16b, #12
+ ext v23.16b, v5.16b, v5.16b, #4
+ ext v6.16b, v6.16b, v6.16b, #8
+ ext v25.16b, v18.16b, v18.16b, #4
+ add v18.4s, v26.4s, v19.4s
+ uzp1 v24.4s, v23.4s, v23.4s
+ eor v6.16b, v18.16b, v6.16b
+ ext v24.16b, v24.16b, v23.16b, #8
+ add v16.4s, v18.4s, v16.4s
+ tbl v18.16b, { v27.16b, v28.16b }, v3.16b
+ tbl v27.16b, { v6.16b }, v0.16b
+ uzp2 v6.4s, v24.4s, v18.4s
+ add v24.4s, v25.4s, v27.4s
+ eor v19.16b, v19.16b, v24.16b
+ ushr v25.4s, v19.4s, #12
+ shl v19.4s, v19.4s, #20
+ orr v19.16b, v19.16b, v25.16b
+ add v16.4s, v16.4s, v19.4s
+ eor v25.16b, v27.16b, v16.16b
+ add v4.4s, v16.4s, v4.4s
+ tbl v16.16b, { v25.16b }, v1.16b
+ add v24.4s, v24.4s, v16.4s
+ eor v19.16b, v19.16b, v24.16b
+ ushr v25.4s, v19.4s, #7
+ shl v19.4s, v19.4s, #25
+ ext v4.16b, v4.16b, v4.16b, #4
+ orr v19.16b, v19.16b, v25.16b
+ ext v16.16b, v16.16b, v16.16b, #8
+ add v4.4s, v4.4s, v19.4s
+ eor v16.16b, v4.16b, v16.16b
+ ext v24.16b, v24.16b, v24.16b, #12
+ tbl v25.16b, { v16.16b }, v0.16b
+ add v24.4s, v24.4s, v25.4s
+ eor v16.16b, v19.16b, v24.16b
+ ushr v19.4s, v16.4s, #12
+ shl v16.4s, v16.4s, #20
+ add v4.4s, v4.4s, v17.4s
+ orr v19.16b, v16.16b, v19.16b
+ add v27.4s, v4.4s, v19.4s
+ eor v25.16b, v25.16b, v27.16b
+ tbl v25.16b, { v25.16b }, v1.16b
+ add v24.4s, v24.4s, v25.4s
+ zip2 v26.4s, v17.4s, v7.4s
+ ext v4.16b, v27.16b, v27.16b, #12
+ eor v19.16b, v19.16b, v24.16b
+ add v28.4s, v4.4s, v21.4s
+ zip1 v20.2d, v7.2d, v17.2d
+ zip1 v4.4s, v26.4s, v21.4s
+ zip1 v17.4s, v21.4s, v26.4s
+ ushr v26.4s, v19.4s, #7
+ shl v19.4s, v19.4s, #25
+ orr v19.16b, v19.16b, v26.16b
+ ext v25.16b, v25.16b, v25.16b, #8
+ add v27.4s, v28.4s, v19.4s
+ eor v25.16b, v27.16b, v25.16b
+ ext v24.16b, v24.16b, v24.16b, #4
+ tbl v25.16b, { v25.16b }, v0.16b
+ add v24.4s, v24.4s, v25.4s
+ eor v19.16b, v19.16b, v24.16b
+ add v7.4s, v27.4s, v7.4s
+ ushr v27.4s, v19.4s, #12
+ shl v19.4s, v19.4s, #20
+ orr v19.16b, v19.16b, v27.16b
+ add v7.4s, v7.4s, v19.4s
+ eor v25.16b, v25.16b, v7.16b
+ add v5.4s, v7.4s, v5.4s
+ tbl v7.16b, { v25.16b }, v1.16b
+ add v24.4s, v24.4s, v7.4s
+ eor v19.16b, v19.16b, v24.16b
+ ushr v25.4s, v19.4s, #7
+ shl v19.4s, v19.4s, #25
+ ext v5.16b, v5.16b, v5.16b, #4
+ orr v19.16b, v19.16b, v25.16b
+ ext v7.16b, v7.16b, v7.16b, #8
+ add v5.4s, v5.4s, v19.4s
+ eor v7.16b, v5.16b, v7.16b
+ ext v24.16b, v24.16b, v24.16b, #12
+ tbl v7.16b, { v7.16b }, v0.16b
+ add v24.4s, v24.4s, v7.4s
+ eor v19.16b, v19.16b, v24.16b
+ ushr v25.4s, v19.4s, #12
+ shl v19.4s, v19.4s, #20
+ tbl v16.16b, { v20.16b, v21.16b }, v2.16b
+ add v5.4s, v5.4s, v18.4s
+ orr v19.16b, v19.16b, v25.16b
+ ext v20.16b, v16.16b, v16.16b, #12
+ ext v4.16b, v17.16b, v4.16b, #8
+ add v5.4s, v5.4s, v19.4s
+ uzp1 v21.4s, v16.4s, v20.4s
+ mov v17.16b, v4.16b
+ ext v25.16b, v5.16b, v5.16b, #12
+ mov v17.s[1], v21.s[2]
+ add v25.4s, v25.4s, v21.4s
+ zip1 v20.2d, v4.2d, v18.2d
+ ext v22.16b, v23.16b, v23.16b, #12
+ zip2 v26.4s, v18.4s, v4.4s
+ tbl v18.16b, { v20.16b, v21.16b }, v2.16b
+ eor v5.16b, v7.16b, v5.16b
+ ext v16.16b, v23.16b, v22.16b, #12
+ ext v22.16b, v6.16b, v6.16b, #4
+ zip1 v27.4s, v26.4s, v21.4s
+ zip1 v20.4s, v21.4s, v26.4s
+ ext v21.16b, v18.16b, v18.16b, #12
+ tbl v5.16b, { v5.16b }, v1.16b
+ ext v20.16b, v20.16b, v27.16b, #8
+ uzp1 v27.4s, v18.4s, v21.4s
+ uzp1 v18.4s, v22.4s, v22.4s
+ add v21.4s, v24.4s, v5.4s
+ ext v18.16b, v18.16b, v22.16b, #8
+ eor v19.16b, v19.16b, v21.16b
+ tbl v7.16b, { v16.16b, v17.16b }, v3.16b
+ uzp2 v18.4s, v18.4s, v17.4s
+ zip2 v16.4s, v16.4s, v20.4s
+ ushr v17.4s, v19.4s, #7
+ shl v19.4s, v19.4s, #25
+ orr v17.16b, v19.16b, v17.16b
+ ext v5.16b, v5.16b, v5.16b, #8
+ add v19.4s, v25.4s, v17.4s
+ eor v5.16b, v19.16b, v5.16b
+ ext v21.16b, v21.16b, v21.16b, #4
+ tbl v5.16b, { v5.16b }, v0.16b
+ add v4.4s, v19.4s, v4.4s
+ add v19.4s, v21.4s, v5.4s
+ eor v17.16b, v17.16b, v19.16b
+ ushr v21.4s, v17.4s, #12
+ shl v17.4s, v17.4s, #20
+ orr v17.16b, v17.16b, v21.16b
+ add v4.4s, v4.4s, v17.4s
+ eor v5.16b, v5.16b, v4.16b
+ tbl v5.16b, { v5.16b }, v1.16b
+ add v4.4s, v4.4s, v6.4s
+ add v6.4s, v19.4s, v5.4s
+ eor v17.16b, v17.16b, v6.16b
+ ushr v19.4s, v17.4s, #7
+ shl v17.4s, v17.4s, #25
+ ext v4.16b, v4.16b, v4.16b, #4
+ orr v17.16b, v17.16b, v19.16b
+ ext v5.16b, v5.16b, v5.16b, #8
+ add v4.4s, v4.4s, v17.4s
+ eor v5.16b, v4.16b, v5.16b
+ ext v6.16b, v6.16b, v6.16b, #12
+ tbl v5.16b, { v5.16b }, v0.16b
+ add v6.4s, v6.4s, v5.4s
+ eor v17.16b, v17.16b, v6.16b
+ ushr v19.4s, v17.4s, #12
+ shl v17.4s, v17.4s, #20
+ add v4.4s, v4.4s, v7.4s
+ orr v17.16b, v17.16b, v19.16b
+ add v4.4s, v4.4s, v17.4s
+ eor v5.16b, v5.16b, v4.16b
+ tbl v5.16b, { v5.16b }, v1.16b
+ mov v29.16b, v20.16b
+ ext v4.16b, v4.16b, v4.16b, #12
+ add v6.4s, v6.4s, v5.4s
+ mov v29.s[1], v27.s[2]
+ add v4.4s, v4.4s, v27.4s
+ zip1 v26.2d, v20.2d, v7.2d
+ zip1 v7.4s, v16.4s, v27.4s
+ zip1 v16.4s, v27.4s, v16.4s
+ eor v17.16b, v17.16b, v6.16b
+ ext v7.16b, v16.16b, v7.16b, #8
+ ushr v16.4s, v17.4s, #7
+ shl v17.4s, v17.4s, #25
+ orr v16.16b, v17.16b, v16.16b
+ ext v5.16b, v5.16b, v5.16b, #8
+ add v4.4s, v4.4s, v16.4s
+ eor v5.16b, v4.16b, v5.16b
+ ext v6.16b, v6.16b, v6.16b, #4
+ tbl v5.16b, { v5.16b }, v0.16b
+ add v6.4s, v6.4s, v5.4s
+ eor v16.16b, v16.16b, v6.16b
+ ushr v17.4s, v16.4s, #12
+ shl v16.4s, v16.4s, #20
+ add v4.4s, v4.4s, v20.4s
+ orr v16.16b, v16.16b, v17.16b
+ add v4.4s, v4.4s, v16.4s
+ eor v5.16b, v5.16b, v4.16b
+ tbl v5.16b, { v5.16b }, v1.16b
+ add v6.4s, v6.4s, v5.4s
+ eor v16.16b, v16.16b, v6.16b
+ add v4.4s, v4.4s, v18.4s
+ ushr v17.4s, v16.4s, #7
+ shl v16.4s, v16.4s, #25
+ ext v23.16b, v22.16b, v22.16b, #12
+ ext v4.16b, v4.16b, v4.16b, #4
+ orr v16.16b, v16.16b, v17.16b
+ ext v28.16b, v22.16b, v23.16b, #12
+ ext v5.16b, v5.16b, v5.16b, #8
+ add v4.4s, v16.4s, v4.4s
+ tbl v3.16b, { v28.16b, v29.16b }, v3.16b
+ eor v5.16b, v4.16b, v5.16b
+ ext v6.16b, v6.16b, v6.16b, #12
+ add v3.4s, v4.4s, v3.4s
+ tbl v4.16b, { v5.16b }, v0.16b
+ add v5.4s, v6.4s, v4.4s
+ eor v6.16b, v16.16b, v5.16b
+ ushr v16.4s, v6.4s, #12
+ shl v6.4s, v6.4s, #20
+ orr v6.16b, v6.16b, v16.16b
+ tbl v2.16b, { v26.16b, v27.16b }, v2.16b
+ add v3.4s, v3.4s, v6.4s
+ ext v19.16b, v2.16b, v2.16b, #12
+ eor v4.16b, v4.16b, v3.16b
+ uzp1 v2.4s, v2.4s, v19.4s
+ ext v3.16b, v3.16b, v3.16b, #12
+ tbl v4.16b, { v4.16b }, v1.16b
+ add v2.4s, v3.4s, v2.4s
+ add v3.4s, v5.4s, v4.4s
+ eor v5.16b, v6.16b, v3.16b
+ ushr v6.4s, v5.4s, #7
+ shl v5.4s, v5.4s, #25
+ orr v5.16b, v5.16b, v6.16b
+ ext v4.16b, v4.16b, v4.16b, #8
+ add v2.4s, v2.4s, v5.4s
+ eor v4.16b, v2.16b, v4.16b
+ ext v3.16b, v3.16b, v3.16b, #4
+ tbl v0.16b, { v4.16b }, v0.16b
+ add v3.4s, v3.4s, v0.4s
+ eor v4.16b, v5.16b, v3.16b
+ ushr v5.4s, v4.4s, #12
+ shl v4.4s, v4.4s, #20
+ add v2.4s, v2.4s, v7.4s
+ orr v4.16b, v4.16b, v5.16b
+ add v2.4s, v2.4s, v4.4s
+ eor v0.16b, v0.16b, v2.16b
+ tbl v0.16b, { v0.16b }, v1.16b
+ add v1.4s, v3.4s, v0.4s
+ eor v3.16b, v4.16b, v1.16b
+ ushr v4.4s, v3.4s, #7
+ shl v3.4s, v3.4s, #25
+ ext v2.16b, v2.16b, v2.16b, #4
+ ext v0.16b, v0.16b, v0.16b, #8
+ ext v1.16b, v1.16b, v1.16b, #12
+ orr v3.16b, v3.16b, v4.16b
+ eor v2.16b, v2.16b, v1.16b
+ eor v3.16b, v3.16b, v0.16b
+ stp q2, q3, [x5]
+ ldr q2, [x0]
+ eor v1.16b, v2.16b, v1.16b
+ str q1, [x5, #32]
+ ldr q1, [x0, #16]
+ eor v0.16b, v1.16b, v0.16b
+ str q0, [x5, #48]
+ ret
+.Lfunc_end1:
+ .size zfs_blake3_compress_xof_sse41, .Lfunc_end1-zfs_blake3_compress_xof_sse41
+ .cfi_endproc
+
+ .section .rodata.cst16,"aM",@progbits,16
+ .p2align 4
+.LCPI2_0:
+ .word 0
+ .word 1
+ .word 2
+ .word 3
+.LCPI2_1:
+ .byte 2
+ .byte 3
+ .byte 0
+ .byte 1
+ .byte 6
+ .byte 7
+ .byte 4
+ .byte 5
+ .byte 10
+ .byte 11
+ .byte 8
+ .byte 9
+ .byte 14
+ .byte 15
+ .byte 12
+ .byte 13
+.LCPI2_2:
+ .byte 1
+ .byte 2
+ .byte 3
+ .byte 0
+ .byte 5
+ .byte 6
+ .byte 7
+ .byte 4
+ .byte 9
+ .byte 10
+ .byte 11
+ .byte 8
+ .byte 13
+ .byte 14
+ .byte 15
+ .byte 12
+ .text
+ .globl zfs_blake3_hash_many_sse41
+ .p2align 2
+ .type zfs_blake3_hash_many_sse41,@function
+zfs_blake3_hash_many_sse41:
+ .cfi_startproc
+ stp d15, d14, [sp, #-160]!
+ stp d13, d12, [sp, #16]
+ stp d11, d10, [sp, #32]
+ stp d9, d8, [sp, #48]
+ stp x29, x30, [sp, #64]
+ stp x28, x27, [sp, #80]
+ stp x26, x25, [sp, #96]
+ stp x24, x23, [sp, #112]
+ stp x22, x21, [sp, #128]
+ stp x20, x19, [sp, #144]
+ mov x29, sp
+ sub sp, sp, #448
+ .cfi_def_cfa w29, 160
+ .cfi_offset w19, -8
+ .cfi_offset w20, -16
+ .cfi_offset w21, -24
+ .cfi_offset w22, -32
+ .cfi_offset w23, -40
+ .cfi_offset w24, -48
+ .cfi_offset w25, -56
+ .cfi_offset w26, -64
+ .cfi_offset w27, -72
+ .cfi_offset w28, -80
+ .cfi_offset w30, -88
+ .cfi_offset w29, -96
+ .cfi_offset b8, -104
+ .cfi_offset b9, -112
+ .cfi_offset b10, -120
+ .cfi_offset b11, -128
+ .cfi_offset b12, -136
+ .cfi_offset b13, -144
+ .cfi_offset b14, -152
+ .cfi_offset b15, -160
+ ldr x26, [x29, #168]
+ ldrb w27, [x29, #160]
+ mov w19, w6
+ mov x20, x4
+ mov x22, x2
+ mov x28, x1
+ cmp x1, #4
+ mov x24, x0
+ str x3, [sp, #40]
+ b.lo .LBB2_8
+ adrp x11, .LCPI2_0
+ ldr q0, [x11, :lo12:.LCPI2_0]
+ sbfx w13, w5, #0, #1
+ dup v1.4s, w13
+ mov w10, #58983
+ mov w11, #44677
+ mov w12, #62322
+ and v0.16b, v1.16b, v0.16b
+ mov w13, #62778
+ orr w8, w7, w19
+ adrp x9, .LCPI2_1
+ movk w10, #27145, lsl #16
+ movk w11, #47975, lsl #16
+ movk w12, #15470, lsl #16
+ movk w13, #42319, lsl #16
+ str q0, [sp, #16]
+ orr v0.4s, #128, lsl #24
+ adrp x14, .LCPI2_2
+ str q0, [sp]
+.LBB2_2:
+ ldr x2, [sp, #40]
+ mov x15, x2
+ ld1r { v7.4s }, [x15], #4
+ add x16, x2, #8
+ add x17, x2, #12
+ add x18, x2, #16
+ add x0, x2, #20
+ add x3, x2, #24
+ add x2, x2, #28
+ ld1r { v6.4s }, [x16]
+ ld1r { v17.4s }, [x17]
+ ld1r { v10.4s }, [x18]
+ ld1r { v11.4s }, [x0]
+ ld1r { v19.4s }, [x3]
+ ld1r { v18.4s }, [x15]
+ ld1r { v16.4s }, [x2]
+ cbz x22, .LBB2_7
+ ldr q1, [sp, #16]
+ dup v0.4s, w20
+ ldp x15, x16, [x24]
+ ldp x17, x18, [x24, #16]
+ add v1.4s, v0.4s, v1.4s
+ movi v0.4s, #128, lsl #24
+ str q1, [sp, #64]
+ eor v0.16b, v1.16b, v0.16b
+ ldr q1, [sp]
+ lsr x2, x20, #32
+ mov x0, xzr
+ mov w6, w8
+ cmgt v0.4s, v1.4s, v0.4s
+ dup v1.4s, w2
+ sub v0.4s, v1.4s, v0.4s
+ str q0, [sp, #48]
+.LBB2_4:
+ mov w4, #16
+ stp q16, q17, [sp, #192]
+ bfi x4, x0, #6, #58
+ ldr q1, [x15, x4]
+ ldr q3, [x16, x4]
+ ldr q2, [x17, x4]
+ ldr q4, [x18, x4]
+ mov w4, #32
+ bfi x4, x0, #6, #58
+ ldr q5, [x15, x4]
+ ldr q20, [x16, x4]
+ ldr q21, [x17, x4]
+ ldr q22, [x18, x4]
+ mov w4, #48
+ lsl x3, x0, #6
+ bfi x4, x0, #6, #58
+ add x0, x0, #1
+ ldr q0, [x15, x3]
+ ldr q23, [x16, x3]
+ ldr q16, [x17, x3]
+ ldr q17, [x18, x3]
+ cmp x0, x22
+ ldr q25, [x15, x4]
+ ldr q14, [x16, x4]
+ ldr q28, [x17, x4]
+ ldr q31, [x18, x4]
+ csel w4, w27, wzr, eq
+ orr w4, w4, w6
+ mov x2, xzr
+ and w6, w4, #0xff
+ add x3, x3, #256
+.LBB2_5:
+ ldr x4, [x24, x2]
+ add x2, x2, #8
+ cmp x2, #32
+ add x4, x4, x3
+ prfm pldl1keep, [x4]
+ b.ne .LBB2_5
+ zip1 v29.4s, v0.4s, v23.4s
+ zip2 v23.4s, v0.4s, v23.4s
+ zip1 v0.4s, v16.4s, v17.4s
+ zip2 v24.4s, v16.4s, v17.4s
+ zip1 v9.4s, v1.4s, v3.4s
+ zip2 v26.4s, v1.4s, v3.4s
+ zip1 v27.4s, v2.4s, v4.4s
+ zip2 v17.4s, v2.4s, v4.4s
+ zip1 v12.4s, v21.4s, v22.4s
+ zip2 v13.4s, v21.4s, v22.4s
+ add v2.4s, v7.4s, v10.4s
+ add v1.4s, v18.4s, v11.4s
+ ext v7.16b, v0.16b, v29.16b, #8
+ ext v22.16b, v24.16b, v23.16b, #8
+ zip1 v30.4s, v5.4s, v20.4s
+ zip2 v20.4s, v5.4s, v20.4s
+ stp q1, q2, [sp, #112]
+ ext v2.16b, v29.16b, v7.16b, #8
+ mov v29.d[1], v0.d[0]
+ ext v18.16b, v23.16b, v22.16b, #8
+ mov v23.d[1], v24.d[0]
+ zip1 v21.4s, v25.4s, v14.4s
+ zip2 v4.4s, v25.4s, v14.4s
+ zip1 v14.4s, v28.4s, v31.4s
+ zip2 v15.4s, v28.4s, v31.4s
+ add v8.4s, v6.4s, v19.4s
+ ext v28.16b, v27.16b, v9.16b, #8
+ ext v31.16b, v17.16b, v26.16b, #8
+ stur q2, [x29, #-208]
+ mov v7.16b, v29.16b
+ ext v0.16b, v12.16b, v30.16b, #8
+ stp q23, q29, [x29, #-80]
+ mov v2.16b, v19.16b
+ ext v19.16b, v13.16b, v20.16b, #8
+ mov v29.16b, v9.16b
+ ext v25.16b, v9.16b, v28.16b, #8
+ mov v29.d[1], v27.d[0]
+ ext v24.16b, v26.16b, v31.16b, #8
+ mov v26.d[1], v17.d[0]
+ ext v17.16b, v15.16b, v4.16b, #8
+ ext v27.16b, v30.16b, v0.16b, #8
+ ext v0.16b, v20.16b, v19.16b, #8
+ stp q0, q25, [sp, #80]
+ ext v0.16b, v4.16b, v17.16b, #8
+ str q0, [sp, #224]
+ ldr q0, [sp, #128]
+ mov v6.16b, v23.16b
+ mov v22.16b, v4.16b
+ ldr q16, [x9, :lo12:.LCPI2_1]
+ add v17.4s, v0.4s, v7.4s
+ ldr q0, [sp, #112]
+ mov v30.d[1], v12.d[0]
+ add v7.4s, v8.4s, v29.4s
+ mov v20.d[1], v13.d[0]
+ add v4.4s, v0.4s, v6.4s
+ ldr q0, [sp, #64]
+ dup v3.4s, w12
+ ext v28.16b, v14.16b, v21.16b, #8
+ dup v1.4s, w10
+ eor v19.16b, v17.16b, v0.16b
+ ldr q0, [sp, #48]
+ ext v23.16b, v21.16b, v28.16b, #8
+ mov v21.d[1], v14.d[0]
+ tbl v14.16b, { v19.16b }, v16.16b
+ eor v12.16b, v4.16b, v0.16b
+ movi v0.4s, #64
+ eor v13.16b, v7.16b, v0.16b
+ tbl v13.16b, { v13.16b }, v16.16b
+ add v6.4s, v13.4s, v3.4s
+ dup v5.4s, w11
+ tbl v12.16b, { v12.16b }, v16.16b
+ add v1.4s, v14.4s, v1.4s
+ eor v9.16b, v6.16b, v2.16b
+ ldp q2, q0, [sp, #192]
+ add v5.4s, v12.4s, v5.4s
+ eor v19.16b, v1.16b, v10.16b
+ eor v10.16b, v5.16b, v11.16b
+ ushr v11.4s, v19.4s, #12
+ shl v19.4s, v19.4s, #20
+ orr v11.16b, v19.16b, v11.16b
+ ushr v19.4s, v10.4s, #12
+ shl v10.4s, v10.4s, #20
+ mov v22.d[1], v15.d[0]
+ orr v10.16b, v10.16b, v19.16b
+ ushr v19.4s, v9.4s, #12
+ shl v9.4s, v9.4s, #20
+ add v15.4s, v0.4s, v2.4s
+ orr v9.16b, v9.16b, v19.16b
+ dup v19.4s, w6
+ add v15.4s, v15.4s, v26.4s
+ eor v19.16b, v15.16b, v19.16b
+ tbl v3.16b, { v19.16b }, v16.16b
+ dup v19.4s, w13
+ add v8.4s, v3.4s, v19.4s
+ ldur q31, [x29, #-208]
+ eor v19.16b, v8.16b, v2.16b
+ ushr v0.4s, v19.4s, #12
+ shl v19.4s, v19.4s, #20
+ orr v2.16b, v19.16b, v0.16b
+ ldr q19, [x14, :lo12:.LCPI2_2]
+ add v17.4s, v17.4s, v31.4s
+ add v17.4s, v17.4s, v11.4s
+ eor v14.16b, v14.16b, v17.16b
+ tbl v14.16b, { v14.16b }, v19.16b
+ add v1.4s, v1.4s, v14.4s
+ eor v11.16b, v1.16b, v11.16b
+ add v4.4s, v4.4s, v18.4s
+ ushr v0.4s, v11.4s, #7
+ shl v11.4s, v11.4s, #25
+ add v4.4s, v4.4s, v10.4s
+ orr v0.16b, v11.16b, v0.16b
+ eor v11.16b, v12.16b, v4.16b
+ tbl v11.16b, { v11.16b }, v19.16b
+ add v5.4s, v5.4s, v11.4s
+ eor v10.16b, v5.16b, v10.16b
+ add v7.4s, v7.4s, v25.4s
+ ushr v12.4s, v10.4s, #7
+ shl v10.4s, v10.4s, #25
+ add v7.4s, v7.4s, v9.4s
+ orr v10.16b, v10.16b, v12.16b
+ eor v12.16b, v13.16b, v7.16b
+ tbl v12.16b, { v12.16b }, v19.16b
+ add v6.4s, v6.4s, v12.4s
+ eor v9.16b, v6.16b, v9.16b
+ ushr v13.4s, v9.4s, #7
+ shl v9.4s, v9.4s, #25
+ orr v9.16b, v9.16b, v13.16b
+ add v13.4s, v15.4s, v24.4s
+ add v13.4s, v13.4s, v2.4s
+ eor v3.16b, v3.16b, v13.16b
+ tbl v3.16b, { v3.16b }, v19.16b
+ add v8.4s, v8.4s, v3.4s
+ eor v2.16b, v8.16b, v2.16b
+ add v17.4s, v17.4s, v30.4s
+ ushr v15.4s, v2.4s, #7
+ shl v2.4s, v2.4s, #25
+ add v17.4s, v17.4s, v10.4s
+ add v4.4s, v4.4s, v20.4s
+ orr v2.16b, v2.16b, v15.16b
+ eor v3.16b, v3.16b, v17.16b
+ add v4.4s, v4.4s, v9.4s
+ add v7.4s, v7.4s, v21.4s
+ tbl v3.16b, { v3.16b }, v16.16b
+ eor v14.16b, v14.16b, v4.16b
+ add v7.4s, v7.4s, v2.4s
+ add v13.4s, v13.4s, v22.4s
+ mov v28.16b, v26.16b
+ stur q26, [x29, #-112]
+ mov v26.16b, v18.16b
+ mov v18.16b, v24.16b
+ stur q24, [x29, #-160]
+ add v6.4s, v6.4s, v3.4s
+ mov v24.16b, v20.16b
+ tbl v14.16b, { v14.16b }, v16.16b
+ eor v11.16b, v11.16b, v7.16b
+ add v13.4s, v13.4s, v0.4s
+ ldr q20, [sp, #80]
+ eor v10.16b, v6.16b, v10.16b
+ add v8.4s, v8.4s, v14.4s
+ tbl v11.16b, { v11.16b }, v16.16b
+ eor v12.16b, v12.16b, v13.16b
+ stp q30, q22, [x29, #-192]
+ ushr v15.4s, v10.4s, #12
+ shl v10.4s, v10.4s, #20
+ eor v9.16b, v8.16b, v9.16b
+ add v1.4s, v1.4s, v11.4s
+ tbl v12.16b, { v12.16b }, v16.16b
+ mov v30.16b, v27.16b
+ add v17.4s, v17.4s, v27.4s
+ ldr q27, [sp, #224]
+ orr v10.16b, v10.16b, v15.16b
+ ushr v15.4s, v9.4s, #12
+ shl v9.4s, v9.4s, #20
+ eor v2.16b, v1.16b, v2.16b
+ add v5.4s, v5.4s, v12.4s
+ orr v9.16b, v9.16b, v15.16b
+ ushr v15.4s, v2.4s, #12
+ shl v2.4s, v2.4s, #20
+ eor v0.16b, v5.16b, v0.16b
+ add v17.4s, v17.4s, v10.4s
+ add v4.4s, v4.4s, v20.4s
+ orr v2.16b, v2.16b, v15.16b
+ ushr v15.4s, v0.4s, #12
+ shl v0.4s, v0.4s, #20
+ eor v3.16b, v3.16b, v17.16b
+ add v4.4s, v4.4s, v9.4s
+ add v7.4s, v7.4s, v23.4s
+ orr v0.16b, v0.16b, v15.16b
+ tbl v3.16b, { v3.16b }, v19.16b
+ eor v14.16b, v14.16b, v4.16b
+ add v7.4s, v7.4s, v2.4s
+ add v13.4s, v13.4s, v27.4s
+ add v6.4s, v6.4s, v3.4s
+ tbl v14.16b, { v14.16b }, v19.16b
+ eor v11.16b, v11.16b, v7.16b
+ add v13.4s, v13.4s, v0.4s
+ eor v10.16b, v6.16b, v10.16b
+ add v8.4s, v8.4s, v14.4s
+ tbl v11.16b, { v11.16b }, v19.16b
+ eor v12.16b, v12.16b, v13.16b
+ stur q21, [x29, #-144]
+ ushr v15.4s, v10.4s, #7
+ shl v10.4s, v10.4s, #25
+ eor v9.16b, v8.16b, v9.16b
+ add v1.4s, v1.4s, v11.4s
+ tbl v12.16b, { v12.16b }, v19.16b
+ ldur q21, [x29, #-80]
+ orr v10.16b, v10.16b, v15.16b
+ ushr v15.4s, v9.4s, #7
+ shl v9.4s, v9.4s, #25
+ eor v2.16b, v1.16b, v2.16b
+ add v5.4s, v5.4s, v12.4s
+ orr v9.16b, v9.16b, v15.16b
+ ushr v15.4s, v2.4s, #7
+ shl v2.4s, v2.4s, #25
+ eor v0.16b, v5.16b, v0.16b
+ orr v2.16b, v2.16b, v15.16b
+ ushr v15.4s, v0.4s, #7
+ shl v0.4s, v0.4s, #25
+ orr v0.16b, v0.16b, v15.16b
+ add v17.4s, v17.4s, v21.4s
+ add v17.4s, v17.4s, v0.4s
+ add v4.4s, v4.4s, v26.4s
+ eor v14.16b, v14.16b, v17.16b
+ add v4.4s, v4.4s, v10.4s
+ add v7.4s, v7.4s, v18.4s
+ tbl v14.16b, { v14.16b }, v16.16b
+ eor v11.16b, v11.16b, v4.16b
+ add v7.4s, v7.4s, v9.4s
+ add v13.4s, v13.4s, v29.4s
+ add v1.4s, v1.4s, v14.4s
+ tbl v11.16b, { v11.16b }, v16.16b
+ eor v12.16b, v12.16b, v7.16b
+ add v13.4s, v13.4s, v2.4s
+ eor v0.16b, v0.16b, v1.16b
+ add v5.4s, v5.4s, v11.4s
+ tbl v12.16b, { v12.16b }, v16.16b
+ eor v3.16b, v3.16b, v13.16b
+ ldur q22, [x29, #-64]
+ ushr v15.4s, v0.4s, #12
+ shl v0.4s, v0.4s, #20
+ eor v10.16b, v5.16b, v10.16b
+ add v6.4s, v6.4s, v12.4s
+ tbl v3.16b, { v3.16b }, v16.16b
+ orr v0.16b, v0.16b, v15.16b
+ ushr v15.4s, v10.4s, #12
+ shl v10.4s, v10.4s, #20
+ eor v9.16b, v6.16b, v9.16b
+ add v8.4s, v8.4s, v3.4s
+ add v17.4s, v17.4s, v28.4s
+ orr v10.16b, v10.16b, v15.16b
+ ushr v15.4s, v9.4s, #12
+ shl v9.4s, v9.4s, #20
+ eor v2.16b, v8.16b, v2.16b
+ add v17.4s, v17.4s, v0.4s
+ add v4.4s, v4.4s, v24.4s
+ orr v9.16b, v9.16b, v15.16b
+ ushr v15.4s, v2.4s, #12
+ shl v2.4s, v2.4s, #20
+ eor v14.16b, v14.16b, v17.16b
+ add v4.4s, v4.4s, v10.4s
+ add v7.4s, v7.4s, v22.4s
+ orr v2.16b, v2.16b, v15.16b
+ tbl v14.16b, { v14.16b }, v19.16b
+ eor v11.16b, v11.16b, v4.16b
+ add v7.4s, v7.4s, v9.4s
+ add v13.4s, v13.4s, v23.4s
+ add v1.4s, v1.4s, v14.4s
+ tbl v11.16b, { v11.16b }, v19.16b
+ eor v12.16b, v12.16b, v7.16b
+ add v13.4s, v13.4s, v2.4s
+ eor v0.16b, v0.16b, v1.16b
+ add v5.4s, v5.4s, v11.4s
+ tbl v12.16b, { v12.16b }, v19.16b
+ eor v3.16b, v3.16b, v13.16b
+ ldur q22, [x29, #-144]
+ ushr v15.4s, v0.4s, #7
+ shl v0.4s, v0.4s, #25
+ eor v10.16b, v5.16b, v10.16b
+ add v6.4s, v6.4s, v12.4s
+ tbl v3.16b, { v3.16b }, v19.16b
+ orr v0.16b, v0.16b, v15.16b
+ ushr v15.4s, v10.4s, #7
+ shl v10.4s, v10.4s, #25
+ eor v9.16b, v6.16b, v9.16b
+ add v8.4s, v8.4s, v3.4s
+ orr v10.16b, v10.16b, v15.16b
+ ushr v15.4s, v9.4s, #7
+ shl v9.4s, v9.4s, #25
+ eor v2.16b, v8.16b, v2.16b
+ add v17.4s, v17.4s, v31.4s
+ orr v9.16b, v9.16b, v15.16b
+ ushr v15.4s, v2.4s, #7
+ shl v2.4s, v2.4s, #25
+ add v17.4s, v17.4s, v10.4s
+ add v4.4s, v4.4s, v22.4s
+ orr v2.16b, v2.16b, v15.16b
+ eor v3.16b, v3.16b, v17.16b
+ add v4.4s, v4.4s, v9.4s
+ add v7.4s, v7.4s, v30.4s
+ tbl v3.16b, { v3.16b }, v16.16b
+ eor v14.16b, v14.16b, v4.16b
+ add v7.4s, v7.4s, v2.4s
+ add v13.4s, v13.4s, v27.4s
+ add v6.4s, v6.4s, v3.4s
+ tbl v14.16b, { v14.16b }, v16.16b
+ eor v11.16b, v11.16b, v7.16b
+ add v13.4s, v13.4s, v0.4s
+ ldr q27, [sp, #96]
+ mov v21.16b, v26.16b
+ stur q26, [x29, #-96]
+ mov v28.16b, v31.16b
+ eor v10.16b, v6.16b, v10.16b
+ add v8.4s, v8.4s, v14.4s
+ tbl v11.16b, { v11.16b }, v16.16b
+ eor v12.16b, v12.16b, v13.16b
+ ldp q31, q26, [x29, #-192]
+ ushr v15.4s, v10.4s, #12
+ shl v10.4s, v10.4s, #20
+ eor v9.16b, v8.16b, v9.16b
+ add v1.4s, v1.4s, v11.4s
+ tbl v12.16b, { v12.16b }, v16.16b
+ orr v10.16b, v10.16b, v15.16b
+ ushr v15.4s, v9.4s, #12
+ shl v9.4s, v9.4s, #20
+ eor v2.16b, v1.16b, v2.16b
+ add v5.4s, v5.4s, v12.4s
+ add v17.4s, v17.4s, v20.4s
+ orr v9.16b, v9.16b, v15.16b
+ ushr v15.4s, v2.4s, #12
+ shl v2.4s, v2.4s, #20
+ eor v0.16b, v5.16b, v0.16b
+ add v17.4s, v17.4s, v10.4s
+ add v4.4s, v4.4s, v27.4s
+ orr v2.16b, v2.16b, v15.16b
+ ushr v15.4s, v0.4s, #12
+ shl v0.4s, v0.4s, #20
+ eor v3.16b, v3.16b, v17.16b
+ add v4.4s, v4.4s, v9.4s
+ add v7.4s, v7.4s, v26.4s
+ orr v0.16b, v0.16b, v15.16b
+ tbl v3.16b, { v3.16b }, v19.16b
+ eor v14.16b, v14.16b, v4.16b
+ add v7.4s, v7.4s, v2.4s
+ add v13.4s, v13.4s, v31.4s
+ add v6.4s, v6.4s, v3.4s
+ tbl v14.16b, { v14.16b }, v19.16b
+ eor v11.16b, v11.16b, v7.16b
+ add v13.4s, v13.4s, v0.4s
+ eor v10.16b, v6.16b, v10.16b
+ add v8.4s, v8.4s, v14.4s
+ tbl v11.16b, { v11.16b }, v19.16b
+ eor v12.16b, v12.16b, v13.16b
+ ushr v15.4s, v10.4s, #7
+ shl v10.4s, v10.4s, #25
+ eor v9.16b, v8.16b, v9.16b
+ add v1.4s, v1.4s, v11.4s
+ tbl v12.16b, { v12.16b }, v19.16b
+ orr v10.16b, v10.16b, v15.16b
+ ushr v15.4s, v9.4s, #7
+ shl v9.4s, v9.4s, #25
+ eor v2.16b, v1.16b, v2.16b
+ add v5.4s, v5.4s, v12.4s
+ orr v9.16b, v9.16b, v15.16b
+ ushr v15.4s, v2.4s, #7
+ shl v2.4s, v2.4s, #25
+ eor v0.16b, v5.16b, v0.16b
+ mov v18.16b, v24.16b
+ mov v24.16b, v20.16b
+ orr v2.16b, v2.16b, v15.16b
+ ushr v15.4s, v0.4s, #7
+ shl v0.4s, v0.4s, #25
+ ldur q20, [x29, #-160]
+ orr v0.16b, v0.16b, v15.16b
+ add v17.4s, v17.4s, v21.4s
+ add v17.4s, v17.4s, v0.4s
+ add v4.4s, v4.4s, v18.4s
+ eor v14.16b, v14.16b, v17.16b
+ add v4.4s, v4.4s, v10.4s
+ add v7.4s, v7.4s, v23.4s
+ tbl v14.16b, { v14.16b }, v16.16b
+ eor v11.16b, v11.16b, v4.16b
+ add v7.4s, v7.4s, v9.4s
+ add v13.4s, v13.4s, v20.4s
+ add v1.4s, v1.4s, v14.4s
+ tbl v11.16b, { v11.16b }, v16.16b
+ eor v12.16b, v12.16b, v7.16b
+ add v13.4s, v13.4s, v2.4s
+ eor v0.16b, v0.16b, v1.16b
+ add v5.4s, v5.4s, v11.4s
+ tbl v12.16b, { v12.16b }, v16.16b
+ eor v3.16b, v3.16b, v13.16b
+ ldur q25, [x29, #-80]
+ ushr v15.4s, v0.4s, #12
+ shl v0.4s, v0.4s, #20
+ eor v10.16b, v5.16b, v10.16b
+ add v6.4s, v6.4s, v12.4s
+ tbl v3.16b, { v3.16b }, v16.16b
+ orr v0.16b, v0.16b, v15.16b
+ ushr v15.4s, v10.4s, #12
+ shl v10.4s, v10.4s, #20
+ eor v9.16b, v6.16b, v9.16b
+ add v8.4s, v8.4s, v3.4s
+ add v17.4s, v17.4s, v29.4s
+ orr v10.16b, v10.16b, v15.16b
+ ushr v15.4s, v9.4s, #12
+ shl v9.4s, v9.4s, #20
+ eor v2.16b, v8.16b, v2.16b
+ add v17.4s, v17.4s, v0.4s
+ add v4.4s, v4.4s, v22.4s
+ orr v9.16b, v9.16b, v15.16b
+ ushr v15.4s, v2.4s, #12
+ shl v2.4s, v2.4s, #20
+ eor v14.16b, v14.16b, v17.16b
+ add v4.4s, v4.4s, v10.4s
+ add v7.4s, v7.4s, v25.4s
+ orr v2.16b, v2.16b, v15.16b
+ tbl v14.16b, { v14.16b }, v19.16b
+ eor v11.16b, v11.16b, v4.16b
+ add v7.4s, v7.4s, v9.4s
+ add v13.4s, v13.4s, v26.4s
+ add v1.4s, v1.4s, v14.4s
+ tbl v11.16b, { v11.16b }, v19.16b
+ eor v12.16b, v12.16b, v7.16b
+ add v13.4s, v13.4s, v2.4s
+ ldur q25, [x29, #-112]
+ eor v0.16b, v0.16b, v1.16b
+ add v5.4s, v5.4s, v11.4s
+ tbl v12.16b, { v12.16b }, v19.16b
+ eor v3.16b, v3.16b, v13.16b
+ ushr v15.4s, v0.4s, #7
+ shl v0.4s, v0.4s, #25
+ eor v10.16b, v5.16b, v10.16b
+ add v6.4s, v6.4s, v12.4s
+ tbl v3.16b, { v3.16b }, v19.16b
+ orr v0.16b, v0.16b, v15.16b
+ ushr v15.4s, v10.4s, #7
+ shl v10.4s, v10.4s, #25
+ eor v9.16b, v6.16b, v9.16b
+ add v8.4s, v8.4s, v3.4s
+ orr v10.16b, v10.16b, v15.16b
+ ushr v15.4s, v9.4s, #7
+ shl v9.4s, v9.4s, #25
+ eor v2.16b, v8.16b, v2.16b
+ add v17.4s, v17.4s, v25.4s
+ orr v9.16b, v9.16b, v15.16b
+ ushr v15.4s, v2.4s, #7
+ shl v2.4s, v2.4s, #25
+ add v17.4s, v17.4s, v10.4s
+ add v4.4s, v4.4s, v30.4s
+ orr v2.16b, v2.16b, v15.16b
+ eor v3.16b, v3.16b, v17.16b
+ add v4.4s, v4.4s, v9.4s
+ add v7.4s, v7.4s, v24.4s
+ tbl v3.16b, { v3.16b }, v16.16b
+ eor v14.16b, v14.16b, v4.16b
+ add v7.4s, v7.4s, v2.4s
+ add v13.4s, v13.4s, v31.4s
+ add v6.4s, v6.4s, v3.4s
+ tbl v14.16b, { v14.16b }, v16.16b
+ eor v11.16b, v11.16b, v7.16b
+ add v13.4s, v13.4s, v0.4s
+ ldur q25, [x29, #-64]
+ eor v10.16b, v6.16b, v10.16b
+ add v8.4s, v8.4s, v14.4s
+ tbl v11.16b, { v11.16b }, v16.16b
+ eor v12.16b, v12.16b, v13.16b
+ ldr q31, [sp, #224]
+ ushr v15.4s, v10.4s, #12
+ shl v10.4s, v10.4s, #20
+ eor v9.16b, v8.16b, v9.16b
+ add v1.4s, v1.4s, v11.4s
+ tbl v12.16b, { v12.16b }, v16.16b
+ orr v10.16b, v10.16b, v15.16b
+ ushr v15.4s, v9.4s, #12
+ shl v9.4s, v9.4s, #20
+ eor v2.16b, v1.16b, v2.16b
+ add v5.4s, v5.4s, v12.4s
+ add v17.4s, v17.4s, v27.4s
+ orr v9.16b, v9.16b, v15.16b
+ ushr v15.4s, v2.4s, #12
+ shl v2.4s, v2.4s, #20
+ eor v0.16b, v5.16b, v0.16b
+ add v17.4s, v17.4s, v10.4s
+ add v4.4s, v4.4s, v25.4s
+ orr v2.16b, v2.16b, v15.16b
+ ushr v15.4s, v0.4s, #12
+ shl v0.4s, v0.4s, #20
+ eor v3.16b, v3.16b, v17.16b
+ add v4.4s, v4.4s, v9.4s
+ add v7.4s, v7.4s, v31.4s
+ orr v0.16b, v0.16b, v15.16b
+ tbl v3.16b, { v3.16b }, v19.16b
+ eor v14.16b, v14.16b, v4.16b
+ add v7.4s, v7.4s, v2.4s
+ add v13.4s, v13.4s, v28.4s
+ add v6.4s, v6.4s, v3.4s
+ tbl v14.16b, { v14.16b }, v19.16b
+ eor v11.16b, v11.16b, v7.16b
+ add v13.4s, v13.4s, v0.4s
+ eor v10.16b, v6.16b, v10.16b
+ add v8.4s, v8.4s, v14.4s
+ tbl v11.16b, { v11.16b }, v19.16b
+ eor v12.16b, v12.16b, v13.16b
+ ushr v15.4s, v10.4s, #7
+ shl v10.4s, v10.4s, #25
+ eor v9.16b, v8.16b, v9.16b
+ add v1.4s, v1.4s, v11.4s
+ tbl v12.16b, { v12.16b }, v19.16b
+ orr v10.16b, v10.16b, v15.16b
+ ushr v15.4s, v9.4s, #7
+ shl v9.4s, v9.4s, #25
+ eor v2.16b, v1.16b, v2.16b
+ add v5.4s, v5.4s, v12.4s
+ orr v9.16b, v9.16b, v15.16b
+ ushr v15.4s, v2.4s, #7
+ shl v2.4s, v2.4s, #25
+ eor v0.16b, v5.16b, v0.16b
+ orr v2.16b, v2.16b, v15.16b
+ ushr v15.4s, v0.4s, #7
+ shl v0.4s, v0.4s, #25
+ orr v0.16b, v0.16b, v15.16b
+ add v17.4s, v17.4s, v18.4s
+ add v17.4s, v17.4s, v0.4s
+ add v4.4s, v4.4s, v22.4s
+ eor v14.16b, v14.16b, v17.16b
+ add v4.4s, v4.4s, v10.4s
+ add v7.4s, v7.4s, v26.4s
+ tbl v14.16b, { v14.16b }, v16.16b
+ eor v11.16b, v11.16b, v4.16b
+ add v7.4s, v7.4s, v9.4s
+ add v13.4s, v13.4s, v23.4s
+ add v1.4s, v1.4s, v14.4s
+ tbl v11.16b, { v11.16b }, v16.16b
+ eor v12.16b, v12.16b, v7.16b
+ add v13.4s, v13.4s, v2.4s
+ mov v21.16b, v29.16b
+ stur q29, [x29, #-128]
+ mov v29.16b, v30.16b
+ mov v30.16b, v27.16b
+ mov v27.16b, v18.16b
+ str q18, [sp, #176]
+ eor v0.16b, v0.16b, v1.16b
+ mov v18.16b, v22.16b
+ add v5.4s, v5.4s, v11.4s
+ tbl v12.16b, { v12.16b }, v16.16b
+ eor v3.16b, v3.16b, v13.16b
+ ldur q22, [x29, #-96]
+ ushr v15.4s, v0.4s, #12
+ shl v0.4s, v0.4s, #20
+ eor v10.16b, v5.16b, v10.16b
+ add v6.4s, v6.4s, v12.4s
+ tbl v3.16b, { v3.16b }, v16.16b
+ orr v0.16b, v0.16b, v15.16b
+ ushr v15.4s, v10.4s, #12
+ shl v10.4s, v10.4s, #20
+ eor v9.16b, v6.16b, v9.16b
+ add v8.4s, v8.4s, v3.4s
+ add v17.4s, v17.4s, v20.4s
+ orr v10.16b, v10.16b, v15.16b
+ ushr v15.4s, v9.4s, #12
+ shl v9.4s, v9.4s, #20
+ eor v2.16b, v8.16b, v2.16b
+ add v17.4s, v17.4s, v0.4s
+ add v4.4s, v4.4s, v29.4s
+ orr v9.16b, v9.16b, v15.16b
+ ushr v15.4s, v2.4s, #12
+ shl v2.4s, v2.4s, #20
+ eor v14.16b, v14.16b, v17.16b
+ add v4.4s, v4.4s, v10.4s
+ add v7.4s, v7.4s, v22.4s
+ orr v2.16b, v2.16b, v15.16b
+ tbl v14.16b, { v14.16b }, v19.16b
+ eor v11.16b, v11.16b, v4.16b
+ add v7.4s, v7.4s, v9.4s
+ add v13.4s, v13.4s, v31.4s
+ add v1.4s, v1.4s, v14.4s
+ tbl v11.16b, { v11.16b }, v19.16b
+ eor v12.16b, v12.16b, v7.16b
+ add v13.4s, v13.4s, v2.4s
+ eor v0.16b, v0.16b, v1.16b
+ add v5.4s, v5.4s, v11.4s
+ tbl v12.16b, { v12.16b }, v19.16b
+ eor v3.16b, v3.16b, v13.16b
+ ushr v15.4s, v0.4s, #7
+ shl v0.4s, v0.4s, #25
+ eor v10.16b, v5.16b, v10.16b
+ add v6.4s, v6.4s, v12.4s
+ tbl v3.16b, { v3.16b }, v19.16b
+ orr v0.16b, v0.16b, v15.16b
+ ushr v15.4s, v10.4s, #7
+ shl v10.4s, v10.4s, #25
+ eor v9.16b, v6.16b, v9.16b
+ add v8.4s, v8.4s, v3.4s
+ orr v10.16b, v10.16b, v15.16b
+ ushr v15.4s, v9.4s, #7
+ shl v9.4s, v9.4s, #25
+ eor v2.16b, v8.16b, v2.16b
+ add v17.4s, v17.4s, v21.4s
+ orr v9.16b, v9.16b, v15.16b
+ ushr v15.4s, v2.4s, #7
+ shl v2.4s, v2.4s, #25
+ add v17.4s, v17.4s, v10.4s
+ add v4.4s, v4.4s, v24.4s
+ orr v2.16b, v2.16b, v15.16b
+ eor v3.16b, v3.16b, v17.16b
+ add v4.4s, v4.4s, v9.4s
+ add v7.4s, v7.4s, v30.4s
+ tbl v3.16b, { v3.16b }, v16.16b
+ eor v14.16b, v14.16b, v4.16b
+ add v7.4s, v7.4s, v2.4s
+ add v13.4s, v13.4s, v28.4s
+ add v6.4s, v6.4s, v3.4s
+ mov v22.16b, v24.16b
+ tbl v14.16b, { v14.16b }, v16.16b
+ eor v11.16b, v11.16b, v7.16b
+ add v13.4s, v13.4s, v0.4s
+ ldur q24, [x29, #-80]
+ eor v10.16b, v6.16b, v10.16b
+ add v8.4s, v8.4s, v14.4s
+ mov v21.16b, v30.16b
+ tbl v11.16b, { v11.16b }, v16.16b
+ eor v12.16b, v12.16b, v13.16b
+ ldur q30, [x29, #-192]
+ mov v20.16b, v29.16b
+ ushr v15.4s, v10.4s, #12
+ shl v10.4s, v10.4s, #20
+ eor v9.16b, v8.16b, v9.16b
+ add v1.4s, v1.4s, v11.4s
+ tbl v12.16b, { v12.16b }, v16.16b
+ ldur q29, [x29, #-112]
+ orr v10.16b, v10.16b, v15.16b
+ ushr v15.4s, v9.4s, #12
+ shl v9.4s, v9.4s, #20
+ eor v2.16b, v1.16b, v2.16b
+ add v5.4s, v5.4s, v12.4s
+ add v17.4s, v17.4s, v25.4s
+ orr v9.16b, v9.16b, v15.16b
+ ushr v15.4s, v2.4s, #12
+ shl v2.4s, v2.4s, #20
+ eor v0.16b, v5.16b, v0.16b
+ add v17.4s, v17.4s, v10.4s
+ add v4.4s, v4.4s, v24.4s
+ orr v2.16b, v2.16b, v15.16b
+ ushr v15.4s, v0.4s, #12
+ shl v0.4s, v0.4s, #20
+ eor v3.16b, v3.16b, v17.16b
+ add v4.4s, v4.4s, v9.4s
+ add v7.4s, v7.4s, v30.4s
+ orr v0.16b, v0.16b, v15.16b
+ tbl v3.16b, { v3.16b }, v19.16b
+ eor v14.16b, v14.16b, v4.16b
+ add v7.4s, v7.4s, v2.4s
+ add v13.4s, v13.4s, v29.4s
+ add v6.4s, v6.4s, v3.4s
+ tbl v14.16b, { v14.16b }, v19.16b
+ eor v11.16b, v11.16b, v7.16b
+ add v13.4s, v13.4s, v0.4s
+ eor v10.16b, v6.16b, v10.16b
+ add v8.4s, v8.4s, v14.4s
+ tbl v11.16b, { v11.16b }, v19.16b
+ eor v12.16b, v12.16b, v13.16b
+ ushr v15.4s, v10.4s, #7
+ shl v10.4s, v10.4s, #25
+ eor v9.16b, v8.16b, v9.16b
+ add v1.4s, v1.4s, v11.4s
+ tbl v12.16b, { v12.16b }, v19.16b
+ orr v10.16b, v10.16b, v15.16b
+ ushr v15.4s, v9.4s, #7
+ shl v9.4s, v9.4s, #25
+ eor v2.16b, v1.16b, v2.16b
+ add v5.4s, v5.4s, v12.4s
+ orr v9.16b, v9.16b, v15.16b
+ ushr v15.4s, v2.4s, #7
+ shl v2.4s, v2.4s, #25
+ eor v0.16b, v5.16b, v0.16b
+ orr v2.16b, v2.16b, v15.16b
+ ushr v15.4s, v0.4s, #7
+ shl v0.4s, v0.4s, #25
+ orr v0.16b, v0.16b, v15.16b
+ add v17.4s, v17.4s, v18.4s
+ add v17.4s, v17.4s, v0.4s
+ add v4.4s, v4.4s, v20.4s
+ eor v14.16b, v14.16b, v17.16b
+ add v4.4s, v4.4s, v10.4s
+ add v7.4s, v7.4s, v31.4s
+ tbl v14.16b, { v14.16b }, v16.16b
+ eor v11.16b, v11.16b, v4.16b
+ add v7.4s, v7.4s, v9.4s
+ add v13.4s, v13.4s, v26.4s
+ add v1.4s, v1.4s, v14.4s
+ tbl v11.16b, { v11.16b }, v16.16b
+ eor v12.16b, v12.16b, v7.16b
+ add v13.4s, v13.4s, v2.4s
+ eor v0.16b, v0.16b, v1.16b
+ add v5.4s, v5.4s, v11.4s
+ tbl v12.16b, { v12.16b }, v16.16b
+ eor v3.16b, v3.16b, v13.16b
+ ushr v15.4s, v0.4s, #12
+ shl v0.4s, v0.4s, #20
+ eor v10.16b, v5.16b, v10.16b
+ add v6.4s, v6.4s, v12.4s
+ tbl v3.16b, { v3.16b }, v16.16b
+ orr v0.16b, v0.16b, v15.16b
+ ushr v15.4s, v10.4s, #12
+ shl v10.4s, v10.4s, #20
+ eor v9.16b, v6.16b, v9.16b
+ add v8.4s, v8.4s, v3.4s
+ add v17.4s, v17.4s, v23.4s
+ orr v10.16b, v10.16b, v15.16b
+ ushr v15.4s, v9.4s, #12
+ shl v9.4s, v9.4s, #20
+ eor v2.16b, v8.16b, v2.16b
+ add v17.4s, v17.4s, v0.4s
+ add v4.4s, v4.4s, v22.4s
+ orr v9.16b, v9.16b, v15.16b
+ ushr v15.4s, v2.4s, #12
+ shl v2.4s, v2.4s, #20
+ eor v14.16b, v14.16b, v17.16b
+ add v4.4s, v4.4s, v10.4s
+ add v7.4s, v7.4s, v27.4s
+ orr v2.16b, v2.16b, v15.16b
+ tbl v14.16b, { v14.16b }, v19.16b
+ eor v11.16b, v11.16b, v4.16b
+ add v7.4s, v7.4s, v9.4s
+ add v13.4s, v13.4s, v30.4s
+ add v1.4s, v1.4s, v14.4s
+ tbl v11.16b, { v11.16b }, v19.16b
+ eor v12.16b, v12.16b, v7.16b
+ add v13.4s, v13.4s, v2.4s
+ ldur q27, [x29, #-160]
+ eor v0.16b, v0.16b, v1.16b
+ add v5.4s, v5.4s, v11.4s
+ tbl v12.16b, { v12.16b }, v19.16b
+ eor v3.16b, v3.16b, v13.16b
+ ushr v15.4s, v0.4s, #7
+ shl v0.4s, v0.4s, #25
+ eor v10.16b, v5.16b, v10.16b
+ add v6.4s, v6.4s, v12.4s
+ tbl v3.16b, { v3.16b }, v19.16b
+ orr v0.16b, v0.16b, v15.16b
+ ushr v15.4s, v10.4s, #7
+ shl v10.4s, v10.4s, #25
+ eor v9.16b, v6.16b, v9.16b
+ add v8.4s, v8.4s, v3.4s
+ orr v10.16b, v10.16b, v15.16b
+ ushr v15.4s, v9.4s, #7
+ shl v9.4s, v9.4s, #25
+ eor v2.16b, v8.16b, v2.16b
+ add v17.4s, v17.4s, v27.4s
+ mov v28.16b, v25.16b
+ orr v9.16b, v9.16b, v15.16b
+ ushr v15.4s, v2.4s, #7
+ shl v2.4s, v2.4s, #25
+ add v17.4s, v17.4s, v10.4s
+ add v4.4s, v4.4s, v21.4s
+ orr v2.16b, v2.16b, v15.16b
+ eor v3.16b, v3.16b, v17.16b
+ add v4.4s, v4.4s, v9.4s
+ add v7.4s, v7.4s, v28.4s
+ tbl v3.16b, { v3.16b }, v16.16b
+ eor v14.16b, v14.16b, v4.16b
+ add v7.4s, v7.4s, v2.4s
+ add v13.4s, v13.4s, v29.4s
+ mov v25.16b, v31.16b
+ add v6.4s, v6.4s, v3.4s
+ tbl v14.16b, { v14.16b }, v16.16b
+ eor v11.16b, v11.16b, v7.16b
+ add v13.4s, v13.4s, v0.4s
+ ldur q31, [x29, #-96]
+ eor v10.16b, v6.16b, v10.16b
+ add v8.4s, v8.4s, v14.4s
+ tbl v11.16b, { v11.16b }, v16.16b
+ eor v12.16b, v12.16b, v13.16b
+ ldur q28, [x29, #-208]
+ mov v18.16b, v20.16b
+ str q20, [sp, #144]
+ ushr v15.4s, v10.4s, #12
+ shl v10.4s, v10.4s, #20
+ eor v9.16b, v8.16b, v9.16b
+ add v1.4s, v1.4s, v11.4s
+ tbl v12.16b, { v12.16b }, v16.16b
+ ldur q20, [x29, #-128]
+ orr v10.16b, v10.16b, v15.16b
+ ushr v15.4s, v9.4s, #12
+ shl v9.4s, v9.4s, #20
+ eor v2.16b, v1.16b, v2.16b
+ add v5.4s, v5.4s, v12.4s
+ add v17.4s, v17.4s, v24.4s
+ orr v9.16b, v9.16b, v15.16b
+ ushr v15.4s, v2.4s, #12
+ shl v2.4s, v2.4s, #20
+ eor v0.16b, v5.16b, v0.16b
+ add v17.4s, v17.4s, v10.4s
+ add v4.4s, v4.4s, v31.4s
+ orr v2.16b, v2.16b, v15.16b
+ ushr v15.4s, v0.4s, #12
+ shl v0.4s, v0.4s, #20
+ eor v3.16b, v3.16b, v17.16b
+ add v4.4s, v4.4s, v9.4s
+ add v7.4s, v7.4s, v28.4s
+ orr v0.16b, v0.16b, v15.16b
+ tbl v3.16b, { v3.16b }, v19.16b
+ eor v14.16b, v14.16b, v4.16b
+ add v7.4s, v7.4s, v2.4s
+ add v13.4s, v13.4s, v20.4s
+ add v6.4s, v6.4s, v3.4s
+ tbl v14.16b, { v14.16b }, v19.16b
+ eor v11.16b, v11.16b, v7.16b
+ add v13.4s, v13.4s, v0.4s
+ eor v10.16b, v6.16b, v10.16b
+ add v8.4s, v8.4s, v14.4s
+ tbl v11.16b, { v11.16b }, v19.16b
+ eor v12.16b, v12.16b, v13.16b
+ ushr v15.4s, v10.4s, #7
+ shl v10.4s, v10.4s, #25
+ eor v9.16b, v8.16b, v9.16b
+ add v1.4s, v1.4s, v11.4s
+ tbl v12.16b, { v12.16b }, v19.16b
+ orr v10.16b, v10.16b, v15.16b
+ ushr v15.4s, v9.4s, #7
+ shl v9.4s, v9.4s, #25
+ eor v2.16b, v1.16b, v2.16b
+ add v5.4s, v5.4s, v12.4s
+ orr v9.16b, v9.16b, v15.16b
+ ushr v15.4s, v2.4s, #7
+ shl v2.4s, v2.4s, #25
+ eor v0.16b, v5.16b, v0.16b
+ orr v2.16b, v2.16b, v15.16b
+ ushr v15.4s, v0.4s, #7
+ shl v0.4s, v0.4s, #25
+ orr v0.16b, v0.16b, v15.16b
+ add v17.4s, v17.4s, v18.4s
+ add v17.4s, v17.4s, v0.4s
+ add v4.4s, v4.4s, v22.4s
+ eor v14.16b, v14.16b, v17.16b
+ add v4.4s, v4.4s, v10.4s
+ add v7.4s, v7.4s, v30.4s
+ tbl v14.16b, { v14.16b }, v16.16b
+ eor v11.16b, v11.16b, v4.16b
+ add v7.4s, v7.4s, v9.4s
+ add v13.4s, v13.4s, v25.4s
+ add v1.4s, v1.4s, v14.4s
+ tbl v11.16b, { v11.16b }, v16.16b
+ eor v12.16b, v12.16b, v7.16b
+ add v13.4s, v13.4s, v2.4s
+ eor v0.16b, v0.16b, v1.16b
+ add v5.4s, v5.4s, v11.4s
+ tbl v12.16b, { v12.16b }, v16.16b
+ eor v3.16b, v3.16b, v13.16b
+ add v17.4s, v17.4s, v26.4s
+ mov v26.16b, v21.16b
+ add v4.4s, v4.4s, v21.4s
+ ldur q21, [x29, #-144]
+ ushr v15.4s, v0.4s, #12
+ shl v0.4s, v0.4s, #20
+ eor v10.16b, v5.16b, v10.16b
+ add v6.4s, v6.4s, v12.4s
+ tbl v3.16b, { v3.16b }, v16.16b
+ orr v0.16b, v0.16b, v15.16b
+ ushr v15.4s, v10.4s, #12
+ shl v10.4s, v10.4s, #20
+ eor v9.16b, v6.16b, v9.16b
+ add v8.4s, v8.4s, v3.4s
+ orr v10.16b, v10.16b, v15.16b
+ ushr v15.4s, v9.4s, #12
+ shl v9.4s, v9.4s, #20
+ eor v2.16b, v8.16b, v2.16b
+ add v17.4s, v17.4s, v0.4s
+ orr v9.16b, v9.16b, v15.16b
+ ushr v15.4s, v2.4s, #12
+ shl v2.4s, v2.4s, #20
+ eor v14.16b, v14.16b, v17.16b
+ add v4.4s, v4.4s, v10.4s
+ add v7.4s, v7.4s, v21.4s
+ orr v2.16b, v2.16b, v15.16b
+ tbl v14.16b, { v14.16b }, v19.16b
+ eor v11.16b, v11.16b, v4.16b
+ add v7.4s, v7.4s, v9.4s
+ add v13.4s, v13.4s, v28.4s
+ add v1.4s, v1.4s, v14.4s
+ tbl v11.16b, { v11.16b }, v19.16b
+ eor v12.16b, v12.16b, v7.16b
+ add v13.4s, v13.4s, v2.4s
+ str q23, [sp, #160]
+ eor v0.16b, v0.16b, v1.16b
+ add v5.4s, v5.4s, v11.4s
+ tbl v12.16b, { v12.16b }, v19.16b
+ eor v3.16b, v3.16b, v13.16b
+ add v17.4s, v17.4s, v23.4s
+ ldur q23, [x29, #-64]
+ ushr v15.4s, v0.4s, #7
+ shl v0.4s, v0.4s, #25
+ eor v10.16b, v5.16b, v10.16b
+ add v6.4s, v6.4s, v12.4s
+ tbl v3.16b, { v3.16b }, v19.16b
+ orr v0.16b, v0.16b, v15.16b
+ ushr v15.4s, v10.4s, #7
+ shl v10.4s, v10.4s, #25
+ eor v9.16b, v6.16b, v9.16b
+ add v8.4s, v8.4s, v3.4s
+ orr v10.16b, v10.16b, v15.16b
+ ushr v15.4s, v9.4s, #7
+ shl v9.4s, v9.4s, #25
+ eor v2.16b, v8.16b, v2.16b
+ orr v9.16b, v9.16b, v15.16b
+ ushr v15.4s, v2.4s, #7
+ shl v2.4s, v2.4s, #25
+ add v17.4s, v17.4s, v10.4s
+ add v4.4s, v4.4s, v23.4s
+ orr v2.16b, v2.16b, v15.16b
+ eor v3.16b, v3.16b, v17.16b
+ add v4.4s, v4.4s, v9.4s
+ add v7.4s, v7.4s, v24.4s
+ tbl v3.16b, { v3.16b }, v16.16b
+ eor v14.16b, v14.16b, v4.16b
+ add v7.4s, v7.4s, v2.4s
+ add v6.4s, v6.4s, v3.4s
+ tbl v14.16b, { v14.16b }, v16.16b
+ eor v11.16b, v11.16b, v7.16b
+ add v13.4s, v13.4s, v20.4s
+ eor v10.16b, v6.16b, v10.16b
+ add v8.4s, v8.4s, v14.4s
+ tbl v11.16b, { v11.16b }, v16.16b
+ add v13.4s, v13.4s, v0.4s
+ ldr q20, [sp, #176]
+ ushr v15.4s, v10.4s, #12
+ shl v10.4s, v10.4s, #20
+ eor v9.16b, v8.16b, v9.16b
+ add v1.4s, v1.4s, v11.4s
+ eor v12.16b, v12.16b, v13.16b
+ orr v10.16b, v10.16b, v15.16b
+ ushr v15.4s, v9.4s, #12
+ shl v9.4s, v9.4s, #20
+ eor v2.16b, v1.16b, v2.16b
+ tbl v12.16b, { v12.16b }, v16.16b
+ orr v9.16b, v9.16b, v15.16b
+ ushr v15.4s, v2.4s, #12
+ shl v2.4s, v2.4s, #20
+ add v5.4s, v5.4s, v12.4s
+ add v17.4s, v17.4s, v31.4s
+ orr v2.16b, v2.16b, v15.16b
+ eor v0.16b, v5.16b, v0.16b
+ add v17.4s, v17.4s, v10.4s
+ add v4.4s, v4.4s, v20.4s
+ add v7.4s, v7.4s, v29.4s
+ ushr v15.4s, v0.4s, #12
+ shl v0.4s, v0.4s, #20
+ eor v3.16b, v3.16b, v17.16b
+ add v4.4s, v4.4s, v9.4s
+ add v7.4s, v7.4s, v2.4s
+ orr v0.16b, v0.16b, v15.16b
+ mov v15.16b, v31.16b
+ add v17.4s, v17.4s, v22.4s
+ eor v31.16b, v14.16b, v4.16b
+ eor v22.16b, v11.16b, v7.16b
+ add v11.4s, v13.4s, v27.4s
+ tbl v3.16b, { v3.16b }, v19.16b
+ add v11.4s, v11.4s, v0.4s
+ tbl v31.16b, { v31.16b }, v19.16b
+ add v6.4s, v6.4s, v3.4s
+ eor v12.16b, v12.16b, v11.16b
+ tbl v22.16b, { v22.16b }, v19.16b
+ add v8.4s, v8.4s, v31.4s
+ eor v10.16b, v6.16b, v10.16b
+ add v30.4s, v11.4s, v30.4s
+ tbl v11.16b, { v12.16b }, v19.16b
+ add v1.4s, v1.4s, v22.4s
+ eor v9.16b, v8.16b, v9.16b
+ ushr v12.4s, v10.4s, #7
+ shl v10.4s, v10.4s, #25
+ add v5.4s, v5.4s, v11.4s
+ eor v2.16b, v1.16b, v2.16b
+ orr v10.16b, v10.16b, v12.16b
+ ushr v12.4s, v9.4s, #7
+ shl v9.4s, v9.4s, #25
+ eor v0.16b, v5.16b, v0.16b
+ orr v9.16b, v9.16b, v12.16b
+ ushr v12.4s, v2.4s, #7
+ shl v2.4s, v2.4s, #25
+ orr v2.16b, v2.16b, v12.16b
+ ushr v12.4s, v0.4s, #7
+ shl v0.4s, v0.4s, #25
+ orr v0.16b, v0.16b, v12.16b
+ add v4.4s, v4.4s, v26.4s
+ add v17.4s, v17.4s, v0.4s
+ add v7.4s, v7.4s, v28.4s
+ mov v18.16b, v27.16b
+ eor v31.16b, v31.16b, v17.16b
+ add v4.4s, v4.4s, v10.4s
+ add v27.4s, v30.4s, v2.4s
+ eor v22.16b, v22.16b, v4.16b
+ add v7.4s, v7.4s, v9.4s
+ eor v3.16b, v3.16b, v27.16b
+ add v26.4s, v27.4s, v29.4s
+ tbl v27.16b, { v31.16b }, v16.16b
+ eor v28.16b, v11.16b, v7.16b
+ tbl v22.16b, { v22.16b }, v16.16b
+ add v1.4s, v1.4s, v27.4s
+ add v4.4s, v4.4s, v23.4s
+ ldr q23, [sp, #144]
+ tbl v28.16b, { v28.16b }, v16.16b
+ tbl v3.16b, { v3.16b }, v16.16b
+ add v5.4s, v5.4s, v22.4s
+ eor v0.16b, v0.16b, v1.16b
+ add v6.4s, v6.4s, v28.4s
+ add v29.4s, v8.4s, v3.4s
+ eor v30.16b, v5.16b, v10.16b
+ ushr v8.4s, v0.4s, #12
+ shl v0.4s, v0.4s, #20
+ eor v31.16b, v6.16b, v9.16b
+ orr v0.16b, v0.16b, v8.16b
+ ushr v8.4s, v30.4s, #12
+ shl v30.4s, v30.4s, #20
+ eor v2.16b, v29.16b, v2.16b
+ orr v30.16b, v30.16b, v8.16b
+ ushr v8.4s, v31.4s, #12
+ shl v31.4s, v31.4s, #20
+ add v17.4s, v17.4s, v25.4s
+ add v7.4s, v7.4s, v23.4s
+ orr v31.16b, v31.16b, v8.16b
+ ushr v8.4s, v2.4s, #12
+ shl v2.4s, v2.4s, #20
+ ldur q23, [x29, #-176]
+ orr v2.16b, v2.16b, v8.16b
+ add v17.4s, v17.4s, v0.4s
+ eor v27.16b, v27.16b, v17.16b
+ add v4.4s, v4.4s, v30.4s
+ add v25.4s, v26.4s, v2.4s
+ eor v22.16b, v22.16b, v4.16b
+ add v4.4s, v4.4s, v24.4s
+ add v7.4s, v7.4s, v31.4s
+ eor v3.16b, v3.16b, v25.16b
+ add v24.4s, v25.4s, v18.4s
+ tbl v25.16b, { v27.16b }, v19.16b
+ add v17.4s, v17.4s, v23.4s
+ eor v23.16b, v28.16b, v7.16b
+ tbl v22.16b, { v22.16b }, v19.16b
+ add v1.4s, v1.4s, v25.4s
+ tbl v23.16b, { v23.16b }, v19.16b
+ tbl v3.16b, { v3.16b }, v19.16b
+ add v5.4s, v5.4s, v22.4s
+ eor v0.16b, v0.16b, v1.16b
+ add v6.4s, v6.4s, v23.4s
+ add v26.4s, v29.4s, v3.4s
+ eor v27.16b, v5.16b, v30.16b
+ ushr v29.4s, v0.4s, #7
+ shl v0.4s, v0.4s, #25
+ eor v28.16b, v6.16b, v31.16b
+ orr v0.16b, v0.16b, v29.16b
+ ushr v29.4s, v27.4s, #7
+ shl v27.4s, v27.4s, #25
+ eor v2.16b, v26.16b, v2.16b
+ orr v27.16b, v27.16b, v29.16b
+ ushr v29.4s, v28.4s, #7
+ shl v28.4s, v28.4s, #25
+ ldur q18, [x29, #-128]
+ orr v28.16b, v28.16b, v29.16b
+ ushr v29.4s, v2.4s, #7
+ shl v2.4s, v2.4s, #25
+ add v7.4s, v7.4s, v15.4s
+ orr v2.16b, v2.16b, v29.16b
+ add v17.4s, v17.4s, v27.4s
+ add v4.4s, v4.4s, v28.4s
+ add v7.4s, v7.4s, v2.4s
+ eor v3.16b, v3.16b, v17.16b
+ add v17.4s, v17.4s, v20.4s
+ eor v20.16b, v25.16b, v4.16b
+ add v4.4s, v4.4s, v21.4s
+ eor v21.16b, v22.16b, v7.16b
+ add v7.4s, v7.4s, v18.4s
+ add v18.4s, v24.4s, v0.4s
+ eor v22.16b, v23.16b, v18.16b
+ ldr q23, [sp, #160]
+ tbl v3.16b, { v3.16b }, v16.16b
+ tbl v20.16b, { v20.16b }, v16.16b
+ add v6.4s, v6.4s, v3.4s
+ add v18.4s, v18.4s, v23.4s
+ tbl v21.16b, { v21.16b }, v16.16b
+ tbl v16.16b, { v22.16b }, v16.16b
+ add v22.4s, v26.4s, v20.4s
+ eor v23.16b, v6.16b, v27.16b
+ add v1.4s, v1.4s, v21.4s
+ eor v24.16b, v22.16b, v28.16b
+ ushr v25.4s, v23.4s, #12
+ shl v23.4s, v23.4s, #20
+ add v5.4s, v5.4s, v16.4s
+ eor v2.16b, v1.16b, v2.16b
+ orr v23.16b, v23.16b, v25.16b
+ ushr v25.4s, v24.4s, #12
+ shl v24.4s, v24.4s, #20
+ eor v0.16b, v5.16b, v0.16b
+ orr v24.16b, v24.16b, v25.16b
+ ushr v25.4s, v2.4s, #12
+ shl v2.4s, v2.4s, #20
+ orr v2.16b, v2.16b, v25.16b
+ ushr v25.4s, v0.4s, #12
+ shl v0.4s, v0.4s, #20
+ orr v0.16b, v0.16b, v25.16b
+ add v25.4s, v7.4s, v2.4s
+ add v26.4s, v18.4s, v0.4s
+ eor v18.16b, v21.16b, v25.16b
+ add v17.4s, v17.4s, v23.4s
+ add v4.4s, v4.4s, v24.4s
+ eor v16.16b, v16.16b, v26.16b
+ tbl v21.16b, { v18.16b }, v19.16b
+ eor v3.16b, v3.16b, v17.16b
+ eor v7.16b, v20.16b, v4.16b
+ tbl v16.16b, { v16.16b }, v19.16b
+ add v1.4s, v1.4s, v21.4s
+ tbl v3.16b, { v3.16b }, v19.16b
+ tbl v20.16b, { v7.16b }, v19.16b
+ eor v2.16b, v1.16b, v2.16b
+ eor v7.16b, v1.16b, v17.16b
+ add v1.4s, v5.4s, v16.4s
+ eor v0.16b, v1.16b, v0.16b
+ eor v18.16b, v1.16b, v4.16b
+ add v1.4s, v6.4s, v3.4s
+ eor v4.16b, v1.16b, v23.16b
+ eor v6.16b, v25.16b, v1.16b
+ add v1.4s, v22.4s, v20.4s
+ eor v5.16b, v1.16b, v24.16b
+ eor v17.16b, v26.16b, v1.16b
+ ushr v1.4s, v4.4s, #7
+ shl v4.4s, v4.4s, #25
+ orr v1.16b, v4.16b, v1.16b
+ ushr v4.4s, v5.4s, #7
+ shl v5.4s, v5.4s, #25
+ orr v4.16b, v5.16b, v4.16b
+ ushr v5.4s, v2.4s, #7
+ shl v2.4s, v2.4s, #25
+ orr v2.16b, v2.16b, v5.16b
+ ushr v5.4s, v0.4s, #7
+ shl v0.4s, v0.4s, #25
+ orr v0.16b, v0.16b, v5.16b
+ eor v10.16b, v0.16b, v20.16b
+ eor v11.16b, v1.16b, v21.16b
+ eor v19.16b, v4.16b, v16.16b
+ cmp x0, x22
+ eor v16.16b, v2.16b, v3.16b
+ mov w6, w19
+ b.ne .LBB2_4
+.LBB2_7:
+ zip1 v0.4s, v7.4s, v18.4s
+ zip2 v1.4s, v7.4s, v18.4s
+ zip1 v2.4s, v6.4s, v17.4s
+ zip2 v3.4s, v6.4s, v17.4s
+ zip1 v4.4s, v10.4s, v11.4s
+ zip2 v5.4s, v10.4s, v11.4s
+ zip1 v6.4s, v19.4s, v16.4s
+ zip2 v7.4s, v19.4s, v16.4s
+ add x15, x20, #4
+ tst w5, #0x1
+ sub x28, x28, #4
+ zip1 v16.2d, v0.2d, v2.2d
+ zip2 v0.2d, v0.2d, v2.2d
+ zip1 v2.2d, v1.2d, v3.2d
+ zip2 v1.2d, v1.2d, v3.2d
+ zip1 v3.2d, v4.2d, v6.2d
+ zip2 v4.2d, v4.2d, v6.2d
+ zip1 v6.2d, v5.2d, v7.2d
+ zip2 v5.2d, v5.2d, v7.2d
+ add x24, x24, #32
+ csel x20, x15, x20, ne
+ cmp x28, #3
+ stp q16, q3, [x26]
+ stp q0, q4, [x26, #32]
+ stp q2, q6, [x26, #64]
+ stp q1, q5, [x26, #96]
+ add x26, x26, #128
+ b.hi .LBB2_2
+.LBB2_8:
+ cbz x28, .LBB2_16
+ orr w8, w7, w19
+ and x21, x5, #0x1
+ stur w8, [x29, #-64]
+.LBB2_10:
+ ldr x8, [sp, #40]
+ ldr x25, [x24]
+ ldur w4, [x29, #-64]
+ ldp q1, q0, [x8]
+ mov x8, x22
+ stp q1, q0, [x29, #-48]
+.LBB2_11:
+ subs x23, x8, #1
+ b.eq .LBB2_13
+ cbnz x8, .LBB2_14
+ b .LBB2_15
+.LBB2_13:
+ orr w4, w4, w27
+.LBB2_14:
+ sub x0, x29, #48
+ mov w2, #64
+ mov x1, x25
+ mov x3, x20
+ bl zfs_blake3_compress_in_place_sse41
+ add x25, x25, #64
+ mov x8, x23
+ mov w4, w19
+ b .LBB2_11
+.LBB2_15:
+ ldp q0, q1, [x29, #-48]
+ add x20, x20, x21
+ add x24, x24, #8
+ subs x28, x28, #1
+ stp q0, q1, [x26], #32
+ b.ne .LBB2_10
+.LBB2_16:
+ add sp, sp, #448
+ ldp x20, x19, [sp, #144]
+ ldp x22, x21, [sp, #128]
+ ldp x24, x23, [sp, #112]
+ ldp x26, x25, [sp, #96]
+ ldp x28, x27, [sp, #80]
+ ldp x29, x30, [sp, #64]
+ ldp d9, d8, [sp, #48]
+ ldp d11, d10, [sp, #32]
+ ldp d13, d12, [sp, #16]
+ ldp d15, d14, [sp], #160
+ ret
+.Lfunc_end2:
+ .size zfs_blake3_hash_many_sse41, .Lfunc_end2-zfs_blake3_hash_many_sse41
+ .cfi_endproc
+ .section ".note.GNU-stack","",@progbits
+#endif
diff --git a/sys/contrib/openzfs/module/icp/asm-ppc64/blake3/b3_ppc64le_sse2.S b/sys/contrib/openzfs/module/icp/asm-ppc64/blake3/b3_ppc64le_sse2.S
new file mode 100644
index 000000000000..9deba202fde8
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/asm-ppc64/blake3/b3_ppc64le_sse2.S
@@ -0,0 +1,2823 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Based on BLAKE3 v1.3.1, https://github.com/BLAKE3-team/BLAKE3
+ * Copyright (c) 2019-2022 Samuel Neves and Matthew Krupcale
+ * Copyright (c) 2022 Tino Reichardt <milky-zfs@mcmilk.de>
+ *
+ * This is converted assembly: SSE2 -> POWER8 PPC64 Little Endian
+ * Used tools: SIMDe https://github.com/simd-everywhere/simde
+ */
+
+#if (defined(__PPC64__) && defined(__LITTLE_ENDIAN__))
+ .text
+ .abiversion 2
+ .section .rodata.cst16,"aM",@progbits,16
+ .p2align 4
+.LCPI0_0:
+ .byte 29
+ .byte 28
+ .byte 31
+ .byte 30
+ .byte 25
+ .byte 24
+ .byte 27
+ .byte 26
+ .byte 21
+ .byte 20
+ .byte 23
+ .byte 22
+ .byte 17
+ .byte 16
+ .byte 19
+ .byte 18
+.LCPI0_1:
+ .long 1779033703
+ .long 3144134277
+ .long 1013904242
+ .long 2773480762
+.LCPI0_2:
+ .byte 27
+ .byte 26
+ .byte 25
+ .byte 24
+ .byte 19
+ .byte 18
+ .byte 17
+ .byte 16
+ .byte 11
+ .byte 10
+ .byte 9
+ .byte 8
+ .byte 3
+ .byte 2
+ .byte 1
+ .byte 0
+.LCPI0_3:
+ .byte 19
+ .byte 18
+ .byte 17
+ .byte 16
+ .byte 11
+ .byte 10
+ .byte 9
+ .byte 8
+ .byte 3
+ .byte 2
+ .byte 1
+ .byte 0
+ .byte 27
+ .byte 26
+ .byte 25
+ .byte 24
+.LCPI0_4:
+ .byte 23
+ .byte 22
+ .byte 21
+ .byte 20
+ .byte 27
+ .byte 26
+ .byte 25
+ .byte 24
+ .byte 11
+ .byte 10
+ .byte 9
+ .byte 8
+ .byte 3
+ .byte 2
+ .byte 1
+ .byte 0
+.LCPI0_5:
+ .byte 23
+ .byte 22
+ .byte 21
+ .byte 20
+ .byte 23
+ .byte 22
+ .byte 21
+ .byte 20
+ .byte 3
+ .byte 2
+ .byte 1
+ .byte 0
+ .byte 3
+ .byte 2
+ .byte 1
+ .byte 0
+.LCPI0_6:
+ .short 1
+ .short 2
+ .short 4
+ .short 8
+ .short 16
+ .short 32
+ .short 64
+ .short 128
+.LCPI0_7:
+ .short 0
+ .short 0
+ .short 4
+ .short 8
+ .short 0
+ .short 0
+ .short 64
+ .short 128
+.LCPI0_8:
+ .byte 19
+ .byte 18
+ .byte 17
+ .byte 16
+ .byte 19
+ .byte 18
+ .byte 17
+ .byte 16
+ .byte 31
+ .byte 30
+ .byte 29
+ .byte 28
+ .byte 31
+ .byte 30
+ .byte 29
+ .byte 28
+.LCPI0_9:
+ .short 0
+ .short 0
+ .short 0
+ .short 0
+ .short 0
+ .short 0
+ .short 64
+ .short 128
+.LCPI0_10:
+ .byte 31
+ .byte 30
+ .byte 29
+ .byte 28
+ .byte 7
+ .byte 6
+ .byte 5
+ .byte 4
+ .byte 3
+ .byte 2
+ .byte 1
+ .byte 0
+ .byte 27
+ .byte 26
+ .byte 25
+ .byte 24
+.LCPI0_11:
+ .byte 31
+ .byte 30
+ .byte 29
+ .byte 28
+ .byte 23
+ .byte 22
+ .byte 21
+ .byte 20
+ .byte 19
+ .byte 18
+ .byte 17
+ .byte 16
+ .byte 27
+ .byte 26
+ .byte 25
+ .byte 24
+.LCPI0_12:
+ .byte 27
+ .byte 26
+ .byte 25
+ .byte 24
+ .byte 11
+ .byte 10
+ .byte 9
+ .byte 8
+ .byte 15
+ .byte 14
+ .byte 13
+ .byte 12
+ .byte 31
+ .byte 30
+ .byte 29
+ .byte 28
+.LCPI0_13:
+ .byte 31
+ .byte 30
+ .byte 29
+ .byte 28
+ .byte 15
+ .byte 14
+ .byte 13
+ .byte 12
+ .byte 11
+ .byte 10
+ .byte 9
+ .byte 8
+ .byte 27
+ .byte 26
+ .byte 25
+ .byte 24
+.LCPI0_14:
+ .byte 27
+ .byte 26
+ .byte 25
+ .byte 24
+ .byte 11
+ .byte 10
+ .byte 9
+ .byte 8
+ .byte 3
+ .byte 2
+ .byte 1
+ .byte 0
+ .byte 23
+ .byte 22
+ .byte 21
+ .byte 20
+ .text
+ .globl zfs_blake3_compress_in_place_sse2
+ .p2align 2
+ .type zfs_blake3_compress_in_place_sse2,@function
+zfs_blake3_compress_in_place_sse2:
+.Lfunc_begin0:
+ .cfi_startproc
+.Lfunc_gep0:
+ addis 2, 12, .TOC.-.Lfunc_gep0@ha
+ addi 2, 2, .TOC.-.Lfunc_gep0@l
+.Lfunc_lep0:
+ .localentry zfs_blake3_compress_in_place_sse2, .Lfunc_lep0-.Lfunc_gep0
+ li 8, -64
+ mtvsrd 35, 5
+ li 5, 16
+ lfdx 0, 0, 4
+ vspltisw 12, 9
+ stxvd2x 60, 1, 8
+ li 8, -48
+ mtvsrd 36, 7
+ lfd 2, 16(4)
+ stxvd2x 61, 1, 8
+ li 8, -32
+ lfd 1, 8(4)
+ mtvsrwz 37, 6
+ rldicl 6, 6, 32, 32
+ addis 7, 2, .LCPI0_2@toc@ha
+ stxvd2x 62, 1, 8
+ li 8, -16
+ addi 7, 7, .LCPI0_2@toc@l
+ stxvd2x 63, 1, 8
+ li 8, 0
+ lvx 9, 0, 7
+ li 7, 48
+ mtvsrd 34, 8
+ xxmrghd 32, 1, 0
+ lxvd2x 0, 0, 3
+ lxvd2x 1, 3, 5
+ lfd 3, 24(4)
+ addis 8, 2, .LCPI0_5@toc@ha
+ vmrghb 3, 2, 3
+ addi 8, 8, .LCPI0_5@toc@l
+ vmrghb 4, 2, 4
+ vspltb 2, 2, 7
+ xxmrghd 33, 3, 2
+ vpkudum 7, 1, 0
+ vmrglh 3, 2, 3
+ vmrglh 2, 2, 4
+ mtvsrwz 36, 6
+ addis 6, 2, .LCPI0_0@toc@ha
+ addi 6, 6, .LCPI0_0@toc@l
+ vperm 10, 1, 0, 9
+ vmrghw 4, 4, 5
+ xxswapd 37, 1
+ lxvd2x 1, 4, 7
+ addis 7, 2, .LCPI0_8@toc@ha
+ addi 7, 7, .LCPI0_8@toc@l
+ vmrglw 2, 2, 3
+ xxswapd 35, 0
+ xxswapd 41, 1
+ xxspltd 62, 42, 1
+ vadduwm 3, 7, 3
+ vadduwm 6, 3, 5
+ xxmrgld 36, 34, 36
+ lvx 2, 0, 6
+ addis 6, 2, .LCPI0_1@toc@ha
+ addi 6, 6, .LCPI0_1@toc@l
+ xxlxor 35, 38, 36
+ lvx 4, 0, 6
+ li 6, 32
+ lxvd2x 0, 4, 6
+ addis 4, 2, .LCPI0_3@toc@ha
+ addis 6, 2, .LCPI0_7@toc@ha
+ vperm 8, 3, 3, 2
+ vspltisw 3, 10
+ addi 4, 4, .LCPI0_3@toc@l
+ addi 6, 6, .LCPI0_7@toc@l
+ vadduwm 3, 3, 3
+ vadduwm 11, 8, 4
+ xxlxor 36, 43, 37
+ vadduwm 5, 6, 10
+ vrlw 0, 4, 3
+ vspltisw 4, 12
+ vadduwm 4, 4, 4
+ vadduwm 1, 0, 5
+ xxlxor 37, 33, 40
+ xxswapd 40, 0
+ vrlw 6, 5, 4
+ vspltisw 5, -16
+ vpkudum 13, 9, 8
+ vsubuwm 5, 12, 5
+ lvx 12, 0, 4
+ addis 4, 2, .LCPI0_4@toc@ha
+ addi 4, 4, .LCPI0_4@toc@l
+ vadduwm 11, 6, 11
+ xxswapd 0, 38
+ vadduwm 1, 1, 13
+ xxsldwi 50, 45, 45, 1
+ xxlxor 32, 43, 32
+ xxsldwi 43, 43, 43, 3
+ xxsldwi 33, 33, 33, 1
+ vperm 12, 8, 9, 12
+ vrlw 0, 0, 5
+ vadduwm 1, 0, 1
+ xxlxor 38, 33, 0
+ vadduwm 1, 1, 12
+ vperm 6, 6, 6, 2
+ vadduwm 15, 6, 11
+ lvx 11, 0, 4
+ addis 4, 2, .LCPI0_6@toc@ha
+ addi 4, 4, .LCPI0_6@toc@l
+ xxlxor 32, 47, 32
+ lvx 17, 0, 4
+ addis 4, 2, .LCPI0_9@toc@ha
+ vperm 14, 10, 7, 11
+ addi 4, 4, .LCPI0_9@toc@l
+ vrlw 0, 0, 3
+ vadduwm 1, 0, 1
+ xxlxor 38, 33, 38
+ vrlw 6, 6, 4
+ vadduwm 8, 6, 15
+ xxswapd 0, 38
+ lvx 6, 0, 8
+ xxlxor 32, 40, 32
+ xxsldwi 40, 40, 40, 1
+ vperm 13, 12, 18, 6
+ vrlw 9, 0, 5
+ vadduwm 0, 1, 14
+ lvx 1, 0, 7
+ xxsldwi 46, 46, 46, 3
+ xxsldwi 32, 32, 32, 3
+ vperm 7, 7, 7, 1
+ vadduwm 15, 9, 0
+ xxlxor 32, 47, 0
+ vperm 16, 0, 0, 2
+ lvx 0, 0, 6
+ addis 6, 2, .LCPI0_10@toc@ha
+ vcmpequh 0, 0, 17
+ vadduwm 19, 16, 8
+ xxlxor 40, 51, 41
+ xxsel 45, 39, 45, 32
+ vrlw 31, 8, 3
+ lvx 8, 0, 4
+ addis 4, 2, .LCPI0_11@toc@ha
+ addi 4, 4, .LCPI0_11@toc@l
+ vcmpequh 7, 8, 17
+ vadduwm 8, 15, 13
+ vadduwm 15, 31, 8
+ lvx 8, 0, 4
+ addi 4, 6, .LCPI0_10@toc@l
+ lvx 17, 0, 4
+ addis 4, 2, .LCPI0_12@toc@ha
+ xxlxor 41, 47, 48
+ xxsldwi 47, 47, 47, 1
+ addi 4, 4, .LCPI0_12@toc@l
+ xxlnor 48, 39, 39
+ vrlw 29, 9, 4
+ vperm 9, 16, 16, 8
+ xxland 48, 50, 39
+ vperm 17, 30, 12, 17
+ vperm 16, 16, 16, 8
+ vmrghw 12, 12, 10
+ lvx 10, 0, 4
+ addis 4, 2, .LCPI0_13@toc@ha
+ vadduwm 19, 29, 19
+ addi 4, 4, .LCPI0_13@toc@l
+ xxlxor 63, 51, 63
+ xxsldwi 51, 51, 51, 3
+ xxland 0, 49, 41
+ vrlw 17, 31, 5
+ xxlor 48, 0, 48
+ xxswapd 0, 61
+ vperm 18, 12, 18, 10
+ vadduwm 15, 15, 16
+ xxland 60, 48, 39
+ vadduwm 15, 17, 15
+ vperm 28, 28, 28, 8
+ xxlxor 63, 47, 0
+ vadduwm 15, 15, 18
+ vperm 31, 31, 31, 2
+ vperm 30, 18, 16, 6
+ vadduwm 19, 31, 19
+ xxlxor 44, 51, 49
+ vrlw 12, 12, 3
+ vadduwm 15, 12, 15
+ xxlxor 49, 47, 63
+ vperm 31, 13, 14, 11
+ vrlw 17, 17, 4
+ vperm 14, 14, 14, 1
+ vadduwm 15, 15, 31
+ vadduwm 19, 17, 19
+ xxswapd 0, 49
+ xxsldwi 47, 47, 47, 3
+ xxsel 46, 46, 62, 32
+ xxlxor 44, 51, 44
+ xxsldwi 51, 51, 51, 1
+ vrlw 12, 12, 5
+ vadduwm 15, 12, 15
+ xxlxor 49, 47, 0
+ vperm 17, 17, 17, 2
+ vadduwm 19, 17, 19
+ xxlxor 44, 51, 44
+ vrlw 29, 12, 3
+ vadduwm 12, 15, 14
+ vadduwm 15, 29, 12
+ lvx 12, 0, 4
+ addis 4, 2, .LCPI0_14@toc@ha
+ addi 4, 4, .LCPI0_14@toc@l
+ xxlxor 49, 47, 49
+ xxsldwi 47, 47, 47, 1
+ vperm 30, 13, 18, 12
+ vrlw 17, 17, 4
+ vmrghw 13, 18, 13
+ xxland 0, 62, 41
+ vadduwm 19, 17, 19
+ vperm 16, 13, 16, 10
+ xxlxor 61, 51, 61
+ xxsldwi 50, 51, 51, 3
+ xxsldwi 51, 63, 63, 3
+ vrlw 30, 29, 5
+ xxlor 61, 60, 0
+ xxswapd 0, 49
+ vperm 31, 14, 19, 11
+ vadduwm 15, 15, 29
+ vperm 19, 19, 19, 1
+ vadduwm 15, 30, 15
+ xxlxor 49, 47, 0
+ vadduwm 15, 15, 16
+ vperm 17, 17, 17, 2
+ vadduwm 18, 17, 18
+ xxlxor 45, 50, 62
+ vperm 30, 16, 29, 6
+ vrlw 13, 13, 3
+ vadduwm 15, 13, 15
+ xxlxor 49, 47, 49
+ vadduwm 15, 15, 31
+ xxsldwi 63, 63, 63, 3
+ vrlw 17, 17, 4
+ xxsldwi 47, 47, 47, 3
+ vadduwm 18, 17, 18
+ xxswapd 0, 49
+ xxlxor 45, 50, 45
+ xxsldwi 50, 50, 50, 1
+ vrlw 13, 13, 5
+ vadduwm 15, 13, 15
+ xxlxor 49, 47, 0
+ vperm 17, 17, 17, 2
+ vadduwm 18, 17, 18
+ xxlxor 45, 50, 45
+ vrlw 28, 13, 3
+ xxsel 45, 51, 62, 32
+ xxland 51, 61, 39
+ vperm 30, 14, 16, 12
+ vadduwm 15, 15, 13
+ vperm 19, 19, 19, 8
+ vmrghw 14, 16, 14
+ vadduwm 15, 28, 15
+ xxlxor 49, 47, 49
+ xxsldwi 47, 47, 47, 1
+ xxland 0, 62, 41
+ vrlw 17, 17, 4
+ xxlor 51, 51, 0
+ vadduwm 15, 15, 19
+ vadduwm 18, 17, 18
+ xxswapd 0, 49
+ xxlxor 60, 50, 60
+ xxsldwi 48, 50, 50, 3
+ vperm 18, 14, 29, 10
+ vrlw 30, 28, 5
+ vperm 29, 18, 19, 6
+ vadduwm 15, 30, 15
+ xxlxor 49, 47, 0
+ vadduwm 15, 15, 18
+ vperm 17, 17, 17, 2
+ vadduwm 16, 17, 16
+ xxlxor 46, 48, 62
+ vperm 30, 13, 31, 11
+ vrlw 14, 14, 3
+ vperm 31, 31, 31, 1
+ vadduwm 15, 14, 15
+ xxlxor 49, 47, 49
+ vadduwm 15, 15, 30
+ vrlw 17, 17, 4
+ xxsldwi 47, 47, 47, 3
+ vadduwm 16, 17, 16
+ xxswapd 0, 49
+ xxlxor 46, 48, 46
+ xxsldwi 48, 48, 48, 1
+ vrlw 14, 14, 5
+ vadduwm 15, 14, 15
+ xxlxor 49, 47, 0
+ vperm 17, 17, 17, 2
+ vadduwm 16, 17, 16
+ xxlxor 46, 48, 46
+ vrlw 28, 14, 3
+ xxsel 46, 63, 61, 32
+ xxland 63, 51, 39
+ vperm 29, 13, 18, 12
+ vadduwm 15, 15, 14
+ vperm 31, 31, 31, 8
+ vmrghw 13, 18, 13
+ vadduwm 15, 28, 15
+ xxlxor 49, 47, 49
+ xxsldwi 47, 47, 47, 1
+ xxland 0, 61, 41
+ vrlw 17, 17, 4
+ xxlor 63, 63, 0
+ vperm 13, 13, 19, 10
+ xxsldwi 51, 62, 62, 3
+ vadduwm 15, 15, 31
+ vperm 30, 14, 19, 11
+ vadduwm 16, 17, 16
+ xxswapd 0, 49
+ xxlxor 60, 48, 60
+ xxsldwi 48, 48, 48, 3
+ vrlw 29, 28, 5
+ vadduwm 15, 29, 15
+ xxlxor 49, 47, 0
+ vadduwm 15, 15, 13
+ vperm 17, 17, 17, 2
+ vadduwm 16, 17, 16
+ xxlxor 50, 48, 61
+ vrlw 18, 18, 3
+ vadduwm 15, 18, 15
+ xxlxor 49, 47, 49
+ vadduwm 15, 15, 30
+ vrlw 17, 17, 4
+ xxsldwi 47, 47, 47, 3
+ vadduwm 11, 17, 16
+ xxswapd 0, 49
+ xxlxor 48, 43, 50
+ xxsldwi 43, 43, 43, 1
+ vperm 18, 19, 19, 1
+ vrlw 16, 16, 5
+ vperm 19, 13, 31, 6
+ vadduwm 15, 16, 15
+ xxlxor 49, 47, 0
+ vperm 17, 17, 17, 2
+ vadduwm 29, 17, 11
+ xxlxor 43, 61, 48
+ vrlw 16, 11, 3
+ xxsel 43, 50, 51, 32
+ xxland 50, 63, 39
+ vperm 19, 14, 13, 12
+ vadduwm 15, 15, 11
+ vperm 18, 18, 18, 8
+ vmrghw 13, 13, 14
+ vadduwm 15, 16, 15
+ xxlxor 49, 47, 49
+ xxsldwi 47, 47, 47, 1
+ xxland 0, 51, 41
+ lvx 19, 0, 4
+ vrlw 17, 17, 4
+ xxlor 50, 50, 0
+ vperm 13, 13, 31, 10
+ xxsldwi 63, 62, 62, 3
+ vadduwm 15, 15, 18
+ vperm 19, 11, 31, 19
+ vadduwm 29, 17, 29
+ xxswapd 0, 49
+ vperm 1, 31, 31, 1
+ xxlxor 48, 61, 48
+ xxsldwi 46, 61, 61, 3
+ vperm 6, 13, 18, 6
+ vrlw 16, 16, 5
+ xxsel 32, 33, 38, 32
+ xxland 38, 50, 39
+ vadduwm 15, 16, 15
+ vperm 7, 11, 13, 12
+ xxlxor 49, 47, 0
+ vadduwm 15, 15, 13
+ vperm 17, 17, 17, 2
+ vperm 6, 6, 6, 8
+ vadduwm 14, 17, 14
+ xxlxor 48, 46, 48
+ vrlw 16, 16, 3
+ vadduwm 15, 16, 15
+ xxlxor 49, 47, 49
+ xxsldwi 47, 47, 47, 3
+ vrlw 17, 17, 4
+ vadduwm 15, 15, 19
+ vadduwm 14, 17, 14
+ xxswapd 0, 49
+ xxlxor 48, 46, 48
+ xxsldwi 46, 46, 46, 1
+ vrlw 16, 16, 5
+ vadduwm 15, 16, 15
+ xxlxor 49, 47, 0
+ vadduwm 0, 15, 0
+ vperm 17, 17, 17, 2
+ xxland 0, 39, 41
+ xxlor 38, 38, 0
+ vadduwm 14, 17, 14
+ xxlxor 48, 46, 48
+ vrlw 16, 16, 3
+ vadduwm 0, 16, 0
+ xxlxor 33, 32, 49
+ xxsldwi 32, 32, 32, 1
+ vrlw 1, 1, 4
+ vadduwm 0, 0, 6
+ vadduwm 8, 1, 14
+ xxswapd 0, 33
+ xxlxor 44, 40, 48
+ xxsldwi 38, 40, 40, 3
+ vrlw 7, 12, 5
+ vadduwm 0, 7, 0
+ xxlxor 33, 32, 0
+ vperm 2, 1, 1, 2
+ vmrghw 1, 13, 11
+ vadduwm 6, 2, 6
+ vperm 1, 1, 18, 10
+ xxlxor 39, 38, 39
+ vrlw 3, 7, 3
+ vadduwm 0, 0, 1
+ vadduwm 0, 3, 0
+ xxlxor 34, 32, 34
+ xxsldwi 0, 32, 32, 3
+ vrlw 2, 2, 4
+ vadduwm 4, 2, 6
+ xxswapd 2, 34
+ xxlxor 35, 36, 35
+ xxsldwi 1, 36, 36, 1
+ vrlw 3, 3, 5
+ xxlxor 0, 1, 0
+ xxswapd 0, 0
+ xxlxor 1, 35, 2
+ stxvd2x 0, 0, 3
+ xxswapd 1, 1
+ stxvd2x 1, 3, 5
+ li 3, -16
+ lxvd2x 63, 1, 3
+ li 3, -32
+ lxvd2x 62, 1, 3
+ li 3, -48
+ lxvd2x 61, 1, 3
+ li 3, -64
+ lxvd2x 60, 1, 3
+ blr
+ .long 0
+ .quad 0
+.Lfunc_end0:
+ .size zfs_blake3_compress_in_place_sse2, .Lfunc_end0-.Lfunc_begin0
+ .cfi_endproc
+
+ .section .rodata.cst16,"aM",@progbits,16
+ .p2align 4
+.LCPI1_0:
+ .byte 29
+ .byte 28
+ .byte 31
+ .byte 30
+ .byte 25
+ .byte 24
+ .byte 27
+ .byte 26
+ .byte 21
+ .byte 20
+ .byte 23
+ .byte 22
+ .byte 17
+ .byte 16
+ .byte 19
+ .byte 18
+.LCPI1_1:
+ .long 1779033703
+ .long 3144134277
+ .long 1013904242
+ .long 2773480762
+.LCPI1_2:
+ .byte 27
+ .byte 26
+ .byte 25
+ .byte 24
+ .byte 19
+ .byte 18
+ .byte 17
+ .byte 16
+ .byte 11
+ .byte 10
+ .byte 9
+ .byte 8
+ .byte 3
+ .byte 2
+ .byte 1
+ .byte 0
+.LCPI1_3:
+ .byte 19
+ .byte 18
+ .byte 17
+ .byte 16
+ .byte 11
+ .byte 10
+ .byte 9
+ .byte 8
+ .byte 3
+ .byte 2
+ .byte 1
+ .byte 0
+ .byte 27
+ .byte 26
+ .byte 25
+ .byte 24
+.LCPI1_4:
+ .byte 23
+ .byte 22
+ .byte 21
+ .byte 20
+ .byte 27
+ .byte 26
+ .byte 25
+ .byte 24
+ .byte 11
+ .byte 10
+ .byte 9
+ .byte 8
+ .byte 3
+ .byte 2
+ .byte 1
+ .byte 0
+.LCPI1_5:
+ .byte 23
+ .byte 22
+ .byte 21
+ .byte 20
+ .byte 23
+ .byte 22
+ .byte 21
+ .byte 20
+ .byte 3
+ .byte 2
+ .byte 1
+ .byte 0
+ .byte 3
+ .byte 2
+ .byte 1
+ .byte 0
+.LCPI1_6:
+ .short 1
+ .short 2
+ .short 4
+ .short 8
+ .short 16
+ .short 32
+ .short 64
+ .short 128
+.LCPI1_7:
+ .short 0
+ .short 0
+ .short 4
+ .short 8
+ .short 0
+ .short 0
+ .short 64
+ .short 128
+.LCPI1_8:
+ .byte 19
+ .byte 18
+ .byte 17
+ .byte 16
+ .byte 19
+ .byte 18
+ .byte 17
+ .byte 16
+ .byte 31
+ .byte 30
+ .byte 29
+ .byte 28
+ .byte 31
+ .byte 30
+ .byte 29
+ .byte 28
+.LCPI1_9:
+ .short 0
+ .short 0
+ .short 0
+ .short 0
+ .short 0
+ .short 0
+ .short 64
+ .short 128
+.LCPI1_10:
+ .byte 31
+ .byte 30
+ .byte 29
+ .byte 28
+ .byte 7
+ .byte 6
+ .byte 5
+ .byte 4
+ .byte 3
+ .byte 2
+ .byte 1
+ .byte 0
+ .byte 27
+ .byte 26
+ .byte 25
+ .byte 24
+.LCPI1_11:
+ .byte 31
+ .byte 30
+ .byte 29
+ .byte 28
+ .byte 23
+ .byte 22
+ .byte 21
+ .byte 20
+ .byte 19
+ .byte 18
+ .byte 17
+ .byte 16
+ .byte 27
+ .byte 26
+ .byte 25
+ .byte 24
+.LCPI1_12:
+ .byte 27
+ .byte 26
+ .byte 25
+ .byte 24
+ .byte 11
+ .byte 10
+ .byte 9
+ .byte 8
+ .byte 15
+ .byte 14
+ .byte 13
+ .byte 12
+ .byte 31
+ .byte 30
+ .byte 29
+ .byte 28
+.LCPI1_13:
+ .byte 31
+ .byte 30
+ .byte 29
+ .byte 28
+ .byte 15
+ .byte 14
+ .byte 13
+ .byte 12
+ .byte 11
+ .byte 10
+ .byte 9
+ .byte 8
+ .byte 27
+ .byte 26
+ .byte 25
+ .byte 24
+.LCPI1_14:
+ .byte 27
+ .byte 26
+ .byte 25
+ .byte 24
+ .byte 11
+ .byte 10
+ .byte 9
+ .byte 8
+ .byte 3
+ .byte 2
+ .byte 1
+ .byte 0
+ .byte 23
+ .byte 22
+ .byte 21
+ .byte 20
+ .text
+ .globl zfs_blake3_compress_xof_sse2
+ .p2align 2
+ .type zfs_blake3_compress_xof_sse2,@function
+zfs_blake3_compress_xof_sse2:
+.Lfunc_begin1:
+ .cfi_startproc
+.Lfunc_gep1:
+ addis 2, 12, .TOC.-.Lfunc_gep1@ha
+ addi 2, 2, .TOC.-.Lfunc_gep1@l
+.Lfunc_lep1:
+ .localentry zfs_blake3_compress_xof_sse2, .Lfunc_lep1-.Lfunc_gep1
+ li 9, -80
+ mtvsrd 35, 5
+ li 5, 16
+ lfdx 0, 0, 4
+ addis 10, 2, .LCPI1_2@toc@ha
+ vspltisw 12, 9
+ std 30, -16(1)
+ addis 12, 2, .LCPI1_8@toc@ha
+ addis 30, 2, .LCPI1_5@toc@ha
+ addis 11, 2, .LCPI1_7@toc@ha
+ stxvd2x 60, 1, 9
+ li 9, -64
+ mtvsrd 36, 7
+ lfd 2, 16(4)
+ addi 10, 10, .LCPI1_2@toc@l
+ addi 12, 12, .LCPI1_8@toc@l
+ addi 11, 11, .LCPI1_7@toc@l
+ stxvd2x 61, 1, 9
+ li 9, -48
+ lfd 3, 24(4)
+ mtvsrwz 37, 6
+ rldicl 6, 6, 32, 32
+ lvx 9, 0, 10
+ stxvd2x 62, 1, 9
+ li 9, -32
+ li 10, 32
+ stxvd2x 63, 1, 9
+ li 9, 0
+ mtvsrd 34, 9
+ xxmrghd 33, 3, 2
+ lfd 1, 8(4)
+ vmrghb 3, 2, 3
+ vmrghb 4, 2, 4
+ vspltb 2, 2, 7
+ xxmrghd 32, 1, 0
+ lxvd2x 0, 0, 3
+ lxvd2x 1, 3, 5
+ vpkudum 7, 1, 0
+ vmrglh 3, 2, 3
+ vmrglh 2, 2, 4
+ mtvsrwz 36, 6
+ addis 6, 2, .LCPI1_0@toc@ha
+ addi 6, 6, .LCPI1_0@toc@l
+ vperm 10, 1, 0, 9
+ vmrghw 4, 4, 5
+ xxswapd 37, 1
+ vmrglw 2, 2, 3
+ xxswapd 35, 0
+ lxvd2x 0, 4, 10
+ xxspltd 62, 42, 1
+ vadduwm 3, 7, 3
+ vadduwm 6, 3, 5
+ xxmrgld 36, 34, 36
+ lvx 2, 0, 6
+ addis 6, 2, .LCPI1_1@toc@ha
+ addi 6, 6, .LCPI1_1@toc@l
+ xxlxor 35, 38, 36
+ lvx 4, 0, 6
+ li 6, 48
+ lxvd2x 1, 4, 6
+ addis 4, 2, .LCPI1_3@toc@ha
+ vperm 8, 3, 3, 2
+ vspltisw 3, 10
+ addi 4, 4, .LCPI1_3@toc@l
+ xxswapd 41, 1
+ vadduwm 3, 3, 3
+ vadduwm 11, 8, 4
+ xxlxor 36, 43, 37
+ vadduwm 5, 6, 10
+ vrlw 0, 4, 3
+ vspltisw 4, 12
+ vadduwm 4, 4, 4
+ vadduwm 1, 0, 5
+ xxlxor 37, 33, 40
+ xxswapd 40, 0
+ vrlw 6, 5, 4
+ vspltisw 5, -16
+ vpkudum 13, 9, 8
+ vsubuwm 5, 12, 5
+ lvx 12, 0, 4
+ addis 4, 2, .LCPI1_4@toc@ha
+ addi 4, 4, .LCPI1_4@toc@l
+ vadduwm 11, 6, 11
+ xxswapd 0, 38
+ vadduwm 1, 1, 13
+ xxsldwi 50, 45, 45, 1
+ xxlxor 32, 43, 32
+ xxsldwi 43, 43, 43, 3
+ xxsldwi 33, 33, 33, 1
+ vperm 12, 8, 9, 12
+ vrlw 0, 0, 5
+ vadduwm 1, 0, 1
+ xxlxor 38, 33, 0
+ vadduwm 1, 1, 12
+ vperm 6, 6, 6, 2
+ vadduwm 15, 6, 11
+ lvx 11, 0, 4
+ addis 4, 2, .LCPI1_6@toc@ha
+ addi 4, 4, .LCPI1_6@toc@l
+ xxlxor 32, 47, 32
+ lvx 17, 0, 4
+ addi 4, 30, .LCPI1_5@toc@l
+ vperm 14, 10, 7, 11
+ vrlw 0, 0, 3
+ vadduwm 1, 0, 1
+ xxlxor 38, 33, 38
+ vrlw 6, 6, 4
+ vadduwm 8, 6, 15
+ xxswapd 0, 38
+ lvx 6, 0, 4
+ addis 4, 2, .LCPI1_9@toc@ha
+ addi 4, 4, .LCPI1_9@toc@l
+ xxlxor 32, 40, 32
+ xxsldwi 40, 40, 40, 1
+ vperm 13, 12, 18, 6
+ vrlw 9, 0, 5
+ vadduwm 0, 1, 14
+ lvx 1, 0, 12
+ xxsldwi 46, 46, 46, 3
+ xxsldwi 32, 32, 32, 3
+ vperm 7, 7, 7, 1
+ vadduwm 15, 9, 0
+ xxlxor 32, 47, 0
+ vperm 16, 0, 0, 2
+ lvx 0, 0, 11
+ addis 11, 2, .LCPI1_10@toc@ha
+ vcmpequh 0, 0, 17
+ vadduwm 19, 16, 8
+ xxlxor 40, 51, 41
+ xxsel 45, 39, 45, 32
+ vrlw 31, 8, 3
+ lvx 8, 0, 4
+ addis 4, 2, .LCPI1_11@toc@ha
+ addi 4, 4, .LCPI1_11@toc@l
+ vcmpequh 7, 8, 17
+ vadduwm 8, 15, 13
+ vadduwm 15, 31, 8
+ lvx 8, 0, 4
+ addi 4, 11, .LCPI1_10@toc@l
+ lvx 17, 0, 4
+ addis 4, 2, .LCPI1_12@toc@ha
+ xxlxor 41, 47, 48
+ xxsldwi 47, 47, 47, 1
+ addi 4, 4, .LCPI1_12@toc@l
+ xxlnor 48, 39, 39
+ vrlw 29, 9, 4
+ vperm 9, 16, 16, 8
+ xxland 48, 50, 39
+ vperm 17, 30, 12, 17
+ vperm 16, 16, 16, 8
+ vmrghw 12, 12, 10
+ lvx 10, 0, 4
+ addis 4, 2, .LCPI1_13@toc@ha
+ vadduwm 19, 29, 19
+ addi 4, 4, .LCPI1_13@toc@l
+ xxlxor 63, 51, 63
+ xxsldwi 51, 51, 51, 3
+ xxland 0, 49, 41
+ vrlw 17, 31, 5
+ xxlor 48, 0, 48
+ xxswapd 0, 61
+ vperm 18, 12, 18, 10
+ vadduwm 15, 15, 16
+ xxland 60, 48, 39
+ vadduwm 15, 17, 15
+ vperm 28, 28, 28, 8
+ xxlxor 63, 47, 0
+ vadduwm 15, 15, 18
+ vperm 31, 31, 31, 2
+ vperm 30, 18, 16, 6
+ vadduwm 19, 31, 19
+ xxlxor 44, 51, 49
+ vrlw 12, 12, 3
+ vadduwm 15, 12, 15
+ xxlxor 49, 47, 63
+ vperm 31, 13, 14, 11
+ vrlw 17, 17, 4
+ vperm 14, 14, 14, 1
+ vadduwm 15, 15, 31
+ vadduwm 19, 17, 19
+ xxswapd 0, 49
+ xxsldwi 47, 47, 47, 3
+ xxsel 46, 46, 62, 32
+ xxlxor 44, 51, 44
+ xxsldwi 51, 51, 51, 1
+ vrlw 12, 12, 5
+ vadduwm 15, 12, 15
+ xxlxor 49, 47, 0
+ vperm 17, 17, 17, 2
+ vadduwm 19, 17, 19
+ xxlxor 44, 51, 44
+ vrlw 29, 12, 3
+ vadduwm 12, 15, 14
+ vadduwm 15, 29, 12
+ lvx 12, 0, 4
+ addis 4, 2, .LCPI1_14@toc@ha
+ addi 4, 4, .LCPI1_14@toc@l
+ xxlxor 49, 47, 49
+ xxsldwi 47, 47, 47, 1
+ vperm 30, 13, 18, 12
+ vrlw 17, 17, 4
+ vmrghw 13, 18, 13
+ xxland 0, 62, 41
+ vadduwm 19, 17, 19
+ vperm 16, 13, 16, 10
+ xxlxor 61, 51, 61
+ xxsldwi 50, 51, 51, 3
+ xxsldwi 51, 63, 63, 3
+ vrlw 30, 29, 5
+ xxlor 61, 60, 0
+ xxswapd 0, 49
+ vperm 31, 14, 19, 11
+ vadduwm 15, 15, 29
+ vperm 19, 19, 19, 1
+ vadduwm 15, 30, 15
+ xxlxor 49, 47, 0
+ vadduwm 15, 15, 16
+ vperm 17, 17, 17, 2
+ vadduwm 18, 17, 18
+ xxlxor 45, 50, 62
+ vperm 30, 16, 29, 6
+ vrlw 13, 13, 3
+ vadduwm 15, 13, 15
+ xxlxor 49, 47, 49
+ vadduwm 15, 15, 31
+ xxsldwi 63, 63, 63, 3
+ vrlw 17, 17, 4
+ xxsldwi 47, 47, 47, 3
+ vadduwm 18, 17, 18
+ xxswapd 0, 49
+ xxlxor 45, 50, 45
+ xxsldwi 50, 50, 50, 1
+ vrlw 13, 13, 5
+ vadduwm 15, 13, 15
+ xxlxor 49, 47, 0
+ vperm 17, 17, 17, 2
+ vadduwm 18, 17, 18
+ xxlxor 45, 50, 45
+ vrlw 28, 13, 3
+ xxsel 45, 51, 62, 32
+ xxland 51, 61, 39
+ vperm 30, 14, 16, 12
+ vadduwm 15, 15, 13
+ vperm 19, 19, 19, 8
+ vmrghw 14, 16, 14
+ vadduwm 15, 28, 15
+ xxlxor 49, 47, 49
+ xxsldwi 47, 47, 47, 1
+ xxland 0, 62, 41
+ vrlw 17, 17, 4
+ xxlor 51, 51, 0
+ vadduwm 15, 15, 19
+ vadduwm 18, 17, 18
+ xxswapd 0, 49
+ xxlxor 60, 50, 60
+ xxsldwi 48, 50, 50, 3
+ vperm 18, 14, 29, 10
+ vrlw 30, 28, 5
+ vperm 29, 18, 19, 6
+ vadduwm 15, 30, 15
+ xxlxor 49, 47, 0
+ vadduwm 15, 15, 18
+ vperm 17, 17, 17, 2
+ vadduwm 16, 17, 16
+ xxlxor 46, 48, 62
+ vperm 30, 13, 31, 11
+ vrlw 14, 14, 3
+ vperm 31, 31, 31, 1
+ vadduwm 15, 14, 15
+ xxlxor 49, 47, 49
+ vadduwm 15, 15, 30
+ vrlw 17, 17, 4
+ xxsldwi 47, 47, 47, 3
+ vadduwm 16, 17, 16
+ xxswapd 0, 49
+ xxlxor 46, 48, 46
+ xxsldwi 48, 48, 48, 1
+ vrlw 14, 14, 5
+ vadduwm 15, 14, 15
+ xxlxor 49, 47, 0
+ vperm 17, 17, 17, 2
+ vadduwm 16, 17, 16
+ xxlxor 46, 48, 46
+ vrlw 28, 14, 3
+ xxsel 46, 63, 61, 32
+ xxland 63, 51, 39
+ vperm 29, 13, 18, 12
+ vadduwm 15, 15, 14
+ vperm 31, 31, 31, 8
+ vmrghw 13, 18, 13
+ vadduwm 15, 28, 15
+ xxlxor 49, 47, 49
+ xxsldwi 47, 47, 47, 1
+ xxland 0, 61, 41
+ vrlw 17, 17, 4
+ xxlor 63, 63, 0
+ vperm 13, 13, 19, 10
+ xxsldwi 51, 62, 62, 3
+ vadduwm 15, 15, 31
+ vperm 30, 14, 19, 11
+ vadduwm 16, 17, 16
+ xxswapd 0, 49
+ xxlxor 60, 48, 60
+ xxsldwi 48, 48, 48, 3
+ vrlw 29, 28, 5
+ vadduwm 15, 29, 15
+ xxlxor 49, 47, 0
+ vadduwm 15, 15, 13
+ vperm 17, 17, 17, 2
+ vadduwm 16, 17, 16
+ xxlxor 50, 48, 61
+ vrlw 18, 18, 3
+ vadduwm 15, 18, 15
+ xxlxor 49, 47, 49
+ vadduwm 15, 15, 30
+ vrlw 17, 17, 4
+ xxsldwi 47, 47, 47, 3
+ vadduwm 11, 17, 16
+ xxswapd 0, 49
+ xxlxor 48, 43, 50
+ xxsldwi 43, 43, 43, 1
+ vperm 18, 19, 19, 1
+ vrlw 16, 16, 5
+ vperm 19, 13, 31, 6
+ vadduwm 15, 16, 15
+ xxlxor 49, 47, 0
+ vperm 17, 17, 17, 2
+ vadduwm 29, 17, 11
+ xxlxor 43, 61, 48
+ vrlw 16, 11, 3
+ xxsel 43, 50, 51, 32
+ xxland 50, 63, 39
+ vperm 19, 14, 13, 12
+ vadduwm 15, 15, 11
+ vperm 18, 18, 18, 8
+ vmrghw 13, 13, 14
+ vadduwm 15, 16, 15
+ xxlxor 49, 47, 49
+ xxsldwi 47, 47, 47, 1
+ xxland 0, 51, 41
+ lvx 19, 0, 4
+ vrlw 17, 17, 4
+ xxlor 50, 50, 0
+ vperm 13, 13, 31, 10
+ xxsldwi 63, 62, 62, 3
+ vadduwm 15, 15, 18
+ vperm 19, 11, 31, 19
+ vadduwm 29, 17, 29
+ xxswapd 0, 49
+ vperm 1, 31, 31, 1
+ xxlxor 48, 61, 48
+ xxsldwi 46, 61, 61, 3
+ vperm 6, 13, 18, 6
+ vrlw 16, 16, 5
+ xxsel 32, 33, 38, 32
+ xxland 38, 50, 39
+ vadduwm 15, 16, 15
+ vperm 7, 11, 13, 12
+ xxlxor 49, 47, 0
+ vadduwm 15, 15, 13
+ vperm 17, 17, 17, 2
+ vperm 6, 6, 6, 8
+ vadduwm 14, 17, 14
+ xxlxor 48, 46, 48
+ vrlw 16, 16, 3
+ vadduwm 15, 16, 15
+ xxlxor 49, 47, 49
+ xxsldwi 47, 47, 47, 3
+ vrlw 17, 17, 4
+ vadduwm 15, 15, 19
+ vadduwm 14, 17, 14
+ xxswapd 0, 49
+ xxlxor 48, 46, 48
+ xxsldwi 46, 46, 46, 1
+ vrlw 16, 16, 5
+ vadduwm 15, 16, 15
+ xxlxor 49, 47, 0
+ vadduwm 0, 15, 0
+ vperm 17, 17, 17, 2
+ xxland 0, 39, 41
+ xxlor 38, 38, 0
+ vadduwm 14, 17, 14
+ xxlxor 48, 46, 48
+ vrlw 16, 16, 3
+ vadduwm 0, 16, 0
+ xxlxor 33, 32, 49
+ xxsldwi 32, 32, 32, 1
+ vrlw 1, 1, 4
+ vadduwm 0, 0, 6
+ vadduwm 8, 1, 14
+ xxswapd 0, 33
+ xxlxor 44, 40, 48
+ xxsldwi 38, 40, 40, 3
+ vrlw 7, 12, 5
+ vadduwm 0, 7, 0
+ xxlxor 33, 32, 0
+ vperm 2, 1, 1, 2
+ vmrghw 1, 13, 11
+ vadduwm 6, 2, 6
+ vperm 1, 1, 18, 10
+ xxlxor 39, 38, 39
+ vrlw 3, 7, 3
+ vadduwm 0, 0, 1
+ vadduwm 0, 3, 0
+ xxlxor 34, 32, 34
+ xxsldwi 0, 32, 32, 3
+ vrlw 2, 2, 4
+ vadduwm 4, 2, 6
+ xxswapd 2, 34
+ xxlxor 35, 36, 35
+ xxsldwi 1, 36, 36, 1
+ vrlw 3, 3, 5
+ xxlxor 0, 1, 0
+ xxswapd 0, 0
+ xxlxor 3, 35, 2
+ stxvd2x 0, 0, 8
+ xxswapd 3, 3
+ stxvd2x 3, 8, 5
+ lfdx 0, 0, 3
+ lfd 3, 8(3)
+ xxmrghd 34, 3, 0
+ xxlxor 0, 1, 34
+ xxswapd 0, 0
+ stxvd2x 0, 8, 10
+ lfd 0, 16(3)
+ lfd 1, 24(3)
+ li 3, -32
+ xxmrghd 34, 1, 0
+ xxlxor 0, 2, 34
+ xxswapd 0, 0
+ stxvd2x 0, 8, 6
+ lxvd2x 63, 1, 3
+ li 3, -48
+ ld 30, -16(1)
+ lxvd2x 62, 1, 3
+ li 3, -64
+ lxvd2x 61, 1, 3
+ li 3, -80
+ lxvd2x 60, 1, 3
+ blr
+ .long 0
+ .quad 0
+.Lfunc_end1:
+ .size zfs_blake3_compress_xof_sse2, .Lfunc_end1-.Lfunc_begin1
+ .cfi_endproc
+
+ .globl zfs_blake3_hash_many_sse2
+ .p2align 2
+ .type zfs_blake3_hash_many_sse2,@function
+zfs_blake3_hash_many_sse2:
+.Lfunc_begin2:
+ .cfi_startproc
+.Lfunc_gep2:
+ addis 2, 12, .TOC.-.Lfunc_gep2@ha
+ addi 2, 2, .TOC.-.Lfunc_gep2@l
+.Lfunc_lep2:
+ .localentry zfs_blake3_hash_many_sse2, .Lfunc_lep2-.Lfunc_gep2
+ mfocrf 12, 32
+ mflr 0
+ std 0, 16(1)
+ stw 12, 8(1)
+ stdu 1, -256(1)
+ .cfi_def_cfa_offset 256
+ .cfi_offset lr, 16
+ .cfi_offset r17, -120
+ .cfi_offset r18, -112
+ .cfi_offset r19, -104
+ .cfi_offset r20, -96
+ .cfi_offset r21, -88
+ .cfi_offset r22, -80
+ .cfi_offset r23, -72
+ .cfi_offset r24, -64
+ .cfi_offset r25, -56
+ .cfi_offset r26, -48
+ .cfi_offset r27, -40
+ .cfi_offset r28, -32
+ .cfi_offset r29, -24
+ .cfi_offset r30, -16
+ .cfi_offset cr2, 8
+ std 26, 208(1)
+ mr 26, 4
+ cmpldi 1, 4, 4
+ andi. 4, 8, 1
+ std 18, 144(1)
+ std 19, 152(1)
+ crmove 8, 1
+ ld 19, 360(1)
+ lwz 18, 352(1)
+ std 24, 192(1)
+ std 25, 200(1)
+ std 27, 216(1)
+ std 28, 224(1)
+ mr 24, 10
+ mr 28, 6
+ mr 27, 5
+ mr 25, 3
+ std 29, 232(1)
+ std 30, 240(1)
+ mr 30, 9
+ mr 29, 7
+ std 17, 136(1)
+ std 20, 160(1)
+ std 21, 168(1)
+ std 22, 176(1)
+ std 23, 184(1)
+ blt 1, .LBB2_3
+ li 3, 0
+ li 4, 1
+ clrldi 23, 30, 32
+ isel 22, 4, 3, 8
+ clrldi 21, 24, 32
+ clrldi 20, 18, 32
+.LBB2_2:
+ mr 3, 25
+ mr 4, 27
+ mr 5, 28
+ mr 6, 29
+ mr 7, 22
+ mr 8, 23
+ mr 9, 21
+ mr 10, 20
+ std 19, 32(1)
+ bl blake3_hash4_sse2
+ addi 26, 26, -4
+ addi 3, 29, 4
+ addi 25, 25, 32
+ addi 19, 19, 128
+ cmpldi 26, 3
+ isel 29, 3, 29, 8
+ bgt 0, .LBB2_2
+.LBB2_3:
+ cmpldi 26, 0
+ beq 0, .LBB2_11
+ li 3, 0
+ li 4, 1
+ or 21, 24, 30
+ li 20, 16
+ addi 24, 1, 96
+ isel 22, 4, 3, 8
+.LBB2_5:
+ lxvd2x 0, 28, 20
+ ld 23, 0(25)
+ mr 17, 27
+ mr 3, 21
+ stxvd2x 0, 24, 20
+ lxvd2x 0, 0, 28
+ stxvd2x 0, 0, 24
+.LBB2_6:
+ cmpldi 17, 1
+ beq 0, .LBB2_8
+ cmpldi 17, 0
+ bne 0, .LBB2_9
+ b .LBB2_10
+.LBB2_8:
+ or 3, 3, 18
+.LBB2_9:
+ clrldi 7, 3, 56
+ mr 3, 24
+ mr 4, 23
+ li 5, 64
+ mr 6, 29
+ bl zfs_blake3_compress_in_place_sse2
+ addi 23, 23, 64
+ addi 17, 17, -1
+ mr 3, 30
+ b .LBB2_6
+.LBB2_10:
+ lxvd2x 0, 24, 20
+ addi 26, 26, -1
+ add 29, 29, 22
+ addi 25, 25, 8
+ cmpldi 26, 0
+ stxvd2x 0, 19, 20
+ lxvd2x 0, 0, 24
+ stxvd2x 0, 0, 19
+ addi 19, 19, 32
+ bne 0, .LBB2_5
+.LBB2_11:
+ ld 30, 240(1)
+ ld 29, 232(1)
+ ld 28, 224(1)
+ ld 27, 216(1)
+ ld 26, 208(1)
+ ld 25, 200(1)
+ ld 24, 192(1)
+ ld 23, 184(1)
+ ld 22, 176(1)
+ ld 21, 168(1)
+ ld 20, 160(1)
+ ld 19, 152(1)
+ ld 18, 144(1)
+ ld 17, 136(1)
+ addi 1, 1, 256
+ ld 0, 16(1)
+ lwz 12, 8(1)
+ mtocrf 32, 12
+ mtlr 0
+ blr
+ .long 0
+ .quad 0
+.Lfunc_end2:
+ .size zfs_blake3_hash_many_sse2, .Lfunc_end2-.Lfunc_begin2
+ .cfi_endproc
+
+ .section .rodata.cst16,"aM",@progbits,16
+ .p2align 4
+.LCPI3_0:
+ .quad 4294967296
+ .quad 12884901890
+.LCPI3_1:
+ .byte 29
+ .byte 28
+ .byte 31
+ .byte 30
+ .byte 25
+ .byte 24
+ .byte 27
+ .byte 26
+ .byte 21
+ .byte 20
+ .byte 23
+ .byte 22
+ .byte 17
+ .byte 16
+ .byte 19
+ .byte 18
+.LCPI3_2:
+ .long 1779033703
+ .long 1779033703
+ .long 1779033703
+ .long 1779033703
+.LCPI3_3:
+ .long 3144134277
+ .long 3144134277
+ .long 3144134277
+ .long 3144134277
+.LCPI3_4:
+ .long 1013904242
+ .long 1013904242
+ .long 1013904242
+ .long 1013904242
+.LCPI3_5:
+ .long 2773480762
+ .long 2773480762
+ .long 2773480762
+ .long 2773480762
+ .text
+ .p2align 2
+ .type blake3_hash4_sse2,@function
+blake3_hash4_sse2:
+.Lfunc_begin3:
+ .cfi_startproc
+.Lfunc_gep3:
+ addis 2, 12, .TOC.-.Lfunc_gep3@ha
+ addi 2, 2, .TOC.-.Lfunc_gep3@l
+.Lfunc_lep3:
+ .localentry blake3_hash4_sse2, .Lfunc_lep3-.Lfunc_gep3
+ stdu 1, -400(1)
+ .cfi_def_cfa_offset 400
+ .cfi_offset r22, -152
+ .cfi_offset r23, -144
+ .cfi_offset r24, -136
+ .cfi_offset r25, -128
+ .cfi_offset r26, -120
+ .cfi_offset r27, -112
+ .cfi_offset r28, -104
+ .cfi_offset r29, -96
+ .cfi_offset r30, -88
+ .cfi_offset f23, -72
+ .cfi_offset f24, -64
+ .cfi_offset f25, -56
+ .cfi_offset f26, -48
+ .cfi_offset f27, -40
+ .cfi_offset f28, -32
+ .cfi_offset f29, -24
+ .cfi_offset f30, -16
+ .cfi_offset f31, -8
+ .cfi_offset v20, -352
+ .cfi_offset v21, -336
+ .cfi_offset v22, -320
+ .cfi_offset v23, -304
+ .cfi_offset v24, -288
+ .cfi_offset v25, -272
+ .cfi_offset v26, -256
+ .cfi_offset v27, -240
+ .cfi_offset v28, -224
+ .cfi_offset v29, -208
+ .cfi_offset v30, -192
+ .cfi_offset v31, -176
+ li 11, 48
+ li 0, 8
+ std 30, 312(1)
+ li 30, 12
+ li 12, 4
+ lfiwzx 0, 0, 5
+ stxvd2x 52, 1, 11
+ li 11, 64
+ lfiwzx 2, 5, 0
+ li 0, 20
+ lfiwzx 3, 5, 30
+ stxvd2x 53, 1, 11
+ li 11, 80
+ li 30, 24
+ lfiwzx 4, 5, 0
+ li 0, 28
+ stxvd2x 54, 1, 11
+ li 11, 96
+ lfiwzx 1, 5, 12
+ lfiwzx 6, 5, 30
+ xxspltw 45, 0, 1
+ cmpldi 4, 0
+ std 22, 248(1)
+ stxvd2x 55, 1, 11
+ li 11, 112
+ lfiwzx 7, 5, 0
+ xxspltw 40, 2, 1
+ std 23, 256(1)
+ xxspltw 38, 3, 1
+ xxspltw 50, 4, 1
+ std 24, 264(1)
+ std 25, 272(1)
+ std 26, 280(1)
+ xxspltw 54, 7, 1
+ std 27, 288(1)
+ std 28, 296(1)
+ std 29, 304(1)
+ stxvd2x 56, 1, 11
+ li 11, 128
+ stfd 23, 328(1)
+ stxvd2x 57, 1, 11
+ li 11, 144
+ stfd 24, 336(1)
+ stxvd2x 58, 1, 11
+ li 11, 160
+ stfd 25, 344(1)
+ stxvd2x 59, 1, 11
+ li 11, 176
+ xxspltw 59, 1, 1
+ stxvd2x 60, 1, 11
+ li 11, 192
+ stfd 26, 352(1)
+ stxvd2x 61, 1, 11
+ li 11, 208
+ stfd 27, 360(1)
+ stxvd2x 62, 1, 11
+ li 11, 224
+ xxspltw 62, 6, 1
+ stxvd2x 63, 1, 11
+ li 11, 16
+ stfd 28, 368(1)
+ lfiwzx 5, 5, 11
+ ld 5, 432(1)
+ stfd 29, 376(1)
+ stfd 30, 384(1)
+ stfd 31, 392(1)
+ xxspltw 61, 5, 1
+ beq 0, .LBB3_5
+ addis 30, 2, .LCPI3_0@toc@ha
+ neg 7, 7
+ xxleqv 34, 34, 34
+ addis 28, 2, .LCPI3_2@toc@ha
+ addis 27, 2, .LCPI3_3@toc@ha
+ addis 26, 2, .LCPI3_4@toc@ha
+ addis 25, 2, .LCPI3_5@toc@ha
+ ld 29, 24(3)
+ addi 0, 30, .LCPI3_0@toc@l
+ mtfprwz 1, 7
+ addis 7, 2, .LCPI3_1@toc@ha
+ ld 30, 16(3)
+ lxvd2x 0, 0, 0
+ mtfprwz 2, 6
+ rldicl 6, 6, 32, 32
+ addi 0, 7, .LCPI3_1@toc@l
+ ld 7, 8(3)
+ vslw 2, 2, 2
+ lvx 5, 0, 0
+ addi 0, 28, .LCPI3_2@toc@l
+ addi 28, 27, .LCPI3_3@toc@l
+ addi 27, 26, .LCPI3_4@toc@l
+ addi 26, 25, .LCPI3_5@toc@l
+ or 25, 9, 8
+ li 9, 0
+ xxspltw 36, 2, 1
+ xxswapd 35, 0
+ xxspltw 0, 1, 1
+ xxland 35, 0, 35
+ mtfprwz 0, 6
+ ld 6, 0(3)
+ addi 3, 3, -8
+ vadduwm 4, 3, 4
+ xxlor 35, 35, 34
+ xxlxor 34, 36, 34
+ xxlor 9, 36, 36
+ vspltisw 4, 4
+ vcmpgtsw 2, 3, 2
+ xxspltw 35, 0, 1
+ xxlor 10, 36, 36
+ vsubuwm 2, 3, 2
+ xxlor 11, 34, 34
+ lvx 2, 0, 0
+ li 0, 32
+ xxlor 12, 34, 34
+ lvx 2, 0, 28
+ li 28, 48
+ xxlor 13, 34, 34
+ lvx 2, 0, 27
+ li 27, 0
+ xxlor 31, 34, 34
+ lvx 2, 0, 26
+ xxlor 30, 34, 34
+.LBB3_2:
+ mr 26, 27
+ addi 27, 27, 1
+ xxlor 28, 40, 40
+ cmpld 27, 4
+ sldi 26, 26, 6
+ xxlor 24, 45, 45
+ iseleq 24, 10, 9
+ add 23, 6, 26
+ add 22, 30, 26
+ lxvd2x 0, 6, 26
+ lxvd2x 1, 7, 26
+ or 25, 24, 25
+ add 24, 7, 26
+ lxvd2x 2, 30, 26
+ lxvd2x 3, 29, 26
+ xxlor 29, 38, 38
+ lxvd2x 4, 23, 11
+ lxvd2x 6, 24, 11
+ clrlwi 25, 25, 24
+ lxvd2x 7, 22, 11
+ lxvd2x 8, 23, 0
+ mtfprd 5, 25
+ add 25, 29, 26
+ xxswapd 34, 0
+ lxvd2x 0, 25, 11
+ xxswapd 36, 1
+ xxswapd 33, 2
+ lxvd2x 1, 24, 0
+ lxvd2x 2, 22, 0
+ xxswapd 39, 3
+ xxswapd 32, 4
+ lxvd2x 3, 25, 0
+ lxvd2x 4, 23, 28
+ xxswapd 49, 6
+ xxswapd 51, 7
+ lxvd2x 6, 24, 28
+ xxswapd 58, 8
+ lxvd2x 7, 22, 28
+ lxvd2x 8, 25, 28
+ xxswapd 60, 0
+ mr 25, 3
+ xxswapd 57, 1
+ xxswapd 53, 2
+ xxswapd 52, 3
+ xxswapd 56, 4
+ xxswapd 55, 6
+ xxswapd 0, 5
+ xxswapd 40, 7
+ xxswapd 41, 8
+ mtctr 12
+.LBB3_3:
+ ldu 24, 8(25)
+ add 24, 24, 26
+ addi 24, 24, 256
+ dcbt 0, 24
+ bdnz .LBB3_3
+ vmrgew 3, 4, 2
+ vspltisw 31, 9
+ mr 25, 8
+ vmrglw 10, 4, 2
+ vspltisw 14, 10
+ vmrghw 6, 4, 2
+ xxspltw 0, 0, 3
+ vmrgew 4, 17, 0
+ vmrglw 11, 17, 0
+ vmrghw 16, 17, 0
+ vmrgew 0, 25, 26
+ vmrgew 13, 7, 1
+ vmrglw 2, 7, 1
+ vmrghw 7, 7, 1
+ xxlor 25, 36, 36
+ vmrgew 4, 28, 19
+ xxlor 26, 32, 32
+ vmrglw 0, 25, 26
+ vmrglw 1, 28, 19
+ xxmrgld 47, 34, 42
+ xxlor 44, 28, 28
+ vmrghw 25, 25, 26
+ xxlor 23, 36, 36
+ vmrghw 4, 28, 19
+ vspltisw 19, -16
+ xxlor 5, 32, 32
+ vmrgew 0, 20, 21
+ xxmrgld 34, 33, 43
+ vmrglw 28, 20, 21
+ vmrghw 21, 20, 21
+ vmrglw 20, 23, 24
+ vmrghw 26, 23, 24
+ vmrglw 17, 9, 8
+ xxlor 8, 32, 32
+ vmrgew 0, 23, 24
+ xxmrgld 56, 39, 38
+ vmrgew 23, 9, 8
+ xxlor 33, 24, 24
+ xxlor 2, 34, 34
+ vadduwm 11, 15, 1
+ xxmrgld 33, 36, 48
+ xxlor 6, 47, 47
+ xxlor 27, 32, 32
+ vmrghw 0, 9, 8
+ vspltisw 9, 12
+ vsubuwm 8, 31, 19
+ xxmrgld 51, 23, 25
+ vadduwm 31, 2, 12
+ xxlor 34, 10, 10
+ vadduwm 10, 14, 14
+ vslw 15, 2, 2
+ xxlor 34, 29, 29
+ vadduwm 14, 24, 27
+ xxlor 24, 48, 48
+ vadduwm 16, 1, 2
+ xxmrgld 34, 45, 35
+ vadduwm 31, 31, 30
+ xxmrghd 36, 36, 24
+ vadduwm 11, 11, 29
+ vadduwm 14, 14, 18
+ vadduwm 13, 16, 22
+ xxlxor 47, 63, 47
+ xxlor 1, 9, 9
+ xxlor 1, 11, 11
+ xxlxor 48, 43, 9
+ vadduwm 11, 11, 2
+ xxlor 7, 34, 34
+ xxmrghd 34, 39, 38
+ xxlxor 39, 46, 11
+ xxlor 1, 50, 50
+ xxlxor 50, 45, 0
+ vperm 15, 15, 15, 5
+ vperm 16, 16, 16, 5
+ vperm 7, 7, 7, 5
+ vperm 18, 18, 18, 5
+ xxlor 4, 33, 33
+ xxlor 33, 31, 31
+ vadduwm 14, 14, 2
+ xxlor 3, 34, 34
+ xxlor 34, 12, 12
+ xxlor 35, 13, 13
+ vadduwm 6, 15, 1
+ xxlor 33, 30, 30
+ vadduwm 2, 16, 2
+ vadduwm 3, 7, 3
+ vadduwm 12, 18, 1
+ xxlxor 59, 34, 61
+ xxlxor 61, 35, 1
+ xxlxor 33, 38, 62
+ xxlxor 62, 44, 54
+ vrlw 22, 27, 10
+ vrlw 29, 29, 10
+ vrlw 1, 1, 10
+ vrlw 30, 30, 10
+ vadduwm 31, 31, 19
+ vadduwm 13, 13, 4
+ vadduwm 11, 22, 11
+ vadduwm 14, 29, 14
+ vadduwm 31, 1, 31
+ vadduwm 13, 30, 13
+ vadduwm 9, 9, 9
+ xxlor 1, 36, 36
+ xxlxor 48, 43, 48
+ xxlxor 36, 46, 39
+ xxmrgld 39, 60, 5
+ xxlxor 47, 63, 47
+ xxlxor 50, 45, 50
+ vrlw 16, 16, 9
+ vrlw 28, 4, 9
+ xxmrgld 36, 53, 57
+ vrlw 15, 15, 9
+ xxmrghd 57, 53, 57
+ vrlw 18, 18, 9
+ vadduwm 14, 14, 4
+ xxlor 0, 36, 36
+ xxmrgld 36, 49, 52
+ vadduwm 2, 16, 2
+ xxmrgld 49, 8, 26
+ vadduwm 3, 28, 3
+ vadduwm 6, 15, 6
+ vadduwm 12, 18, 12
+ xxlxor 54, 34, 54
+ xxlxor 61, 35, 61
+ xxlxor 33, 38, 33
+ xxlxor 62, 44, 62
+ vrlw 29, 29, 8
+ vrlw 20, 1, 8
+ xxmrgld 33, 55, 27
+ vrlw 30, 30, 8
+ vrlw 22, 22, 8
+ vadduwm 11, 11, 7
+ xxlor 5, 39, 39
+ xxmrgld 39, 32, 58
+ vadduwm 31, 31, 4
+ vadduwm 11, 29, 11
+ vadduwm 13, 13, 7
+ vadduwm 14, 20, 14
+ vadduwm 31, 30, 31
+ vadduwm 13, 22, 13
+ xxlor 28, 36, 36
+ xxlxor 50, 43, 50
+ xxlxor 48, 46, 48
+ xxlxor 36, 63, 60
+ xxlxor 47, 45, 47
+ vperm 18, 18, 18, 5
+ vperm 16, 16, 16, 5
+ vperm 4, 4, 4, 5
+ vperm 15, 15, 15, 5
+ vadduwm 11, 11, 17
+ vmr 28, 17
+ xxmrghd 49, 32, 58
+ vadduwm 6, 18, 6
+ vadduwm 12, 16, 12
+ vadduwm 21, 4, 2
+ vadduwm 3, 15, 3
+ xxlxor 34, 38, 61
+ xxlxor 61, 44, 52
+ xxlxor 62, 53, 62
+ xxlxor 54, 35, 54
+ vrlw 20, 2, 10
+ vrlw 29, 29, 10
+ vrlw 0, 30, 10
+ vrlw 30, 22, 10
+ vadduwm 14, 14, 25
+ vadduwm 31, 31, 1
+ vadduwm 13, 13, 17
+ vadduwm 11, 20, 11
+ vadduwm 14, 29, 14
+ vadduwm 31, 0, 31
+ vadduwm 13, 30, 13
+ xxlxor 50, 43, 50
+ xxlxor 48, 46, 48
+ xxlxor 36, 63, 36
+ xxlxor 47, 45, 47
+ vrlw 18, 18, 9
+ vrlw 16, 16, 9
+ vrlw 4, 4, 9
+ vrlw 15, 15, 9
+ vadduwm 11, 11, 24
+ xxlor 8, 56, 56
+ vadduwm 6, 18, 6
+ vadduwm 12, 16, 12
+ vadduwm 24, 4, 21
+ vadduwm 3, 15, 3
+ xxlxor 55, 38, 52
+ xxlxor 61, 44, 61
+ xxlxor 62, 35, 62
+ xxlxor 32, 56, 32
+ vrlw 30, 30, 8
+ vrlw 23, 23, 8
+ vrlw 29, 29, 8
+ vrlw 0, 0, 8
+ xxlor 25, 51, 51
+ vmr 26, 17
+ xxlor 49, 3, 3
+ xxlor 52, 1, 1
+ xxlor 51, 2, 2
+ vadduwm 14, 14, 17
+ vadduwm 31, 31, 20
+ vadduwm 13, 13, 19
+ vadduwm 11, 30, 11
+ vadduwm 14, 23, 14
+ vadduwm 31, 29, 31
+ vadduwm 13, 0, 13
+ xxlxor 48, 43, 48
+ xxlxor 36, 46, 36
+ xxlxor 47, 63, 47
+ xxlxor 50, 45, 50
+ vperm 16, 16, 16, 5
+ vperm 4, 4, 4, 5
+ vperm 15, 15, 15, 5
+ vperm 18, 18, 18, 5
+ xxlor 29, 39, 39
+ xxlor 59, 4, 4
+ vadduwm 24, 16, 24
+ vadduwm 3, 4, 3
+ vadduwm 6, 15, 6
+ vadduwm 12, 18, 12
+ xxlxor 62, 56, 62
+ xxlxor 55, 35, 55
+ xxlxor 61, 38, 61
+ xxlxor 32, 44, 32
+ vrlw 30, 30, 10
+ vrlw 23, 23, 10
+ vrlw 29, 29, 10
+ vrlw 0, 0, 10
+ xxlor 53, 0, 0
+ xxlor 39, 6, 6
+ vadduwm 11, 11, 27
+ vadduwm 14, 14, 21
+ vadduwm 31, 31, 7
+ vadduwm 13, 13, 1
+ vadduwm 11, 30, 11
+ vadduwm 14, 23, 14
+ vadduwm 31, 29, 31
+ vadduwm 13, 0, 13
+ xxlxor 48, 43, 48
+ xxlxor 36, 46, 36
+ xxlxor 47, 63, 47
+ xxlxor 50, 45, 50
+ vrlw 16, 16, 9
+ vrlw 4, 4, 9
+ vrlw 15, 15, 9
+ vrlw 18, 18, 9
+ xxlor 34, 7, 7
+ vadduwm 31, 31, 28
+ vadduwm 24, 16, 24
+ vadduwm 3, 4, 3
+ vadduwm 6, 15, 6
+ vadduwm 12, 18, 12
+ xxlxor 62, 56, 62
+ xxlxor 55, 35, 55
+ xxlxor 61, 38, 61
+ xxlxor 32, 44, 32
+ vrlw 23, 23, 8
+ vrlw 29, 29, 8
+ vrlw 0, 0, 8
+ vrlw 30, 30, 8
+ vadduwm 11, 11, 2
+ xxlor 34, 28, 28
+ vadduwm 13, 13, 26
+ vadduwm 14, 14, 2
+ vadduwm 11, 23, 11
+ vadduwm 14, 29, 14
+ vadduwm 31, 0, 31
+ vadduwm 13, 30, 13
+ xxlxor 50, 43, 50
+ xxlxor 48, 46, 48
+ xxlxor 36, 63, 36
+ xxlxor 47, 45, 47
+ vperm 18, 18, 18, 5
+ vperm 16, 16, 16, 5
+ vperm 4, 4, 4, 5
+ vperm 15, 15, 15, 5
+ xxlor 2, 58, 58
+ xxlor 39, 25, 25
+ vadduwm 6, 18, 6
+ vadduwm 12, 16, 12
+ vadduwm 24, 4, 24
+ vadduwm 3, 15, 3
+ xxlxor 55, 38, 55
+ xxlxor 61, 44, 61
+ xxlxor 32, 56, 32
+ xxlxor 62, 35, 62
+ vrlw 23, 23, 10
+ vrlw 29, 29, 10
+ vrlw 0, 0, 10
+ vrlw 30, 30, 10
+ xxlor 54, 29, 29
+ xxlor 58, 5, 5
+ vadduwm 11, 11, 25
+ vadduwm 14, 14, 7
+ vadduwm 31, 31, 22
+ vadduwm 13, 13, 26
+ vadduwm 11, 23, 11
+ vadduwm 14, 29, 14
+ vadduwm 31, 0, 31
+ vadduwm 13, 30, 13
+ xxlxor 50, 43, 50
+ xxlxor 48, 46, 48
+ xxlxor 36, 63, 36
+ xxlxor 47, 45, 47
+ vrlw 18, 18, 9
+ vrlw 16, 16, 9
+ vrlw 4, 4, 9
+ vrlw 15, 15, 9
+ vadduwm 11, 11, 17
+ vadduwm 14, 14, 21
+ vadduwm 6, 18, 6
+ vadduwm 12, 16, 12
+ vadduwm 24, 4, 24
+ vadduwm 3, 15, 3
+ xxlxor 55, 38, 55
+ xxlxor 61, 44, 61
+ xxlxor 62, 35, 62
+ xxlxor 32, 56, 32
+ vrlw 30, 30, 8
+ vrlw 23, 23, 8
+ vrlw 29, 29, 8
+ vrlw 0, 0, 8
+ vadduwm 31, 31, 1
+ vadduwm 13, 13, 20
+ vadduwm 11, 30, 11
+ vadduwm 14, 23, 14
+ vadduwm 31, 29, 31
+ vadduwm 13, 0, 13
+ xxlxor 48, 43, 48
+ xxlxor 36, 46, 36
+ xxlxor 47, 63, 47
+ xxlxor 50, 45, 50
+ vperm 16, 16, 16, 5
+ vperm 4, 4, 4, 5
+ vperm 15, 15, 15, 5
+ vperm 18, 18, 18, 5
+ xxlor 0, 33, 33
+ xxlor 33, 8, 8
+ vadduwm 24, 16, 24
+ vadduwm 3, 4, 3
+ vadduwm 6, 15, 6
+ vadduwm 12, 18, 12
+ xxlxor 62, 56, 62
+ xxlxor 55, 35, 55
+ xxlxor 61, 38, 61
+ xxlxor 32, 44, 32
+ vrlw 30, 30, 10
+ vrlw 23, 23, 10
+ vrlw 29, 29, 10
+ vrlw 0, 0, 10
+ vadduwm 11, 11, 19
+ vadduwm 14, 14, 2
+ vadduwm 31, 31, 1
+ vadduwm 13, 13, 22
+ vadduwm 11, 30, 11
+ vadduwm 14, 23, 14
+ vadduwm 31, 29, 31
+ vadduwm 13, 0, 13
+ xxlxor 48, 43, 48
+ xxlxor 36, 46, 36
+ xxlxor 47, 63, 47
+ xxlxor 50, 45, 50
+ vrlw 16, 16, 9
+ vrlw 4, 4, 9
+ vrlw 15, 15, 9
+ vrlw 18, 18, 9
+ vadduwm 11, 11, 27
+ vadduwm 14, 14, 28
+ vadduwm 24, 16, 24
+ vadduwm 3, 4, 3
+ vadduwm 6, 15, 6
+ vadduwm 12, 18, 12
+ xxlxor 62, 56, 62
+ xxlxor 55, 35, 55
+ xxlxor 61, 38, 61
+ xxlxor 32, 44, 32
+ vrlw 23, 23, 8
+ vrlw 29, 29, 8
+ vrlw 0, 0, 8
+ vrlw 30, 30, 8
+ vadduwm 31, 31, 25
+ vadduwm 13, 13, 26
+ vadduwm 11, 23, 11
+ vadduwm 14, 29, 14
+ vadduwm 31, 0, 31
+ vadduwm 13, 30, 13
+ xxlxor 50, 43, 50
+ xxlxor 48, 46, 48
+ xxlxor 36, 63, 36
+ xxlxor 47, 45, 47
+ vperm 18, 18, 18, 5
+ vperm 16, 16, 16, 5
+ vperm 4, 4, 4, 5
+ vperm 15, 15, 15, 5
+ xxlor 3, 7, 7
+ vadduwm 11, 11, 7
+ vadduwm 6, 18, 6
+ vadduwm 12, 16, 12
+ vadduwm 24, 4, 24
+ vadduwm 3, 15, 3
+ xxlxor 55, 38, 55
+ xxlxor 61, 44, 61
+ xxlxor 32, 56, 32
+ xxlxor 62, 35, 62
+ vrlw 23, 23, 10
+ vrlw 29, 29, 10
+ vrlw 0, 0, 10
+ vrlw 30, 30, 10
+ xxlor 33, 6, 6
+ xxlor 58, 2, 2
+ xxlor 39, 3, 3
+ vadduwm 14, 14, 1
+ vadduwm 31, 31, 26
+ vadduwm 13, 13, 7
+ vadduwm 11, 23, 11
+ vadduwm 14, 29, 14
+ vadduwm 31, 0, 31
+ vadduwm 13, 30, 13
+ xxlxor 50, 43, 50
+ xxlxor 48, 46, 48
+ xxlxor 36, 63, 36
+ xxlxor 47, 45, 47
+ vrlw 18, 18, 9
+ vrlw 16, 16, 9
+ vrlw 4, 4, 9
+ vrlw 15, 15, 9
+ xxlor 52, 0, 0
+ vadduwm 11, 11, 21
+ vadduwm 6, 18, 6
+ vadduwm 12, 16, 12
+ vadduwm 24, 4, 24
+ vadduwm 3, 15, 3
+ xxlxor 55, 38, 55
+ xxlxor 61, 44, 61
+ xxlxor 62, 35, 62
+ xxlxor 32, 56, 32
+ vrlw 30, 30, 8
+ vrlw 23, 23, 8
+ vrlw 29, 29, 8
+ vrlw 0, 0, 8
+ vadduwm 14, 14, 2
+ vadduwm 31, 31, 22
+ vadduwm 13, 13, 20
+ vadduwm 11, 30, 11
+ vadduwm 14, 23, 14
+ vadduwm 31, 29, 31
+ vadduwm 13, 0, 13
+ xxlxor 48, 43, 48
+ xxlxor 36, 46, 36
+ xxlxor 47, 63, 47
+ xxlxor 50, 45, 50
+ vperm 16, 16, 16, 5
+ vperm 4, 4, 4, 5
+ vperm 15, 15, 15, 5
+ vperm 18, 18, 18, 5
+ xxlor 7, 49, 49
+ vmr 17, 2
+ vadduwm 24, 16, 24
+ vadduwm 3, 4, 3
+ vadduwm 6, 15, 6
+ vadduwm 12, 18, 12
+ xxlxor 62, 56, 62
+ xxlxor 55, 35, 55
+ xxlxor 61, 38, 61
+ xxlxor 32, 44, 32
+ vrlw 30, 30, 10
+ vrlw 23, 23, 10
+ vrlw 29, 29, 10
+ vrlw 0, 0, 10
+ xxlor 54, 1, 1
+ xxlor 34, 7, 7
+ vadduwm 11, 11, 22
+ vadduwm 14, 14, 28
+ vadduwm 31, 31, 2
+ vadduwm 13, 13, 26
+ vadduwm 11, 30, 11
+ vadduwm 14, 23, 14
+ vadduwm 31, 29, 31
+ vadduwm 13, 0, 13
+ xxlxor 48, 43, 48
+ xxlxor 36, 46, 36
+ xxlxor 47, 63, 47
+ xxlxor 50, 45, 50
+ vrlw 16, 16, 9
+ vrlw 4, 4, 9
+ vrlw 15, 15, 9
+ vrlw 18, 18, 9
+ xxlor 59, 25, 25
+ vadduwm 11, 11, 19
+ vadduwm 24, 16, 24
+ vadduwm 3, 4, 3
+ vadduwm 6, 15, 6
+ vadduwm 12, 18, 12
+ xxlxor 62, 56, 62
+ xxlxor 55, 35, 55
+ xxlxor 61, 38, 61
+ xxlxor 32, 44, 32
+ vrlw 23, 23, 8
+ vrlw 29, 29, 8
+ vrlw 0, 0, 8
+ vrlw 30, 30, 8
+ vadduwm 14, 14, 25
+ vadduwm 31, 31, 27
+ vadduwm 13, 13, 7
+ vadduwm 11, 23, 11
+ vadduwm 14, 29, 14
+ vadduwm 31, 0, 31
+ vadduwm 13, 30, 13
+ xxlxor 50, 43, 50
+ xxlxor 48, 46, 48
+ xxlxor 36, 63, 36
+ xxlxor 47, 45, 47
+ vperm 18, 18, 18, 5
+ vperm 16, 16, 16, 5
+ vperm 4, 4, 4, 5
+ vperm 15, 15, 15, 5
+ vmr 2, 19
+ xxlor 0, 7, 7
+ vadduwm 6, 18, 6
+ vadduwm 12, 16, 12
+ vadduwm 24, 4, 24
+ vadduwm 3, 15, 3
+ xxlxor 55, 38, 55
+ xxlxor 61, 44, 61
+ xxlxor 32, 56, 32
+ xxlxor 62, 35, 62
+ vrlw 23, 23, 10
+ vrlw 29, 29, 10
+ vrlw 0, 0, 10
+ vrlw 30, 30, 10
+ xxlor 1, 51, 51
+ xxlor 7, 39, 39
+ xxlor 51, 8, 8
+ xxlor 39, 5, 5
+ xxlor 34, 4, 4
+ vadduwm 11, 11, 1
+ vadduwm 14, 14, 19
+ vadduwm 31, 31, 7
+ vadduwm 13, 13, 2
+ vadduwm 11, 23, 11
+ vadduwm 14, 29, 14
+ vadduwm 31, 0, 31
+ vadduwm 13, 30, 13
+ xxlxor 50, 43, 50
+ xxlxor 48, 46, 48
+ xxlxor 36, 63, 36
+ xxlxor 47, 45, 47
+ vrlw 18, 18, 9
+ vrlw 16, 16, 9
+ vrlw 4, 4, 9
+ vrlw 15, 15, 9
+ xxlor 2, 53, 53
+ vmr 21, 28
+ vadduwm 6, 18, 6
+ vadduwm 12, 16, 12
+ vadduwm 24, 4, 24
+ vadduwm 3, 15, 3
+ xxlxor 55, 38, 55
+ xxlxor 61, 44, 61
+ xxlxor 62, 35, 62
+ xxlxor 32, 56, 32
+ vrlw 30, 30, 8
+ vrlw 23, 23, 8
+ vrlw 29, 29, 8
+ vrlw 0, 0, 8
+ xxlor 53, 29, 29
+ vadduwm 11, 11, 17
+ vadduwm 14, 14, 28
+ vadduwm 31, 31, 26
+ vadduwm 13, 13, 21
+ vadduwm 11, 30, 11
+ vadduwm 14, 23, 14
+ vadduwm 31, 29, 31
+ vadduwm 13, 0, 13
+ xxlxor 48, 43, 48
+ xxlxor 36, 46, 36
+ xxlxor 47, 63, 47
+ xxlxor 50, 45, 50
+ vperm 16, 16, 16, 5
+ vperm 4, 4, 4, 5
+ vperm 15, 15, 15, 5
+ vperm 18, 18, 18, 5
+ vadduwm 11, 11, 20
+ xxlor 5, 52, 52
+ vadduwm 24, 16, 24
+ vadduwm 3, 4, 3
+ vadduwm 6, 15, 6
+ vadduwm 12, 18, 12
+ xxlxor 62, 56, 62
+ xxlxor 55, 35, 55
+ xxlxor 61, 38, 61
+ xxlxor 32, 44, 32
+ vrlw 30, 30, 10
+ vrlw 23, 23, 10
+ vrlw 29, 29, 10
+ vrlw 0, 0, 10
+ xxlor 52, 2, 2
+ vadduwm 14, 14, 25
+ vadduwm 31, 31, 20
+ vadduwm 13, 13, 7
+ vadduwm 11, 30, 11
+ vadduwm 14, 23, 14
+ vadduwm 31, 29, 31
+ vadduwm 13, 0, 13
+ xxlxor 48, 43, 48
+ xxlxor 36, 46, 36
+ xxlxor 47, 63, 47
+ xxlxor 50, 45, 50
+ vrlw 16, 16, 9
+ vrlw 4, 4, 9
+ vrlw 15, 15, 9
+ vrlw 18, 18, 9
+ vadduwm 11, 11, 22
+ vadduwm 14, 14, 27
+ vadduwm 24, 16, 24
+ vadduwm 3, 4, 3
+ vadduwm 6, 15, 6
+ vadduwm 12, 18, 12
+ xxlxor 62, 56, 62
+ xxlxor 55, 35, 55
+ xxlxor 61, 38, 61
+ xxlxor 32, 44, 32
+ vrlw 23, 23, 8
+ vrlw 29, 29, 8
+ vrlw 0, 0, 8
+ vrlw 30, 30, 8
+ vadduwm 31, 31, 1
+ vadduwm 13, 13, 2
+ vadduwm 11, 23, 11
+ vadduwm 14, 29, 14
+ vadduwm 31, 0, 31
+ vadduwm 13, 30, 13
+ xxlxor 50, 43, 50
+ xxlxor 48, 46, 48
+ xxlxor 36, 63, 36
+ xxlxor 47, 45, 47
+ vperm 18, 18, 18, 5
+ vperm 16, 16, 16, 5
+ vperm 4, 4, 4, 5
+ vperm 15, 15, 15, 5
+ xxlor 3, 29, 29
+ xxlor 4, 49, 49
+ vadduwm 6, 18, 6
+ vadduwm 12, 16, 12
+ vadduwm 24, 4, 24
+ vadduwm 3, 15, 3
+ xxlxor 55, 38, 55
+ xxlxor 61, 44, 61
+ xxlxor 32, 56, 32
+ xxlxor 62, 35, 62
+ vrlw 23, 23, 10
+ vrlw 29, 29, 10
+ vrlw 0, 0, 10
+ vrlw 30, 30, 10
+ vmr 17, 28
+ xxlor 2, 54, 54
+ xxlor 3, 34, 34
+ xxlor 34, 8, 8
+ xxlor 51, 0, 0
+ xxlor 60, 7, 7
+ xxlor 54, 1, 1
+ vadduwm 11, 11, 2
+ vadduwm 14, 14, 19
+ vadduwm 31, 31, 28
+ vadduwm 13, 13, 22
+ vadduwm 11, 23, 11
+ vadduwm 14, 29, 14
+ vadduwm 31, 0, 31
+ vadduwm 13, 30, 13
+ xxlxor 50, 43, 50
+ xxlxor 48, 46, 48
+ xxlxor 36, 63, 36
+ xxlxor 47, 45, 47
+ vrlw 18, 18, 9
+ vrlw 16, 16, 9
+ vrlw 4, 4, 9
+ vrlw 15, 15, 9
+ vadduwm 11, 11, 17
+ vadduwm 14, 14, 25
+ vadduwm 6, 18, 6
+ vadduwm 12, 16, 12
+ vadduwm 24, 4, 24
+ vadduwm 3, 15, 3
+ xxlxor 55, 38, 55
+ xxlxor 61, 44, 61
+ xxlxor 62, 35, 62
+ xxlxor 32, 56, 32
+ vrlw 30, 30, 8
+ vrlw 23, 23, 8
+ vrlw 29, 29, 8
+ vrlw 0, 0, 8
+ vadduwm 31, 31, 7
+ vadduwm 13, 13, 26
+ vadduwm 11, 30, 11
+ vadduwm 14, 23, 14
+ vadduwm 31, 29, 31
+ vadduwm 13, 0, 13
+ xxlxor 48, 43, 48
+ xxlxor 36, 46, 36
+ xxlxor 47, 63, 47
+ xxlxor 50, 45, 50
+ vperm 16, 16, 16, 5
+ vperm 4, 4, 4, 5
+ vperm 15, 15, 15, 5
+ vperm 18, 18, 18, 5
+ xxlor 6, 39, 39
+ xxlor 39, 4, 4
+ vadduwm 24, 16, 24
+ vadduwm 3, 4, 3
+ vadduwm 6, 15, 6
+ vadduwm 12, 18, 12
+ xxlxor 62, 56, 62
+ xxlxor 55, 35, 55
+ xxlxor 61, 38, 61
+ xxlxor 32, 44, 32
+ vrlw 30, 30, 10
+ vrlw 23, 23, 10
+ vrlw 29, 29, 10
+ vrlw 0, 0, 10
+ vadduwm 11, 11, 21
+ vadduwm 14, 14, 27
+ vadduwm 31, 31, 7
+ vadduwm 13, 13, 28
+ vadduwm 11, 30, 11
+ vadduwm 14, 23, 14
+ vadduwm 31, 29, 31
+ vadduwm 13, 0, 13
+ xxlxor 48, 43, 48
+ xxlxor 36, 46, 36
+ xxlxor 47, 63, 47
+ xxlxor 50, 45, 50
+ vrlw 16, 16, 9
+ vrlw 4, 4, 9
+ vrlw 15, 15, 9
+ vrlw 18, 18, 9
+ xxlor 0, 49, 49
+ xxlor 49, 5, 5
+ vadduwm 24, 16, 24
+ vadduwm 3, 4, 3
+ vadduwm 6, 15, 6
+ vadduwm 12, 18, 12
+ xxlxor 62, 56, 62
+ xxlxor 55, 35, 55
+ xxlxor 61, 38, 61
+ xxlxor 32, 44, 32
+ vrlw 23, 23, 8
+ vrlw 29, 29, 8
+ vrlw 0, 0, 8
+ vrlw 30, 30, 8
+ vadduwm 11, 11, 17
+ vadduwm 14, 14, 1
+ vadduwm 31, 31, 2
+ vadduwm 13, 13, 22
+ vadduwm 11, 23, 11
+ vadduwm 14, 29, 14
+ vadduwm 31, 0, 31
+ vadduwm 13, 30, 13
+ xxlxor 50, 43, 50
+ xxlxor 48, 46, 48
+ xxlxor 36, 63, 36
+ xxlxor 47, 45, 47
+ vperm 18, 18, 18, 5
+ vperm 16, 16, 16, 5
+ vperm 4, 4, 4, 5
+ vperm 15, 15, 15, 5
+ xxlor 34, 3, 3
+ xxlor 49, 2, 2
+ vadduwm 6, 18, 6
+ vadduwm 12, 16, 12
+ vadduwm 24, 4, 24
+ vadduwm 3, 15, 3
+ xxlxor 55, 38, 55
+ xxlxor 61, 44, 61
+ xxlxor 32, 56, 32
+ xxlxor 62, 35, 62
+ vrlw 23, 23, 10
+ vrlw 29, 29, 10
+ vrlw 0, 0, 10
+ vrlw 30, 30, 10
+ vadduwm 11, 11, 19
+ vadduwm 14, 14, 20
+ vadduwm 31, 31, 2
+ vadduwm 13, 13, 17
+ vadduwm 11, 23, 11
+ vadduwm 14, 29, 14
+ vadduwm 31, 0, 31
+ vadduwm 13, 30, 13
+ xxlxor 50, 43, 50
+ xxlxor 48, 46, 48
+ xxlxor 36, 63, 36
+ xxlxor 47, 45, 47
+ vrlw 18, 18, 9
+ vrlw 16, 16, 9
+ vrlw 4, 4, 9
+ vrlw 15, 15, 9
+ vadduwm 14, 14, 27
+ vadduwm 11, 11, 25
+ vadduwm 6, 18, 6
+ vadduwm 12, 16, 12
+ vadduwm 27, 4, 24
+ vadduwm 3, 15, 3
+ xxlxor 57, 38, 55
+ xxlxor 61, 44, 61
+ xxlxor 62, 35, 62
+ xxlxor 32, 59, 32
+ xxlor 39, 7, 7
+ vrlw 30, 30, 8
+ vrlw 25, 25, 8
+ vrlw 29, 29, 8
+ vrlw 0, 0, 8
+ xxlor 1, 58, 58
+ vmr 26, 19
+ vadduwm 19, 31, 7
+ xxlor 39, 6, 6
+ vadduwm 11, 30, 11
+ vadduwm 7, 13, 7
+ vadduwm 13, 25, 14
+ vadduwm 14, 29, 19
+ vadduwm 7, 0, 7
+ xxlxor 48, 43, 48
+ xxlxor 36, 45, 36
+ xxlxor 47, 46, 47
+ xxlxor 50, 39, 50
+ vperm 16, 16, 16, 5
+ vperm 4, 4, 4, 5
+ vperm 15, 15, 15, 5
+ vperm 18, 18, 18, 5
+ xxlor 51, 1, 1
+ vadduwm 13, 13, 1
+ vadduwm 11, 11, 19
+ vadduwm 19, 16, 27
+ vadduwm 3, 4, 3
+ vadduwm 6, 15, 6
+ vadduwm 12, 18, 12
+ xxlxor 63, 51, 62
+ xxlxor 62, 35, 57
+ xxlxor 61, 38, 61
+ xxlxor 32, 44, 32
+ vrlw 31, 31, 10
+ vrlw 30, 30, 10
+ vrlw 29, 29, 10
+ vrlw 0, 0, 10
+ xxlor 33, 0, 0
+ vadduwm 7, 7, 2
+ vadduwm 14, 14, 1
+ vadduwm 11, 31, 11
+ vadduwm 13, 30, 13
+ vadduwm 14, 29, 14
+ vadduwm 7, 0, 7
+ xxlxor 48, 43, 48
+ xxlxor 36, 45, 36
+ xxlxor 47, 46, 47
+ xxlxor 50, 39, 50
+ vrlw 16, 16, 9
+ vrlw 4, 4, 9
+ vrlw 15, 15, 9
+ vrlw 18, 18, 9
+ xxlor 60, 8, 8
+ vadduwm 1, 11, 21
+ vadduwm 11, 13, 28
+ vadduwm 13, 16, 19
+ vadduwm 3, 4, 3
+ vadduwm 6, 15, 6
+ vadduwm 12, 18, 12
+ xxlxor 51, 45, 63
+ xxlxor 63, 35, 62
+ xxlxor 62, 38, 61
+ xxlxor 32, 44, 32
+ vrlw 31, 31, 8
+ vrlw 30, 30, 8
+ vrlw 0, 0, 8
+ vrlw 19, 19, 8
+ vadduwm 14, 14, 26
+ vadduwm 7, 7, 17
+ vadduwm 1, 31, 1
+ vadduwm 11, 30, 11
+ vadduwm 14, 0, 14
+ vadduwm 7, 19, 7
+ xxlxor 50, 33, 50
+ xxlxor 48, 43, 48
+ xxlxor 36, 46, 36
+ xxlxor 47, 39, 47
+ vperm 18, 18, 18, 5
+ vperm 16, 16, 16, 5
+ vperm 4, 4, 4, 5
+ vperm 15, 15, 15, 5
+ xxlor 34, 4, 4
+ vadduwm 14, 14, 22
+ vadduwm 6, 18, 6
+ vadduwm 12, 16, 12
+ vadduwm 13, 4, 13
+ vadduwm 3, 15, 3
+ xxlxor 49, 38, 63
+ xxlxor 63, 44, 62
+ xxlxor 32, 45, 32
+ xxlxor 51, 35, 51
+ vrlw 17, 17, 10
+ vrlw 31, 31, 10
+ vrlw 0, 0, 10
+ vrlw 10, 19, 10
+ vadduwm 11, 11, 2
+ xxlor 34, 5, 5
+ vadduwm 1, 1, 20
+ vadduwm 2, 7, 2
+ vadduwm 7, 31, 11
+ vadduwm 11, 0, 14
+ vadduwm 2, 10, 2
+ vadduwm 1, 17, 1
+ xxlxor 36, 43, 36
+ xxlxor 46, 34, 47
+ vrlw 4, 4, 9
+ vrlw 14, 14, 9
+ xxlxor 47, 33, 50
+ xxlxor 48, 39, 48
+ vrlw 15, 15, 9
+ vrlw 9, 16, 9
+ vadduwm 13, 4, 13
+ vadduwm 3, 14, 3
+ xxlxor 32, 45, 32
+ xxlxor 45, 45, 33
+ xxlxor 33, 35, 42
+ xxlxor 59, 35, 39
+ vadduwm 3, 15, 6
+ vadduwm 6, 9, 12
+ xxlxor 39, 35, 49
+ xxlxor 42, 38, 63
+ vrlw 1, 1, 8
+ vrlw 7, 7, 8
+ vrlw 10, 10, 8
+ vrlw 0, 0, 8
+ xxlxor 40, 35, 43
+ xxlxor 38, 38, 34
+ xxlxor 61, 33, 41
+ xxlxor 50, 39, 36
+ xxlxor 62, 42, 46
+ xxlxor 54, 32, 47
+ bne 0, .LBB3_2
+.LBB3_5:
+ vmrglw 2, 27, 13
+ li 3, 32
+ li 4, 48
+ vmrglw 4, 6, 8
+ vmrglw 0, 18, 29
+ vmrglw 1, 22, 30
+ vmrghw 3, 27, 13
+ vmrghw 5, 6, 8
+ vmrghw 6, 18, 29
+ vmrghw 7, 22, 30
+ xxmrgld 40, 36, 34
+ xxmrghd 34, 36, 34
+ xxmrgld 41, 33, 32
+ xxswapd 0, 40
+ xxmrgld 36, 37, 35
+ xxmrghd 35, 37, 35
+ xxmrghd 37, 33, 32
+ xxswapd 1, 41
+ xxmrgld 32, 39, 38
+ xxmrghd 33, 39, 38
+ xxswapd 2, 34
+ xxswapd 4, 36
+ xxswapd 3, 37
+ stxvd2x 0, 0, 5
+ xxswapd 5, 32
+ stxvd2x 1, 5, 11
+ xxswapd 0, 35
+ xxswapd 1, 33
+ stxvd2x 2, 5, 3
+ li 3, 64
+ stxvd2x 3, 5, 4
+ li 4, 80
+ stxvd2x 4, 5, 3
+ li 3, 96
+ stxvd2x 5, 5, 4
+ li 4, 112
+ stxvd2x 0, 5, 3
+ stxvd2x 1, 5, 4
+ li 3, 224
+ lxvd2x 63, 1, 3
+ li 3, 208
+ lfd 31, 392(1)
+ ld 30, 312(1)
+ ld 29, 304(1)
+ lxvd2x 62, 1, 3
+ li 3, 192
+ lfd 30, 384(1)
+ ld 28, 296(1)
+ ld 27, 288(1)
+ lxvd2x 61, 1, 3
+ li 3, 176
+ lfd 29, 376(1)
+ ld 26, 280(1)
+ ld 25, 272(1)
+ lxvd2x 60, 1, 3
+ li 3, 160
+ lfd 28, 368(1)
+ ld 24, 264(1)
+ ld 23, 256(1)
+ lxvd2x 59, 1, 3
+ li 3, 144
+ lfd 27, 360(1)
+ ld 22, 248(1)
+ lxvd2x 58, 1, 3
+ li 3, 128
+ lfd 26, 352(1)
+ lxvd2x 57, 1, 3
+ li 3, 112
+ lfd 25, 344(1)
+ lxvd2x 56, 1, 3
+ li 3, 96
+ lfd 24, 336(1)
+ lxvd2x 55, 1, 3
+ li 3, 80
+ lfd 23, 328(1)
+ lxvd2x 54, 1, 3
+ li 3, 64
+ lxvd2x 53, 1, 3
+ li 3, 48
+ lxvd2x 52, 1, 3
+ addi 1, 1, 400
+ blr
+ .long 0
+ .quad 0
+.Lfunc_end3:
+ .size blake3_hash4_sse2, .Lfunc_end3-.Lfunc_begin3
+ .cfi_endproc
+ .section ".note.GNU-stack","",@progbits
+#endif
diff --git a/sys/contrib/openzfs/module/icp/asm-ppc64/blake3/b3_ppc64le_sse41.S b/sys/contrib/openzfs/module/icp/asm-ppc64/blake3/b3_ppc64le_sse41.S
new file mode 100644
index 000000000000..a8b2627f12b0
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/asm-ppc64/blake3/b3_ppc64le_sse41.S
@@ -0,0 +1,3064 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Based on BLAKE3 v1.3.1, https://github.com/BLAKE3-team/BLAKE3
+ * Copyright (c) 2019-2022 Samuel Neves
+ * Copyright (c) 2022 Tino Reichardt <milky-zfs@mcmilk.de>
+ *
+ * This is converted assembly: SSE4.1 -> POWER8 PPC64 Little Endian
+ * Used tools: SIMDe https://github.com/simd-everywhere/simde
+ */
+
+#if (defined(__PPC64__) && defined(__LITTLE_ENDIAN__))
+ .text
+ .abiversion 2
+ .section .rodata.cst16,"aM",@progbits,16
+ .p2align 4
+.LCPI0_0:
+ .byte 31
+ .byte 14
+ .byte 13
+ .byte 12
+ .byte 30
+ .byte 10
+ .byte 9
+ .byte 8
+ .byte 29
+ .byte 6
+ .byte 5
+ .byte 4
+ .byte 28
+ .byte 2
+ .byte 1
+ .byte 0
+.LCPI0_1:
+ .byte 2
+ .byte 3
+ .byte 0
+ .byte 1
+ .byte 6
+ .byte 7
+ .byte 4
+ .byte 5
+ .byte 10
+ .byte 11
+ .byte 8
+ .byte 9
+ .byte 14
+ .byte 15
+ .byte 12
+ .byte 13
+.LCPI0_2:
+ .byte 29
+ .byte 28
+ .byte 31
+ .byte 30
+ .byte 25
+ .byte 24
+ .byte 27
+ .byte 26
+ .byte 21
+ .byte 20
+ .byte 23
+ .byte 22
+ .byte 17
+ .byte 16
+ .byte 19
+ .byte 18
+.LCPI0_3:
+ .long 1779033703
+ .long 3144134277
+ .long 1013904242
+ .long 2773480762
+.LCPI0_4:
+ .byte 27
+ .byte 26
+ .byte 25
+ .byte 24
+ .byte 19
+ .byte 18
+ .byte 17
+ .byte 16
+ .byte 11
+ .byte 10
+ .byte 9
+ .byte 8
+ .byte 3
+ .byte 2
+ .byte 1
+ .byte 0
+.LCPI0_5:
+ .byte 1
+ .byte 2
+ .byte 3
+ .byte 0
+ .byte 5
+ .byte 6
+ .byte 7
+ .byte 4
+ .byte 9
+ .byte 10
+ .byte 11
+ .byte 8
+ .byte 13
+ .byte 14
+ .byte 15
+ .byte 12
+.LCPI0_6:
+ .byte 30
+ .byte 29
+ .byte 28
+ .byte 31
+ .byte 26
+ .byte 25
+ .byte 24
+ .byte 27
+ .byte 22
+ .byte 21
+ .byte 20
+ .byte 23
+ .byte 18
+ .byte 17
+ .byte 16
+ .byte 19
+.LCPI0_7:
+ .byte 19
+ .byte 18
+ .byte 17
+ .byte 16
+ .byte 11
+ .byte 10
+ .byte 9
+ .byte 8
+ .byte 3
+ .byte 2
+ .byte 1
+ .byte 0
+ .byte 27
+ .byte 26
+ .byte 25
+ .byte 24
+.LCPI0_8:
+ .byte 23
+ .byte 22
+ .byte 21
+ .byte 20
+ .byte 27
+ .byte 26
+ .byte 25
+ .byte 24
+ .byte 11
+ .byte 10
+ .byte 9
+ .byte 8
+ .byte 3
+ .byte 2
+ .byte 1
+ .byte 0
+.LCPI0_9:
+ .byte 31
+ .byte 31
+ .byte 31
+ .byte 31
+ .byte 23
+ .byte 22
+ .byte 21
+ .byte 20
+ .byte 31
+ .byte 31
+ .byte 31
+ .byte 31
+ .byte 3
+ .byte 2
+ .byte 1
+ .byte 0
+.LCPI0_10:
+ .byte 19
+ .byte 18
+ .byte 17
+ .byte 16
+ .byte 31
+ .byte 31
+ .byte 31
+ .byte 31
+ .byte 31
+ .byte 30
+ .byte 29
+ .byte 28
+ .byte 31
+ .byte 31
+ .byte 31
+ .byte 31
+.LCPI0_11:
+ .byte 31
+ .byte 30
+ .byte 29
+ .byte 28
+ .byte 11
+ .byte 10
+ .byte 9
+ .byte 8
+ .byte 23
+ .byte 22
+ .byte 21
+ .byte 20
+ .byte 3
+ .byte 2
+ .byte 1
+ .byte 0
+.LCPI0_12:
+ .byte 31
+ .byte 30
+ .byte 29
+ .byte 28
+ .byte 23
+ .byte 22
+ .byte 21
+ .byte 20
+ .byte 3
+ .byte 2
+ .byte 1
+ .byte 0
+ .byte 27
+ .byte 26
+ .byte 25
+ .byte 24
+.LCPI0_13:
+ .byte 27
+ .byte 26
+ .byte 25
+ .byte 24
+ .byte 11
+ .byte 10
+ .byte 9
+ .byte 8
+ .byte 15
+ .byte 14
+ .byte 13
+ .byte 12
+ .byte 31
+ .byte 30
+ .byte 29
+ .byte 28
+.LCPI0_14:
+ .byte 23
+ .byte 22
+ .byte 21
+ .byte 20
+ .byte 23
+ .byte 22
+ .byte 21
+ .byte 20
+ .byte 3
+ .byte 2
+ .byte 1
+ .byte 0
+ .byte 3
+ .byte 2
+ .byte 1
+ .byte 0
+ .text
+ .globl zfs_blake3_compress_in_place_sse41
+ .p2align 2
+ .type zfs_blake3_compress_in_place_sse41,@function
+zfs_blake3_compress_in_place_sse41:
+.Lfunc_begin0:
+ .cfi_startproc
+.Lfunc_gep0:
+ addis 2, 12, .TOC.-.Lfunc_gep0@ha
+ addi 2, 2, .TOC.-.Lfunc_gep0@l
+.Lfunc_lep0:
+ .localentry zfs_blake3_compress_in_place_sse41, .Lfunc_lep0-.Lfunc_gep0
+ li 8, -64
+ mtvsrd 34, 5
+ li 5, 16
+ lfdx 0, 0, 4
+ vspltisw 13, -16
+ stxvd2x 60, 1, 8
+ li 8, -48
+ mtvsrd 35, 7
+ lfd 2, 16(4)
+ lfd 3, 24(4)
+ addis 7, 2, .LCPI0_0@toc@ha
+ stxvd2x 61, 1, 8
+ li 8, -32
+ mtvsrwz 36, 6
+ rldicl 6, 6, 32, 32
+ stxvd2x 62, 1, 8
+ li 8, -16
+ vmrghb 2, 3, 2
+ stxvd2x 63, 1, 8
+ mtvsrwz 35, 6
+ addi 6, 7, .LCPI0_0@toc@l
+ addis 7, 2, .LCPI0_2@toc@ha
+ lfd 1, 8(4)
+ xxmrghd 32, 3, 2
+ lvx 6, 0, 6
+ xxlxor 33, 33, 33
+ addis 6, 2, .LCPI0_1@toc@ha
+ addi 7, 7, .LCPI0_2@toc@l
+ vmrghw 3, 3, 4
+ addi 6, 6, .LCPI0_1@toc@l
+ vspltisw 14, 9
+ xxmrghd 37, 1, 0
+ lxvd2x 0, 0, 3
+ lxvd2x 1, 3, 5
+ vperm 2, 1, 2, 6
+ vpkudum 9, 0, 5
+ xxswapd 36, 0
+ xxswapd 38, 1
+ xxmrgld 34, 34, 35
+ lvx 3, 0, 7
+ addis 7, 2, .LCPI0_4@toc@ha
+ addi 7, 7, .LCPI0_4@toc@l
+ vadduwm 4, 9, 4
+ lvx 11, 0, 7
+ addis 7, 2, .LCPI0_6@toc@ha
+ addi 7, 7, .LCPI0_6@toc@l
+ vadduwm 7, 4, 6
+ lvx 4, 0, 6
+ addis 6, 2, .LCPI0_3@toc@ha
+ addi 6, 6, .LCPI0_3@toc@l
+ vperm 11, 0, 5, 11
+ lvx 0, 0, 7
+ li 7, 48
+ xxlxor 40, 39, 34
+ lvx 10, 0, 6
+ addis 6, 2, .LCPI0_5@toc@ha
+ lxvd2x 1, 4, 7
+ vcmpgtsb 2, 1, 4
+ addi 6, 6, .LCPI0_5@toc@l
+ vperm 4, 8, 8, 3
+ vspltisw 8, 10
+ xxlandc 44, 36, 34
+ vadduwm 4, 8, 8
+ vadduwm 8, 12, 10
+ xxlxor 37, 40, 38
+ vrlw 6, 5, 4
+ vadduwm 5, 7, 11
+ vadduwm 7, 6, 5
+ lvx 5, 0, 6
+ li 6, 32
+ lxvd2x 0, 4, 6
+ addis 4, 2, .LCPI0_7@toc@ha
+ addis 6, 2, .LCPI0_9@toc@ha
+ xxlxor 42, 39, 44
+ xxswapd 44, 1
+ addi 4, 4, .LCPI0_7@toc@l
+ addi 6, 6, .LCPI0_9@toc@l
+ vcmpgtsb 5, 1, 5
+ vperm 1, 10, 10, 0
+ xxswapd 42, 0
+ vpkudum 16, 12, 10
+ xxlandc 47, 33, 37
+ vsubuwm 1, 14, 13
+ lvx 14, 0, 4
+ addis 4, 2, .LCPI0_8@toc@ha
+ vadduwm 8, 15, 8
+ xxswapd 45, 47
+ addi 4, 4, .LCPI0_8@toc@l
+ vadduwm 7, 7, 16
+ xxsldwi 48, 48, 48, 1
+ xxlxor 38, 40, 38
+ xxsldwi 40, 40, 40, 3
+ xxsldwi 39, 39, 39, 1
+ vperm 14, 10, 12, 14
+ vrlw 6, 6, 1
+ vadduwm 7, 6, 7
+ xxlxor 45, 39, 45
+ vperm 13, 13, 13, 3
+ xxlandc 45, 45, 34
+ vadduwm 8, 13, 8
+ xxlxor 38, 40, 38
+ vrlw 10, 6, 4
+ vadduwm 6, 7, 14
+ vadduwm 7, 10, 6
+ xxlxor 38, 39, 45
+ vperm 12, 6, 6, 0
+ lvx 6, 0, 4
+ addis 4, 2, .LCPI0_10@toc@ha
+ addi 4, 4, .LCPI0_10@toc@l
+ vperm 13, 11, 9, 6
+ xxlandc 44, 44, 37
+ vadduwm 15, 12, 8
+ vadduwm 7, 7, 13
+ xxsldwi 45, 45, 45, 3
+ xxlxor 40, 47, 42
+ xxsldwi 47, 47, 47, 1
+ xxsldwi 39, 39, 39, 3
+ vrlw 10, 8, 1
+ xxswapd 40, 44
+ vadduwm 17, 10, 7
+ lvx 7, 0, 4
+ addis 4, 2, .LCPI0_11@toc@ha
+ addi 4, 4, .LCPI0_11@toc@l
+ xxlxor 44, 49, 40
+ lvx 8, 0, 6
+ vperm 18, 9, 9, 7
+ lvx 9, 0, 4
+ addis 4, 2, .LCPI0_12@toc@ha
+ vperm 12, 12, 12, 3
+ addi 4, 4, .LCPI0_12@toc@l
+ vperm 19, 14, 16, 8
+ xxlandc 63, 44, 34
+ vperm 12, 19, 18, 9
+ vadduwm 15, 31, 15
+ xxlxor 42, 47, 42
+ vrlw 18, 10, 4
+ vadduwm 10, 17, 12
+ vadduwm 17, 18, 10
+ xxlxor 42, 49, 63
+ xxmrgld 63, 43, 46
+ xxsldwi 49, 49, 49, 1
+ vmrghw 14, 14, 11
+ vperm 19, 10, 10, 0
+ lvx 10, 0, 4
+ addis 4, 2, .LCPI0_13@toc@ha
+ addi 4, 4, .LCPI0_13@toc@l
+ lvx 11, 0, 4
+ addis 4, 2, .LCPI0_14@toc@ha
+ vperm 31, 16, 31, 10
+ addi 4, 4, .LCPI0_14@toc@l
+ vperm 14, 14, 16, 11
+ xxlandc 51, 51, 37
+ vadduwm 15, 19, 15
+ xxswapd 51, 51
+ vadduwm 17, 17, 31
+ xxlxor 50, 47, 50
+ xxsldwi 47, 47, 47, 3
+ vperm 30, 14, 31, 8
+ vrlw 18, 18, 1
+ vadduwm 17, 18, 17
+ xxlxor 51, 49, 51
+ vadduwm 17, 17, 14
+ vperm 19, 19, 19, 3
+ xxlandc 51, 51, 34
+ vadduwm 15, 19, 15
+ xxlxor 48, 47, 50
+ vrlw 16, 16, 4
+ vadduwm 17, 16, 17
+ xxlxor 50, 49, 51
+ vperm 19, 12, 13, 6
+ vperm 18, 18, 18, 0
+ vperm 13, 13, 13, 7
+ vadduwm 17, 17, 19
+ xxlandc 50, 50, 37
+ xxsldwi 49, 49, 49, 3
+ vperm 13, 30, 13, 9
+ vadduwm 15, 18, 15
+ xxswapd 50, 50
+ xxmrgld 62, 44, 46
+ vmrghw 12, 14, 12
+ xxlxor 48, 47, 48
+ xxsldwi 47, 47, 47, 1
+ vrlw 16, 16, 1
+ vperm 30, 31, 30, 10
+ vperm 12, 12, 31, 11
+ vadduwm 17, 16, 17
+ xxlxor 50, 49, 50
+ vadduwm 17, 17, 13
+ vperm 18, 18, 18, 3
+ vperm 31, 12, 30, 8
+ xxlandc 50, 50, 34
+ vadduwm 15, 18, 15
+ xxlxor 48, 47, 48
+ vrlw 16, 16, 4
+ vadduwm 17, 16, 17
+ xxlxor 50, 49, 50
+ xxsldwi 49, 49, 49, 1
+ vperm 18, 18, 18, 0
+ vadduwm 17, 17, 30
+ xxlandc 50, 50, 37
+ vadduwm 15, 18, 15
+ xxswapd 50, 50
+ xxlxor 48, 47, 48
+ xxsldwi 46, 47, 47, 3
+ vrlw 16, 16, 1
+ vadduwm 17, 16, 17
+ xxlxor 50, 49, 50
+ vadduwm 17, 17, 12
+ vperm 18, 18, 18, 3
+ xxlandc 47, 50, 34
+ xxsldwi 50, 51, 51, 3
+ vadduwm 14, 15, 14
+ vperm 19, 13, 18, 6
+ xxlxor 48, 46, 48
+ vperm 18, 18, 18, 7
+ vrlw 16, 16, 4
+ vadduwm 17, 16, 17
+ xxlxor 47, 49, 47
+ vadduwm 17, 17, 19
+ vperm 15, 15, 15, 0
+ xxsldwi 49, 49, 49, 3
+ xxlandc 47, 47, 37
+ vadduwm 14, 15, 14
+ xxswapd 47, 47
+ xxlxor 48, 46, 48
+ xxsldwi 46, 46, 46, 1
+ vrlw 16, 16, 1
+ vadduwm 17, 16, 17
+ xxlxor 47, 49, 47
+ vperm 15, 15, 15, 3
+ xxlandc 47, 47, 34
+ vadduwm 29, 15, 14
+ vperm 14, 31, 18, 9
+ xxmrgld 50, 45, 44
+ xxlxor 48, 61, 48
+ vmrghw 12, 12, 13
+ vrlw 16, 16, 4
+ vperm 18, 30, 18, 10
+ vadduwm 17, 17, 14
+ vadduwm 17, 16, 17
+ xxlxor 47, 49, 47
+ xxsldwi 49, 49, 49, 1
+ vperm 15, 15, 15, 0
+ vadduwm 17, 17, 18
+ xxlandc 47, 47, 37
+ vadduwm 31, 15, 29
+ xxswapd 47, 47
+ xxlxor 48, 63, 48
+ xxsldwi 45, 63, 63, 3
+ vperm 31, 12, 30, 11
+ vrlw 16, 16, 1
+ vadduwm 17, 16, 17
+ xxlxor 47, 49, 47
+ vperm 15, 15, 15, 3
+ xxlandc 47, 47, 34
+ vadduwm 13, 15, 13
+ xxlxor 44, 45, 48
+ vadduwm 16, 17, 31
+ xxsldwi 49, 51, 51, 3
+ vrlw 12, 12, 4
+ vperm 19, 14, 17, 6
+ vadduwm 16, 12, 16
+ xxlxor 47, 48, 47
+ vperm 15, 15, 15, 0
+ xxlandc 47, 47, 37
+ vadduwm 13, 15, 13
+ xxswapd 47, 47
+ xxlxor 44, 45, 44
+ xxsldwi 45, 45, 45, 1
+ vrlw 30, 12, 1
+ vadduwm 12, 16, 19
+ xxsldwi 44, 44, 44, 3
+ vadduwm 16, 30, 12
+ xxlxor 44, 48, 47
+ vperm 15, 17, 17, 7
+ vperm 12, 12, 12, 3
+ vperm 17, 31, 18, 8
+ xxlandc 61, 44, 34
+ vperm 12, 17, 15, 9
+ vadduwm 13, 29, 13
+ xxlxor 47, 45, 62
+ xxmrgld 62, 46, 63
+ vmrghw 14, 31, 14
+ vrlw 15, 15, 4
+ vadduwm 16, 16, 12
+ vperm 30, 18, 30, 10
+ vperm 14, 14, 18, 11
+ xxsldwi 50, 51, 51, 3
+ vadduwm 16, 15, 16
+ xxlxor 49, 48, 61
+ xxsldwi 48, 48, 48, 1
+ vperm 19, 12, 18, 6
+ vperm 17, 17, 17, 0
+ vadduwm 16, 16, 30
+ xxmrgld 60, 44, 46
+ vmrghw 12, 14, 12
+ vperm 28, 30, 28, 10
+ xxlandc 49, 49, 37
+ vadduwm 13, 17, 13
+ xxswapd 49, 49
+ vperm 12, 12, 30, 11
+ xxlxor 47, 45, 47
+ xxsldwi 45, 45, 45, 3
+ vrlw 15, 15, 1
+ vperm 8, 12, 28, 8
+ vadduwm 16, 15, 16
+ xxlxor 49, 48, 49
+ vadduwm 16, 16, 14
+ vperm 17, 17, 17, 3
+ xxlandc 49, 49, 34
+ vadduwm 13, 17, 13
+ xxlxor 47, 45, 47
+ vrlw 15, 15, 4
+ vadduwm 16, 15, 16
+ xxlxor 49, 48, 49
+ vperm 17, 17, 17, 0
+ xxlandc 49, 49, 37
+ vadduwm 31, 17, 13
+ xxlxor 45, 63, 47
+ vrlw 15, 13, 1
+ vadduwm 13, 16, 19
+ xxswapd 48, 49
+ xxsldwi 51, 51, 51, 3
+ xxsldwi 45, 45, 45, 3
+ vadduwm 17, 15, 13
+ xxlxor 45, 49, 48
+ lvx 16, 0, 4
+ vperm 29, 13, 13, 3
+ vperm 13, 18, 18, 7
+ xxsldwi 50, 63, 63, 1
+ vperm 16, 14, 30, 16
+ vperm 7, 19, 19, 7
+ xxlandc 63, 61, 34
+ vadduwm 18, 31, 18
+ vperm 29, 16, 13, 9
+ xxlxor 47, 50, 47
+ vperm 6, 16, 19, 6
+ vrlw 15, 15, 4
+ vperm 7, 8, 7, 9
+ vadduwm 17, 17, 29
+ xxmrgld 41, 61, 44
+ vadduwm 17, 15, 17
+ vperm 9, 28, 9, 10
+ xxlxor 63, 49, 63
+ xxsldwi 49, 49, 49, 1
+ vperm 31, 31, 31, 0
+ vadduwm 17, 17, 28
+ xxlandc 63, 63, 37
+ vadduwm 18, 31, 18
+ xxswapd 63, 63
+ xxlxor 47, 50, 47
+ xxsldwi 46, 50, 50, 3
+ vrlw 15, 15, 1
+ vadduwm 17, 15, 17
+ xxlxor 63, 49, 63
+ vadduwm 17, 17, 12
+ vperm 31, 31, 31, 3
+ xxlandc 50, 63, 34
+ vadduwm 14, 18, 14
+ xxlxor 47, 46, 47
+ vrlw 15, 15, 4
+ vadduwm 17, 15, 17
+ xxlxor 50, 49, 50
+ vadduwm 6, 17, 6
+ vperm 18, 18, 18, 0
+ xxsldwi 38, 38, 38, 3
+ xxlandc 50, 50, 37
+ vadduwm 14, 18, 14
+ xxswapd 48, 50
+ xxlxor 47, 46, 47
+ xxsldwi 46, 46, 46, 1
+ vrlw 15, 15, 1
+ vadduwm 6, 15, 6
+ xxlxor 48, 38, 48
+ vadduwm 6, 6, 7
+ vperm 16, 16, 16, 3
+ xxlandc 48, 48, 34
+ vadduwm 14, 16, 14
+ xxlxor 40, 46, 47
+ vrlw 8, 8, 4
+ vadduwm 6, 8, 6
+ xxlxor 39, 38, 48
+ xxsldwi 38, 38, 38, 1
+ vperm 7, 7, 7, 0
+ vadduwm 6, 6, 9
+ xxlandc 39, 39, 37
+ vadduwm 14, 7, 14
+ xxswapd 39, 39
+ xxlxor 40, 46, 40
+ xxsldwi 41, 46, 46, 3
+ vrlw 8, 8, 1
+ vadduwm 6, 8, 6
+ xxlxor 39, 38, 39
+ vperm 3, 7, 7, 3
+ vmrghw 7, 12, 13
+ xxlandc 34, 35, 34
+ vperm 7, 7, 28, 11
+ vadduwm 3, 2, 9
+ xxlxor 40, 35, 40
+ vrlw 4, 8, 4
+ vadduwm 6, 6, 7
+ vadduwm 6, 4, 6
+ xxlxor 34, 38, 34
+ xxsldwi 0, 38, 38, 3
+ vperm 2, 2, 2, 0
+ xxlandc 34, 34, 37
+ vadduwm 3, 2, 3
+ xxswapd 34, 34
+ xxlxor 36, 35, 36
+ xxsldwi 1, 35, 35, 1
+ vrlw 4, 4, 1
+ xxlxor 0, 1, 0
+ xxswapd 0, 0
+ xxlxor 1, 36, 34
+ stxvd2x 0, 0, 3
+ xxswapd 1, 1
+ stxvd2x 1, 3, 5
+ li 3, -16
+ lxvd2x 63, 1, 3
+ li 3, -32
+ lxvd2x 62, 1, 3
+ li 3, -48
+ lxvd2x 61, 1, 3
+ li 3, -64
+ lxvd2x 60, 1, 3
+ blr
+ .long 0
+ .quad 0
+.Lfunc_end0:
+ .size zfs_blake3_compress_in_place_sse41, .Lfunc_end0-.Lfunc_begin0
+ .cfi_endproc
+
+ .section .rodata.cst16,"aM",@progbits,16
+ .p2align 4
+.LCPI1_0:
+ .byte 31
+ .byte 14
+ .byte 13
+ .byte 12
+ .byte 30
+ .byte 10
+ .byte 9
+ .byte 8
+ .byte 29
+ .byte 6
+ .byte 5
+ .byte 4
+ .byte 28
+ .byte 2
+ .byte 1
+ .byte 0
+.LCPI1_1:
+ .byte 2
+ .byte 3
+ .byte 0
+ .byte 1
+ .byte 6
+ .byte 7
+ .byte 4
+ .byte 5
+ .byte 10
+ .byte 11
+ .byte 8
+ .byte 9
+ .byte 14
+ .byte 15
+ .byte 12
+ .byte 13
+.LCPI1_2:
+ .byte 29
+ .byte 28
+ .byte 31
+ .byte 30
+ .byte 25
+ .byte 24
+ .byte 27
+ .byte 26
+ .byte 21
+ .byte 20
+ .byte 23
+ .byte 22
+ .byte 17
+ .byte 16
+ .byte 19
+ .byte 18
+.LCPI1_3:
+ .long 1779033703
+ .long 3144134277
+ .long 1013904242
+ .long 2773480762
+.LCPI1_4:
+ .byte 27
+ .byte 26
+ .byte 25
+ .byte 24
+ .byte 19
+ .byte 18
+ .byte 17
+ .byte 16
+ .byte 11
+ .byte 10
+ .byte 9
+ .byte 8
+ .byte 3
+ .byte 2
+ .byte 1
+ .byte 0
+.LCPI1_5:
+ .byte 1
+ .byte 2
+ .byte 3
+ .byte 0
+ .byte 5
+ .byte 6
+ .byte 7
+ .byte 4
+ .byte 9
+ .byte 10
+ .byte 11
+ .byte 8
+ .byte 13
+ .byte 14
+ .byte 15
+ .byte 12
+.LCPI1_6:
+ .byte 30
+ .byte 29
+ .byte 28
+ .byte 31
+ .byte 26
+ .byte 25
+ .byte 24
+ .byte 27
+ .byte 22
+ .byte 21
+ .byte 20
+ .byte 23
+ .byte 18
+ .byte 17
+ .byte 16
+ .byte 19
+.LCPI1_7:
+ .byte 19
+ .byte 18
+ .byte 17
+ .byte 16
+ .byte 11
+ .byte 10
+ .byte 9
+ .byte 8
+ .byte 3
+ .byte 2
+ .byte 1
+ .byte 0
+ .byte 27
+ .byte 26
+ .byte 25
+ .byte 24
+.LCPI1_8:
+ .byte 23
+ .byte 22
+ .byte 21
+ .byte 20
+ .byte 27
+ .byte 26
+ .byte 25
+ .byte 24
+ .byte 11
+ .byte 10
+ .byte 9
+ .byte 8
+ .byte 3
+ .byte 2
+ .byte 1
+ .byte 0
+.LCPI1_9:
+ .byte 31
+ .byte 31
+ .byte 31
+ .byte 31
+ .byte 23
+ .byte 22
+ .byte 21
+ .byte 20
+ .byte 31
+ .byte 31
+ .byte 31
+ .byte 31
+ .byte 3
+ .byte 2
+ .byte 1
+ .byte 0
+.LCPI1_10:
+ .byte 19
+ .byte 18
+ .byte 17
+ .byte 16
+ .byte 31
+ .byte 31
+ .byte 31
+ .byte 31
+ .byte 31
+ .byte 30
+ .byte 29
+ .byte 28
+ .byte 31
+ .byte 31
+ .byte 31
+ .byte 31
+.LCPI1_11:
+ .byte 31
+ .byte 30
+ .byte 29
+ .byte 28
+ .byte 11
+ .byte 10
+ .byte 9
+ .byte 8
+ .byte 23
+ .byte 22
+ .byte 21
+ .byte 20
+ .byte 3
+ .byte 2
+ .byte 1
+ .byte 0
+.LCPI1_12:
+ .byte 31
+ .byte 30
+ .byte 29
+ .byte 28
+ .byte 23
+ .byte 22
+ .byte 21
+ .byte 20
+ .byte 3
+ .byte 2
+ .byte 1
+ .byte 0
+ .byte 27
+ .byte 26
+ .byte 25
+ .byte 24
+.LCPI1_13:
+ .byte 27
+ .byte 26
+ .byte 25
+ .byte 24
+ .byte 11
+ .byte 10
+ .byte 9
+ .byte 8
+ .byte 15
+ .byte 14
+ .byte 13
+ .byte 12
+ .byte 31
+ .byte 30
+ .byte 29
+ .byte 28
+.LCPI1_14:
+ .byte 23
+ .byte 22
+ .byte 21
+ .byte 20
+ .byte 23
+ .byte 22
+ .byte 21
+ .byte 20
+ .byte 3
+ .byte 2
+ .byte 1
+ .byte 0
+ .byte 3
+ .byte 2
+ .byte 1
+ .byte 0
+ .text
+ .globl zfs_blake3_compress_xof_sse41
+ .p2align 2
+ .type zfs_blake3_compress_xof_sse41,@function
+zfs_blake3_compress_xof_sse41:
+.Lfunc_begin1:
+ .cfi_startproc
+.Lfunc_gep1:
+ addis 2, 12, .TOC.-.Lfunc_gep1@ha
+ addi 2, 2, .TOC.-.Lfunc_gep1@l
+.Lfunc_lep1:
+ .localentry zfs_blake3_compress_xof_sse41, .Lfunc_lep1-.Lfunc_gep1
+ li 9, -64
+ mtvsrd 34, 5
+ li 5, 16
+ lfdx 0, 0, 4
+ vspltisw 13, -16
+ addis 11, 2, .LCPI1_9@toc@ha
+ stxvd2x 60, 1, 9
+ li 9, -48
+ mtvsrd 35, 7
+ lfd 1, 8(4)
+ lfd 2, 16(4)
+ addis 7, 2, .LCPI1_0@toc@ha
+ stxvd2x 61, 1, 9
+ li 9, -32
+ mtvsrwz 36, 6
+ rldicl 6, 6, 32, 32
+ stxvd2x 62, 1, 9
+ li 9, -16
+ vmrghb 2, 3, 2
+ stxvd2x 63, 1, 9
+ mtvsrwz 35, 6
+ addi 6, 7, .LCPI1_0@toc@l
+ addis 7, 2, .LCPI1_2@toc@ha
+ lfd 3, 24(4)
+ xxmrghd 37, 1, 0
+ lvx 6, 0, 6
+ xxlxor 33, 33, 33
+ lxvd2x 0, 0, 3
+ addis 6, 2, .LCPI1_1@toc@ha
+ addi 7, 7, .LCPI1_2@toc@l
+ vmrghw 3, 3, 4
+ lxvd2x 1, 3, 5
+ addi 6, 6, .LCPI1_1@toc@l
+ vspltisw 14, 9
+ xxmrghd 32, 3, 2
+ xxswapd 36, 0
+ vperm 2, 1, 2, 6
+ xxswapd 38, 1
+ vpkudum 9, 0, 5
+ xxmrgld 34, 34, 35
+ lvx 3, 0, 7
+ addis 7, 2, .LCPI1_4@toc@ha
+ addi 7, 7, .LCPI1_4@toc@l
+ vadduwm 4, 9, 4
+ lvx 11, 0, 7
+ addis 7, 2, .LCPI1_6@toc@ha
+ addi 7, 7, .LCPI1_6@toc@l
+ vadduwm 7, 4, 6
+ lvx 4, 0, 6
+ addis 6, 2, .LCPI1_3@toc@ha
+ addi 6, 6, .LCPI1_3@toc@l
+ vperm 11, 0, 5, 11
+ lvx 0, 0, 7
+ li 7, 32
+ xxlxor 40, 39, 34
+ lvx 10, 0, 6
+ addis 6, 2, .LCPI1_5@toc@ha
+ lxvd2x 0, 4, 7
+ vcmpgtsb 2, 1, 4
+ addi 6, 6, .LCPI1_5@toc@l
+ vperm 4, 8, 8, 3
+ vspltisw 8, 10
+ xxlandc 44, 36, 34
+ vadduwm 4, 8, 8
+ vadduwm 8, 12, 10
+ xxlxor 37, 40, 38
+ vrlw 6, 5, 4
+ vadduwm 5, 7, 11
+ vadduwm 7, 6, 5
+ lvx 5, 0, 6
+ li 6, 48
+ lxvd2x 1, 4, 6
+ addis 4, 2, .LCPI1_7@toc@ha
+ xxlxor 42, 39, 44
+ addi 4, 4, .LCPI1_7@toc@l
+ vcmpgtsb 5, 1, 5
+ vperm 1, 10, 10, 0
+ xxswapd 42, 0
+ xxswapd 44, 1
+ vpkudum 16, 12, 10
+ xxlandc 47, 33, 37
+ vsubuwm 1, 14, 13
+ lvx 14, 0, 4
+ addis 4, 2, .LCPI1_8@toc@ha
+ vadduwm 8, 15, 8
+ xxswapd 45, 47
+ addi 4, 4, .LCPI1_8@toc@l
+ xxlxor 38, 40, 38
+ xxsldwi 40, 40, 40, 3
+ vadduwm 7, 7, 16
+ xxsldwi 48, 48, 48, 1
+ vrlw 6, 6, 1
+ xxsldwi 39, 39, 39, 1
+ vperm 14, 10, 12, 14
+ vadduwm 7, 6, 7
+ xxlxor 45, 39, 45
+ vperm 13, 13, 13, 3
+ xxlandc 45, 45, 34
+ vadduwm 8, 13, 8
+ xxlxor 38, 40, 38
+ vrlw 10, 6, 4
+ vadduwm 6, 7, 14
+ vadduwm 7, 10, 6
+ xxlxor 38, 39, 45
+ vperm 12, 6, 6, 0
+ lvx 6, 0, 4
+ addis 4, 2, .LCPI1_10@toc@ha
+ addi 4, 4, .LCPI1_10@toc@l
+ vperm 13, 11, 9, 6
+ xxlandc 44, 44, 37
+ vadduwm 15, 12, 8
+ vadduwm 7, 7, 13
+ xxsldwi 45, 45, 45, 3
+ xxlxor 40, 47, 42
+ xxsldwi 47, 47, 47, 1
+ xxsldwi 39, 39, 39, 3
+ vrlw 10, 8, 1
+ xxswapd 40, 44
+ vadduwm 17, 10, 7
+ lvx 7, 0, 4
+ addi 4, 11, .LCPI1_9@toc@l
+ xxlxor 44, 49, 40
+ lvx 8, 0, 4
+ addis 4, 2, .LCPI1_11@toc@ha
+ vperm 18, 9, 9, 7
+ addi 4, 4, .LCPI1_11@toc@l
+ vperm 12, 12, 12, 3
+ lvx 9, 0, 4
+ addis 4, 2, .LCPI1_12@toc@ha
+ vperm 19, 14, 16, 8
+ addi 4, 4, .LCPI1_12@toc@l
+ xxlandc 63, 44, 34
+ vperm 12, 19, 18, 9
+ vadduwm 15, 31, 15
+ xxlxor 42, 47, 42
+ vrlw 18, 10, 4
+ vadduwm 10, 17, 12
+ vadduwm 17, 18, 10
+ xxlxor 42, 49, 63
+ xxmrgld 63, 43, 46
+ xxsldwi 49, 49, 49, 1
+ vmrghw 14, 14, 11
+ vperm 19, 10, 10, 0
+ lvx 10, 0, 4
+ addis 4, 2, .LCPI1_13@toc@ha
+ addi 4, 4, .LCPI1_13@toc@l
+ lvx 11, 0, 4
+ addis 4, 2, .LCPI1_14@toc@ha
+ vperm 31, 16, 31, 10
+ addi 4, 4, .LCPI1_14@toc@l
+ vperm 14, 14, 16, 11
+ xxlandc 51, 51, 37
+ vadduwm 15, 19, 15
+ xxswapd 51, 51
+ vadduwm 17, 17, 31
+ xxlxor 50, 47, 50
+ xxsldwi 47, 47, 47, 3
+ vperm 30, 14, 31, 8
+ vrlw 18, 18, 1
+ vadduwm 17, 18, 17
+ xxlxor 51, 49, 51
+ vadduwm 17, 17, 14
+ vperm 19, 19, 19, 3
+ xxlandc 51, 51, 34
+ vadduwm 15, 19, 15
+ xxlxor 48, 47, 50
+ vrlw 16, 16, 4
+ vadduwm 17, 16, 17
+ xxlxor 50, 49, 51
+ vperm 19, 12, 13, 6
+ vperm 18, 18, 18, 0
+ vperm 13, 13, 13, 7
+ vadduwm 17, 17, 19
+ xxlandc 50, 50, 37
+ xxsldwi 49, 49, 49, 3
+ vperm 13, 30, 13, 9
+ vadduwm 15, 18, 15
+ xxswapd 50, 50
+ xxmrgld 62, 44, 46
+ vmrghw 12, 14, 12
+ xxlxor 48, 47, 48
+ xxsldwi 47, 47, 47, 1
+ vrlw 16, 16, 1
+ vperm 30, 31, 30, 10
+ vperm 12, 12, 31, 11
+ vadduwm 17, 16, 17
+ xxlxor 50, 49, 50
+ vadduwm 17, 17, 13
+ vperm 18, 18, 18, 3
+ vperm 31, 12, 30, 8
+ xxlandc 50, 50, 34
+ vadduwm 15, 18, 15
+ xxlxor 48, 47, 48
+ vrlw 16, 16, 4
+ vadduwm 17, 16, 17
+ xxlxor 50, 49, 50
+ xxsldwi 49, 49, 49, 1
+ vperm 18, 18, 18, 0
+ vadduwm 17, 17, 30
+ xxlandc 50, 50, 37
+ vadduwm 15, 18, 15
+ xxswapd 50, 50
+ xxlxor 48, 47, 48
+ xxsldwi 46, 47, 47, 3
+ vrlw 16, 16, 1
+ vadduwm 17, 16, 17
+ xxlxor 50, 49, 50
+ vadduwm 17, 17, 12
+ vperm 18, 18, 18, 3
+ xxlandc 47, 50, 34
+ xxsldwi 50, 51, 51, 3
+ vadduwm 14, 15, 14
+ vperm 19, 13, 18, 6
+ xxlxor 48, 46, 48
+ vperm 18, 18, 18, 7
+ vrlw 16, 16, 4
+ vadduwm 17, 16, 17
+ xxlxor 47, 49, 47
+ vadduwm 17, 17, 19
+ vperm 15, 15, 15, 0
+ xxsldwi 49, 49, 49, 3
+ xxlandc 47, 47, 37
+ vadduwm 14, 15, 14
+ xxswapd 47, 47
+ xxlxor 48, 46, 48
+ xxsldwi 46, 46, 46, 1
+ vrlw 16, 16, 1
+ vadduwm 17, 16, 17
+ xxlxor 47, 49, 47
+ vperm 15, 15, 15, 3
+ xxlandc 47, 47, 34
+ vadduwm 29, 15, 14
+ vperm 14, 31, 18, 9
+ xxmrgld 50, 45, 44
+ xxlxor 48, 61, 48
+ vmrghw 12, 12, 13
+ vrlw 16, 16, 4
+ vperm 18, 30, 18, 10
+ vadduwm 17, 17, 14
+ vadduwm 17, 16, 17
+ xxlxor 47, 49, 47
+ xxsldwi 49, 49, 49, 1
+ vperm 15, 15, 15, 0
+ vadduwm 17, 17, 18
+ xxlandc 47, 47, 37
+ vadduwm 31, 15, 29
+ xxswapd 47, 47
+ xxlxor 48, 63, 48
+ xxsldwi 45, 63, 63, 3
+ vperm 31, 12, 30, 11
+ vrlw 16, 16, 1
+ vadduwm 17, 16, 17
+ xxlxor 47, 49, 47
+ vperm 15, 15, 15, 3
+ xxlandc 47, 47, 34
+ vadduwm 13, 15, 13
+ xxlxor 44, 45, 48
+ vadduwm 16, 17, 31
+ xxsldwi 49, 51, 51, 3
+ vrlw 12, 12, 4
+ vperm 19, 14, 17, 6
+ vadduwm 16, 12, 16
+ xxlxor 47, 48, 47
+ vperm 15, 15, 15, 0
+ xxlandc 47, 47, 37
+ vadduwm 13, 15, 13
+ xxswapd 47, 47
+ xxlxor 44, 45, 44
+ xxsldwi 45, 45, 45, 1
+ vrlw 30, 12, 1
+ vadduwm 12, 16, 19
+ xxsldwi 44, 44, 44, 3
+ vadduwm 16, 30, 12
+ xxlxor 44, 48, 47
+ vperm 15, 17, 17, 7
+ vperm 12, 12, 12, 3
+ vperm 17, 31, 18, 8
+ xxlandc 61, 44, 34
+ vperm 12, 17, 15, 9
+ vadduwm 13, 29, 13
+ xxlxor 47, 45, 62
+ xxmrgld 62, 46, 63
+ vmrghw 14, 31, 14
+ vrlw 15, 15, 4
+ vadduwm 16, 16, 12
+ vperm 30, 18, 30, 10
+ vperm 14, 14, 18, 11
+ xxsldwi 50, 51, 51, 3
+ vadduwm 16, 15, 16
+ xxlxor 49, 48, 61
+ xxsldwi 48, 48, 48, 1
+ vperm 19, 12, 18, 6
+ vperm 17, 17, 17, 0
+ vadduwm 16, 16, 30
+ xxmrgld 60, 44, 46
+ vmrghw 12, 14, 12
+ vperm 28, 30, 28, 10
+ xxlandc 49, 49, 37
+ vadduwm 13, 17, 13
+ xxswapd 49, 49
+ vperm 12, 12, 30, 11
+ xxlxor 47, 45, 47
+ xxsldwi 45, 45, 45, 3
+ vrlw 15, 15, 1
+ vperm 8, 12, 28, 8
+ vadduwm 16, 15, 16
+ xxlxor 49, 48, 49
+ vadduwm 16, 16, 14
+ vperm 17, 17, 17, 3
+ xxlandc 49, 49, 34
+ vadduwm 13, 17, 13
+ xxlxor 47, 45, 47
+ vrlw 15, 15, 4
+ vadduwm 16, 15, 16
+ xxlxor 49, 48, 49
+ vperm 17, 17, 17, 0
+ xxlandc 49, 49, 37
+ vadduwm 31, 17, 13
+ xxlxor 45, 63, 47
+ vrlw 15, 13, 1
+ vadduwm 13, 16, 19
+ xxswapd 48, 49
+ xxsldwi 51, 51, 51, 3
+ xxsldwi 45, 45, 45, 3
+ vadduwm 17, 15, 13
+ xxlxor 45, 49, 48
+ lvx 16, 0, 4
+ vperm 29, 13, 13, 3
+ vperm 13, 18, 18, 7
+ xxsldwi 50, 63, 63, 1
+ vperm 16, 14, 30, 16
+ vperm 7, 19, 19, 7
+ xxlandc 63, 61, 34
+ vadduwm 18, 31, 18
+ vperm 29, 16, 13, 9
+ xxlxor 47, 50, 47
+ vperm 6, 16, 19, 6
+ vrlw 15, 15, 4
+ vperm 7, 8, 7, 9
+ vadduwm 17, 17, 29
+ xxmrgld 41, 61, 44
+ vadduwm 17, 15, 17
+ vperm 9, 28, 9, 10
+ xxlxor 63, 49, 63
+ xxsldwi 49, 49, 49, 1
+ vperm 31, 31, 31, 0
+ vadduwm 17, 17, 28
+ xxlandc 63, 63, 37
+ vadduwm 18, 31, 18
+ xxswapd 63, 63
+ xxlxor 47, 50, 47
+ xxsldwi 46, 50, 50, 3
+ vrlw 15, 15, 1
+ vadduwm 17, 15, 17
+ xxlxor 63, 49, 63
+ vadduwm 17, 17, 12
+ vperm 31, 31, 31, 3
+ xxlandc 50, 63, 34
+ vadduwm 14, 18, 14
+ xxlxor 47, 46, 47
+ vrlw 15, 15, 4
+ vadduwm 17, 15, 17
+ xxlxor 50, 49, 50
+ vadduwm 6, 17, 6
+ vperm 18, 18, 18, 0
+ xxsldwi 38, 38, 38, 3
+ xxlandc 50, 50, 37
+ vadduwm 14, 18, 14
+ xxswapd 48, 50
+ xxlxor 47, 46, 47
+ xxsldwi 46, 46, 46, 1
+ vrlw 15, 15, 1
+ vadduwm 6, 15, 6
+ xxlxor 48, 38, 48
+ vadduwm 6, 6, 7
+ vperm 16, 16, 16, 3
+ xxlandc 48, 48, 34
+ vadduwm 14, 16, 14
+ xxlxor 40, 46, 47
+ vrlw 8, 8, 4
+ vadduwm 6, 8, 6
+ xxlxor 39, 38, 48
+ xxsldwi 38, 38, 38, 1
+ vperm 7, 7, 7, 0
+ vadduwm 6, 6, 9
+ xxlandc 39, 39, 37
+ vadduwm 14, 7, 14
+ xxswapd 39, 39
+ xxlxor 40, 46, 40
+ xxsldwi 41, 46, 46, 3
+ vrlw 8, 8, 1
+ vadduwm 6, 8, 6
+ xxlxor 39, 38, 39
+ vperm 3, 7, 7, 3
+ vmrghw 7, 12, 13
+ xxlandc 34, 35, 34
+ vperm 7, 7, 28, 11
+ vadduwm 3, 2, 9
+ xxlxor 40, 35, 40
+ vrlw 4, 8, 4
+ vadduwm 6, 6, 7
+ vadduwm 6, 4, 6
+ xxlxor 34, 38, 34
+ xxsldwi 0, 38, 38, 3
+ vperm 2, 2, 2, 0
+ xxlandc 34, 34, 37
+ vadduwm 3, 2, 3
+ xxswapd 34, 34
+ xxlxor 36, 35, 36
+ xxsldwi 1, 35, 35, 1
+ vrlw 4, 4, 1
+ xxlxor 0, 1, 0
+ xxswapd 0, 0
+ xxlxor 2, 36, 34
+ stxvd2x 0, 0, 8
+ xxswapd 2, 2
+ stxvd2x 2, 8, 5
+ lfdx 0, 0, 3
+ lfd 2, 8(3)
+ xxmrghd 35, 2, 0
+ xxlxor 0, 1, 35
+ xxswapd 0, 0
+ stxvd2x 0, 8, 7
+ lfd 0, 16(3)
+ lfd 1, 24(3)
+ li 3, -16
+ xxmrghd 35, 1, 0
+ xxlxor 0, 34, 35
+ xxswapd 0, 0
+ stxvd2x 0, 8, 6
+ lxvd2x 63, 1, 3
+ li 3, -32
+ lxvd2x 62, 1, 3
+ li 3, -48
+ lxvd2x 61, 1, 3
+ li 3, -64
+ lxvd2x 60, 1, 3
+ blr
+ .long 0
+ .quad 0
+.Lfunc_end1:
+ .size zfs_blake3_compress_xof_sse41, .Lfunc_end1-.Lfunc_begin1
+ .cfi_endproc
+
+ .globl zfs_blake3_hash_many_sse41
+ .p2align 2
+ .type zfs_blake3_hash_many_sse41,@function
+zfs_blake3_hash_many_sse41:
+.Lfunc_begin2:
+ .cfi_startproc
+.Lfunc_gep2:
+ addis 2, 12, .TOC.-.Lfunc_gep2@ha
+ addi 2, 2, .TOC.-.Lfunc_gep2@l
+.Lfunc_lep2:
+ .localentry zfs_blake3_hash_many_sse41, .Lfunc_lep2-.Lfunc_gep2
+ mfocrf 12, 32
+ mflr 0
+ std 0, 16(1)
+ stw 12, 8(1)
+ stdu 1, -256(1)
+ .cfi_def_cfa_offset 256
+ .cfi_offset lr, 16
+ .cfi_offset r17, -120
+ .cfi_offset r18, -112
+ .cfi_offset r19, -104
+ .cfi_offset r20, -96
+ .cfi_offset r21, -88
+ .cfi_offset r22, -80
+ .cfi_offset r23, -72
+ .cfi_offset r24, -64
+ .cfi_offset r25, -56
+ .cfi_offset r26, -48
+ .cfi_offset r27, -40
+ .cfi_offset r28, -32
+ .cfi_offset r29, -24
+ .cfi_offset r30, -16
+ .cfi_offset cr2, 8
+ std 26, 208(1)
+ mr 26, 4
+ cmpldi 1, 4, 4
+ andi. 4, 8, 1
+ std 18, 144(1)
+ std 19, 152(1)
+ crmove 8, 1
+ ld 19, 360(1)
+ lwz 18, 352(1)
+ std 24, 192(1)
+ std 25, 200(1)
+ std 27, 216(1)
+ std 28, 224(1)
+ mr 24, 10
+ mr 28, 6
+ mr 27, 5
+ mr 25, 3
+ std 29, 232(1)
+ std 30, 240(1)
+ mr 30, 9
+ mr 29, 7
+ std 17, 136(1)
+ std 20, 160(1)
+ std 21, 168(1)
+ std 22, 176(1)
+ std 23, 184(1)
+ blt 1, .LBB2_3
+ li 3, 0
+ li 4, 1
+ clrldi 23, 30, 32
+ isel 22, 4, 3, 8
+ clrldi 21, 24, 32
+ clrldi 20, 18, 32
+.LBB2_2:
+ mr 3, 25
+ mr 4, 27
+ mr 5, 28
+ mr 6, 29
+ mr 7, 22
+ mr 8, 23
+ mr 9, 21
+ mr 10, 20
+ std 19, 32(1)
+ bl blake3_hash4_sse41
+ addi 26, 26, -4
+ addi 3, 29, 4
+ addi 25, 25, 32
+ addi 19, 19, 128
+ cmpldi 26, 3
+ isel 29, 3, 29, 8
+ bgt 0, .LBB2_2
+.LBB2_3:
+ cmpldi 26, 0
+ beq 0, .LBB2_11
+ li 3, 0
+ li 4, 1
+ or 21, 24, 30
+ li 20, 16
+ addi 24, 1, 96
+ isel 22, 4, 3, 8
+.LBB2_5:
+ lxvd2x 0, 28, 20
+ ld 23, 0(25)
+ mr 17, 27
+ mr 3, 21
+ stxvd2x 0, 24, 20
+ lxvd2x 0, 0, 28
+ stxvd2x 0, 0, 24
+.LBB2_6:
+ cmpldi 17, 1
+ beq 0, .LBB2_8
+ cmpldi 17, 0
+ bne 0, .LBB2_9
+ b .LBB2_10
+.LBB2_8:
+ or 3, 3, 18
+.LBB2_9:
+ clrldi 7, 3, 56
+ mr 3, 24
+ mr 4, 23
+ li 5, 64
+ mr 6, 29
+ bl zfs_blake3_compress_in_place_sse41
+ addi 23, 23, 64
+ addi 17, 17, -1
+ mr 3, 30
+ b .LBB2_6
+.LBB2_10:
+ lxvd2x 0, 24, 20
+ addi 26, 26, -1
+ add 29, 29, 22
+ addi 25, 25, 8
+ cmpldi 26, 0
+ stxvd2x 0, 19, 20
+ lxvd2x 0, 0, 24
+ stxvd2x 0, 0, 19
+ addi 19, 19, 32
+ bne 0, .LBB2_5
+.LBB2_11:
+ ld 30, 240(1)
+ ld 29, 232(1)
+ ld 28, 224(1)
+ ld 27, 216(1)
+ ld 26, 208(1)
+ ld 25, 200(1)
+ ld 24, 192(1)
+ ld 23, 184(1)
+ ld 22, 176(1)
+ ld 21, 168(1)
+ ld 20, 160(1)
+ ld 19, 152(1)
+ ld 18, 144(1)
+ ld 17, 136(1)
+ addi 1, 1, 256
+ ld 0, 16(1)
+ lwz 12, 8(1)
+ mtocrf 32, 12
+ mtlr 0
+ blr
+ .long 0
+ .quad 0
+.Lfunc_end2:
+ .size zfs_blake3_hash_many_sse41, .Lfunc_end2-.Lfunc_begin2
+ .cfi_endproc
+
+ .section .rodata.cst16,"aM",@progbits,16
+ .p2align 4
+.LCPI3_0:
+ .quad 4294967296
+ .quad 12884901890
+.LCPI3_1:
+ .byte 2
+ .byte 3
+ .byte 0
+ .byte 1
+ .byte 6
+ .byte 7
+ .byte 4
+ .byte 5
+ .byte 10
+ .byte 11
+ .byte 8
+ .byte 9
+ .byte 14
+ .byte 15
+ .byte 12
+ .byte 13
+.LCPI3_2:
+ .byte 1
+ .byte 2
+ .byte 3
+ .byte 0
+ .byte 5
+ .byte 6
+ .byte 7
+ .byte 4
+ .byte 9
+ .byte 10
+ .byte 11
+ .byte 8
+ .byte 13
+ .byte 14
+ .byte 15
+ .byte 12
+.LCPI3_3:
+ .byte 29
+ .byte 28
+ .byte 31
+ .byte 30
+ .byte 25
+ .byte 24
+ .byte 27
+ .byte 26
+ .byte 21
+ .byte 20
+ .byte 23
+ .byte 22
+ .byte 17
+ .byte 16
+ .byte 19
+ .byte 18
+.LCPI3_4:
+ .long 1779033703
+ .long 1779033703
+ .long 1779033703
+ .long 1779033703
+.LCPI3_5:
+ .long 3144134277
+ .long 3144134277
+ .long 3144134277
+ .long 3144134277
+.LCPI3_6:
+ .long 1013904242
+ .long 1013904242
+ .long 1013904242
+ .long 1013904242
+.LCPI3_7:
+ .long 2773480762
+ .long 2773480762
+ .long 2773480762
+ .long 2773480762
+.LCPI3_8:
+ .byte 30
+ .byte 29
+ .byte 28
+ .byte 31
+ .byte 26
+ .byte 25
+ .byte 24
+ .byte 27
+ .byte 22
+ .byte 21
+ .byte 20
+ .byte 23
+ .byte 18
+ .byte 17
+ .byte 16
+ .byte 19
+ .text
+ .p2align 2
+ .type blake3_hash4_sse41,@function
+blake3_hash4_sse41:
+.Lfunc_begin3:
+ .cfi_startproc
+.Lfunc_gep3:
+ addis 2, 12, .TOC.-.Lfunc_gep3@ha
+ addi 2, 2, .TOC.-.Lfunc_gep3@l
+.Lfunc_lep3:
+ .localentry blake3_hash4_sse41, .Lfunc_lep3-.Lfunc_gep3
+ stdu 1, -416(1)
+ .cfi_def_cfa_offset 416
+ .cfi_offset r22, -176
+ .cfi_offset r23, -168
+ .cfi_offset r24, -160
+ .cfi_offset r25, -152
+ .cfi_offset r26, -144
+ .cfi_offset r27, -136
+ .cfi_offset r28, -128
+ .cfi_offset r29, -120
+ .cfi_offset r30, -112
+ .cfi_offset f20, -96
+ .cfi_offset f21, -88
+ .cfi_offset f22, -80
+ .cfi_offset f23, -72
+ .cfi_offset f24, -64
+ .cfi_offset f25, -56
+ .cfi_offset f26, -48
+ .cfi_offset f27, -40
+ .cfi_offset f28, -32
+ .cfi_offset f29, -24
+ .cfi_offset f30, -16
+ .cfi_offset f31, -8
+ .cfi_offset v20, -368
+ .cfi_offset v21, -352
+ .cfi_offset v22, -336
+ .cfi_offset v23, -320
+ .cfi_offset v24, -304
+ .cfi_offset v25, -288
+ .cfi_offset v26, -272
+ .cfi_offset v27, -256
+ .cfi_offset v28, -240
+ .cfi_offset v29, -224
+ .cfi_offset v30, -208
+ .cfi_offset v31, -192
+ li 11, 48
+ li 0, 8
+ std 30, 304(1)
+ li 30, 12
+ li 12, 4
+ lfiwzx 0, 0, 5
+ stxvd2x 52, 1, 11
+ li 11, 64
+ lfiwzx 2, 5, 0
+ li 0, 20
+ lfiwzx 3, 5, 30
+ stxvd2x 53, 1, 11
+ li 11, 80
+ li 30, 24
+ lfiwzx 4, 5, 0
+ li 0, 28
+ stxvd2x 54, 1, 11
+ li 11, 96
+ lfiwzx 1, 5, 12
+ lfiwzx 6, 5, 30
+ xxspltw 47, 0, 1
+ cmpldi 4, 0
+ std 22, 240(1)
+ stxvd2x 55, 1, 11
+ li 11, 112
+ lfiwzx 7, 5, 0
+ xxspltw 40, 2, 1
+ std 23, 248(1)
+ xxspltw 39, 3, 1
+ std 24, 256(1)
+ std 25, 264(1)
+ xxspltw 51, 1, 1
+ xxspltw 43, 6, 1
+ std 26, 272(1)
+ xxspltw 41, 7, 1
+ std 27, 280(1)
+ std 28, 288(1)
+ std 29, 296(1)
+ stxvd2x 56, 1, 11
+ li 11, 128
+ stfd 20, 320(1)
+ stxvd2x 57, 1, 11
+ li 11, 144
+ stfd 21, 328(1)
+ stxvd2x 58, 1, 11
+ li 11, 160
+ stfd 22, 336(1)
+ stxvd2x 59, 1, 11
+ li 11, 176
+ stfd 23, 344(1)
+ stxvd2x 60, 1, 11
+ li 11, 192
+ stfd 24, 352(1)
+ stxvd2x 61, 1, 11
+ li 11, 208
+ stfd 25, 360(1)
+ stxvd2x 62, 1, 11
+ li 11, 224
+ stfd 26, 368(1)
+ stxvd2x 63, 1, 11
+ li 11, 16
+ xxspltw 63, 4, 1
+ lfiwzx 5, 5, 11
+ ld 5, 448(1)
+ stfd 27, 376(1)
+ stfd 28, 384(1)
+ stfd 29, 392(1)
+ stfd 30, 400(1)
+ stfd 31, 408(1)
+ xxspltw 50, 5, 1
+ beq 0, .LBB3_5
+ addis 30, 2, .LCPI3_0@toc@ha
+ neg 7, 7
+ xxleqv 34, 34, 34
+ addis 28, 2, .LCPI3_5@toc@ha
+ addis 27, 2, .LCPI3_6@toc@ha
+ addis 26, 2, .LCPI3_7@toc@ha
+ addis 29, 2, .LCPI3_4@toc@ha
+ addis 25, 2, .LCPI3_8@toc@ha
+ addi 0, 30, .LCPI3_0@toc@l
+ mtfprwz 2, 7
+ addis 7, 2, .LCPI3_1@toc@ha
+ addis 30, 2, .LCPI3_3@toc@ha
+ addi 24, 29, .LCPI3_4@toc@l
+ ld 29, 24(3)
+ lxvd2x 1, 0, 0
+ mtfprwz 0, 6
+ rldicl 6, 6, 32, 32
+ addi 0, 30, .LCPI3_3@toc@l
+ ld 30, 16(3)
+ xxspltw 2, 2, 1
+ vslw 2, 2, 2
+ xxspltw 37, 0, 1
+ mtfprwz 0, 6
+ addi 6, 7, .LCPI3_1@toc@l
+ addis 7, 2, .LCPI3_2@toc@ha
+ xxswapd 35, 1
+ xxlxor 36, 36, 36
+ xxspltw 33, 0, 1
+ xxland 35, 2, 35
+ vadduwm 0, 3, 5
+ lvx 5, 0, 6
+ addi 6, 7, .LCPI3_2@toc@l
+ ld 7, 8(3)
+ xxlor 35, 35, 34
+ xxlxor 34, 32, 34
+ xxlor 9, 32, 32
+ lvx 0, 0, 6
+ ld 6, 0(3)
+ addi 3, 3, -8
+ vcmpgtsw 2, 3, 2
+ lvx 3, 0, 0
+ addi 0, 28, .LCPI3_5@toc@l
+ addi 28, 27, .LCPI3_6@toc@l
+ addi 27, 26, .LCPI3_7@toc@l
+ addi 26, 25, .LCPI3_8@toc@l
+ or 25, 9, 8
+ li 9, 0
+ vcmpgtsb 5, 4, 5
+ vcmpgtsb 0, 4, 0
+ xxlor 11, 35, 35
+ lvx 3, 0, 24
+ xxlor 12, 35, 35
+ vsubuwm 2, 1, 2
+ xxlnor 10, 37, 37
+ xxlor 13, 34, 34
+ lvx 2, 0, 0
+ li 0, 32
+ xxlnor 31, 32, 32
+ xxlor 30, 34, 34
+ lvx 2, 0, 28
+ li 28, 48
+ xxlor 29, 34, 34
+ lvx 2, 0, 27
+ li 27, 0
+ xxlor 28, 34, 34
+ lvx 2, 0, 26
+ xxlor 27, 34, 34
+.LBB3_2:
+ mr 26, 27
+ addi 27, 27, 1
+ xxlor 23, 39, 39
+ cmpld 27, 4
+ sldi 26, 26, 6
+ xxlor 24, 40, 40
+ iseleq 24, 10, 9
+ add 23, 6, 26
+ add 22, 30, 26
+ lxvd2x 0, 6, 26
+ lxvd2x 1, 7, 26
+ or 25, 24, 25
+ add 24, 7, 26
+ lxvd2x 2, 30, 26
+ lxvd2x 3, 29, 26
+ xxlor 26, 47, 47
+ lxvd2x 4, 23, 11
+ lxvd2x 6, 24, 11
+ clrlwi 25, 25, 24
+ xxlor 25, 51, 51
+ lxvd2x 7, 22, 11
+ lxvd2x 8, 23, 0
+ mtfprd 5, 25
+ add 25, 29, 26
+ xxswapd 34, 0
+ lxvd2x 0, 25, 11
+ xxswapd 38, 1
+ xxswapd 32, 2
+ lxvd2x 1, 24, 0
+ lxvd2x 2, 22, 0
+ xxswapd 40, 3
+ xxswapd 39, 4
+ lxvd2x 3, 25, 0
+ lxvd2x 4, 23, 28
+ xxswapd 60, 6
+ xxswapd 47, 7
+ lxvd2x 6, 24, 28
+ xxswapd 57, 8
+ lxvd2x 7, 22, 28
+ lxvd2x 8, 25, 28
+ xxswapd 58, 0
+ mr 25, 3
+ xxswapd 53, 1
+ xxswapd 56, 2
+ xxswapd 52, 3
+ xxswapd 55, 4
+ xxswapd 54, 6
+ xxswapd 0, 5
+ xxswapd 42, 7
+ xxswapd 48, 8
+ mtctr 12
+.LBB3_3:
+ ldu 24, 8(25)
+ add 24, 24, 26
+ addi 24, 24, 256
+ dcbt 0, 24
+ bdnz .LBB3_3
+ vmrgew 4, 28, 7
+ vspltisw 14, 9
+ mr 25, 8
+ vmrgew 27, 6, 2
+ vspltisw 17, 4
+ vmrglw 12, 6, 2
+ vspltisw 19, 10
+ vmrghw 30, 6, 2
+ xxspltw 0, 0, 3
+ vmrglw 2, 8, 0
+ vmrghw 13, 8, 0
+ xxlor 7, 36, 36
+ vmrgew 4, 21, 25
+ vmrglw 29, 28, 7
+ vmrghw 1, 28, 7
+ vmrglw 28, 26, 15
+ xxmrgld 37, 34, 44
+ vmrgew 7, 26, 15
+ vmrghw 15, 26, 15
+ xxlor 21, 36, 36
+ vmrglw 4, 21, 25
+ vmrghw 21, 21, 25
+ vmrglw 25, 20, 24
+ xxmrgld 34, 60, 61
+ vmrghw 26, 20, 24
+ xxlor 38, 26, 26
+ vmrgew 3, 8, 0
+ xxlor 5, 36, 36
+ vmrgew 4, 20, 24
+ vspltisw 24, -16
+ vmrglw 20, 22, 23
+ xxmrgld 57, 57, 5
+ vmrglw 8, 16, 10
+ vmrghw 0, 16, 10
+ vadduwm 12, 19, 19
+ xxlor 8, 37, 37
+ xxlor 20, 36, 36
+ vmrgew 4, 22, 23
+ vmrghw 23, 22, 23
+ xxmrgld 40, 40, 52
+ vmrgew 22, 16, 10
+ vsubuwm 10, 14, 24
+ vslw 14, 17, 17
+ vadduwm 17, 5, 6
+ xxmrgld 37, 47, 33
+ xxlor 22, 36, 36
+ xxmrgld 36, 45, 62
+ xxlor 38, 25, 25
+ xxlor 2, 34, 34
+ vadduwm 19, 4, 6
+ xxmrgld 38, 39, 7
+ xxlor 3, 36, 36
+ xxmrghd 39, 47, 33
+ xxlor 36, 24, 24
+ xxmrgld 33, 58, 53
+ vadduwm 17, 17, 18
+ vadduwm 29, 2, 4
+ xxmrgld 36, 35, 59
+ xxlor 34, 23, 23
+ xxmrghd 35, 45, 62
+ xxlor 1, 9, 9
+ vadduwm 28, 5, 2
+ xxlor 1, 13, 13
+ vadduwm 19, 19, 31
+ vadduwm 24, 29, 11
+ vadduwm 28, 28, 9
+ xxlxor 61, 49, 9
+ xxlor 1, 41, 41
+ xxlor 41, 11, 11
+ xxlxor 34, 51, 13
+ vperm 29, 29, 29, 9
+ xxlxor 46, 56, 46
+ vperm 2, 2, 2, 9
+ xxlxor 59, 60, 0
+ vperm 14, 14, 14, 9
+ vperm 30, 27, 27, 9
+ vadduwm 19, 19, 3
+ xxlor 4, 35, 35
+ xxland 61, 61, 10
+ xxlor 35, 12, 12
+ xxland 34, 34, 10
+ vadduwm 27, 29, 3
+ xxlor 35, 30, 30
+ vadduwm 17, 17, 4
+ xxlor 26, 36, 36
+ xxland 46, 46, 10
+ vadduwm 3, 2, 3
+ xxlor 36, 29, 29
+ xxland 62, 62, 10
+ xxlxor 45, 59, 50
+ xxlxor 50, 35, 63
+ vadduwm 31, 14, 4
+ xxlor 36, 28, 28
+ xxlor 6, 37, 37
+ vadduwm 16, 30, 4
+ xxlxor 43, 63, 43
+ xxlxor 37, 48, 1
+ vrlw 4, 13, 12
+ vrlw 18, 18, 12
+ vrlw 11, 11, 12
+ vrlw 5, 5, 12
+ vadduwm 15, 24, 6
+ vadduwm 28, 28, 7
+ vadduwm 17, 4, 17
+ vadduwm 19, 18, 19
+ vadduwm 15, 11, 15
+ vadduwm 28, 5, 28
+ xxlor 25, 38, 38
+ xxlxor 61, 49, 61
+ xxlxor 34, 51, 34
+ xxlxor 46, 47, 46
+ xxlxor 62, 60, 62
+ xxlor 38, 27, 27
+ vadduwm 19, 19, 1
+ vperm 29, 29, 29, 6
+ vperm 2, 2, 2, 6
+ vperm 24, 14, 14, 6
+ vperm 30, 30, 30, 6
+ xxlor 5, 33, 33
+ vadduwm 17, 17, 25
+ xxland 61, 61, 31
+ xxland 34, 34, 31
+ xxland 56, 56, 31
+ xxland 62, 62, 31
+ vadduwm 27, 29, 27
+ vadduwm 3, 2, 3
+ vadduwm 31, 24, 31
+ vadduwm 16, 30, 16
+ xxlxor 36, 59, 36
+ xxlxor 50, 35, 50
+ xxlxor 43, 63, 43
+ xxlxor 37, 48, 37
+ vrlw 1, 18, 10
+ xxmrgld 50, 32, 55
+ vrlw 11, 11, 10
+ xxmrghd 55, 32, 55
+ vrlw 5, 5, 10
+ vrlw 4, 4, 10
+ vadduwm 15, 15, 8
+ vadduwm 28, 28, 18
+ vadduwm 17, 1, 17
+ vadduwm 19, 11, 19
+ vadduwm 15, 5, 15
+ vadduwm 28, 4, 28
+ xxlor 7, 57, 57
+ xxlxor 62, 49, 62
+ xxlxor 61, 51, 61
+ xxlxor 57, 47, 34
+ xxlxor 34, 60, 56
+ vperm 24, 30, 30, 9
+ xxmrgld 62, 20, 21
+ vperm 29, 29, 29, 9
+ vperm 25, 25, 25, 9
+ vperm 2, 2, 2, 9
+ vmr 14, 8
+ xxmrghd 40, 58, 53
+ xxmrgld 58, 54, 22
+ vadduwm 17, 17, 30
+ xxland 56, 56, 10
+ vadduwm 21, 19, 8
+ xxland 61, 61, 10
+ xxland 51, 57, 10
+ xxland 34, 34, 10
+ vadduwm 31, 24, 31
+ vadduwm 16, 29, 16
+ vadduwm 27, 19, 27
+ vadduwm 3, 2, 3
+ xxlxor 33, 63, 33
+ xxlxor 43, 48, 43
+ xxlxor 37, 59, 37
+ xxlxor 36, 35, 36
+ vrlw 1, 1, 12
+ vrlw 11, 11, 12
+ vrlw 5, 5, 12
+ vrlw 4, 4, 12
+ vadduwm 0, 15, 26
+ vadduwm 15, 28, 23
+ vadduwm 17, 1, 17
+ vadduwm 28, 11, 21
+ vadduwm 0, 5, 0
+ vadduwm 15, 4, 15
+ xxlxor 56, 49, 56
+ xxlxor 61, 60, 61
+ xxlxor 51, 32, 51
+ xxlxor 34, 47, 34
+ vperm 24, 24, 24, 6
+ vperm 29, 29, 29, 6
+ vperm 19, 19, 19, 6
+ vperm 2, 2, 2, 6
+ vmr 13, 8
+ xxlor 53, 3, 3
+ xxland 56, 56, 31
+ xxland 61, 61, 31
+ xxland 51, 51, 31
+ xxland 34, 34, 31
+ vadduwm 31, 24, 31
+ vadduwm 16, 29, 16
+ vadduwm 27, 19, 27
+ vadduwm 3, 2, 3
+ xxlxor 33, 63, 33
+ xxlxor 43, 48, 43
+ xxlxor 36, 35, 36
+ xxlxor 37, 59, 37
+ vrlw 4, 4, 10
+ vrlw 1, 1, 10
+ vrlw 11, 11, 10
+ vrlw 5, 5, 10
+ xxlor 52, 4, 4
+ xxlor 40, 2, 2
+ vadduwm 17, 17, 21
+ vadduwm 28, 28, 20
+ vadduwm 0, 0, 7
+ vadduwm 15, 15, 8
+ vadduwm 17, 4, 17
+ vadduwm 28, 1, 28
+ vadduwm 0, 11, 0
+ vadduwm 15, 5, 15
+ xxlxor 61, 49, 61
+ xxlxor 51, 60, 51
+ xxlxor 34, 32, 34
+ xxlxor 56, 47, 56
+ vperm 29, 29, 29, 9
+ vperm 19, 19, 19, 9
+ vperm 2, 2, 2, 9
+ vperm 24, 24, 24, 9
+ vmr 25, 26
+ xxlor 3, 39, 39
+ xxland 61, 61, 10
+ xxland 51, 51, 10
+ xxland 34, 34, 10
+ xxland 56, 56, 10
+ vadduwm 27, 29, 27
+ vadduwm 3, 19, 3
+ vadduwm 31, 2, 31
+ vadduwm 16, 24, 16
+ xxlxor 36, 59, 36
+ xxlxor 33, 35, 33
+ xxlxor 43, 63, 43
+ xxlxor 37, 48, 37
+ vrlw 4, 4, 12
+ vrlw 1, 1, 12
+ vrlw 11, 11, 12
+ vrlw 5, 5, 12
+ xxlor 54, 6, 6
+ xxlor 58, 5, 5
+ xxlor 39, 8, 8
+ vadduwm 17, 17, 22
+ vadduwm 28, 28, 26
+ vadduwm 0, 0, 7
+ vadduwm 15, 15, 25
+ vadduwm 17, 4, 17
+ vadduwm 28, 1, 28
+ vadduwm 0, 11, 0
+ vadduwm 15, 5, 15
+ xxlxor 61, 49, 61
+ xxlxor 51, 60, 51
+ xxlxor 34, 32, 34
+ xxlxor 56, 47, 56
+ vperm 29, 29, 29, 6
+ vperm 19, 19, 19, 6
+ vperm 2, 2, 2, 6
+ vperm 24, 24, 24, 6
+ xxlor 39, 26, 26
+ vadduwm 28, 28, 14
+ xxland 61, 61, 31
+ xxland 51, 51, 31
+ xxland 34, 34, 31
+ xxland 56, 56, 31
+ vadduwm 27, 29, 27
+ vadduwm 3, 19, 3
+ vadduwm 31, 2, 31
+ vadduwm 16, 24, 16
+ xxlxor 36, 59, 36
+ xxlxor 33, 35, 33
+ xxlxor 43, 63, 43
+ xxlxor 37, 48, 37
+ vrlw 1, 1, 10
+ vrlw 11, 11, 10
+ vrlw 5, 5, 10
+ vrlw 4, 4, 10
+ vadduwm 17, 17, 7
+ vadduwm 0, 0, 30
+ vadduwm 15, 15, 23
+ vadduwm 17, 1, 17
+ vadduwm 28, 11, 28
+ vadduwm 0, 5, 0
+ vadduwm 15, 4, 15
+ xxlxor 56, 49, 56
+ xxlxor 61, 60, 61
+ xxlxor 51, 32, 51
+ xxlxor 34, 47, 34
+ vperm 24, 24, 24, 9
+ vperm 29, 29, 29, 9
+ vperm 19, 19, 19, 9
+ vperm 2, 2, 2, 9
+ xxlor 24, 55, 55
+ vadduwm 17, 17, 13
+ xxland 56, 56, 10
+ xxland 61, 61, 10
+ xxland 51, 51, 10
+ xxland 34, 34, 10
+ vadduwm 31, 24, 31
+ vadduwm 16, 29, 16
+ vadduwm 27, 19, 27
+ vadduwm 3, 2, 3
+ xxlxor 33, 63, 33
+ xxlxor 43, 48, 43
+ xxlxor 37, 59, 37
+ xxlxor 36, 35, 36
+ vrlw 1, 1, 12
+ vrlw 11, 11, 12
+ vrlw 5, 5, 12
+ vrlw 4, 4, 12
+ vmr 23, 13
+ xxlor 45, 25, 25
+ xxlor 39, 7, 7
+ vadduwm 28, 28, 13
+ vadduwm 0, 0, 18
+ vadduwm 15, 15, 7
+ vadduwm 17, 1, 17
+ vadduwm 28, 11, 28
+ vadduwm 0, 5, 0
+ vadduwm 15, 4, 15
+ xxlxor 56, 49, 56
+ xxlxor 61, 60, 61
+ xxlxor 51, 32, 51
+ xxlxor 34, 47, 34
+ vperm 24, 24, 24, 6
+ vperm 29, 29, 29, 6
+ vperm 19, 19, 19, 6
+ vperm 2, 2, 2, 6
+ xxlor 2, 46, 46
+ xxlor 46, 3, 3
+ xxland 56, 56, 31
+ xxland 61, 61, 31
+ xxland 51, 51, 31
+ xxland 34, 34, 31
+ vadduwm 31, 24, 31
+ vadduwm 16, 29, 16
+ vadduwm 27, 19, 27
+ vadduwm 3, 2, 3
+ xxlxor 33, 63, 33
+ xxlxor 43, 48, 43
+ xxlxor 36, 35, 36
+ xxlxor 37, 59, 37
+ vrlw 4, 4, 10
+ vrlw 1, 1, 10
+ vrlw 11, 11, 10
+ vrlw 5, 5, 10
+ vadduwm 17, 17, 20
+ vadduwm 28, 28, 26
+ vadduwm 0, 0, 25
+ vadduwm 15, 15, 14
+ vadduwm 17, 4, 17
+ vadduwm 28, 1, 28
+ vadduwm 0, 11, 0
+ vadduwm 15, 5, 15
+ xxlxor 61, 49, 61
+ xxlxor 51, 60, 51
+ xxlxor 34, 32, 34
+ xxlxor 56, 47, 56
+ vperm 29, 29, 29, 9
+ vperm 19, 19, 19, 9
+ vperm 2, 2, 2, 9
+ vperm 24, 24, 24, 9
+ xxlor 52, 2, 2
+ vadduwm 17, 17, 8
+ xxland 61, 61, 10
+ xxland 51, 51, 10
+ xxland 34, 34, 10
+ xxland 56, 56, 10
+ vadduwm 27, 29, 27
+ vadduwm 3, 19, 3
+ vadduwm 31, 2, 31
+ vadduwm 16, 24, 16
+ xxlxor 36, 59, 36
+ xxlxor 33, 35, 33
+ xxlxor 43, 63, 43
+ xxlxor 37, 48, 37
+ vrlw 4, 4, 12
+ vrlw 1, 1, 12
+ vrlw 11, 11, 12
+ vrlw 5, 5, 12
+ vadduwm 28, 28, 20
+ vadduwm 0, 0, 21
+ vadduwm 15, 15, 18
+ vadduwm 17, 4, 17
+ vadduwm 28, 1, 28
+ vadduwm 0, 11, 0
+ vadduwm 15, 5, 15
+ xxlxor 61, 49, 61
+ xxlxor 51, 60, 51
+ xxlxor 34, 32, 34
+ xxlxor 56, 47, 56
+ vperm 29, 29, 29, 6
+ vperm 19, 19, 19, 6
+ vperm 2, 2, 2, 6
+ vperm 24, 24, 24, 6
+ vadduwm 17, 17, 22
+ vadduwm 28, 28, 30
+ xxland 61, 61, 31
+ xxland 51, 51, 31
+ xxland 34, 34, 31
+ xxland 56, 56, 31
+ vadduwm 27, 29, 27
+ vadduwm 3, 19, 3
+ vadduwm 31, 2, 31
+ vadduwm 16, 24, 16
+ xxlxor 36, 59, 36
+ xxlxor 33, 35, 33
+ xxlxor 43, 63, 43
+ xxlxor 37, 48, 37
+ vrlw 1, 1, 10
+ vrlw 11, 11, 10
+ vrlw 5, 5, 10
+ vrlw 4, 4, 10
+ vadduwm 0, 0, 23
+ vadduwm 15, 15, 7
+ vadduwm 17, 1, 17
+ vadduwm 28, 11, 28
+ vadduwm 0, 5, 0
+ vadduwm 15, 4, 15
+ xxlxor 56, 49, 56
+ xxlxor 61, 60, 61
+ xxlxor 51, 32, 51
+ xxlxor 34, 47, 34
+ vperm 24, 24, 24, 9
+ vperm 29, 29, 29, 9
+ vperm 19, 19, 19, 9
+ vperm 2, 2, 2, 9
+ xxlor 5, 4, 4
+ xxlor 4, 58, 58
+ xxland 56, 56, 10
+ xxland 61, 61, 10
+ xxland 51, 51, 10
+ xxland 34, 34, 10
+ vadduwm 31, 24, 31
+ vadduwm 16, 29, 16
+ vadduwm 27, 19, 27
+ vadduwm 3, 2, 3
+ xxlxor 33, 63, 33
+ xxlxor 43, 48, 43
+ xxlxor 37, 59, 37
+ xxlxor 36, 35, 36
+ vrlw 1, 1, 12
+ vrlw 11, 11, 12
+ vrlw 5, 5, 12
+ vrlw 4, 4, 12
+ xxlor 39, 8, 8
+ xxlor 54, 24, 24
+ xxlor 58, 26, 26
+ vadduwm 17, 17, 13
+ vadduwm 28, 28, 7
+ vadduwm 0, 0, 22
+ vadduwm 15, 15, 26
+ vadduwm 17, 1, 17
+ vadduwm 28, 11, 28
+ vadduwm 0, 5, 0
+ vadduwm 15, 4, 15
+ xxlxor 56, 49, 56
+ xxlxor 61, 60, 61
+ xxlxor 51, 32, 51
+ xxlxor 34, 47, 34
+ vperm 24, 24, 24, 6
+ vperm 29, 29, 29, 6
+ vperm 19, 19, 19, 6
+ vperm 2, 2, 2, 6
+ xxlor 3, 53, 53
+ xxlor 53, 4, 4
+ xxland 56, 56, 31
+ xxland 61, 61, 31
+ xxland 51, 51, 31
+ xxland 34, 34, 31
+ vadduwm 31, 24, 31
+ vadduwm 16, 29, 16
+ vadduwm 27, 19, 27
+ vadduwm 3, 2, 3
+ xxlxor 33, 63, 33
+ xxlxor 43, 48, 43
+ xxlxor 36, 35, 36
+ xxlxor 37, 59, 37
+ vrlw 4, 4, 10
+ vrlw 1, 1, 10
+ vrlw 11, 11, 10
+ vrlw 5, 5, 10
+ vadduwm 17, 17, 21
+ vadduwm 28, 28, 20
+ vadduwm 0, 0, 18
+ vadduwm 15, 15, 25
+ vadduwm 17, 4, 17
+ vadduwm 28, 1, 28
+ vadduwm 0, 11, 0
+ vadduwm 15, 5, 15
+ xxlxor 61, 49, 61
+ xxlxor 51, 60, 51
+ xxlxor 34, 32, 34
+ xxlxor 56, 47, 56
+ vperm 29, 29, 29, 9
+ vperm 19, 19, 19, 9
+ vperm 2, 2, 2, 9
+ vperm 24, 24, 24, 9
+ xxlor 2, 55, 55
+ vmr 23, 18
+ xxland 61, 61, 10
+ xxland 51, 51, 10
+ xxland 34, 34, 10
+ xxland 56, 56, 10
+ vadduwm 27, 29, 27
+ vadduwm 3, 19, 3
+ vadduwm 31, 2, 31
+ vadduwm 16, 24, 16
+ xxlxor 36, 59, 36
+ xxlxor 33, 35, 33
+ xxlxor 43, 63, 43
+ xxlxor 37, 48, 37
+ vrlw 4, 4, 12
+ vrlw 1, 1, 12
+ vrlw 11, 11, 12
+ vrlw 5, 5, 12
+ xxlor 50, 5, 5
+ vadduwm 17, 17, 14
+ vadduwm 28, 28, 30
+ vadduwm 0, 0, 18
+ vadduwm 15, 15, 22
+ vadduwm 17, 4, 17
+ vadduwm 28, 1, 28
+ vadduwm 0, 11, 0
+ vadduwm 15, 5, 15
+ xxlxor 61, 49, 61
+ xxlxor 51, 60, 51
+ xxlxor 34, 32, 34
+ xxlxor 56, 47, 56
+ vperm 29, 29, 29, 6
+ vperm 19, 19, 19, 6
+ vperm 2, 2, 2, 6
+ vperm 24, 24, 24, 6
+ xxlor 25, 40, 40
+ vmr 8, 13
+ xxland 61, 61, 31
+ xxland 51, 51, 31
+ xxland 34, 34, 31
+ xxland 56, 56, 31
+ vadduwm 27, 29, 27
+ vadduwm 3, 19, 3
+ vadduwm 31, 2, 31
+ vadduwm 16, 24, 16
+ xxlxor 36, 59, 36
+ xxlxor 33, 35, 33
+ xxlxor 43, 63, 43
+ xxlxor 37, 48, 37
+ xxlor 45, 25, 25
+ vrlw 1, 1, 10
+ vrlw 11, 11, 10
+ vrlw 5, 5, 10
+ vrlw 4, 4, 10
+ vadduwm 17, 17, 13
+ xxlor 45, 2, 2
+ vadduwm 0, 0, 8
+ vadduwm 28, 28, 13
+ vadduwm 15, 15, 26
+ vadduwm 17, 1, 17
+ vadduwm 28, 11, 28
+ vadduwm 0, 5, 0
+ vadduwm 15, 4, 15
+ xxlxor 56, 49, 56
+ xxlxor 61, 60, 61
+ xxlxor 51, 32, 51
+ xxlxor 34, 47, 34
+ vperm 24, 24, 24, 9
+ vperm 29, 29, 29, 9
+ vperm 19, 19, 19, 9
+ vperm 2, 2, 2, 9
+ xxlor 4, 57, 57
+ xxlor 26, 46, 46
+ xxland 56, 56, 10
+ xxland 61, 61, 10
+ xxland 51, 51, 10
+ xxland 34, 34, 10
+ vadduwm 31, 24, 31
+ vadduwm 16, 29, 16
+ vadduwm 27, 19, 27
+ vadduwm 3, 2, 3
+ xxlxor 33, 63, 33
+ xxlxor 43, 48, 43
+ xxlxor 37, 59, 37
+ xxlxor 36, 35, 36
+ vrlw 1, 1, 12
+ vrlw 11, 11, 12
+ vrlw 5, 5, 12
+ vrlw 4, 4, 12
+ xxlor 8, 62, 62
+ xxlor 57, 3, 3
+ xxlor 46, 7, 7
+ xxlor 62, 6, 6
+ vadduwm 17, 17, 7
+ vadduwm 28, 28, 25
+ vadduwm 0, 0, 14
+ vadduwm 15, 15, 30
+ vadduwm 17, 1, 17
+ vadduwm 28, 11, 28
+ vadduwm 0, 5, 0
+ vadduwm 15, 4, 15
+ xxlxor 56, 49, 56
+ xxlxor 61, 60, 61
+ xxlxor 51, 32, 51
+ xxlxor 34, 47, 34
+ vperm 24, 24, 24, 6
+ vperm 29, 29, 29, 6
+ vperm 19, 19, 19, 6
+ vperm 2, 2, 2, 6
+ vadduwm 17, 17, 20
+ xxlor 3, 52, 52
+ xxland 56, 56, 31
+ xxland 61, 61, 31
+ xxland 51, 51, 31
+ xxland 34, 34, 31
+ vadduwm 31, 24, 31
+ vadduwm 16, 29, 16
+ vadduwm 27, 19, 27
+ vadduwm 3, 2, 3
+ xxlxor 33, 63, 33
+ xxlxor 43, 48, 43
+ xxlxor 36, 35, 36
+ xxlxor 37, 59, 37
+ vrlw 4, 4, 10
+ vrlw 1, 1, 10
+ vrlw 11, 11, 10
+ vrlw 5, 5, 10
+ xxlor 52, 8, 8
+ vadduwm 0, 0, 22
+ vadduwm 28, 28, 20
+ vadduwm 15, 15, 23
+ vadduwm 17, 4, 17
+ vadduwm 28, 1, 28
+ vadduwm 0, 11, 0
+ vadduwm 15, 5, 15
+ xxlxor 61, 49, 61
+ xxlxor 51, 60, 51
+ xxlxor 34, 32, 34
+ xxlxor 56, 47, 56
+ vperm 29, 29, 29, 9
+ vperm 19, 19, 19, 9
+ vperm 2, 2, 2, 9
+ vperm 24, 24, 24, 9
+ xxlor 6, 55, 55
+ xxlor 55, 4, 4
+ xxland 61, 61, 10
+ xxland 51, 51, 10
+ xxland 34, 34, 10
+ xxland 56, 56, 10
+ vadduwm 27, 29, 27
+ vadduwm 3, 19, 3
+ vadduwm 31, 2, 31
+ vadduwm 16, 24, 16
+ xxlxor 36, 59, 36
+ xxlxor 33, 35, 33
+ xxlxor 43, 63, 43
+ xxlxor 37, 48, 37
+ vrlw 4, 4, 12
+ vrlw 1, 1, 12
+ vrlw 11, 11, 12
+ vrlw 5, 5, 12
+ vadduwm 17, 17, 23
+ vadduwm 28, 28, 13
+ vadduwm 0, 0, 21
+ vadduwm 15, 15, 14
+ vadduwm 17, 4, 17
+ vadduwm 28, 1, 28
+ vadduwm 0, 11, 0
+ vadduwm 15, 5, 15
+ xxlxor 61, 49, 61
+ xxlxor 51, 60, 51
+ xxlxor 34, 32, 34
+ xxlxor 56, 47, 56
+ vperm 29, 29, 29, 6
+ vperm 19, 19, 19, 6
+ vperm 2, 2, 2, 6
+ vperm 24, 24, 24, 6
+ xxlor 4, 53, 53
+ xxlor 53, 26, 26
+ xxland 61, 61, 31
+ xxland 51, 51, 31
+ xxland 34, 34, 31
+ xxland 56, 56, 31
+ vadduwm 27, 29, 27
+ vadduwm 3, 19, 3
+ vadduwm 31, 2, 31
+ vadduwm 16, 24, 16
+ xxlxor 36, 59, 36
+ xxlxor 33, 35, 33
+ xxlxor 43, 63, 43
+ xxlxor 37, 48, 37
+ vrlw 1, 1, 10
+ vrlw 11, 11, 10
+ vrlw 5, 5, 10
+ vrlw 4, 4, 10
+ vadduwm 17, 17, 21
+ vadduwm 28, 28, 8
+ vadduwm 0, 0, 7
+ vadduwm 15, 15, 30
+ vadduwm 17, 1, 17
+ vadduwm 28, 11, 28
+ vadduwm 0, 5, 0
+ vadduwm 15, 4, 15
+ xxlxor 56, 49, 56
+ xxlxor 61, 60, 61
+ xxlxor 51, 32, 51
+ xxlxor 34, 47, 34
+ vperm 24, 24, 24, 9
+ vperm 29, 29, 29, 9
+ vperm 19, 19, 19, 9
+ vperm 2, 2, 2, 9
+ xxlor 5, 25, 25
+ xxlor 2, 58, 58
+ xxland 56, 56, 10
+ xxland 61, 61, 10
+ xxland 51, 51, 10
+ xxland 34, 34, 10
+ vadduwm 31, 24, 31
+ vadduwm 16, 29, 16
+ vadduwm 27, 19, 27
+ vadduwm 3, 2, 3
+ xxlxor 33, 63, 33
+ xxlxor 43, 48, 43
+ xxlxor 37, 59, 37
+ xxlxor 36, 35, 36
+ vrlw 1, 1, 12
+ vrlw 11, 11, 12
+ vrlw 5, 5, 12
+ vrlw 4, 4, 12
+ vmr 22, 26
+ vadduwm 0, 0, 26
+ xxlor 58, 5, 5
+ vadduwm 17, 17, 25
+ vadduwm 28, 28, 18
+ vadduwm 15, 15, 26
+ vadduwm 17, 1, 17
+ vadduwm 28, 11, 28
+ vadduwm 0, 5, 0
+ vadduwm 15, 4, 15
+ xxlxor 56, 49, 56
+ xxlxor 61, 60, 61
+ xxlxor 51, 32, 51
+ xxlxor 34, 47, 34
+ vperm 24, 24, 24, 6
+ vperm 29, 29, 29, 6
+ vperm 19, 19, 19, 6
+ vperm 2, 2, 2, 6
+ xxlor 7, 24, 24
+ xxlor 8, 57, 57
+ xxland 56, 56, 31
+ xxland 61, 61, 31
+ xxland 51, 51, 31
+ xxland 34, 34, 31
+ vadduwm 31, 24, 31
+ vadduwm 16, 29, 16
+ vadduwm 27, 19, 27
+ vadduwm 3, 2, 3
+ xxlxor 33, 63, 33
+ xxlxor 43, 48, 43
+ xxlxor 36, 35, 36
+ xxlxor 37, 59, 37
+ vrlw 4, 4, 10
+ vrlw 1, 1, 10
+ vrlw 11, 11, 10
+ vrlw 5, 5, 10
+ xxlor 57, 7, 7
+ vadduwm 17, 17, 20
+ vadduwm 28, 28, 13
+ vadduwm 0, 0, 14
+ vadduwm 15, 15, 25
+ vadduwm 17, 4, 17
+ vadduwm 28, 1, 28
+ vadduwm 0, 11, 0
+ vadduwm 15, 5, 15
+ xxlxor 61, 49, 61
+ xxlxor 51, 60, 51
+ xxlxor 34, 32, 34
+ xxlxor 56, 47, 56
+ vperm 29, 29, 29, 9
+ vperm 19, 19, 19, 9
+ vperm 2, 2, 2, 9
+ vperm 24, 24, 24, 9
+ xxlor 5, 52, 52
+ xxlor 23, 45, 45
+ xxland 61, 61, 10
+ xxland 51, 51, 10
+ xxland 34, 34, 10
+ xxland 56, 56, 10
+ vadduwm 27, 29, 27
+ vadduwm 3, 19, 3
+ vadduwm 31, 2, 31
+ vadduwm 16, 24, 16
+ xxlxor 36, 59, 36
+ xxlxor 33, 35, 33
+ xxlxor 43, 63, 43
+ xxlxor 37, 48, 37
+ vrlw 4, 4, 12
+ vrlw 1, 1, 12
+ vrlw 11, 11, 12
+ vrlw 5, 5, 12
+ xxlor 52, 6, 6
+ vadduwm 28, 28, 8
+ vmr 13, 8
+ xxlor 40, 3, 3
+ vadduwm 17, 17, 20
+ vadduwm 0, 0, 8
+ vadduwm 15, 15, 22
+ vadduwm 17, 4, 17
+ vadduwm 28, 1, 28
+ vadduwm 0, 11, 0
+ vadduwm 15, 5, 15
+ xxlxor 61, 49, 61
+ xxlxor 51, 60, 51
+ xxlxor 34, 32, 34
+ xxlxor 56, 47, 56
+ vperm 29, 29, 29, 6
+ vperm 19, 19, 19, 6
+ vperm 2, 2, 2, 6
+ vperm 24, 24, 24, 6
+ xxlor 25, 39, 39
+ vmr 7, 30
+ xxland 61, 61, 31
+ xxland 51, 51, 31
+ xxland 34, 34, 31
+ xxland 56, 56, 31
+ vadduwm 27, 29, 27
+ vadduwm 3, 19, 3
+ vadduwm 31, 2, 31
+ vadduwm 16, 24, 16
+ xxlxor 36, 59, 36
+ xxlxor 33, 35, 33
+ xxlxor 43, 63, 43
+ xxlxor 37, 48, 37
+ vrlw 1, 1, 10
+ vrlw 11, 11, 10
+ vrlw 5, 5, 10
+ vrlw 4, 4, 10
+ vmr 30, 18
+ xxlor 24, 46, 46
+ xxlor 46, 25, 25
+ xxlor 50, 8, 8
+ vadduwm 17, 17, 23
+ vadduwm 28, 28, 14
+ vadduwm 0, 0, 18
+ vadduwm 15, 15, 26
+ vadduwm 17, 1, 17
+ vadduwm 28, 11, 28
+ vadduwm 0, 5, 0
+ vadduwm 15, 4, 15
+ xxlxor 56, 49, 56
+ xxlxor 61, 60, 61
+ xxlxor 51, 32, 51
+ xxlxor 34, 47, 34
+ vperm 24, 24, 24, 9
+ vperm 29, 29, 29, 9
+ vperm 19, 19, 19, 9
+ vperm 2, 2, 2, 9
+ xxlor 6, 58, 58
+ xxlor 58, 4, 4
+ xxland 56, 56, 10
+ xxland 61, 61, 10
+ xxland 51, 51, 10
+ xxland 34, 34, 10
+ vadduwm 31, 24, 31
+ vadduwm 16, 29, 16
+ vadduwm 27, 19, 27
+ vadduwm 3, 2, 3
+ xxlxor 33, 63, 33
+ xxlxor 43, 48, 43
+ xxlxor 37, 59, 37
+ xxlxor 36, 35, 36
+ vrlw 1, 1, 12
+ vrlw 11, 11, 12
+ vrlw 5, 5, 12
+ vrlw 4, 4, 12
+ vadduwm 17, 17, 30
+ vadduwm 28, 28, 26
+ vadduwm 0, 0, 7
+ vadduwm 15, 15, 21
+ vadduwm 17, 1, 17
+ vadduwm 28, 11, 28
+ vadduwm 0, 5, 0
+ vadduwm 15, 4, 15
+ xxlxor 56, 49, 56
+ xxlxor 61, 60, 61
+ xxlxor 51, 32, 51
+ xxlxor 34, 47, 34
+ vperm 24, 24, 24, 6
+ vperm 29, 29, 29, 6
+ vperm 19, 19, 19, 6
+ vperm 2, 2, 2, 6
+ xxlor 40, 23, 23
+ vadduwm 13, 28, 13
+ vadduwm 8, 17, 8
+ xxland 49, 56, 31
+ xxland 61, 61, 31
+ xxland 51, 51, 31
+ xxland 34, 34, 31
+ vadduwm 31, 17, 31
+ vadduwm 16, 29, 16
+ vadduwm 28, 19, 27
+ vadduwm 3, 2, 3
+ xxlxor 33, 63, 33
+ xxlxor 43, 48, 43
+ xxlxor 36, 35, 36
+ xxlxor 37, 60, 37
+ vrlw 4, 4, 10
+ vrlw 1, 1, 10
+ vrlw 11, 11, 10
+ vrlw 5, 5, 10
+ xxlor 2, 55, 55
+ vmr 23, 30
+ xxlor 62, 24, 24
+ vadduwm 0, 0, 22
+ vadduwm 15, 15, 30
+ vadduwm 8, 4, 8
+ vadduwm 13, 1, 13
+ vadduwm 0, 11, 0
+ vadduwm 15, 5, 15
+ xxlxor 61, 40, 61
+ xxlxor 51, 45, 51
+ xxlxor 34, 32, 34
+ xxlxor 49, 47, 49
+ vperm 29, 29, 29, 9
+ vperm 19, 19, 19, 9
+ vperm 2, 2, 2, 9
+ vperm 17, 17, 17, 9
+ vadduwm 13, 13, 14
+ xxlor 46, 5, 5
+ xxland 61, 61, 10
+ xxland 51, 51, 10
+ xxland 34, 34, 10
+ xxland 49, 49, 10
+ vadduwm 28, 29, 28
+ vadduwm 3, 19, 3
+ vadduwm 31, 2, 31
+ vadduwm 16, 17, 16
+ xxlxor 36, 60, 36
+ xxlxor 33, 35, 33
+ xxlxor 43, 63, 43
+ xxlxor 37, 48, 37
+ vrlw 4, 4, 12
+ vrlw 1, 1, 12
+ vrlw 11, 11, 12
+ vrlw 5, 5, 12
+ vadduwm 8, 8, 25
+ vadduwm 0, 0, 14
+ vadduwm 15, 15, 7
+ vadduwm 8, 4, 8
+ vadduwm 13, 1, 13
+ vadduwm 0, 11, 0
+ vadduwm 15, 5, 15
+ xxlxor 62, 40, 61
+ xxlxor 51, 45, 51
+ xxlxor 34, 32, 34
+ xxlxor 49, 47, 49
+ vperm 30, 30, 30, 6
+ vperm 19, 19, 19, 6
+ vperm 2, 2, 2, 6
+ vperm 17, 17, 17, 6
+ vadduwm 29, 8, 20
+ vadduwm 8, 13, 18
+ xxland 45, 62, 31
+ xxland 51, 51, 31
+ xxland 34, 34, 31
+ xxland 49, 49, 31
+ vadduwm 30, 13, 28
+ vadduwm 3, 19, 3
+ vadduwm 31, 2, 31
+ vadduwm 16, 17, 16
+ xxlxor 36, 62, 36
+ xxlxor 33, 35, 33
+ xxlxor 43, 63, 43
+ xxlxor 37, 48, 37
+ vrlw 1, 1, 10
+ vrlw 11, 11, 10
+ vrlw 5, 5, 10
+ vrlw 4, 4, 10
+ vadduwm 0, 0, 23
+ vadduwm 7, 15, 21
+ vadduwm 29, 1, 29
+ vadduwm 8, 11, 8
+ vadduwm 0, 5, 0
+ vadduwm 7, 4, 7
+ xxlxor 47, 61, 49
+ xxlxor 45, 40, 45
+ xxlxor 49, 32, 51
+ xxlxor 34, 39, 34
+ vperm 15, 15, 15, 9
+ vperm 13, 13, 13, 9
+ vperm 17, 17, 17, 9
+ vperm 2, 2, 2, 9
+ xxlor 46, 3, 3
+ vadduwm 9, 29, 26
+ vadduwm 8, 8, 14
+ xxland 46, 47, 10
+ xxland 45, 45, 10
+ xxland 47, 49, 10
+ xxland 34, 34, 10
+ vadduwm 17, 14, 31
+ vadduwm 16, 13, 16
+ vadduwm 18, 15, 30
+ vadduwm 3, 2, 3
+ xxlxor 33, 49, 33
+ xxlxor 43, 48, 43
+ xxlxor 37, 50, 37
+ xxlxor 36, 35, 36
+ vrlw 1, 1, 12
+ vrlw 11, 11, 12
+ vrlw 5, 5, 12
+ vrlw 4, 4, 12
+ xxlor 44, 6, 6
+ xxlor 0, 10, 10
+ vadduwm 0, 0, 12
+ xxlor 44, 2, 2
+ vadduwm 9, 1, 9
+ vadduwm 7, 7, 12
+ vadduwm 8, 11, 8
+ vadduwm 7, 4, 7
+ vadduwm 0, 5, 0
+ xxlxor 34, 39, 34
+ xxlxor 44, 32, 47
+ vperm 2, 2, 2, 6
+ xxlxor 46, 41, 46
+ xxlxor 45, 40, 45
+ vperm 12, 12, 12, 6
+ vperm 14, 14, 14, 6
+ vperm 13, 13, 13, 6
+ xxland 34, 34, 31
+ xxlor 1, 31, 31
+ vadduwm 3, 2, 3
+ xxland 44, 44, 31
+ xxlxor 36, 35, 36
+ xxlxor 51, 35, 40
+ xxland 35, 46, 31
+ xxland 38, 45, 31
+ vadduwm 15, 12, 18
+ vadduwm 8, 3, 17
+ vadduwm 13, 6, 16
+ xxlxor 37, 47, 37
+ xxlxor 33, 40, 33
+ xxlxor 43, 45, 43
+ vrlw 4, 4, 10
+ vrlw 1, 1, 10
+ vrlw 11, 11, 10
+ vrlw 5, 5, 10
+ xxlxor 47, 47, 41
+ xxlxor 40, 40, 32
+ xxlxor 39, 45, 39
+ xxlxor 50, 36, 38
+ xxlxor 63, 33, 44
+ xxlxor 43, 43, 34
+ xxlxor 41, 37, 35
+ bne 0, .LBB3_2
+.LBB3_5:
+ vmrglw 2, 19, 15
+ li 3, 32
+ li 4, 48
+ vmrglw 4, 7, 8
+ vmrglw 0, 31, 18
+ vmrglw 1, 9, 11
+ vmrghw 3, 19, 15
+ vmrghw 5, 7, 8
+ vmrghw 6, 31, 18
+ vmrghw 7, 9, 11
+ xxmrgld 40, 36, 34
+ xxmrghd 34, 36, 34
+ xxmrgld 41, 33, 32
+ xxswapd 0, 40
+ xxmrgld 36, 37, 35
+ xxmrghd 35, 37, 35
+ xxmrghd 37, 33, 32
+ xxswapd 1, 41
+ xxmrgld 32, 39, 38
+ xxmrghd 33, 39, 38
+ xxswapd 2, 34
+ xxswapd 4, 36
+ xxswapd 3, 37
+ stxvd2x 0, 0, 5
+ xxswapd 5, 32
+ stxvd2x 1, 5, 11
+ xxswapd 0, 35
+ xxswapd 1, 33
+ stxvd2x 2, 5, 3
+ li 3, 64
+ stxvd2x 3, 5, 4
+ li 4, 80
+ stxvd2x 4, 5, 3
+ li 3, 96
+ stxvd2x 5, 5, 4
+ li 4, 112
+ stxvd2x 0, 5, 3
+ stxvd2x 1, 5, 4
+ li 3, 224
+ lxvd2x 63, 1, 3
+ li 3, 208
+ lfd 31, 408(1)
+ ld 30, 304(1)
+ ld 29, 296(1)
+ lxvd2x 62, 1, 3
+ li 3, 192
+ lfd 30, 400(1)
+ ld 28, 288(1)
+ ld 27, 280(1)
+ lxvd2x 61, 1, 3
+ li 3, 176
+ lfd 29, 392(1)
+ ld 26, 272(1)
+ ld 25, 264(1)
+ lxvd2x 60, 1, 3
+ li 3, 160
+ lfd 28, 384(1)
+ ld 24, 256(1)
+ ld 23, 248(1)
+ lxvd2x 59, 1, 3
+ li 3, 144
+ lfd 27, 376(1)
+ ld 22, 240(1)
+ lxvd2x 58, 1, 3
+ li 3, 128
+ lfd 26, 368(1)
+ lxvd2x 57, 1, 3
+ li 3, 112
+ lfd 25, 360(1)
+ lxvd2x 56, 1, 3
+ li 3, 96
+ lfd 24, 352(1)
+ lxvd2x 55, 1, 3
+ li 3, 80
+ lfd 23, 344(1)
+ lxvd2x 54, 1, 3
+ li 3, 64
+ lfd 22, 336(1)
+ lxvd2x 53, 1, 3
+ li 3, 48
+ lfd 21, 328(1)
+ lxvd2x 52, 1, 3
+ lfd 20, 320(1)
+ addi 1, 1, 416
+ blr
+ .long 0
+ .quad 0
+.Lfunc_end3:
+ .size blake3_hash4_sse41, .Lfunc_end3-.Lfunc_begin3
+ .cfi_endproc
+ .section ".note.GNU-stack","",@progbits
+#endif
diff --git a/sys/contrib/openzfs/module/icp/asm-x86_64/blake3/blake3_avx2.S b/sys/contrib/openzfs/module/icp/asm-x86_64/blake3/blake3_avx2.S
new file mode 100644
index 000000000000..b15d8fc7744e
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/asm-x86_64/blake3/blake3_avx2.S
@@ -0,0 +1,1845 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Based on BLAKE3 v1.3.1, https://github.com/BLAKE3-team/BLAKE3
+ * Copyright (c) 2019-2020 Samuel Neves
+ * Copyright (c) 2022 Tino Reichardt <milky-zfs@mcmilk.de>
+ */
+
+#if defined(HAVE_AVX2)
+
+#define _ASM
+#include <sys/asm_linkage.h>
+
+#if defined(__ELF__) && defined(__CET__) && defined(__has_include)
+#if __has_include(<cet.h>)
+#include <cet.h>
+#endif
+#endif
+
+#if !defined(_CET_ENDBR)
+#define _CET_ENDBR
+#endif
+
+.intel_syntax noprefix
+.global zfs_blake3_hash_many_avx2
+.text
+
+.type zfs_blake3_hash_many_avx2,@function
+.p2align 6
+zfs_blake3_hash_many_avx2:
+ _CET_ENDBR
+ push r15
+ push r14
+ push r13
+ push r12
+ push rbx
+ push rbp
+ mov rbp, rsp
+ sub rsp, 680
+ and rsp, 0xFFFFFFFFFFFFFFC0
+ neg r9d
+ vmovd xmm0, r9d
+ vpbroadcastd ymm0, xmm0
+ vmovdqa ymmword ptr [rsp+0x280], ymm0
+ vpand ymm1, ymm0, ymmword ptr [ADD0+rip]
+ vpand ymm2, ymm0, ymmword ptr [ADD1+rip]
+ vmovdqa ymmword ptr [rsp+0x220], ymm2
+ vmovd xmm2, r8d
+ vpbroadcastd ymm2, xmm2
+ vpaddd ymm2, ymm2, ymm1
+ vmovdqa ymmword ptr [rsp+0x240], ymm2
+ vpxor ymm1, ymm1, ymmword ptr [CMP_MSB_MASK+rip]
+ vpxor ymm2, ymm2, ymmword ptr [CMP_MSB_MASK+rip]
+ vpcmpgtd ymm2, ymm1, ymm2
+ shr r8, 32
+ vmovd xmm3, r8d
+ vpbroadcastd ymm3, xmm3
+ vpsubd ymm3, ymm3, ymm2
+ vmovdqa ymmword ptr [rsp+0x260], ymm3
+ shl rdx, 6
+ mov qword ptr [rsp+0x2A0], rdx
+ cmp rsi, 8
+ jc 3f
+2:
+ vpbroadcastd ymm0, dword ptr [rcx]
+ vpbroadcastd ymm1, dword ptr [rcx+0x4]
+ vpbroadcastd ymm2, dword ptr [rcx+0x8]
+ vpbroadcastd ymm3, dword ptr [rcx+0xC]
+ vpbroadcastd ymm4, dword ptr [rcx+0x10]
+ vpbroadcastd ymm5, dword ptr [rcx+0x14]
+ vpbroadcastd ymm6, dword ptr [rcx+0x18]
+ vpbroadcastd ymm7, dword ptr [rcx+0x1C]
+ mov r8, qword ptr [rdi]
+ mov r9, qword ptr [rdi+0x8]
+ mov r10, qword ptr [rdi+0x10]
+ mov r11, qword ptr [rdi+0x18]
+ mov r12, qword ptr [rdi+0x20]
+ mov r13, qword ptr [rdi+0x28]
+ mov r14, qword ptr [rdi+0x30]
+ mov r15, qword ptr [rdi+0x38]
+ movzx eax, byte ptr [rbp+0x38]
+ movzx ebx, byte ptr [rbp+0x40]
+ or eax, ebx
+ xor edx, edx
+.p2align 5
+9:
+ movzx ebx, byte ptr [rbp+0x48]
+ or ebx, eax
+ add rdx, 64
+ cmp rdx, qword ptr [rsp+0x2A0]
+ cmove eax, ebx
+ mov dword ptr [rsp+0x200], eax
+ vmovups xmm8, xmmword ptr [r8+rdx-0x40]
+ vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x40], 0x01
+ vmovups xmm9, xmmword ptr [r9+rdx-0x40]
+ vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x40], 0x01
+ vunpcklpd ymm12, ymm8, ymm9
+ vunpckhpd ymm13, ymm8, ymm9
+ vmovups xmm10, xmmword ptr [r10+rdx-0x40]
+ vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x40], 0x01
+ vmovups xmm11, xmmword ptr [r11+rdx-0x40]
+ vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x40], 0x01
+ vunpcklpd ymm14, ymm10, ymm11
+ vunpckhpd ymm15, ymm10, ymm11
+ vshufps ymm8, ymm12, ymm14, 136
+ vmovaps ymmword ptr [rsp], ymm8
+ vshufps ymm9, ymm12, ymm14, 221
+ vmovaps ymmword ptr [rsp+0x20], ymm9
+ vshufps ymm10, ymm13, ymm15, 136
+ vmovaps ymmword ptr [rsp+0x40], ymm10
+ vshufps ymm11, ymm13, ymm15, 221
+ vmovaps ymmword ptr [rsp+0x60], ymm11
+ vmovups xmm8, xmmword ptr [r8+rdx-0x30]
+ vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x30], 0x01
+ vmovups xmm9, xmmword ptr [r9+rdx-0x30]
+ vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x30], 0x01
+ vunpcklpd ymm12, ymm8, ymm9
+ vunpckhpd ymm13, ymm8, ymm9
+ vmovups xmm10, xmmword ptr [r10+rdx-0x30]
+ vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x30], 0x01
+ vmovups xmm11, xmmword ptr [r11+rdx-0x30]
+ vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x30], 0x01
+ vunpcklpd ymm14, ymm10, ymm11
+ vunpckhpd ymm15, ymm10, ymm11
+ vshufps ymm8, ymm12, ymm14, 136
+ vmovaps ymmword ptr [rsp+0x80], ymm8
+ vshufps ymm9, ymm12, ymm14, 221
+ vmovaps ymmword ptr [rsp+0xA0], ymm9
+ vshufps ymm10, ymm13, ymm15, 136
+ vmovaps ymmword ptr [rsp+0xC0], ymm10
+ vshufps ymm11, ymm13, ymm15, 221
+ vmovaps ymmword ptr [rsp+0xE0], ymm11
+ vmovups xmm8, xmmword ptr [r8+rdx-0x20]
+ vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x20], 0x01
+ vmovups xmm9, xmmword ptr [r9+rdx-0x20]
+ vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x20], 0x01
+ vunpcklpd ymm12, ymm8, ymm9
+ vunpckhpd ymm13, ymm8, ymm9
+ vmovups xmm10, xmmword ptr [r10+rdx-0x20]
+ vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x20], 0x01
+ vmovups xmm11, xmmword ptr [r11+rdx-0x20]
+ vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x20], 0x01
+ vunpcklpd ymm14, ymm10, ymm11
+ vunpckhpd ymm15, ymm10, ymm11
+ vshufps ymm8, ymm12, ymm14, 136
+ vmovaps ymmword ptr [rsp+0x100], ymm8
+ vshufps ymm9, ymm12, ymm14, 221
+ vmovaps ymmword ptr [rsp+0x120], ymm9
+ vshufps ymm10, ymm13, ymm15, 136
+ vmovaps ymmword ptr [rsp+0x140], ymm10
+ vshufps ymm11, ymm13, ymm15, 221
+ vmovaps ymmword ptr [rsp+0x160], ymm11
+ vmovups xmm8, xmmword ptr [r8+rdx-0x10]
+ vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x10], 0x01
+ vmovups xmm9, xmmword ptr [r9+rdx-0x10]
+ vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x10], 0x01
+ vunpcklpd ymm12, ymm8, ymm9
+ vunpckhpd ymm13, ymm8, ymm9
+ vmovups xmm10, xmmword ptr [r10+rdx-0x10]
+ vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x10], 0x01
+ vmovups xmm11, xmmword ptr [r11+rdx-0x10]
+ vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x10], 0x01
+ vunpcklpd ymm14, ymm10, ymm11
+ vunpckhpd ymm15, ymm10, ymm11
+ vshufps ymm8, ymm12, ymm14, 136
+ vmovaps ymmword ptr [rsp+0x180], ymm8
+ vshufps ymm9, ymm12, ymm14, 221
+ vmovaps ymmword ptr [rsp+0x1A0], ymm9
+ vshufps ymm10, ymm13, ymm15, 136
+ vmovaps ymmword ptr [rsp+0x1C0], ymm10
+ vshufps ymm11, ymm13, ymm15, 221
+ vmovaps ymmword ptr [rsp+0x1E0], ymm11
+ vpbroadcastd ymm15, dword ptr [rsp+0x200]
+ prefetcht0 [r8+rdx+0x80]
+ prefetcht0 [r12+rdx+0x80]
+ prefetcht0 [r9+rdx+0x80]
+ prefetcht0 [r13+rdx+0x80]
+ prefetcht0 [r10+rdx+0x80]
+ prefetcht0 [r14+rdx+0x80]
+ prefetcht0 [r11+rdx+0x80]
+ prefetcht0 [r15+rdx+0x80]
+ vpaddd ymm0, ymm0, ymmword ptr [rsp]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+0x40]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+0x80]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0xC0]
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxor ymm12, ymm0, ymmword ptr [rsp+0x240]
+ vpxor ymm13, ymm1, ymmword ptr [rsp+0x260]
+ vpxor ymm14, ymm2, ymmword ptr [BLAKE3_BLOCK_LEN+rip]
+ vpxor ymm15, ymm3, ymm15
+ vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpshufb ymm15, ymm15, ymm8
+ vpaddd ymm8, ymm12, ymmword ptr [BLAKE3_IV_0+rip]
+ vpaddd ymm9, ymm13, ymmword ptr [BLAKE3_IV_1+rip]
+ vpaddd ymm10, ymm14, ymmword ptr [BLAKE3_IV_2+rip]
+ vpaddd ymm11, ymm15, ymmword ptr [BLAKE3_IV_3+rip]
+ vpxor ymm4, ymm4, ymm8
+ vpxor ymm5, ymm5, ymm9
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vmovdqa ymmword ptr [rsp+0x200], ymm8
+ vpsrld ymm8, ymm4, 12
+ vpslld ymm4, ymm4, 20
+ vpor ymm4, ymm4, ymm8
+ vpsrld ymm8, ymm5, 12
+ vpslld ymm5, ymm5, 20
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 12
+ vpslld ymm6, ymm6, 20
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 12
+ vpslld ymm7, ymm7, 20
+ vpor ymm7, ymm7, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+0x20]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+0x60]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+0xA0]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0xE0]
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxor ymm12, ymm12, ymm0
+ vpxor ymm13, ymm13, ymm1
+ vpxor ymm14, ymm14, ymm2
+ vpxor ymm15, ymm15, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpshufb ymm15, ymm15, ymm8
+ vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200]
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxor ymm4, ymm4, ymm8
+ vpxor ymm5, ymm5, ymm9
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vmovdqa ymmword ptr [rsp+0x200], ymm8
+ vpsrld ymm8, ymm4, 7
+ vpslld ymm4, ymm4, 25
+ vpor ymm4, ymm4, ymm8
+ vpsrld ymm8, ymm5, 7
+ vpslld ymm5, ymm5, 25
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 7
+ vpslld ymm6, ymm6, 25
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 7
+ vpslld ymm7, ymm7, 25
+ vpor ymm7, ymm7, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+0x100]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+0x140]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+0x180]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1C0]
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxor ymm15, ymm15, ymm0
+ vpxor ymm12, ymm12, ymm1
+ vpxor ymm13, ymm13, ymm2
+ vpxor ymm14, ymm14, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
+ vpshufb ymm15, ymm15, ymm8
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200]
+ vpaddd ymm9, ymm9, ymm14
+ vpxor ymm5, ymm5, ymm10
+ vpxor ymm6, ymm6, ymm11
+ vpxor ymm7, ymm7, ymm8
+ vpxor ymm4, ymm4, ymm9
+ vmovdqa ymmword ptr [rsp+0x200], ymm8
+ vpsrld ymm8, ymm5, 12
+ vpslld ymm5, ymm5, 20
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 12
+ vpslld ymm6, ymm6, 20
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 12
+ vpslld ymm7, ymm7, 20
+ vpor ymm7, ymm7, ymm8
+ vpsrld ymm8, ymm4, 12
+ vpslld ymm4, ymm4, 20
+ vpor ymm4, ymm4, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+0x120]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+0x160]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1A0]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1E0]
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxor ymm15, ymm15, ymm0
+ vpxor ymm12, ymm12, ymm1
+ vpxor ymm13, ymm13, ymm2
+ vpxor ymm14, ymm14, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
+ vpshufb ymm15, ymm15, ymm8
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200]
+ vpaddd ymm9, ymm9, ymm14
+ vpxor ymm5, ymm5, ymm10
+ vpxor ymm6, ymm6, ymm11
+ vpxor ymm7, ymm7, ymm8
+ vpxor ymm4, ymm4, ymm9
+ vmovdqa ymmword ptr [rsp+0x200], ymm8
+ vpsrld ymm8, ymm5, 7
+ vpslld ymm5, ymm5, 25
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 7
+ vpslld ymm6, ymm6, 25
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 7
+ vpslld ymm7, ymm7, 25
+ vpor ymm7, ymm7, ymm8
+ vpsrld ymm8, ymm4, 7
+ vpslld ymm4, ymm4, 25
+ vpor ymm4, ymm4, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+0x40]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+0x60]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+0xE0]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0x80]
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxor ymm12, ymm12, ymm0
+ vpxor ymm13, ymm13, ymm1
+ vpxor ymm14, ymm14, ymm2
+ vpxor ymm15, ymm15, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpshufb ymm15, ymm15, ymm8
+ vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200]
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxor ymm4, ymm4, ymm8
+ vpxor ymm5, ymm5, ymm9
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vmovdqa ymmword ptr [rsp+0x200], ymm8
+ vpsrld ymm8, ymm4, 12
+ vpslld ymm4, ymm4, 20
+ vpor ymm4, ymm4, ymm8
+ vpsrld ymm8, ymm5, 12
+ vpslld ymm5, ymm5, 20
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 12
+ vpslld ymm6, ymm6, 20
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 12
+ vpslld ymm7, ymm7, 20
+ vpor ymm7, ymm7, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+0xC0]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+0x140]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1A0]
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxor ymm12, ymm12, ymm0
+ vpxor ymm13, ymm13, ymm1
+ vpxor ymm14, ymm14, ymm2
+ vpxor ymm15, ymm15, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpshufb ymm15, ymm15, ymm8
+ vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200]
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxor ymm4, ymm4, ymm8
+ vpxor ymm5, ymm5, ymm9
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vmovdqa ymmword ptr [rsp+0x200], ymm8
+ vpsrld ymm8, ymm4, 7
+ vpslld ymm4, ymm4, 25
+ vpor ymm4, ymm4, ymm8
+ vpsrld ymm8, ymm5, 7
+ vpslld ymm5, ymm5, 25
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 7
+ vpslld ymm6, ymm6, 25
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 7
+ vpslld ymm7, ymm7, 25
+ vpor ymm7, ymm7, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+0x20]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+0x180]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+0x120]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1E0]
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxor ymm15, ymm15, ymm0
+ vpxor ymm12, ymm12, ymm1
+ vpxor ymm13, ymm13, ymm2
+ vpxor ymm14, ymm14, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
+ vpshufb ymm15, ymm15, ymm8
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200]
+ vpaddd ymm9, ymm9, ymm14
+ vpxor ymm5, ymm5, ymm10
+ vpxor ymm6, ymm6, ymm11
+ vpxor ymm7, ymm7, ymm8
+ vpxor ymm4, ymm4, ymm9
+ vmovdqa ymmword ptr [rsp+0x200], ymm8
+ vpsrld ymm8, ymm5, 12
+ vpslld ymm5, ymm5, 20
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 12
+ vpslld ymm6, ymm6, 20
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 12
+ vpslld ymm7, ymm7, 20
+ vpor ymm7, ymm7, ymm8
+ vpsrld ymm8, ymm4, 12
+ vpslld ymm4, ymm4, 20
+ vpor ymm4, ymm4, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+0x160]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+0xA0]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1C0]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0x100]
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxor ymm15, ymm15, ymm0
+ vpxor ymm12, ymm12, ymm1
+ vpxor ymm13, ymm13, ymm2
+ vpxor ymm14, ymm14, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
+ vpshufb ymm15, ymm15, ymm8
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200]
+ vpaddd ymm9, ymm9, ymm14
+ vpxor ymm5, ymm5, ymm10
+ vpxor ymm6, ymm6, ymm11
+ vpxor ymm7, ymm7, ymm8
+ vpxor ymm4, ymm4, ymm9
+ vmovdqa ymmword ptr [rsp+0x200], ymm8
+ vpsrld ymm8, ymm5, 7
+ vpslld ymm5, ymm5, 25
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 7
+ vpslld ymm6, ymm6, 25
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 7
+ vpslld ymm7, ymm7, 25
+ vpor ymm7, ymm7, ymm8
+ vpsrld ymm8, ymm4, 7
+ vpslld ymm4, ymm4, 25
+ vpor ymm4, ymm4, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+0x60]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+0x140]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1A0]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0xE0]
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxor ymm12, ymm12, ymm0
+ vpxor ymm13, ymm13, ymm1
+ vpxor ymm14, ymm14, ymm2
+ vpxor ymm15, ymm15, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpshufb ymm15, ymm15, ymm8
+ vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200]
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxor ymm4, ymm4, ymm8
+ vpxor ymm5, ymm5, ymm9
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vmovdqa ymmword ptr [rsp+0x200], ymm8
+ vpsrld ymm8, ymm4, 12
+ vpslld ymm4, ymm4, 20
+ vpor ymm4, ymm4, ymm8
+ vpsrld ymm8, ymm5, 12
+ vpslld ymm5, ymm5, 20
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 12
+ vpslld ymm6, ymm6, 20
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 12
+ vpslld ymm7, ymm7, 20
+ vpor ymm7, ymm7, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+0x80]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+0x180]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+0x40]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1C0]
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxor ymm12, ymm12, ymm0
+ vpxor ymm13, ymm13, ymm1
+ vpxor ymm14, ymm14, ymm2
+ vpxor ymm15, ymm15, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpshufb ymm15, ymm15, ymm8
+ vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200]
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxor ymm4, ymm4, ymm8
+ vpxor ymm5, ymm5, ymm9
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vmovdqa ymmword ptr [rsp+0x200], ymm8
+ vpsrld ymm8, ymm4, 7
+ vpslld ymm4, ymm4, 25
+ vpor ymm4, ymm4, ymm8
+ vpsrld ymm8, ymm5, 7
+ vpslld ymm5, ymm5, 25
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 7
+ vpslld ymm6, ymm6, 25
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 7
+ vpslld ymm7, ymm7, 25
+ vpor ymm7, ymm7, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+0xC0]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+0x120]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+0x160]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0x100]
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxor ymm15, ymm15, ymm0
+ vpxor ymm12, ymm12, ymm1
+ vpxor ymm13, ymm13, ymm2
+ vpxor ymm14, ymm14, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
+ vpshufb ymm15, ymm15, ymm8
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200]
+ vpaddd ymm9, ymm9, ymm14
+ vpxor ymm5, ymm5, ymm10
+ vpxor ymm6, ymm6, ymm11
+ vpxor ymm7, ymm7, ymm8
+ vpxor ymm4, ymm4, ymm9
+ vmovdqa ymmword ptr [rsp+0x200], ymm8
+ vpsrld ymm8, ymm5, 12
+ vpslld ymm5, ymm5, 20
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 12
+ vpslld ymm6, ymm6, 20
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 12
+ vpslld ymm7, ymm7, 20
+ vpor ymm7, ymm7, ymm8
+ vpsrld ymm8, ymm4, 12
+ vpslld ymm4, ymm4, 20
+ vpor ymm4, ymm4, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+0xA0]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1E0]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0x20]
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxor ymm15, ymm15, ymm0
+ vpxor ymm12, ymm12, ymm1
+ vpxor ymm13, ymm13, ymm2
+ vpxor ymm14, ymm14, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
+ vpshufb ymm15, ymm15, ymm8
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200]
+ vpaddd ymm9, ymm9, ymm14
+ vpxor ymm5, ymm5, ymm10
+ vpxor ymm6, ymm6, ymm11
+ vpxor ymm7, ymm7, ymm8
+ vpxor ymm4, ymm4, ymm9
+ vmovdqa ymmword ptr [rsp+0x200], ymm8
+ vpsrld ymm8, ymm5, 7
+ vpslld ymm5, ymm5, 25
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 7
+ vpslld ymm6, ymm6, 25
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 7
+ vpslld ymm7, ymm7, 25
+ vpor ymm7, ymm7, ymm8
+ vpsrld ymm8, ymm4, 7
+ vpslld ymm4, ymm4, 25
+ vpor ymm4, ymm4, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+0x140]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+0x180]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1C0]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1A0]
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxor ymm12, ymm12, ymm0
+ vpxor ymm13, ymm13, ymm1
+ vpxor ymm14, ymm14, ymm2
+ vpxor ymm15, ymm15, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpshufb ymm15, ymm15, ymm8
+ vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200]
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxor ymm4, ymm4, ymm8
+ vpxor ymm5, ymm5, ymm9
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vmovdqa ymmword ptr [rsp+0x200], ymm8
+ vpsrld ymm8, ymm4, 12
+ vpslld ymm4, ymm4, 20
+ vpor ymm4, ymm4, ymm8
+ vpsrld ymm8, ymm5, 12
+ vpslld ymm5, ymm5, 20
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 12
+ vpslld ymm6, ymm6, 20
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 12
+ vpslld ymm7, ymm7, 20
+ vpor ymm7, ymm7, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+0xE0]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+0x120]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+0x60]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1E0]
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxor ymm12, ymm12, ymm0
+ vpxor ymm13, ymm13, ymm1
+ vpxor ymm14, ymm14, ymm2
+ vpxor ymm15, ymm15, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpshufb ymm15, ymm15, ymm8
+ vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200]
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxor ymm4, ymm4, ymm8
+ vpxor ymm5, ymm5, ymm9
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vmovdqa ymmword ptr [rsp+0x200], ymm8
+ vpsrld ymm8, ymm4, 7
+ vpslld ymm4, ymm4, 25
+ vpor ymm4, ymm4, ymm8
+ vpsrld ymm8, ymm5, 7
+ vpslld ymm5, ymm5, 25
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 7
+ vpslld ymm6, ymm6, 25
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 7
+ vpslld ymm7, ymm7, 25
+ vpor ymm7, ymm7, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+0x80]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+0x160]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+0xA0]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0x20]
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxor ymm15, ymm15, ymm0
+ vpxor ymm12, ymm12, ymm1
+ vpxor ymm13, ymm13, ymm2
+ vpxor ymm14, ymm14, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
+ vpshufb ymm15, ymm15, ymm8
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200]
+ vpaddd ymm9, ymm9, ymm14
+ vpxor ymm5, ymm5, ymm10
+ vpxor ymm6, ymm6, ymm11
+ vpxor ymm7, ymm7, ymm8
+ vpxor ymm4, ymm4, ymm9
+ vmovdqa ymmword ptr [rsp+0x200], ymm8
+ vpsrld ymm8, ymm5, 12
+ vpslld ymm5, ymm5, 20
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 12
+ vpslld ymm6, ymm6, 20
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 12
+ vpslld ymm7, ymm7, 20
+ vpor ymm7, ymm7, ymm8
+ vpsrld ymm8, ymm4, 12
+ vpslld ymm4, ymm4, 20
+ vpor ymm4, ymm4, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+0x40]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+0x100]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0xC0]
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxor ymm15, ymm15, ymm0
+ vpxor ymm12, ymm12, ymm1
+ vpxor ymm13, ymm13, ymm2
+ vpxor ymm14, ymm14, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
+ vpshufb ymm15, ymm15, ymm8
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200]
+ vpaddd ymm9, ymm9, ymm14
+ vpxor ymm5, ymm5, ymm10
+ vpxor ymm6, ymm6, ymm11
+ vpxor ymm7, ymm7, ymm8
+ vpxor ymm4, ymm4, ymm9
+ vmovdqa ymmword ptr [rsp+0x200], ymm8
+ vpsrld ymm8, ymm5, 7
+ vpslld ymm5, ymm5, 25
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 7
+ vpslld ymm6, ymm6, 25
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 7
+ vpslld ymm7, ymm7, 25
+ vpor ymm7, ymm7, ymm8
+ vpsrld ymm8, ymm4, 7
+ vpslld ymm4, ymm4, 25
+ vpor ymm4, ymm4, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+0x180]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+0x120]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1E0]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1C0]
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxor ymm12, ymm12, ymm0
+ vpxor ymm13, ymm13, ymm1
+ vpxor ymm14, ymm14, ymm2
+ vpxor ymm15, ymm15, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpshufb ymm15, ymm15, ymm8
+ vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200]
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxor ymm4, ymm4, ymm8
+ vpxor ymm5, ymm5, ymm9
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vmovdqa ymmword ptr [rsp+0x200], ymm8
+ vpsrld ymm8, ymm4, 12
+ vpslld ymm4, ymm4, 20
+ vpor ymm4, ymm4, ymm8
+ vpsrld ymm8, ymm5, 12
+ vpslld ymm5, ymm5, 20
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 12
+ vpslld ymm6, ymm6, 20
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 12
+ vpslld ymm7, ymm7, 20
+ vpor ymm7, ymm7, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1A0]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+0x160]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+0x140]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0x100]
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxor ymm12, ymm12, ymm0
+ vpxor ymm13, ymm13, ymm1
+ vpxor ymm14, ymm14, ymm2
+ vpxor ymm15, ymm15, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpshufb ymm15, ymm15, ymm8
+ vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200]
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxor ymm4, ymm4, ymm8
+ vpxor ymm5, ymm5, ymm9
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vmovdqa ymmword ptr [rsp+0x200], ymm8
+ vpsrld ymm8, ymm4, 7
+ vpslld ymm4, ymm4, 25
+ vpor ymm4, ymm4, ymm8
+ vpsrld ymm8, ymm5, 7
+ vpslld ymm5, ymm5, 25
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 7
+ vpslld ymm6, ymm6, 25
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 7
+ vpslld ymm7, ymm7, 25
+ vpor ymm7, ymm7, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+0xE0]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+0xA0]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0xC0]
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxor ymm15, ymm15, ymm0
+ vpxor ymm12, ymm12, ymm1
+ vpxor ymm13, ymm13, ymm2
+ vpxor ymm14, ymm14, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
+ vpshufb ymm15, ymm15, ymm8
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200]
+ vpaddd ymm9, ymm9, ymm14
+ vpxor ymm5, ymm5, ymm10
+ vpxor ymm6, ymm6, ymm11
+ vpxor ymm7, ymm7, ymm8
+ vpxor ymm4, ymm4, ymm9
+ vmovdqa ymmword ptr [rsp+0x200], ymm8
+ vpsrld ymm8, ymm5, 12
+ vpslld ymm5, ymm5, 20
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 12
+ vpslld ymm6, ymm6, 20
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 12
+ vpslld ymm7, ymm7, 20
+ vpor ymm7, ymm7, ymm8
+ vpsrld ymm8, ymm4, 12
+ vpslld ymm4, ymm4, 20
+ vpor ymm4, ymm4, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+0x40]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+0x60]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+0x20]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0x80]
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxor ymm15, ymm15, ymm0
+ vpxor ymm12, ymm12, ymm1
+ vpxor ymm13, ymm13, ymm2
+ vpxor ymm14, ymm14, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
+ vpshufb ymm15, ymm15, ymm8
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200]
+ vpaddd ymm9, ymm9, ymm14
+ vpxor ymm5, ymm5, ymm10
+ vpxor ymm6, ymm6, ymm11
+ vpxor ymm7, ymm7, ymm8
+ vpxor ymm4, ymm4, ymm9
+ vmovdqa ymmword ptr [rsp+0x200], ymm8
+ vpsrld ymm8, ymm5, 7
+ vpslld ymm5, ymm5, 25
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 7
+ vpslld ymm6, ymm6, 25
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 7
+ vpslld ymm7, ymm7, 25
+ vpor ymm7, ymm7, ymm8
+ vpsrld ymm8, ymm4, 7
+ vpslld ymm4, ymm4, 25
+ vpor ymm4, ymm4, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+0x120]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+0x160]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+0x100]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1E0]
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxor ymm12, ymm12, ymm0
+ vpxor ymm13, ymm13, ymm1
+ vpxor ymm14, ymm14, ymm2
+ vpxor ymm15, ymm15, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpshufb ymm15, ymm15, ymm8
+ vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200]
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxor ymm4, ymm4, ymm8
+ vpxor ymm5, ymm5, ymm9
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vmovdqa ymmword ptr [rsp+0x200], ymm8
+ vpsrld ymm8, ymm4, 12
+ vpslld ymm4, ymm4, 20
+ vpor ymm4, ymm4, ymm8
+ vpsrld ymm8, ymm5, 12
+ vpslld ymm5, ymm5, 20
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 12
+ vpslld ymm6, ymm6, 20
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 12
+ vpslld ymm7, ymm7, 20
+ vpor ymm7, ymm7, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1C0]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+0xA0]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+0x180]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0x20]
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxor ymm12, ymm12, ymm0
+ vpxor ymm13, ymm13, ymm1
+ vpxor ymm14, ymm14, ymm2
+ vpxor ymm15, ymm15, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpshufb ymm15, ymm15, ymm8
+ vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200]
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxor ymm4, ymm4, ymm8
+ vpxor ymm5, ymm5, ymm9
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vmovdqa ymmword ptr [rsp+0x200], ymm8
+ vpsrld ymm8, ymm4, 7
+ vpslld ymm4, ymm4, 25
+ vpor ymm4, ymm4, ymm8
+ vpsrld ymm8, ymm5, 7
+ vpslld ymm5, ymm5, 25
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 7
+ vpslld ymm6, ymm6, 25
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 7
+ vpslld ymm7, ymm7, 25
+ vpor ymm7, ymm7, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1A0]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+0x40]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0x80]
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxor ymm15, ymm15, ymm0
+ vpxor ymm12, ymm12, ymm1
+ vpxor ymm13, ymm13, ymm2
+ vpxor ymm14, ymm14, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
+ vpshufb ymm15, ymm15, ymm8
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200]
+ vpaddd ymm9, ymm9, ymm14
+ vpxor ymm5, ymm5, ymm10
+ vpxor ymm6, ymm6, ymm11
+ vpxor ymm7, ymm7, ymm8
+ vpxor ymm4, ymm4, ymm9
+ vmovdqa ymmword ptr [rsp+0x200], ymm8
+ vpsrld ymm8, ymm5, 12
+ vpslld ymm5, ymm5, 20
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 12
+ vpslld ymm6, ymm6, 20
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 12
+ vpslld ymm7, ymm7, 20
+ vpor ymm7, ymm7, ymm8
+ vpsrld ymm8, ymm4, 12
+ vpslld ymm4, ymm4, 20
+ vpor ymm4, ymm4, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+0x60]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+0x140]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+0xC0]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0xE0]
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxor ymm15, ymm15, ymm0
+ vpxor ymm12, ymm12, ymm1
+ vpxor ymm13, ymm13, ymm2
+ vpxor ymm14, ymm14, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
+ vpshufb ymm15, ymm15, ymm8
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200]
+ vpaddd ymm9, ymm9, ymm14
+ vpxor ymm5, ymm5, ymm10
+ vpxor ymm6, ymm6, ymm11
+ vpxor ymm7, ymm7, ymm8
+ vpxor ymm4, ymm4, ymm9
+ vmovdqa ymmword ptr [rsp+0x200], ymm8
+ vpsrld ymm8, ymm5, 7
+ vpslld ymm5, ymm5, 25
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 7
+ vpslld ymm6, ymm6, 25
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 7
+ vpslld ymm7, ymm7, 25
+ vpor ymm7, ymm7, ymm8
+ vpsrld ymm8, ymm4, 7
+ vpslld ymm4, ymm4, 25
+ vpor ymm4, ymm4, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+0x160]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+0xA0]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+0x20]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0x100]
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxor ymm12, ymm12, ymm0
+ vpxor ymm13, ymm13, ymm1
+ vpxor ymm14, ymm14, ymm2
+ vpxor ymm15, ymm15, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpshufb ymm15, ymm15, ymm8
+ vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200]
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxor ymm4, ymm4, ymm8
+ vpxor ymm5, ymm5, ymm9
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vmovdqa ymmword ptr [rsp+0x200], ymm8
+ vpsrld ymm8, ymm4, 12
+ vpslld ymm4, ymm4, 20
+ vpor ymm4, ymm4, ymm8
+ vpsrld ymm8, ymm5, 12
+ vpslld ymm5, ymm5, 20
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 12
+ vpslld ymm6, ymm6, 20
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 12
+ vpslld ymm7, ymm7, 20
+ vpor ymm7, ymm7, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1E0]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+0x120]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0xC0]
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxor ymm12, ymm12, ymm0
+ vpxor ymm13, ymm13, ymm1
+ vpxor ymm14, ymm14, ymm2
+ vpxor ymm15, ymm15, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpshufb ymm15, ymm15, ymm8
+ vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200]
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxor ymm4, ymm4, ymm8
+ vpxor ymm5, ymm5, ymm9
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vmovdqa ymmword ptr [rsp+0x200], ymm8
+ vpsrld ymm8, ymm4, 7
+ vpslld ymm4, ymm4, 25
+ vpor ymm4, ymm4, ymm8
+ vpsrld ymm8, ymm5, 7
+ vpslld ymm5, ymm5, 25
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 7
+ vpslld ymm6, ymm6, 25
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 7
+ vpslld ymm7, ymm7, 25
+ vpor ymm7, ymm7, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1C0]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+0x40]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+0x60]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0xE0]
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxor ymm15, ymm15, ymm0
+ vpxor ymm12, ymm12, ymm1
+ vpxor ymm13, ymm13, ymm2
+ vpxor ymm14, ymm14, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
+ vpshufb ymm15, ymm15, ymm8
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200]
+ vpaddd ymm9, ymm9, ymm14
+ vpxor ymm5, ymm5, ymm10
+ vpxor ymm6, ymm6, ymm11
+ vpxor ymm7, ymm7, ymm8
+ vpxor ymm4, ymm4, ymm9
+ vmovdqa ymmword ptr [rsp+0x200], ymm8
+ vpsrld ymm8, ymm5, 12
+ vpslld ymm5, ymm5, 20
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 12
+ vpslld ymm6, ymm6, 20
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 12
+ vpslld ymm7, ymm7, 20
+ vpor ymm7, ymm7, ymm8
+ vpsrld ymm8, ymm4, 12
+ vpslld ymm4, ymm4, 20
+ vpor ymm4, ymm4, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+0x140]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+0x180]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+0x80]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1A0]
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxor ymm15, ymm15, ymm0
+ vpxor ymm12, ymm12, ymm1
+ vpxor ymm13, ymm13, ymm2
+ vpxor ymm14, ymm14, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
+ vpshufb ymm15, ymm15, ymm8
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200]
+ vpaddd ymm9, ymm9, ymm14
+ vpxor ymm5, ymm5, ymm10
+ vpxor ymm6, ymm6, ymm11
+ vpxor ymm7, ymm7, ymm8
+ vpxor ymm4, ymm4, ymm9
+ vpxor ymm0, ymm0, ymm8
+ vpxor ymm1, ymm1, ymm9
+ vpxor ymm2, ymm2, ymm10
+ vpxor ymm3, ymm3, ymm11
+ vpsrld ymm8, ymm5, 7
+ vpslld ymm5, ymm5, 25
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 7
+ vpslld ymm6, ymm6, 25
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 7
+ vpslld ymm7, ymm7, 25
+ vpor ymm7, ymm7, ymm8
+ vpsrld ymm8, ymm4, 7
+ vpslld ymm4, ymm4, 25
+ vpor ymm4, ymm4, ymm8
+ vpxor ymm4, ymm4, ymm12
+ vpxor ymm5, ymm5, ymm13
+ vpxor ymm6, ymm6, ymm14
+ vpxor ymm7, ymm7, ymm15
+ movzx eax, byte ptr [rbp+0x38]
+ jne 9b
+ mov rbx, qword ptr [rbp+0x50]
+ vunpcklps ymm8, ymm0, ymm1
+ vunpcklps ymm9, ymm2, ymm3
+ vunpckhps ymm10, ymm0, ymm1
+ vunpcklps ymm11, ymm4, ymm5
+ vunpcklps ymm0, ymm6, ymm7
+ vshufps ymm12, ymm8, ymm9, 78
+ vblendps ymm1, ymm8, ymm12, 0xCC
+ vshufps ymm8, ymm11, ymm0, 78
+ vunpckhps ymm13, ymm2, ymm3
+ vblendps ymm2, ymm11, ymm8, 0xCC
+ vblendps ymm3, ymm12, ymm9, 0xCC
+ vperm2f128 ymm12, ymm1, ymm2, 0x20
+ vmovups ymmword ptr [rbx], ymm12
+ vunpckhps ymm14, ymm4, ymm5
+ vblendps ymm4, ymm8, ymm0, 0xCC
+ vunpckhps ymm15, ymm6, ymm7
+ vperm2f128 ymm7, ymm3, ymm4, 0x20
+ vmovups ymmword ptr [rbx+0x20], ymm7
+ vshufps ymm5, ymm10, ymm13, 78
+ vblendps ymm6, ymm5, ymm13, 0xCC
+ vshufps ymm13, ymm14, ymm15, 78
+ vblendps ymm10, ymm10, ymm5, 0xCC
+ vblendps ymm14, ymm14, ymm13, 0xCC
+ vperm2f128 ymm8, ymm10, ymm14, 0x20
+ vmovups ymmword ptr [rbx+0x40], ymm8
+ vblendps ymm15, ymm13, ymm15, 0xCC
+ vperm2f128 ymm13, ymm6, ymm15, 0x20
+ vmovups ymmword ptr [rbx+0x60], ymm13
+ vperm2f128 ymm9, ymm1, ymm2, 0x31
+ vperm2f128 ymm11, ymm3, ymm4, 0x31
+ vmovups ymmword ptr [rbx+0x80], ymm9
+ vperm2f128 ymm14, ymm10, ymm14, 0x31
+ vperm2f128 ymm15, ymm6, ymm15, 0x31
+ vmovups ymmword ptr [rbx+0xA0], ymm11
+ vmovups ymmword ptr [rbx+0xC0], ymm14
+ vmovups ymmword ptr [rbx+0xE0], ymm15
+ vmovdqa ymm0, ymmword ptr [rsp+0x220]
+ vpaddd ymm1, ymm0, ymmword ptr [rsp+0x240]
+ vmovdqa ymmword ptr [rsp+0x240], ymm1
+ vpxor ymm0, ymm0, ymmword ptr [CMP_MSB_MASK+rip]
+ vpxor ymm2, ymm1, ymmword ptr [CMP_MSB_MASK+rip]
+ vpcmpgtd ymm2, ymm0, ymm2
+ vmovdqa ymm0, ymmword ptr [rsp+0x260]
+ vpsubd ymm2, ymm0, ymm2
+ vmovdqa ymmword ptr [rsp+0x260], ymm2
+ add rdi, 64
+ add rbx, 256
+ mov qword ptr [rbp+0x50], rbx
+ sub rsi, 8
+ cmp rsi, 8
+ jnc 2b
+ test rsi, rsi
+ jnz 3f
+4:
+ vzeroupper
+ mov rsp, rbp
+ pop rbp
+ pop rbx
+ pop r12
+ pop r13
+ pop r14
+ pop r15
+ ret
+.p2align 5
+3:
+ mov rbx, qword ptr [rbp+0x50]
+ mov r15, qword ptr [rsp+0x2A0]
+ movzx r13d, byte ptr [rbp+0x38]
+ movzx r12d, byte ptr [rbp+0x48]
+ test rsi, 0x4
+ je 3f
+ vbroadcasti128 ymm0, xmmword ptr [rcx]
+ vbroadcasti128 ymm1, xmmword ptr [rcx+0x10]
+ vmovdqa ymm8, ymm0
+ vmovdqa ymm9, ymm1
+ vbroadcasti128 ymm12, xmmword ptr [rsp+0x240]
+ vbroadcasti128 ymm13, xmmword ptr [rsp+0x260]
+ vpunpckldq ymm14, ymm12, ymm13
+ vpunpckhdq ymm15, ymm12, ymm13
+ vpermq ymm14, ymm14, 0x50
+ vpermq ymm15, ymm15, 0x50
+ vbroadcasti128 ymm12, xmmword ptr [BLAKE3_BLOCK_LEN+rip]
+ vpblendd ymm14, ymm14, ymm12, 0x44
+ vpblendd ymm15, ymm15, ymm12, 0x44
+ vmovdqa ymmword ptr [rsp], ymm14
+ vmovdqa ymmword ptr [rsp+0x20], ymm15
+ mov r8, qword ptr [rdi]
+ mov r9, qword ptr [rdi+0x8]
+ mov r10, qword ptr [rdi+0x10]
+ mov r11, qword ptr [rdi+0x18]
+ movzx eax, byte ptr [rbp+0x40]
+ or eax, r13d
+ xor edx, edx
+.p2align 5
+2:
+ mov r14d, eax
+ or eax, r12d
+ add rdx, 64
+ cmp rdx, r15
+ cmovne eax, r14d
+ mov dword ptr [rsp+0x200], eax
+ vmovups ymm2, ymmword ptr [r8+rdx-0x40]
+ vinsertf128 ymm2, ymm2, xmmword ptr [r9+rdx-0x40], 0x01
+ vmovups ymm3, ymmword ptr [r8+rdx-0x30]
+ vinsertf128 ymm3, ymm3, xmmword ptr [r9+rdx-0x30], 0x01
+ vshufps ymm4, ymm2, ymm3, 136
+ vshufps ymm5, ymm2, ymm3, 221
+ vmovups ymm2, ymmword ptr [r8+rdx-0x20]
+ vinsertf128 ymm2, ymm2, xmmword ptr [r9+rdx-0x20], 0x01
+ vmovups ymm3, ymmword ptr [r8+rdx-0x10]
+ vinsertf128 ymm3, ymm3, xmmword ptr [r9+rdx-0x10], 0x01
+ vshufps ymm6, ymm2, ymm3, 136
+ vshufps ymm7, ymm2, ymm3, 221
+ vpshufd ymm6, ymm6, 0x93
+ vpshufd ymm7, ymm7, 0x93
+ vmovups ymm10, ymmword ptr [r10+rdx-0x40]
+ vinsertf128 ymm10, ymm10, xmmword ptr [r11+rdx-0x40], 0x01
+ vmovups ymm11, ymmword ptr [r10+rdx-0x30]
+ vinsertf128 ymm11, ymm11, xmmword ptr [r11+rdx-0x30], 0x01
+ vshufps ymm12, ymm10, ymm11, 136
+ vshufps ymm13, ymm10, ymm11, 221
+ vmovups ymm10, ymmword ptr [r10+rdx-0x20]
+ vinsertf128 ymm10, ymm10, xmmword ptr [r11+rdx-0x20], 0x01
+ vmovups ymm11, ymmword ptr [r10+rdx-0x10]
+ vinsertf128 ymm11, ymm11, xmmword ptr [r11+rdx-0x10], 0x01
+ vshufps ymm14, ymm10, ymm11, 136
+ vshufps ymm15, ymm10, ymm11, 221
+ vpshufd ymm14, ymm14, 0x93
+ vpshufd ymm15, ymm15, 0x93
+ prefetcht0 [r8+rdx+0x80]
+ prefetcht0 [r9+rdx+0x80]
+ prefetcht0 [r10+rdx+0x80]
+ prefetcht0 [r11+rdx+0x80]
+ vpbroadcastd ymm2, dword ptr [rsp+0x200]
+ vmovdqa ymm3, ymmword ptr [rsp]
+ vmovdqa ymm11, ymmword ptr [rsp+0x20]
+ vpblendd ymm3, ymm3, ymm2, 0x88
+ vpblendd ymm11, ymm11, ymm2, 0x88
+ vbroadcasti128 ymm2, xmmword ptr [BLAKE3_IV+rip]
+ vmovdqa ymm10, ymm2
+ mov al, 7
+9:
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm8, ymm8, ymm12
+ vmovdqa ymmword ptr [rsp+0x40], ymm4
+ nop
+ vmovdqa ymmword ptr [rsp+0x60], ymm12
+ nop
+ vpaddd ymm0, ymm0, ymm1
+ vpaddd ymm8, ymm8, ymm9
+ vpxor ymm3, ymm3, ymm0
+ vpxor ymm11, ymm11, ymm8
+ vbroadcasti128 ymm4, xmmword ptr [ROT16+rip]
+ vpshufb ymm3, ymm3, ymm4
+ vpshufb ymm11, ymm11, ymm4
+ vpaddd ymm2, ymm2, ymm3
+ vpaddd ymm10, ymm10, ymm11
+ vpxor ymm1, ymm1, ymm2
+ vpxor ymm9, ymm9, ymm10
+ vpsrld ymm4, ymm1, 12
+ vpslld ymm1, ymm1, 20
+ vpor ymm1, ymm1, ymm4
+ vpsrld ymm4, ymm9, 12
+ vpslld ymm9, ymm9, 20
+ vpor ymm9, ymm9, ymm4
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm8, ymm8, ymm13
+ vpaddd ymm0, ymm0, ymm1
+ vpaddd ymm8, ymm8, ymm9
+ vmovdqa ymmword ptr [rsp+0x80], ymm5
+ vmovdqa ymmword ptr [rsp+0xA0], ymm13
+ vpxor ymm3, ymm3, ymm0
+ vpxor ymm11, ymm11, ymm8
+ vbroadcasti128 ymm4, xmmword ptr [ROT8+rip]
+ vpshufb ymm3, ymm3, ymm4
+ vpshufb ymm11, ymm11, ymm4
+ vpaddd ymm2, ymm2, ymm3
+ vpaddd ymm10, ymm10, ymm11
+ vpxor ymm1, ymm1, ymm2
+ vpxor ymm9, ymm9, ymm10
+ vpsrld ymm4, ymm1, 7
+ vpslld ymm1, ymm1, 25
+ vpor ymm1, ymm1, ymm4
+ vpsrld ymm4, ymm9, 7
+ vpslld ymm9, ymm9, 25
+ vpor ymm9, ymm9, ymm4
+ vpshufd ymm0, ymm0, 0x93
+ vpshufd ymm8, ymm8, 0x93
+ vpshufd ymm3, ymm3, 0x4E
+ vpshufd ymm11, ymm11, 0x4E
+ vpshufd ymm2, ymm2, 0x39
+ vpshufd ymm10, ymm10, 0x39
+ vpaddd ymm0, ymm0, ymm6
+ vpaddd ymm8, ymm8, ymm14
+ vpaddd ymm0, ymm0, ymm1
+ vpaddd ymm8, ymm8, ymm9
+ vpxor ymm3, ymm3, ymm0
+ vpxor ymm11, ymm11, ymm8
+ vbroadcasti128 ymm4, xmmword ptr [ROT16+rip]
+ vpshufb ymm3, ymm3, ymm4
+ vpshufb ymm11, ymm11, ymm4
+ vpaddd ymm2, ymm2, ymm3
+ vpaddd ymm10, ymm10, ymm11
+ vpxor ymm1, ymm1, ymm2
+ vpxor ymm9, ymm9, ymm10
+ vpsrld ymm4, ymm1, 12
+ vpslld ymm1, ymm1, 20
+ vpor ymm1, ymm1, ymm4
+ vpsrld ymm4, ymm9, 12
+ vpslld ymm9, ymm9, 20
+ vpor ymm9, ymm9, ymm4
+ vpaddd ymm0, ymm0, ymm7
+ vpaddd ymm8, ymm8, ymm15
+ vpaddd ymm0, ymm0, ymm1
+ vpaddd ymm8, ymm8, ymm9
+ vpxor ymm3, ymm3, ymm0
+ vpxor ymm11, ymm11, ymm8
+ vbroadcasti128 ymm4, xmmword ptr [ROT8+rip]
+ vpshufb ymm3, ymm3, ymm4
+ vpshufb ymm11, ymm11, ymm4
+ vpaddd ymm2, ymm2, ymm3
+ vpaddd ymm10, ymm10, ymm11
+ vpxor ymm1, ymm1, ymm2
+ vpxor ymm9, ymm9, ymm10
+ vpsrld ymm4, ymm1, 7
+ vpslld ymm1, ymm1, 25
+ vpor ymm1, ymm1, ymm4
+ vpsrld ymm4, ymm9, 7
+ vpslld ymm9, ymm9, 25
+ vpor ymm9, ymm9, ymm4
+ vpshufd ymm0, ymm0, 0x39
+ vpshufd ymm8, ymm8, 0x39
+ vpshufd ymm3, ymm3, 0x4E
+ vpshufd ymm11, ymm11, 0x4E
+ vpshufd ymm2, ymm2, 0x93
+ vpshufd ymm10, ymm10, 0x93
+ dec al
+ je 9f
+ vmovdqa ymm4, ymmword ptr [rsp+0x40]
+ vmovdqa ymm5, ymmword ptr [rsp+0x80]
+ vshufps ymm12, ymm4, ymm5, 214
+ vpshufd ymm13, ymm4, 0x0F
+ vpshufd ymm4, ymm12, 0x39
+ vshufps ymm12, ymm6, ymm7, 250
+ vpblendd ymm13, ymm13, ymm12, 0xAA
+ vpunpcklqdq ymm12, ymm7, ymm5
+ vpblendd ymm12, ymm12, ymm6, 0x88
+ vpshufd ymm12, ymm12, 0x78
+ vpunpckhdq ymm5, ymm5, ymm7
+ vpunpckldq ymm6, ymm6, ymm5
+ vpshufd ymm7, ymm6, 0x1E
+ vmovdqa ymmword ptr [rsp+0x40], ymm13
+ vmovdqa ymmword ptr [rsp+0x80], ymm12
+ vmovdqa ymm12, ymmword ptr [rsp+0x60]
+ vmovdqa ymm13, ymmword ptr [rsp+0xA0]
+ vshufps ymm5, ymm12, ymm13, 214
+ vpshufd ymm6, ymm12, 0x0F
+ vpshufd ymm12, ymm5, 0x39
+ vshufps ymm5, ymm14, ymm15, 250
+ vpblendd ymm6, ymm6, ymm5, 0xAA
+ vpunpcklqdq ymm5, ymm15, ymm13
+ vpblendd ymm5, ymm5, ymm14, 0x88
+ vpshufd ymm5, ymm5, 0x78
+ vpunpckhdq ymm13, ymm13, ymm15
+ vpunpckldq ymm14, ymm14, ymm13
+ vpshufd ymm15, ymm14, 0x1E
+ vmovdqa ymm13, ymm6
+ vmovdqa ymm14, ymm5
+ vmovdqa ymm5, ymmword ptr [rsp+0x40]
+ vmovdqa ymm6, ymmword ptr [rsp+0x80]
+ jmp 9b
+9:
+ vpxor ymm0, ymm0, ymm2
+ vpxor ymm1, ymm1, ymm3
+ vpxor ymm8, ymm8, ymm10
+ vpxor ymm9, ymm9, ymm11
+ mov eax, r13d
+ cmp rdx, r15
+ jne 2b
+ vmovdqu xmmword ptr [rbx], xmm0
+ vmovdqu xmmword ptr [rbx+0x10], xmm1
+ vextracti128 xmmword ptr [rbx+0x20], ymm0, 0x01
+ vextracti128 xmmword ptr [rbx+0x30], ymm1, 0x01
+ vmovdqu xmmword ptr [rbx+0x40], xmm8
+ vmovdqu xmmword ptr [rbx+0x50], xmm9
+ vextracti128 xmmword ptr [rbx+0x60], ymm8, 0x01
+ vextracti128 xmmword ptr [rbx+0x70], ymm9, 0x01
+ vmovaps xmm8, xmmword ptr [rsp+0x280]
+ vmovaps xmm0, xmmword ptr [rsp+0x240]
+ vmovaps xmm1, xmmword ptr [rsp+0x250]
+ vmovaps xmm2, xmmword ptr [rsp+0x260]
+ vmovaps xmm3, xmmword ptr [rsp+0x270]
+ vblendvps xmm0, xmm0, xmm1, xmm8
+ vblendvps xmm2, xmm2, xmm3, xmm8
+ vmovaps xmmword ptr [rsp+0x240], xmm0
+ vmovaps xmmword ptr [rsp+0x260], xmm2
+ add rbx, 128
+ add rdi, 32
+ sub rsi, 4
+3:
+ test rsi, 0x2
+ je 3f
+ vbroadcasti128 ymm0, xmmword ptr [rcx]
+ vbroadcasti128 ymm1, xmmword ptr [rcx+0x10]
+ vmovd xmm13, dword ptr [rsp+0x240]
+ vpinsrd xmm13, xmm13, dword ptr [rsp+0x260], 1
+ vpinsrd xmm13, xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
+ vmovd xmm14, dword ptr [rsp+0x244]
+ vpinsrd xmm14, xmm14, dword ptr [rsp+0x264], 1
+ vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
+ vinserti128 ymm13, ymm13, xmm14, 0x01
+ vbroadcasti128 ymm14, xmmword ptr [ROT16+rip]
+ vbroadcasti128 ymm15, xmmword ptr [ROT8+rip]
+ mov r8, qword ptr [rdi]
+ mov r9, qword ptr [rdi+0x8]
+ movzx eax, byte ptr [rbp+0x40]
+ or eax, r13d
+ xor edx, edx
+.p2align 5
+2:
+ mov r14d, eax
+ or eax, r12d
+ add rdx, 64
+ cmp rdx, r15
+ cmovne eax, r14d
+ mov dword ptr [rsp+0x200], eax
+ vbroadcasti128 ymm2, xmmword ptr [BLAKE3_IV+rip]
+ vpbroadcastd ymm8, dword ptr [rsp+0x200]
+ vpblendd ymm3, ymm13, ymm8, 0x88
+ vmovups ymm8, ymmword ptr [r8+rdx-0x40]
+ vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-0x40], 0x01
+ vmovups ymm9, ymmword ptr [r8+rdx-0x30]
+ vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-0x30], 0x01
+ vshufps ymm4, ymm8, ymm9, 136
+ vshufps ymm5, ymm8, ymm9, 221
+ vmovups ymm8, ymmword ptr [r8+rdx-0x20]
+ vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-0x20], 0x01
+ vmovups ymm9, ymmword ptr [r8+rdx-0x10]
+ vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-0x10], 0x01
+ vshufps ymm6, ymm8, ymm9, 136
+ vshufps ymm7, ymm8, ymm9, 221
+ vpshufd ymm6, ymm6, 0x93
+ vpshufd ymm7, ymm7, 0x93
+ mov al, 7
+9:
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm0, ymm0, ymm1
+ vpxor ymm3, ymm3, ymm0
+ vpshufb ymm3, ymm3, ymm14
+ vpaddd ymm2, ymm2, ymm3
+ vpxor ymm1, ymm1, ymm2
+ vpsrld ymm8, ymm1, 12
+ vpslld ymm1, ymm1, 20
+ vpor ymm1, ymm1, ymm8
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm0, ymm0, ymm1
+ vpxor ymm3, ymm3, ymm0
+ vpshufb ymm3, ymm3, ymm15
+ vpaddd ymm2, ymm2, ymm3
+ vpxor ymm1, ymm1, ymm2
+ vpsrld ymm8, ymm1, 7
+ vpslld ymm1, ymm1, 25
+ vpor ymm1, ymm1, ymm8
+ vpshufd ymm0, ymm0, 0x93
+ vpshufd ymm3, ymm3, 0x4E
+ vpshufd ymm2, ymm2, 0x39
+ vpaddd ymm0, ymm0, ymm6
+ vpaddd ymm0, ymm0, ymm1
+ vpxor ymm3, ymm3, ymm0
+ vpshufb ymm3, ymm3, ymm14
+ vpaddd ymm2, ymm2, ymm3
+ vpxor ymm1, ymm1, ymm2
+ vpsrld ymm8, ymm1, 12
+ vpslld ymm1, ymm1, 20
+ vpor ymm1, ymm1, ymm8
+ vpaddd ymm0, ymm0, ymm7
+ vpaddd ymm0, ymm0, ymm1
+ vpxor ymm3, ymm3, ymm0
+ vpshufb ymm3, ymm3, ymm15
+ vpaddd ymm2, ymm2, ymm3
+ vpxor ymm1, ymm1, ymm2
+ vpsrld ymm8, ymm1, 7
+ vpslld ymm1, ymm1, 25
+ vpor ymm1, ymm1, ymm8
+ vpshufd ymm0, ymm0, 0x39
+ vpshufd ymm3, ymm3, 0x4E
+ vpshufd ymm2, ymm2, 0x93
+ dec al
+ jz 9f
+ vshufps ymm8, ymm4, ymm5, 214
+ vpshufd ymm9, ymm4, 0x0F
+ vpshufd ymm4, ymm8, 0x39
+ vshufps ymm8, ymm6, ymm7, 250
+ vpblendd ymm9, ymm9, ymm8, 0xAA
+ vpunpcklqdq ymm8, ymm7, ymm5
+ vpblendd ymm8, ymm8, ymm6, 0x88
+ vpshufd ymm8, ymm8, 0x78
+ vpunpckhdq ymm5, ymm5, ymm7
+ vpunpckldq ymm6, ymm6, ymm5
+ vpshufd ymm7, ymm6, 0x1E
+ vmovdqa ymm5, ymm9
+ vmovdqa ymm6, ymm8
+ jmp 9b
+9:
+ vpxor ymm0, ymm0, ymm2
+ vpxor ymm1, ymm1, ymm3
+ mov eax, r13d
+ cmp rdx, r15
+ jne 2b
+ vmovdqu xmmword ptr [rbx], xmm0
+ vmovdqu xmmword ptr [rbx+0x10], xmm1
+ vextracti128 xmmword ptr [rbx+0x20], ymm0, 0x01
+ vextracti128 xmmword ptr [rbx+0x30], ymm1, 0x01
+ vmovaps ymm8, ymmword ptr [rsp+0x280]
+ vmovaps ymm0, ymmword ptr [rsp+0x240]
+ vmovups ymm1, ymmword ptr [rsp+0x248]
+ vmovaps ymm2, ymmword ptr [rsp+0x260]
+ vmovups ymm3, ymmword ptr [rsp+0x268]
+ vblendvps ymm0, ymm0, ymm1, ymm8
+ vblendvps ymm2, ymm2, ymm3, ymm8
+ vmovaps ymmword ptr [rsp+0x240], ymm0
+ vmovaps ymmword ptr [rsp+0x260], ymm2
+ add rbx, 64
+ add rdi, 16
+ sub rsi, 2
+3:
+ test rsi, 0x1
+ je 4b
+ vmovdqu xmm0, xmmword ptr [rcx]
+ vmovdqu xmm1, xmmword ptr [rcx+0x10]
+ vmovd xmm3, dword ptr [rsp+0x240]
+ vpinsrd xmm3, xmm3, dword ptr [rsp+0x260], 1
+ vpinsrd xmm13, xmm3, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
+ vmovdqa xmm14, xmmword ptr [ROT16+rip]
+ vmovdqa xmm15, xmmword ptr [ROT8+rip]
+ mov r8, qword ptr [rdi]
+ movzx eax, byte ptr [rbp+0x40]
+ or eax, r13d
+ xor edx, edx
+.p2align 5
+2:
+ mov r14d, eax
+ or eax, r12d
+ add rdx, 64
+ cmp rdx, r15
+ cmovne eax, r14d
+ vmovdqa xmm2, xmmword ptr [BLAKE3_IV+rip]
+ vmovdqa xmm3, xmm13
+ vpinsrd xmm3, xmm3, eax, 3
+ vmovups xmm8, xmmword ptr [r8+rdx-0x40]
+ vmovups xmm9, xmmword ptr [r8+rdx-0x30]
+ vshufps xmm4, xmm8, xmm9, 136
+ vshufps xmm5, xmm8, xmm9, 221
+ vmovups xmm8, xmmword ptr [r8+rdx-0x20]
+ vmovups xmm9, xmmword ptr [r8+rdx-0x10]
+ vshufps xmm6, xmm8, xmm9, 136
+ vshufps xmm7, xmm8, xmm9, 221
+ vpshufd xmm6, xmm6, 0x93
+ vpshufd xmm7, xmm7, 0x93
+ mov al, 7
+9:
+ vpaddd xmm0, xmm0, xmm4
+ vpaddd xmm0, xmm0, xmm1
+ vpxor xmm3, xmm3, xmm0
+ vpshufb xmm3, xmm3, xmm14
+ vpaddd xmm2, xmm2, xmm3
+ vpxor xmm1, xmm1, xmm2
+ vpsrld xmm8, xmm1, 12
+ vpslld xmm1, xmm1, 20
+ vpor xmm1, xmm1, xmm8
+ vpaddd xmm0, xmm0, xmm5
+ vpaddd xmm0, xmm0, xmm1
+ vpxor xmm3, xmm3, xmm0
+ vpshufb xmm3, xmm3, xmm15
+ vpaddd xmm2, xmm2, xmm3
+ vpxor xmm1, xmm1, xmm2
+ vpsrld xmm8, xmm1, 7
+ vpslld xmm1, xmm1, 25
+ vpor xmm1, xmm1, xmm8
+ vpshufd xmm0, xmm0, 0x93
+ vpshufd xmm3, xmm3, 0x4E
+ vpshufd xmm2, xmm2, 0x39
+ vpaddd xmm0, xmm0, xmm6
+ vpaddd xmm0, xmm0, xmm1
+ vpxor xmm3, xmm3, xmm0
+ vpshufb xmm3, xmm3, xmm14
+ vpaddd xmm2, xmm2, xmm3
+ vpxor xmm1, xmm1, xmm2
+ vpsrld xmm8, xmm1, 12
+ vpslld xmm1, xmm1, 20
+ vpor xmm1, xmm1, xmm8
+ vpaddd xmm0, xmm0, xmm7
+ vpaddd xmm0, xmm0, xmm1
+ vpxor xmm3, xmm3, xmm0
+ vpshufb xmm3, xmm3, xmm15
+ vpaddd xmm2, xmm2, xmm3
+ vpxor xmm1, xmm1, xmm2
+ vpsrld xmm8, xmm1, 7
+ vpslld xmm1, xmm1, 25
+ vpor xmm1, xmm1, xmm8
+ vpshufd xmm0, xmm0, 0x39
+ vpshufd xmm3, xmm3, 0x4E
+ vpshufd xmm2, xmm2, 0x93
+ dec al
+ jz 9f
+ vshufps xmm8, xmm4, xmm5, 214
+ vpshufd xmm9, xmm4, 0x0F
+ vpshufd xmm4, xmm8, 0x39
+ vshufps xmm8, xmm6, xmm7, 250
+ vpblendd xmm9, xmm9, xmm8, 0xAA
+ vpunpcklqdq xmm8, xmm7, xmm5
+ vpblendd xmm8, xmm8, xmm6, 0x88
+ vpshufd xmm8, xmm8, 0x78
+ vpunpckhdq xmm5, xmm5, xmm7
+ vpunpckldq xmm6, xmm6, xmm5
+ vpshufd xmm7, xmm6, 0x1E
+ vmovdqa xmm5, xmm9
+ vmovdqa xmm6, xmm8
+ jmp 9b
+9:
+ vpxor xmm0, xmm0, xmm2
+ vpxor xmm1, xmm1, xmm3
+ mov eax, r13d
+ cmp rdx, r15
+ jne 2b
+ vmovdqu xmmword ptr [rbx], xmm0
+ vmovdqu xmmword ptr [rbx+0x10], xmm1
+ jmp 4b
+
+.size zfs_blake3_hash_many_avx2, . - zfs_blake3_hash_many_avx2
+
+#ifdef __APPLE__
+.static_data
+#else
+.section .rodata
+#endif
+
+.p2align 6
+ADD0:
+ .long 0, 1, 2, 3, 4, 5, 6, 7
+ADD1:
+ .long 8, 8, 8, 8, 8, 8, 8, 8
+BLAKE3_IV_0:
+ .long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667
+ .long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667
+BLAKE3_IV_1:
+ .long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85
+ .long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85
+BLAKE3_IV_2:
+ .long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372
+ .long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372
+BLAKE3_IV_3:
+ .long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A
+ .long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A
+BLAKE3_BLOCK_LEN:
+ .long 0x00000040, 0x00000040, 0x00000040, 0x00000040
+ .long 0x00000040, 0x00000040, 0x00000040, 0x00000040
+ROT16:
+ .byte 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13
+ROT8:
+ .byte 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12
+CMP_MSB_MASK:
+ .long 0x80000000, 0x80000000, 0x80000000, 0x80000000
+ .long 0x80000000, 0x80000000, 0x80000000, 0x80000000
+BLAKE3_IV:
+ .long 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A
+#endif /* HAVE_AVX2 */
+
+#ifdef __ELF__
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/sys/contrib/openzfs/module/icp/asm-x86_64/blake3/blake3_avx512.S b/sys/contrib/openzfs/module/icp/asm-x86_64/blake3/blake3_avx512.S
new file mode 100644
index 000000000000..d02c5e7ec92f
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/asm-x86_64/blake3/blake3_avx512.S
@@ -0,0 +1,2618 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Based on BLAKE3 v1.3.1, https://github.com/BLAKE3-team/BLAKE3
+ * Copyright (c) 2019-2020 Samuel Neves
+ * Copyright (c) 2022 Tino Reichardt <milky-zfs@mcmilk.de>
+ */
+
+#if defined(HAVE_AVX512F) && defined(HAVE_AVX512VL)
+
+#define _ASM
+#include <sys/asm_linkage.h>
+
+#if defined(__ELF__) && defined(__CET__) && defined(__has_include)
+#if __has_include(<cet.h>)
+#include <cet.h>
+#endif
+#endif
+
+#if !defined(_CET_ENDBR)
+#define _CET_ENDBR
+#endif
+
+.intel_syntax noprefix
+.global zfs_blake3_hash_many_avx512
+.global zfs_blake3_compress_in_place_avx512
+.global zfs_blake3_compress_xof_avx512
+.text
+
+.type zfs_blake3_hash_many_avx512,@function
+.type zfs_blake3_compress_xof_avx512,@function
+.type zfs_blake3_compress_in_place_avx512,@function
+
+.p2align 6
+zfs_blake3_hash_many_avx512:
+ _CET_ENDBR
+ push r15
+ push r14
+ push r13
+ push r12
+ push rbx
+ push rbp
+ mov rbp, rsp
+ sub rsp, 144
+ and rsp, 0xFFFFFFFFFFFFFFC0
+ neg r9
+ kmovw k1, r9d
+ vmovd xmm0, r8d
+ vpbroadcastd ymm0, xmm0
+ shr r8, 32
+ vmovd xmm1, r8d
+ vpbroadcastd ymm1, xmm1
+ vmovdqa ymm4, ymm1
+ vmovdqa ymm5, ymm1
+ vpaddd ymm2, ymm0, ymmword ptr [ADD0+rip]
+ vpaddd ymm3, ymm0, ymmword ptr [ADD0+32+rip]
+ vpcmpltud k2, ymm2, ymm0
+ vpcmpltud k3, ymm3, ymm0
+ vpaddd ymm4 {k2}, ymm4, dword ptr [ADD1+rip] {1to8}
+ vpaddd ymm5 {k3}, ymm5, dword ptr [ADD1+rip] {1to8}
+ knotw k2, k1
+ vmovdqa32 ymm2 {k2}, ymm0
+ vmovdqa32 ymm3 {k2}, ymm0
+ vmovdqa32 ymm4 {k2}, ymm1
+ vmovdqa32 ymm5 {k2}, ymm1
+ vmovdqa ymmword ptr [rsp], ymm2
+ vmovdqa ymmword ptr [rsp+0x1*0x20], ymm3
+ vmovdqa ymmword ptr [rsp+0x2*0x20], ymm4
+ vmovdqa ymmword ptr [rsp+0x3*0x20], ymm5
+ shl rdx, 6
+ mov qword ptr [rsp+0x80], rdx
+ cmp rsi, 16
+ jc 3f
+2:
+ vpbroadcastd zmm0, dword ptr [rcx]
+ vpbroadcastd zmm1, dword ptr [rcx+0x1*0x4]
+ vpbroadcastd zmm2, dword ptr [rcx+0x2*0x4]
+ vpbroadcastd zmm3, dword ptr [rcx+0x3*0x4]
+ vpbroadcastd zmm4, dword ptr [rcx+0x4*0x4]
+ vpbroadcastd zmm5, dword ptr [rcx+0x5*0x4]
+ vpbroadcastd zmm6, dword ptr [rcx+0x6*0x4]
+ vpbroadcastd zmm7, dword ptr [rcx+0x7*0x4]
+ movzx eax, byte ptr [rbp+0x38]
+ movzx ebx, byte ptr [rbp+0x40]
+ or eax, ebx
+ xor edx, edx
+.p2align 5
+9:
+ movzx ebx, byte ptr [rbp+0x48]
+ or ebx, eax
+ add rdx, 64
+ cmp rdx, qword ptr [rsp+0x80]
+ cmove eax, ebx
+ mov dword ptr [rsp+0x88], eax
+ mov r8, qword ptr [rdi]
+ mov r9, qword ptr [rdi+0x8]
+ mov r10, qword ptr [rdi+0x10]
+ mov r11, qword ptr [rdi+0x18]
+ mov r12, qword ptr [rdi+0x40]
+ mov r13, qword ptr [rdi+0x48]
+ mov r14, qword ptr [rdi+0x50]
+ mov r15, qword ptr [rdi+0x58]
+ vmovdqu32 ymm16, ymmword ptr [rdx+r8-0x2*0x20]
+ vinserti32x8 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01
+ vmovdqu32 ymm17, ymmword ptr [rdx+r9-0x2*0x20]
+ vinserti32x8 zmm17, zmm17, ymmword ptr [rdx+r13-0x2*0x20], 0x01
+ vpunpcklqdq zmm8, zmm16, zmm17
+ vpunpckhqdq zmm9, zmm16, zmm17
+ vmovdqu32 ymm18, ymmword ptr [rdx+r10-0x2*0x20]
+ vinserti32x8 zmm18, zmm18, ymmword ptr [rdx+r14-0x2*0x20], 0x01
+ vmovdqu32 ymm19, ymmword ptr [rdx+r11-0x2*0x20]
+ vinserti32x8 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01
+ vpunpcklqdq zmm10, zmm18, zmm19
+ vpunpckhqdq zmm11, zmm18, zmm19
+ mov r8, qword ptr [rdi+0x20]
+ mov r9, qword ptr [rdi+0x28]
+ mov r10, qword ptr [rdi+0x30]
+ mov r11, qword ptr [rdi+0x38]
+ mov r12, qword ptr [rdi+0x60]
+ mov r13, qword ptr [rdi+0x68]
+ mov r14, qword ptr [rdi+0x70]
+ mov r15, qword ptr [rdi+0x78]
+ vmovdqu32 ymm16, ymmword ptr [rdx+r8-0x2*0x20]
+ vinserti32x8 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01
+ vmovdqu32 ymm17, ymmword ptr [rdx+r9-0x2*0x20]
+ vinserti32x8 zmm17, zmm17, ymmword ptr [rdx+r13-0x2*0x20], 0x01
+ vpunpcklqdq zmm12, zmm16, zmm17
+ vpunpckhqdq zmm13, zmm16, zmm17
+ vmovdqu32 ymm18, ymmword ptr [rdx+r10-0x2*0x20]
+ vinserti32x8 zmm18, zmm18, ymmword ptr [rdx+r14-0x2*0x20], 0x01
+ vmovdqu32 ymm19, ymmword ptr [rdx+r11-0x2*0x20]
+ vinserti32x8 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01
+ vpunpcklqdq zmm14, zmm18, zmm19
+ vpunpckhqdq zmm15, zmm18, zmm19
+ vmovdqa32 zmm27, zmmword ptr [INDEX0+rip]
+ vmovdqa32 zmm31, zmmword ptr [INDEX1+rip]
+ vshufps zmm16, zmm8, zmm10, 136
+ vshufps zmm17, zmm12, zmm14, 136
+ vmovdqa32 zmm20, zmm16
+ vpermt2d zmm16, zmm27, zmm17
+ vpermt2d zmm20, zmm31, zmm17
+ vshufps zmm17, zmm8, zmm10, 221
+ vshufps zmm30, zmm12, zmm14, 221
+ vmovdqa32 zmm21, zmm17
+ vpermt2d zmm17, zmm27, zmm30
+ vpermt2d zmm21, zmm31, zmm30
+ vshufps zmm18, zmm9, zmm11, 136
+ vshufps zmm8, zmm13, zmm15, 136
+ vmovdqa32 zmm22, zmm18
+ vpermt2d zmm18, zmm27, zmm8
+ vpermt2d zmm22, zmm31, zmm8
+ vshufps zmm19, zmm9, zmm11, 221
+ vshufps zmm8, zmm13, zmm15, 221
+ vmovdqa32 zmm23, zmm19
+ vpermt2d zmm19, zmm27, zmm8
+ vpermt2d zmm23, zmm31, zmm8
+ mov r8, qword ptr [rdi]
+ mov r9, qword ptr [rdi+0x8]
+ mov r10, qword ptr [rdi+0x10]
+ mov r11, qword ptr [rdi+0x18]
+ mov r12, qword ptr [rdi+0x40]
+ mov r13, qword ptr [rdi+0x48]
+ mov r14, qword ptr [rdi+0x50]
+ mov r15, qword ptr [rdi+0x58]
+ vmovdqu32 ymm24, ymmword ptr [r8+rdx-0x1*0x20]
+ vinserti32x8 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01
+ vmovdqu32 ymm25, ymmword ptr [r9+rdx-0x1*0x20]
+ vinserti32x8 zmm25, zmm25, ymmword ptr [r13+rdx-0x1*0x20], 0x01
+ vpunpcklqdq zmm8, zmm24, zmm25
+ vpunpckhqdq zmm9, zmm24, zmm25
+ vmovdqu32 ymm24, ymmword ptr [r10+rdx-0x1*0x20]
+ vinserti32x8 zmm24, zmm24, ymmword ptr [r14+rdx-0x1*0x20], 0x01
+ vmovdqu32 ymm25, ymmword ptr [r11+rdx-0x1*0x20]
+ vinserti32x8 zmm25, zmm25, ymmword ptr [r15+rdx-0x1*0x20], 0x01
+ vpunpcklqdq zmm10, zmm24, zmm25
+ vpunpckhqdq zmm11, zmm24, zmm25
+ prefetcht0 [r8+rdx+0x80]
+ prefetcht0 [r12+rdx+0x80]
+ prefetcht0 [r9+rdx+0x80]
+ prefetcht0 [r13+rdx+0x80]
+ prefetcht0 [r10+rdx+0x80]
+ prefetcht0 [r14+rdx+0x80]
+ prefetcht0 [r11+rdx+0x80]
+ prefetcht0 [r15+rdx+0x80]
+ mov r8, qword ptr [rdi+0x20]
+ mov r9, qword ptr [rdi+0x28]
+ mov r10, qword ptr [rdi+0x30]
+ mov r11, qword ptr [rdi+0x38]
+ mov r12, qword ptr [rdi+0x60]
+ mov r13, qword ptr [rdi+0x68]
+ mov r14, qword ptr [rdi+0x70]
+ mov r15, qword ptr [rdi+0x78]
+ vmovdqu32 ymm24, ymmword ptr [r8+rdx-0x1*0x20]
+ vinserti32x8 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01
+ vmovdqu32 ymm25, ymmword ptr [r9+rdx-0x1*0x20]
+ vinserti32x8 zmm25, zmm25, ymmword ptr [r13+rdx-0x1*0x20], 0x01
+ vpunpcklqdq zmm12, zmm24, zmm25
+ vpunpckhqdq zmm13, zmm24, zmm25
+ vmovdqu32 ymm24, ymmword ptr [r10+rdx-0x1*0x20]
+ vinserti32x8 zmm24, zmm24, ymmword ptr [r14+rdx-0x1*0x20], 0x01
+ vmovdqu32 ymm25, ymmword ptr [r11+rdx-0x1*0x20]
+ vinserti32x8 zmm25, zmm25, ymmword ptr [r15+rdx-0x1*0x20], 0x01
+ vpunpcklqdq zmm14, zmm24, zmm25
+ vpunpckhqdq zmm15, zmm24, zmm25
+ prefetcht0 [r8+rdx+0x80]
+ prefetcht0 [r12+rdx+0x80]
+ prefetcht0 [r9+rdx+0x80]
+ prefetcht0 [r13+rdx+0x80]
+ prefetcht0 [r10+rdx+0x80]
+ prefetcht0 [r14+rdx+0x80]
+ prefetcht0 [r11+rdx+0x80]
+ prefetcht0 [r15+rdx+0x80]
+ vshufps zmm24, zmm8, zmm10, 136
+ vshufps zmm30, zmm12, zmm14, 136
+ vmovdqa32 zmm28, zmm24
+ vpermt2d zmm24, zmm27, zmm30
+ vpermt2d zmm28, zmm31, zmm30
+ vshufps zmm25, zmm8, zmm10, 221
+ vshufps zmm30, zmm12, zmm14, 221
+ vmovdqa32 zmm29, zmm25
+ vpermt2d zmm25, zmm27, zmm30
+ vpermt2d zmm29, zmm31, zmm30
+ vshufps zmm26, zmm9, zmm11, 136
+ vshufps zmm8, zmm13, zmm15, 136
+ vmovdqa32 zmm30, zmm26
+ vpermt2d zmm26, zmm27, zmm8
+ vpermt2d zmm30, zmm31, zmm8
+ vshufps zmm8, zmm9, zmm11, 221
+ vshufps zmm10, zmm13, zmm15, 221
+ vpermi2d zmm27, zmm8, zmm10
+ vpermi2d zmm31, zmm8, zmm10
+ vpbroadcastd zmm8, dword ptr [BLAKE3_IV_0+rip]
+ vpbroadcastd zmm9, dword ptr [BLAKE3_IV_1+rip]
+ vpbroadcastd zmm10, dword ptr [BLAKE3_IV_2+rip]
+ vpbroadcastd zmm11, dword ptr [BLAKE3_IV_3+rip]
+ vmovdqa32 zmm12, zmmword ptr [rsp]
+ vmovdqa32 zmm13, zmmword ptr [rsp+0x1*0x40]
+ vpbroadcastd zmm14, dword ptr [BLAKE3_BLOCK_LEN+rip]
+ vpbroadcastd zmm15, dword ptr [rsp+0x22*0x4]
+ vpaddd zmm0, zmm0, zmm16
+ vpaddd zmm1, zmm1, zmm18
+ vpaddd zmm2, zmm2, zmm20
+ vpaddd zmm3, zmm3, zmm22
+ vpaddd zmm0, zmm0, zmm4
+ vpaddd zmm1, zmm1, zmm5
+ vpaddd zmm2, zmm2, zmm6
+ vpaddd zmm3, zmm3, zmm7
+ vpxord zmm12, zmm12, zmm0
+ vpxord zmm13, zmm13, zmm1
+ vpxord zmm14, zmm14, zmm2
+ vpxord zmm15, zmm15, zmm3
+ vprord zmm12, zmm12, 16
+ vprord zmm13, zmm13, 16
+ vprord zmm14, zmm14, 16
+ vprord zmm15, zmm15, 16
+ vpaddd zmm8, zmm8, zmm12
+ vpaddd zmm9, zmm9, zmm13
+ vpaddd zmm10, zmm10, zmm14
+ vpaddd zmm11, zmm11, zmm15
+ vpxord zmm4, zmm4, zmm8
+ vpxord zmm5, zmm5, zmm9
+ vpxord zmm6, zmm6, zmm10
+ vpxord zmm7, zmm7, zmm11
+ vprord zmm4, zmm4, 12
+ vprord zmm5, zmm5, 12
+ vprord zmm6, zmm6, 12
+ vprord zmm7, zmm7, 12
+ vpaddd zmm0, zmm0, zmm17
+ vpaddd zmm1, zmm1, zmm19
+ vpaddd zmm2, zmm2, zmm21
+ vpaddd zmm3, zmm3, zmm23
+ vpaddd zmm0, zmm0, zmm4
+ vpaddd zmm1, zmm1, zmm5
+ vpaddd zmm2, zmm2, zmm6
+ vpaddd zmm3, zmm3, zmm7
+ vpxord zmm12, zmm12, zmm0
+ vpxord zmm13, zmm13, zmm1
+ vpxord zmm14, zmm14, zmm2
+ vpxord zmm15, zmm15, zmm3
+ vprord zmm12, zmm12, 8
+ vprord zmm13, zmm13, 8
+ vprord zmm14, zmm14, 8
+ vprord zmm15, zmm15, 8
+ vpaddd zmm8, zmm8, zmm12
+ vpaddd zmm9, zmm9, zmm13
+ vpaddd zmm10, zmm10, zmm14
+ vpaddd zmm11, zmm11, zmm15
+ vpxord zmm4, zmm4, zmm8
+ vpxord zmm5, zmm5, zmm9
+ vpxord zmm6, zmm6, zmm10
+ vpxord zmm7, zmm7, zmm11
+ vprord zmm4, zmm4, 7
+ vprord zmm5, zmm5, 7
+ vprord zmm6, zmm6, 7
+ vprord zmm7, zmm7, 7
+ vpaddd zmm0, zmm0, zmm24
+ vpaddd zmm1, zmm1, zmm26
+ vpaddd zmm2, zmm2, zmm28
+ vpaddd zmm3, zmm3, zmm30
+ vpaddd zmm0, zmm0, zmm5
+ vpaddd zmm1, zmm1, zmm6
+ vpaddd zmm2, zmm2, zmm7
+ vpaddd zmm3, zmm3, zmm4
+ vpxord zmm15, zmm15, zmm0
+ vpxord zmm12, zmm12, zmm1
+ vpxord zmm13, zmm13, zmm2
+ vpxord zmm14, zmm14, zmm3
+ vprord zmm15, zmm15, 16
+ vprord zmm12, zmm12, 16
+ vprord zmm13, zmm13, 16
+ vprord zmm14, zmm14, 16
+ vpaddd zmm10, zmm10, zmm15
+ vpaddd zmm11, zmm11, zmm12
+ vpaddd zmm8, zmm8, zmm13
+ vpaddd zmm9, zmm9, zmm14
+ vpxord zmm5, zmm5, zmm10
+ vpxord zmm6, zmm6, zmm11
+ vpxord zmm7, zmm7, zmm8
+ vpxord zmm4, zmm4, zmm9
+ vprord zmm5, zmm5, 12
+ vprord zmm6, zmm6, 12
+ vprord zmm7, zmm7, 12
+ vprord zmm4, zmm4, 12
+ vpaddd zmm0, zmm0, zmm25
+ vpaddd zmm1, zmm1, zmm27
+ vpaddd zmm2, zmm2, zmm29
+ vpaddd zmm3, zmm3, zmm31
+ vpaddd zmm0, zmm0, zmm5
+ vpaddd zmm1, zmm1, zmm6
+ vpaddd zmm2, zmm2, zmm7
+ vpaddd zmm3, zmm3, zmm4
+ vpxord zmm15, zmm15, zmm0
+ vpxord zmm12, zmm12, zmm1
+ vpxord zmm13, zmm13, zmm2
+ vpxord zmm14, zmm14, zmm3
+ vprord zmm15, zmm15, 8
+ vprord zmm12, zmm12, 8
+ vprord zmm13, zmm13, 8
+ vprord zmm14, zmm14, 8
+ vpaddd zmm10, zmm10, zmm15
+ vpaddd zmm11, zmm11, zmm12
+ vpaddd zmm8, zmm8, zmm13
+ vpaddd zmm9, zmm9, zmm14
+ vpxord zmm5, zmm5, zmm10
+ vpxord zmm6, zmm6, zmm11
+ vpxord zmm7, zmm7, zmm8
+ vpxord zmm4, zmm4, zmm9
+ vprord zmm5, zmm5, 7
+ vprord zmm6, zmm6, 7
+ vprord zmm7, zmm7, 7
+ vprord zmm4, zmm4, 7
+ vpaddd zmm0, zmm0, zmm18
+ vpaddd zmm1, zmm1, zmm19
+ vpaddd zmm2, zmm2, zmm23
+ vpaddd zmm3, zmm3, zmm20
+ vpaddd zmm0, zmm0, zmm4
+ vpaddd zmm1, zmm1, zmm5
+ vpaddd zmm2, zmm2, zmm6
+ vpaddd zmm3, zmm3, zmm7
+ vpxord zmm12, zmm12, zmm0
+ vpxord zmm13, zmm13, zmm1
+ vpxord zmm14, zmm14, zmm2
+ vpxord zmm15, zmm15, zmm3
+ vprord zmm12, zmm12, 16
+ vprord zmm13, zmm13, 16
+ vprord zmm14, zmm14, 16
+ vprord zmm15, zmm15, 16
+ vpaddd zmm8, zmm8, zmm12
+ vpaddd zmm9, zmm9, zmm13
+ vpaddd zmm10, zmm10, zmm14
+ vpaddd zmm11, zmm11, zmm15
+ vpxord zmm4, zmm4, zmm8
+ vpxord zmm5, zmm5, zmm9
+ vpxord zmm6, zmm6, zmm10
+ vpxord zmm7, zmm7, zmm11
+ vprord zmm4, zmm4, 12
+ vprord zmm5, zmm5, 12
+ vprord zmm6, zmm6, 12
+ vprord zmm7, zmm7, 12
+ vpaddd zmm0, zmm0, zmm22
+ vpaddd zmm1, zmm1, zmm26
+ vpaddd zmm2, zmm2, zmm16
+ vpaddd zmm3, zmm3, zmm29
+ vpaddd zmm0, zmm0, zmm4
+ vpaddd zmm1, zmm1, zmm5
+ vpaddd zmm2, zmm2, zmm6
+ vpaddd zmm3, zmm3, zmm7
+ vpxord zmm12, zmm12, zmm0
+ vpxord zmm13, zmm13, zmm1
+ vpxord zmm14, zmm14, zmm2
+ vpxord zmm15, zmm15, zmm3
+ vprord zmm12, zmm12, 8
+ vprord zmm13, zmm13, 8
+ vprord zmm14, zmm14, 8
+ vprord zmm15, zmm15, 8
+ vpaddd zmm8, zmm8, zmm12
+ vpaddd zmm9, zmm9, zmm13
+ vpaddd zmm10, zmm10, zmm14
+ vpaddd zmm11, zmm11, zmm15
+ vpxord zmm4, zmm4, zmm8
+ vpxord zmm5, zmm5, zmm9
+ vpxord zmm6, zmm6, zmm10
+ vpxord zmm7, zmm7, zmm11
+ vprord zmm4, zmm4, 7
+ vprord zmm5, zmm5, 7
+ vprord zmm6, zmm6, 7
+ vprord zmm7, zmm7, 7
+ vpaddd zmm0, zmm0, zmm17
+ vpaddd zmm1, zmm1, zmm28
+ vpaddd zmm2, zmm2, zmm25
+ vpaddd zmm3, zmm3, zmm31
+ vpaddd zmm0, zmm0, zmm5
+ vpaddd zmm1, zmm1, zmm6
+ vpaddd zmm2, zmm2, zmm7
+ vpaddd zmm3, zmm3, zmm4
+ vpxord zmm15, zmm15, zmm0
+ vpxord zmm12, zmm12, zmm1
+ vpxord zmm13, zmm13, zmm2
+ vpxord zmm14, zmm14, zmm3
+ vprord zmm15, zmm15, 16
+ vprord zmm12, zmm12, 16
+ vprord zmm13, zmm13, 16
+ vprord zmm14, zmm14, 16
+ vpaddd zmm10, zmm10, zmm15
+ vpaddd zmm11, zmm11, zmm12
+ vpaddd zmm8, zmm8, zmm13
+ vpaddd zmm9, zmm9, zmm14
+ vpxord zmm5, zmm5, zmm10
+ vpxord zmm6, zmm6, zmm11
+ vpxord zmm7, zmm7, zmm8
+ vpxord zmm4, zmm4, zmm9
+ vprord zmm5, zmm5, 12
+ vprord zmm6, zmm6, 12
+ vprord zmm7, zmm7, 12
+ vprord zmm4, zmm4, 12
+ vpaddd zmm0, zmm0, zmm27
+ vpaddd zmm1, zmm1, zmm21
+ vpaddd zmm2, zmm2, zmm30
+ vpaddd zmm3, zmm3, zmm24
+ vpaddd zmm0, zmm0, zmm5
+ vpaddd zmm1, zmm1, zmm6
+ vpaddd zmm2, zmm2, zmm7
+ vpaddd zmm3, zmm3, zmm4
+ vpxord zmm15, zmm15, zmm0
+ vpxord zmm12, zmm12, zmm1
+ vpxord zmm13, zmm13, zmm2
+ vpxord zmm14, zmm14, zmm3
+ vprord zmm15, zmm15, 8
+ vprord zmm12, zmm12, 8
+ vprord zmm13, zmm13, 8
+ vprord zmm14, zmm14, 8
+ vpaddd zmm10, zmm10, zmm15
+ vpaddd zmm11, zmm11, zmm12
+ vpaddd zmm8, zmm8, zmm13
+ vpaddd zmm9, zmm9, zmm14
+ vpxord zmm5, zmm5, zmm10
+ vpxord zmm6, zmm6, zmm11
+ vpxord zmm7, zmm7, zmm8
+ vpxord zmm4, zmm4, zmm9
+ vprord zmm5, zmm5, 7
+ vprord zmm6, zmm6, 7
+ vprord zmm7, zmm7, 7
+ vprord zmm4, zmm4, 7
+ vpaddd zmm0, zmm0, zmm19
+ vpaddd zmm1, zmm1, zmm26
+ vpaddd zmm2, zmm2, zmm29
+ vpaddd zmm3, zmm3, zmm23
+ vpaddd zmm0, zmm0, zmm4
+ vpaddd zmm1, zmm1, zmm5
+ vpaddd zmm2, zmm2, zmm6
+ vpaddd zmm3, zmm3, zmm7
+ vpxord zmm12, zmm12, zmm0
+ vpxord zmm13, zmm13, zmm1
+ vpxord zmm14, zmm14, zmm2
+ vpxord zmm15, zmm15, zmm3
+ vprord zmm12, zmm12, 16
+ vprord zmm13, zmm13, 16
+ vprord zmm14, zmm14, 16
+ vprord zmm15, zmm15, 16
+ vpaddd zmm8, zmm8, zmm12
+ vpaddd zmm9, zmm9, zmm13
+ vpaddd zmm10, zmm10, zmm14
+ vpaddd zmm11, zmm11, zmm15
+ vpxord zmm4, zmm4, zmm8
+ vpxord zmm5, zmm5, zmm9
+ vpxord zmm6, zmm6, zmm10
+ vpxord zmm7, zmm7, zmm11
+ vprord zmm4, zmm4, 12
+ vprord zmm5, zmm5, 12
+ vprord zmm6, zmm6, 12
+ vprord zmm7, zmm7, 12
+ vpaddd zmm0, zmm0, zmm20
+ vpaddd zmm1, zmm1, zmm28
+ vpaddd zmm2, zmm2, zmm18
+ vpaddd zmm3, zmm3, zmm30
+ vpaddd zmm0, zmm0, zmm4
+ vpaddd zmm1, zmm1, zmm5
+ vpaddd zmm2, zmm2, zmm6
+ vpaddd zmm3, zmm3, zmm7
+ vpxord zmm12, zmm12, zmm0
+ vpxord zmm13, zmm13, zmm1
+ vpxord zmm14, zmm14, zmm2
+ vpxord zmm15, zmm15, zmm3
+ vprord zmm12, zmm12, 8
+ vprord zmm13, zmm13, 8
+ vprord zmm14, zmm14, 8
+ vprord zmm15, zmm15, 8
+ vpaddd zmm8, zmm8, zmm12
+ vpaddd zmm9, zmm9, zmm13
+ vpaddd zmm10, zmm10, zmm14
+ vpaddd zmm11, zmm11, zmm15
+ vpxord zmm4, zmm4, zmm8
+ vpxord zmm5, zmm5, zmm9
+ vpxord zmm6, zmm6, zmm10
+ vpxord zmm7, zmm7, zmm11
+ vprord zmm4, zmm4, 7
+ vprord zmm5, zmm5, 7
+ vprord zmm6, zmm6, 7
+ vprord zmm7, zmm7, 7
+ vpaddd zmm0, zmm0, zmm22
+ vpaddd zmm1, zmm1, zmm25
+ vpaddd zmm2, zmm2, zmm27
+ vpaddd zmm3, zmm3, zmm24
+ vpaddd zmm0, zmm0, zmm5
+ vpaddd zmm1, zmm1, zmm6
+ vpaddd zmm2, zmm2, zmm7
+ vpaddd zmm3, zmm3, zmm4
+ vpxord zmm15, zmm15, zmm0
+ vpxord zmm12, zmm12, zmm1
+ vpxord zmm13, zmm13, zmm2
+ vpxord zmm14, zmm14, zmm3
+ vprord zmm15, zmm15, 16
+ vprord zmm12, zmm12, 16
+ vprord zmm13, zmm13, 16
+ vprord zmm14, zmm14, 16
+ vpaddd zmm10, zmm10, zmm15
+ vpaddd zmm11, zmm11, zmm12
+ vpaddd zmm8, zmm8, zmm13
+ vpaddd zmm9, zmm9, zmm14
+ vpxord zmm5, zmm5, zmm10
+ vpxord zmm6, zmm6, zmm11
+ vpxord zmm7, zmm7, zmm8
+ vpxord zmm4, zmm4, zmm9
+ vprord zmm5, zmm5, 12
+ vprord zmm6, zmm6, 12
+ vprord zmm7, zmm7, 12
+ vprord zmm4, zmm4, 12
+ vpaddd zmm0, zmm0, zmm21
+ vpaddd zmm1, zmm1, zmm16
+ vpaddd zmm2, zmm2, zmm31
+ vpaddd zmm3, zmm3, zmm17
+ vpaddd zmm0, zmm0, zmm5
+ vpaddd zmm1, zmm1, zmm6
+ vpaddd zmm2, zmm2, zmm7
+ vpaddd zmm3, zmm3, zmm4
+ vpxord zmm15, zmm15, zmm0
+ vpxord zmm12, zmm12, zmm1
+ vpxord zmm13, zmm13, zmm2
+ vpxord zmm14, zmm14, zmm3
+ vprord zmm15, zmm15, 8
+ vprord zmm12, zmm12, 8
+ vprord zmm13, zmm13, 8
+ vprord zmm14, zmm14, 8
+ vpaddd zmm10, zmm10, zmm15
+ vpaddd zmm11, zmm11, zmm12
+ vpaddd zmm8, zmm8, zmm13
+ vpaddd zmm9, zmm9, zmm14
+ vpxord zmm5, zmm5, zmm10
+ vpxord zmm6, zmm6, zmm11
+ vpxord zmm7, zmm7, zmm8
+ vpxord zmm4, zmm4, zmm9
+ vprord zmm5, zmm5, 7
+ vprord zmm6, zmm6, 7
+ vprord zmm7, zmm7, 7
+ vprord zmm4, zmm4, 7
+ vpaddd zmm0, zmm0, zmm26
+ vpaddd zmm1, zmm1, zmm28
+ vpaddd zmm2, zmm2, zmm30
+ vpaddd zmm3, zmm3, zmm29
+ vpaddd zmm0, zmm0, zmm4
+ vpaddd zmm1, zmm1, zmm5
+ vpaddd zmm2, zmm2, zmm6
+ vpaddd zmm3, zmm3, zmm7
+ vpxord zmm12, zmm12, zmm0
+ vpxord zmm13, zmm13, zmm1
+ vpxord zmm14, zmm14, zmm2
+ vpxord zmm15, zmm15, zmm3
+ vprord zmm12, zmm12, 16
+ vprord zmm13, zmm13, 16
+ vprord zmm14, zmm14, 16
+ vprord zmm15, zmm15, 16
+ vpaddd zmm8, zmm8, zmm12
+ vpaddd zmm9, zmm9, zmm13
+ vpaddd zmm10, zmm10, zmm14
+ vpaddd zmm11, zmm11, zmm15
+ vpxord zmm4, zmm4, zmm8
+ vpxord zmm5, zmm5, zmm9
+ vpxord zmm6, zmm6, zmm10
+ vpxord zmm7, zmm7, zmm11
+ vprord zmm4, zmm4, 12
+ vprord zmm5, zmm5, 12
+ vprord zmm6, zmm6, 12
+ vprord zmm7, zmm7, 12
+ vpaddd zmm0, zmm0, zmm23
+ vpaddd zmm1, zmm1, zmm25
+ vpaddd zmm2, zmm2, zmm19
+ vpaddd zmm3, zmm3, zmm31
+ vpaddd zmm0, zmm0, zmm4
+ vpaddd zmm1, zmm1, zmm5
+ vpaddd zmm2, zmm2, zmm6
+ vpaddd zmm3, zmm3, zmm7
+ vpxord zmm12, zmm12, zmm0
+ vpxord zmm13, zmm13, zmm1
+ vpxord zmm14, zmm14, zmm2
+ vpxord zmm15, zmm15, zmm3
+ vprord zmm12, zmm12, 8
+ vprord zmm13, zmm13, 8
+ vprord zmm14, zmm14, 8
+ vprord zmm15, zmm15, 8
+ vpaddd zmm8, zmm8, zmm12
+ vpaddd zmm9, zmm9, zmm13
+ vpaddd zmm10, zmm10, zmm14
+ vpaddd zmm11, zmm11, zmm15
+ vpxord zmm4, zmm4, zmm8
+ vpxord zmm5, zmm5, zmm9
+ vpxord zmm6, zmm6, zmm10
+ vpxord zmm7, zmm7, zmm11
+ vprord zmm4, zmm4, 7
+ vprord zmm5, zmm5, 7
+ vprord zmm6, zmm6, 7
+ vprord zmm7, zmm7, 7
+ vpaddd zmm0, zmm0, zmm20
+ vpaddd zmm1, zmm1, zmm27
+ vpaddd zmm2, zmm2, zmm21
+ vpaddd zmm3, zmm3, zmm17
+ vpaddd zmm0, zmm0, zmm5
+ vpaddd zmm1, zmm1, zmm6
+ vpaddd zmm2, zmm2, zmm7
+ vpaddd zmm3, zmm3, zmm4
+ vpxord zmm15, zmm15, zmm0
+ vpxord zmm12, zmm12, zmm1
+ vpxord zmm13, zmm13, zmm2
+ vpxord zmm14, zmm14, zmm3
+ vprord zmm15, zmm15, 16
+ vprord zmm12, zmm12, 16
+ vprord zmm13, zmm13, 16
+ vprord zmm14, zmm14, 16
+ vpaddd zmm10, zmm10, zmm15
+ vpaddd zmm11, zmm11, zmm12
+ vpaddd zmm8, zmm8, zmm13
+ vpaddd zmm9, zmm9, zmm14
+ vpxord zmm5, zmm5, zmm10
+ vpxord zmm6, zmm6, zmm11
+ vpxord zmm7, zmm7, zmm8
+ vpxord zmm4, zmm4, zmm9
+ vprord zmm5, zmm5, 12
+ vprord zmm6, zmm6, 12
+ vprord zmm7, zmm7, 12
+ vprord zmm4, zmm4, 12
+ vpaddd zmm0, zmm0, zmm16
+ vpaddd zmm1, zmm1, zmm18
+ vpaddd zmm2, zmm2, zmm24
+ vpaddd zmm3, zmm3, zmm22
+ vpaddd zmm0, zmm0, zmm5
+ vpaddd zmm1, zmm1, zmm6
+ vpaddd zmm2, zmm2, zmm7
+ vpaddd zmm3, zmm3, zmm4
+ vpxord zmm15, zmm15, zmm0
+ vpxord zmm12, zmm12, zmm1
+ vpxord zmm13, zmm13, zmm2
+ vpxord zmm14, zmm14, zmm3
+ vprord zmm15, zmm15, 8
+ vprord zmm12, zmm12, 8
+ vprord zmm13, zmm13, 8
+ vprord zmm14, zmm14, 8
+ vpaddd zmm10, zmm10, zmm15
+ vpaddd zmm11, zmm11, zmm12
+ vpaddd zmm8, zmm8, zmm13
+ vpaddd zmm9, zmm9, zmm14
+ vpxord zmm5, zmm5, zmm10
+ vpxord zmm6, zmm6, zmm11
+ vpxord zmm7, zmm7, zmm8
+ vpxord zmm4, zmm4, zmm9
+ vprord zmm5, zmm5, 7
+ vprord zmm6, zmm6, 7
+ vprord zmm7, zmm7, 7
+ vprord zmm4, zmm4, 7
+ vpaddd zmm0, zmm0, zmm28
+ vpaddd zmm1, zmm1, zmm25
+ vpaddd zmm2, zmm2, zmm31
+ vpaddd zmm3, zmm3, zmm30
+ vpaddd zmm0, zmm0, zmm4
+ vpaddd zmm1, zmm1, zmm5
+ vpaddd zmm2, zmm2, zmm6
+ vpaddd zmm3, zmm3, zmm7
+ vpxord zmm12, zmm12, zmm0
+ vpxord zmm13, zmm13, zmm1
+ vpxord zmm14, zmm14, zmm2
+ vpxord zmm15, zmm15, zmm3
+ vprord zmm12, zmm12, 16
+ vprord zmm13, zmm13, 16
+ vprord zmm14, zmm14, 16
+ vprord zmm15, zmm15, 16
+ vpaddd zmm8, zmm8, zmm12
+ vpaddd zmm9, zmm9, zmm13
+ vpaddd zmm10, zmm10, zmm14
+ vpaddd zmm11, zmm11, zmm15
+ vpxord zmm4, zmm4, zmm8
+ vpxord zmm5, zmm5, zmm9
+ vpxord zmm6, zmm6, zmm10
+ vpxord zmm7, zmm7, zmm11
+ vprord zmm4, zmm4, 12
+ vprord zmm5, zmm5, 12
+ vprord zmm6, zmm6, 12
+ vprord zmm7, zmm7, 12
+ vpaddd zmm0, zmm0, zmm29
+ vpaddd zmm1, zmm1, zmm27
+ vpaddd zmm2, zmm2, zmm26
+ vpaddd zmm3, zmm3, zmm24
+ vpaddd zmm0, zmm0, zmm4
+ vpaddd zmm1, zmm1, zmm5
+ vpaddd zmm2, zmm2, zmm6
+ vpaddd zmm3, zmm3, zmm7
+ vpxord zmm12, zmm12, zmm0
+ vpxord zmm13, zmm13, zmm1
+ vpxord zmm14, zmm14, zmm2
+ vpxord zmm15, zmm15, zmm3
+ vprord zmm12, zmm12, 8
+ vprord zmm13, zmm13, 8
+ vprord zmm14, zmm14, 8
+ vprord zmm15, zmm15, 8
+ vpaddd zmm8, zmm8, zmm12
+ vpaddd zmm9, zmm9, zmm13
+ vpaddd zmm10, zmm10, zmm14
+ vpaddd zmm11, zmm11, zmm15
+ vpxord zmm4, zmm4, zmm8
+ vpxord zmm5, zmm5, zmm9
+ vpxord zmm6, zmm6, zmm10
+ vpxord zmm7, zmm7, zmm11
+ vprord zmm4, zmm4, 7
+ vprord zmm5, zmm5, 7
+ vprord zmm6, zmm6, 7
+ vprord zmm7, zmm7, 7
+ vpaddd zmm0, zmm0, zmm23
+ vpaddd zmm1, zmm1, zmm21
+ vpaddd zmm2, zmm2, zmm16
+ vpaddd zmm3, zmm3, zmm22
+ vpaddd zmm0, zmm0, zmm5
+ vpaddd zmm1, zmm1, zmm6
+ vpaddd zmm2, zmm2, zmm7
+ vpaddd zmm3, zmm3, zmm4
+ vpxord zmm15, zmm15, zmm0
+ vpxord zmm12, zmm12, zmm1
+ vpxord zmm13, zmm13, zmm2
+ vpxord zmm14, zmm14, zmm3
+ vprord zmm15, zmm15, 16
+ vprord zmm12, zmm12, 16
+ vprord zmm13, zmm13, 16
+ vprord zmm14, zmm14, 16
+ vpaddd zmm10, zmm10, zmm15
+ vpaddd zmm11, zmm11, zmm12
+ vpaddd zmm8, zmm8, zmm13
+ vpaddd zmm9, zmm9, zmm14
+ vpxord zmm5, zmm5, zmm10
+ vpxord zmm6, zmm6, zmm11
+ vpxord zmm7, zmm7, zmm8
+ vpxord zmm4, zmm4, zmm9
+ vprord zmm5, zmm5, 12
+ vprord zmm6, zmm6, 12
+ vprord zmm7, zmm7, 12
+ vprord zmm4, zmm4, 12
+ vpaddd zmm0, zmm0, zmm18
+ vpaddd zmm1, zmm1, zmm19
+ vpaddd zmm2, zmm2, zmm17
+ vpaddd zmm3, zmm3, zmm20
+ vpaddd zmm0, zmm0, zmm5
+ vpaddd zmm1, zmm1, zmm6
+ vpaddd zmm2, zmm2, zmm7
+ vpaddd zmm3, zmm3, zmm4
+ vpxord zmm15, zmm15, zmm0
+ vpxord zmm12, zmm12, zmm1
+ vpxord zmm13, zmm13, zmm2
+ vpxord zmm14, zmm14, zmm3
+ vprord zmm15, zmm15, 8
+ vprord zmm12, zmm12, 8
+ vprord zmm13, zmm13, 8
+ vprord zmm14, zmm14, 8
+ vpaddd zmm10, zmm10, zmm15
+ vpaddd zmm11, zmm11, zmm12
+ vpaddd zmm8, zmm8, zmm13
+ vpaddd zmm9, zmm9, zmm14
+ vpxord zmm5, zmm5, zmm10
+ vpxord zmm6, zmm6, zmm11
+ vpxord zmm7, zmm7, zmm8
+ vpxord zmm4, zmm4, zmm9
+ vprord zmm5, zmm5, 7
+ vprord zmm6, zmm6, 7
+ vprord zmm7, zmm7, 7
+ vprord zmm4, zmm4, 7
+ vpaddd zmm0, zmm0, zmm25
+ vpaddd zmm1, zmm1, zmm27
+ vpaddd zmm2, zmm2, zmm24
+ vpaddd zmm3, zmm3, zmm31
+ vpaddd zmm0, zmm0, zmm4
+ vpaddd zmm1, zmm1, zmm5
+ vpaddd zmm2, zmm2, zmm6
+ vpaddd zmm3, zmm3, zmm7
+ vpxord zmm12, zmm12, zmm0
+ vpxord zmm13, zmm13, zmm1
+ vpxord zmm14, zmm14, zmm2
+ vpxord zmm15, zmm15, zmm3
+ vprord zmm12, zmm12, 16
+ vprord zmm13, zmm13, 16
+ vprord zmm14, zmm14, 16
+ vprord zmm15, zmm15, 16
+ vpaddd zmm8, zmm8, zmm12
+ vpaddd zmm9, zmm9, zmm13
+ vpaddd zmm10, zmm10, zmm14
+ vpaddd zmm11, zmm11, zmm15
+ vpxord zmm4, zmm4, zmm8
+ vpxord zmm5, zmm5, zmm9
+ vpxord zmm6, zmm6, zmm10
+ vpxord zmm7, zmm7, zmm11
+ vprord zmm4, zmm4, 12
+ vprord zmm5, zmm5, 12
+ vprord zmm6, zmm6, 12
+ vprord zmm7, zmm7, 12
+ vpaddd zmm0, zmm0, zmm30
+ vpaddd zmm1, zmm1, zmm21
+ vpaddd zmm2, zmm2, zmm28
+ vpaddd zmm3, zmm3, zmm17
+ vpaddd zmm0, zmm0, zmm4
+ vpaddd zmm1, zmm1, zmm5
+ vpaddd zmm2, zmm2, zmm6
+ vpaddd zmm3, zmm3, zmm7
+ vpxord zmm12, zmm12, zmm0
+ vpxord zmm13, zmm13, zmm1
+ vpxord zmm14, zmm14, zmm2
+ vpxord zmm15, zmm15, zmm3
+ vprord zmm12, zmm12, 8
+ vprord zmm13, zmm13, 8
+ vprord zmm14, zmm14, 8
+ vprord zmm15, zmm15, 8
+ vpaddd zmm8, zmm8, zmm12
+ vpaddd zmm9, zmm9, zmm13
+ vpaddd zmm10, zmm10, zmm14
+ vpaddd zmm11, zmm11, zmm15
+ vpxord zmm4, zmm4, zmm8
+ vpxord zmm5, zmm5, zmm9
+ vpxord zmm6, zmm6, zmm10
+ vpxord zmm7, zmm7, zmm11
+ vprord zmm4, zmm4, 7
+ vprord zmm5, zmm5, 7
+ vprord zmm6, zmm6, 7
+ vprord zmm7, zmm7, 7
+ vpaddd zmm0, zmm0, zmm29
+ vpaddd zmm1, zmm1, zmm16
+ vpaddd zmm2, zmm2, zmm18
+ vpaddd zmm3, zmm3, zmm20
+ vpaddd zmm0, zmm0, zmm5
+ vpaddd zmm1, zmm1, zmm6
+ vpaddd zmm2, zmm2, zmm7
+ vpaddd zmm3, zmm3, zmm4
+ vpxord zmm15, zmm15, zmm0
+ vpxord zmm12, zmm12, zmm1
+ vpxord zmm13, zmm13, zmm2
+ vpxord zmm14, zmm14, zmm3
+ vprord zmm15, zmm15, 16
+ vprord zmm12, zmm12, 16
+ vprord zmm13, zmm13, 16
+ vprord zmm14, zmm14, 16
+ vpaddd zmm10, zmm10, zmm15
+ vpaddd zmm11, zmm11, zmm12
+ vpaddd zmm8, zmm8, zmm13
+ vpaddd zmm9, zmm9, zmm14
+ vpxord zmm5, zmm5, zmm10
+ vpxord zmm6, zmm6, zmm11
+ vpxord zmm7, zmm7, zmm8
+ vpxord zmm4, zmm4, zmm9
+ vprord zmm5, zmm5, 12
+ vprord zmm6, zmm6, 12
+ vprord zmm7, zmm7, 12
+ vprord zmm4, zmm4, 12
+ vpaddd zmm0, zmm0, zmm19
+ vpaddd zmm1, zmm1, zmm26
+ vpaddd zmm2, zmm2, zmm22
+ vpaddd zmm3, zmm3, zmm23
+ vpaddd zmm0, zmm0, zmm5
+ vpaddd zmm1, zmm1, zmm6
+ vpaddd zmm2, zmm2, zmm7
+ vpaddd zmm3, zmm3, zmm4
+ vpxord zmm15, zmm15, zmm0
+ vpxord zmm12, zmm12, zmm1
+ vpxord zmm13, zmm13, zmm2
+ vpxord zmm14, zmm14, zmm3
+ vprord zmm15, zmm15, 8
+ vprord zmm12, zmm12, 8
+ vprord zmm13, zmm13, 8
+ vprord zmm14, zmm14, 8
+ vpaddd zmm10, zmm10, zmm15
+ vpaddd zmm11, zmm11, zmm12
+ vpaddd zmm8, zmm8, zmm13
+ vpaddd zmm9, zmm9, zmm14
+ vpxord zmm5, zmm5, zmm10
+ vpxord zmm6, zmm6, zmm11
+ vpxord zmm7, zmm7, zmm8
+ vpxord zmm4, zmm4, zmm9
+ vprord zmm5, zmm5, 7
+ vprord zmm6, zmm6, 7
+ vprord zmm7, zmm7, 7
+ vprord zmm4, zmm4, 7
+ vpaddd zmm0, zmm0, zmm27
+ vpaddd zmm1, zmm1, zmm21
+ vpaddd zmm2, zmm2, zmm17
+ vpaddd zmm3, zmm3, zmm24
+ vpaddd zmm0, zmm0, zmm4
+ vpaddd zmm1, zmm1, zmm5
+ vpaddd zmm2, zmm2, zmm6
+ vpaddd zmm3, zmm3, zmm7
+ vpxord zmm12, zmm12, zmm0
+ vpxord zmm13, zmm13, zmm1
+ vpxord zmm14, zmm14, zmm2
+ vpxord zmm15, zmm15, zmm3
+ vprord zmm12, zmm12, 16
+ vprord zmm13, zmm13, 16
+ vprord zmm14, zmm14, 16
+ vprord zmm15, zmm15, 16
+ vpaddd zmm8, zmm8, zmm12
+ vpaddd zmm9, zmm9, zmm13
+ vpaddd zmm10, zmm10, zmm14
+ vpaddd zmm11, zmm11, zmm15
+ vpxord zmm4, zmm4, zmm8
+ vpxord zmm5, zmm5, zmm9
+ vpxord zmm6, zmm6, zmm10
+ vpxord zmm7, zmm7, zmm11
+ vprord zmm4, zmm4, 12
+ vprord zmm5, zmm5, 12
+ vprord zmm6, zmm6, 12
+ vprord zmm7, zmm7, 12
+ vpaddd zmm0, zmm0, zmm31
+ vpaddd zmm1, zmm1, zmm16
+ vpaddd zmm2, zmm2, zmm25
+ vpaddd zmm3, zmm3, zmm22
+ vpaddd zmm0, zmm0, zmm4
+ vpaddd zmm1, zmm1, zmm5
+ vpaddd zmm2, zmm2, zmm6
+ vpaddd zmm3, zmm3, zmm7
+ vpxord zmm12, zmm12, zmm0
+ vpxord zmm13, zmm13, zmm1
+ vpxord zmm14, zmm14, zmm2
+ vpxord zmm15, zmm15, zmm3
+ vprord zmm12, zmm12, 8
+ vprord zmm13, zmm13, 8
+ vprord zmm14, zmm14, 8
+ vprord zmm15, zmm15, 8
+ vpaddd zmm8, zmm8, zmm12
+ vpaddd zmm9, zmm9, zmm13
+ vpaddd zmm10, zmm10, zmm14
+ vpaddd zmm11, zmm11, zmm15
+ vpxord zmm4, zmm4, zmm8
+ vpxord zmm5, zmm5, zmm9
+ vpxord zmm6, zmm6, zmm10
+ vpxord zmm7, zmm7, zmm11
+ vprord zmm4, zmm4, 7
+ vprord zmm5, zmm5, 7
+ vprord zmm6, zmm6, 7
+ vprord zmm7, zmm7, 7
+ vpaddd zmm0, zmm0, zmm30
+ vpaddd zmm1, zmm1, zmm18
+ vpaddd zmm2, zmm2, zmm19
+ vpaddd zmm3, zmm3, zmm23
+ vpaddd zmm0, zmm0, zmm5
+ vpaddd zmm1, zmm1, zmm6
+ vpaddd zmm2, zmm2, zmm7
+ vpaddd zmm3, zmm3, zmm4
+ vpxord zmm15, zmm15, zmm0
+ vpxord zmm12, zmm12, zmm1
+ vpxord zmm13, zmm13, zmm2
+ vpxord zmm14, zmm14, zmm3
+ vprord zmm15, zmm15, 16
+ vprord zmm12, zmm12, 16
+ vprord zmm13, zmm13, 16
+ vprord zmm14, zmm14, 16
+ vpaddd zmm10, zmm10, zmm15
+ vpaddd zmm11, zmm11, zmm12
+ vpaddd zmm8, zmm8, zmm13
+ vpaddd zmm9, zmm9, zmm14
+ vpxord zmm5, zmm5, zmm10
+ vpxord zmm6, zmm6, zmm11
+ vpxord zmm7, zmm7, zmm8
+ vpxord zmm4, zmm4, zmm9
+ vprord zmm5, zmm5, 12
+ vprord zmm6, zmm6, 12
+ vprord zmm7, zmm7, 12
+ vprord zmm4, zmm4, 12
+ vpaddd zmm0, zmm0, zmm26
+ vpaddd zmm1, zmm1, zmm28
+ vpaddd zmm2, zmm2, zmm20
+ vpaddd zmm3, zmm3, zmm29
+ vpaddd zmm0, zmm0, zmm5
+ vpaddd zmm1, zmm1, zmm6
+ vpaddd zmm2, zmm2, zmm7
+ vpaddd zmm3, zmm3, zmm4
+ vpxord zmm15, zmm15, zmm0
+ vpxord zmm12, zmm12, zmm1
+ vpxord zmm13, zmm13, zmm2
+ vpxord zmm14, zmm14, zmm3
+ vprord zmm15, zmm15, 8
+ vprord zmm12, zmm12, 8
+ vprord zmm13, zmm13, 8
+ vprord zmm14, zmm14, 8
+ vpaddd zmm10, zmm10, zmm15
+ vpaddd zmm11, zmm11, zmm12
+ vpaddd zmm8, zmm8, zmm13
+ vpaddd zmm9, zmm9, zmm14
+ vpxord zmm5, zmm5, zmm10
+ vpxord zmm6, zmm6, zmm11
+ vpxord zmm7, zmm7, zmm8
+ vpxord zmm4, zmm4, zmm9
+ vprord zmm5, zmm5, 7
+ vprord zmm6, zmm6, 7
+ vprord zmm7, zmm7, 7
+ vprord zmm4, zmm4, 7
+ vpxord zmm0, zmm0, zmm8
+ vpxord zmm1, zmm1, zmm9
+ vpxord zmm2, zmm2, zmm10
+ vpxord zmm3, zmm3, zmm11
+ vpxord zmm4, zmm4, zmm12
+ vpxord zmm5, zmm5, zmm13
+ vpxord zmm6, zmm6, zmm14
+ vpxord zmm7, zmm7, zmm15
+ movzx eax, byte ptr [rbp+0x38]
+ jne 9b
+ mov rbx, qword ptr [rbp+0x50]
+ vpunpckldq zmm16, zmm0, zmm1
+ vpunpckhdq zmm17, zmm0, zmm1
+ vpunpckldq zmm18, zmm2, zmm3
+ vpunpckhdq zmm19, zmm2, zmm3
+ vpunpckldq zmm20, zmm4, zmm5
+ vpunpckhdq zmm21, zmm4, zmm5
+ vpunpckldq zmm22, zmm6, zmm7
+ vpunpckhdq zmm23, zmm6, zmm7
+ vpunpcklqdq zmm0, zmm16, zmm18
+ vpunpckhqdq zmm1, zmm16, zmm18
+ vpunpcklqdq zmm2, zmm17, zmm19
+ vpunpckhqdq zmm3, zmm17, zmm19
+ vpunpcklqdq zmm4, zmm20, zmm22
+ vpunpckhqdq zmm5, zmm20, zmm22
+ vpunpcklqdq zmm6, zmm21, zmm23
+ vpunpckhqdq zmm7, zmm21, zmm23
+ vshufi32x4 zmm16, zmm0, zmm4, 0x88
+ vshufi32x4 zmm17, zmm1, zmm5, 0x88
+ vshufi32x4 zmm18, zmm2, zmm6, 0x88
+ vshufi32x4 zmm19, zmm3, zmm7, 0x88
+ vshufi32x4 zmm20, zmm0, zmm4, 0xDD
+ vshufi32x4 zmm21, zmm1, zmm5, 0xDD
+ vshufi32x4 zmm22, zmm2, zmm6, 0xDD
+ vshufi32x4 zmm23, zmm3, zmm7, 0xDD
+ vshufi32x4 zmm0, zmm16, zmm17, 0x88
+ vshufi32x4 zmm1, zmm18, zmm19, 0x88
+ vshufi32x4 zmm2, zmm20, zmm21, 0x88
+ vshufi32x4 zmm3, zmm22, zmm23, 0x88
+ vshufi32x4 zmm4, zmm16, zmm17, 0xDD
+ vshufi32x4 zmm5, zmm18, zmm19, 0xDD
+ vshufi32x4 zmm6, zmm20, zmm21, 0xDD
+ vshufi32x4 zmm7, zmm22, zmm23, 0xDD
+ vmovdqu32 zmmword ptr [rbx], zmm0
+ vmovdqu32 zmmword ptr [rbx+0x1*0x40], zmm1
+ vmovdqu32 zmmword ptr [rbx+0x2*0x40], zmm2
+ vmovdqu32 zmmword ptr [rbx+0x3*0x40], zmm3
+ vmovdqu32 zmmword ptr [rbx+0x4*0x40], zmm4
+ vmovdqu32 zmmword ptr [rbx+0x5*0x40], zmm5
+ vmovdqu32 zmmword ptr [rbx+0x6*0x40], zmm6
+ vmovdqu32 zmmword ptr [rbx+0x7*0x40], zmm7
+ vmovdqa32 zmm0, zmmword ptr [rsp]
+ vmovdqa32 zmm1, zmmword ptr [rsp+0x1*0x40]
+ vmovdqa32 zmm2, zmm0
+ vpaddd zmm2{k1}, zmm0, dword ptr [ADD16+rip] {1to16}
+ vpcmpltud k2, zmm2, zmm0
+ vpaddd zmm1 {k2}, zmm1, dword ptr [ADD1+rip] {1to16}
+ vmovdqa32 zmmword ptr [rsp], zmm2
+ vmovdqa32 zmmword ptr [rsp+0x1*0x40], zmm1
+ add rdi, 128
+ add rbx, 512
+ mov qword ptr [rbp+0x50], rbx
+ sub rsi, 16
+ cmp rsi, 16
+ jnc 2b
+ test rsi, rsi
+ jnz 3f
+4:
+ vzeroupper
+ mov rsp, rbp
+ pop rbp
+ pop rbx
+ pop r12
+ pop r13
+ pop r14
+ pop r15
+ ret
+.p2align 6
+3:
+ test esi, 0x8
+ je 3f
+ vpbroadcastd ymm0, dword ptr [rcx]
+ vpbroadcastd ymm1, dword ptr [rcx+0x4]
+ vpbroadcastd ymm2, dword ptr [rcx+0x8]
+ vpbroadcastd ymm3, dword ptr [rcx+0xC]
+ vpbroadcastd ymm4, dword ptr [rcx+0x10]
+ vpbroadcastd ymm5, dword ptr [rcx+0x14]
+ vpbroadcastd ymm6, dword ptr [rcx+0x18]
+ vpbroadcastd ymm7, dword ptr [rcx+0x1C]
+ mov r8, qword ptr [rdi]
+ mov r9, qword ptr [rdi+0x8]
+ mov r10, qword ptr [rdi+0x10]
+ mov r11, qword ptr [rdi+0x18]
+ mov r12, qword ptr [rdi+0x20]
+ mov r13, qword ptr [rdi+0x28]
+ mov r14, qword ptr [rdi+0x30]
+ mov r15, qword ptr [rdi+0x38]
+ movzx eax, byte ptr [rbp+0x38]
+ movzx ebx, byte ptr [rbp+0x40]
+ or eax, ebx
+ xor edx, edx
+2:
+ movzx ebx, byte ptr [rbp+0x48]
+ or ebx, eax
+ add rdx, 64
+ cmp rdx, qword ptr [rsp+0x80]
+ cmove eax, ebx
+ mov dword ptr [rsp+0x88], eax
+ vmovups xmm8, xmmword ptr [r8+rdx-0x40]
+ vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x40], 0x01
+ vmovups xmm9, xmmword ptr [r9+rdx-0x40]
+ vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x40], 0x01
+ vunpcklpd ymm12, ymm8, ymm9
+ vunpckhpd ymm13, ymm8, ymm9
+ vmovups xmm10, xmmword ptr [r10+rdx-0x40]
+ vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x40], 0x01
+ vmovups xmm11, xmmword ptr [r11+rdx-0x40]
+ vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x40], 0x01
+ vunpcklpd ymm14, ymm10, ymm11
+ vunpckhpd ymm15, ymm10, ymm11
+ vshufps ymm16, ymm12, ymm14, 136
+ vshufps ymm17, ymm12, ymm14, 221
+ vshufps ymm18, ymm13, ymm15, 136
+ vshufps ymm19, ymm13, ymm15, 221
+ vmovups xmm8, xmmword ptr [r8+rdx-0x30]
+ vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x30], 0x01
+ vmovups xmm9, xmmword ptr [r9+rdx-0x30]
+ vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x30], 0x01
+ vunpcklpd ymm12, ymm8, ymm9
+ vunpckhpd ymm13, ymm8, ymm9
+ vmovups xmm10, xmmword ptr [r10+rdx-0x30]
+ vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x30], 0x01
+ vmovups xmm11, xmmword ptr [r11+rdx-0x30]
+ vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x30], 0x01
+ vunpcklpd ymm14, ymm10, ymm11
+ vunpckhpd ymm15, ymm10, ymm11
+ vshufps ymm20, ymm12, ymm14, 136
+ vshufps ymm21, ymm12, ymm14, 221
+ vshufps ymm22, ymm13, ymm15, 136
+ vshufps ymm23, ymm13, ymm15, 221
+ vmovups xmm8, xmmword ptr [r8+rdx-0x20]
+ vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x20], 0x01
+ vmovups xmm9, xmmword ptr [r9+rdx-0x20]
+ vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x20], 0x01
+ vunpcklpd ymm12, ymm8, ymm9
+ vunpckhpd ymm13, ymm8, ymm9
+ vmovups xmm10, xmmword ptr [r10+rdx-0x20]
+ vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x20], 0x01
+ vmovups xmm11, xmmword ptr [r11+rdx-0x20]
+ vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x20], 0x01
+ vunpcklpd ymm14, ymm10, ymm11
+ vunpckhpd ymm15, ymm10, ymm11
+ vshufps ymm24, ymm12, ymm14, 136
+ vshufps ymm25, ymm12, ymm14, 221
+ vshufps ymm26, ymm13, ymm15, 136
+ vshufps ymm27, ymm13, ymm15, 221
+ vmovups xmm8, xmmword ptr [r8+rdx-0x10]
+ vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x10], 0x01
+ vmovups xmm9, xmmword ptr [r9+rdx-0x10]
+ vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x10], 0x01
+ vunpcklpd ymm12, ymm8, ymm9
+ vunpckhpd ymm13, ymm8, ymm9
+ vmovups xmm10, xmmword ptr [r10+rdx-0x10]
+ vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x10], 0x01
+ vmovups xmm11, xmmword ptr [r11+rdx-0x10]
+ vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x10], 0x01
+ vunpcklpd ymm14, ymm10, ymm11
+ vunpckhpd ymm15, ymm10, ymm11
+ vshufps ymm28, ymm12, ymm14, 136
+ vshufps ymm29, ymm12, ymm14, 221
+ vshufps ymm30, ymm13, ymm15, 136
+ vshufps ymm31, ymm13, ymm15, 221
+ vpbroadcastd ymm8, dword ptr [BLAKE3_IV_0+rip]
+ vpbroadcastd ymm9, dword ptr [BLAKE3_IV_1+rip]
+ vpbroadcastd ymm10, dword ptr [BLAKE3_IV_2+rip]
+ vpbroadcastd ymm11, dword ptr [BLAKE3_IV_3+rip]
+ vmovdqa ymm12, ymmword ptr [rsp]
+ vmovdqa ymm13, ymmword ptr [rsp+0x40]
+ vpbroadcastd ymm14, dword ptr [BLAKE3_BLOCK_LEN+rip]
+ vpbroadcastd ymm15, dword ptr [rsp+0x88]
+ vpaddd ymm0, ymm0, ymm16
+ vpaddd ymm1, ymm1, ymm18
+ vpaddd ymm2, ymm2, ymm20
+ vpaddd ymm3, ymm3, ymm22
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxord ymm12, ymm12, ymm0
+ vpxord ymm13, ymm13, ymm1
+ vpxord ymm14, ymm14, ymm2
+ vpxord ymm15, ymm15, ymm3
+ vprord ymm12, ymm12, 16
+ vprord ymm13, ymm13, 16
+ vprord ymm14, ymm14, 16
+ vprord ymm15, ymm15, 16
+ vpaddd ymm8, ymm8, ymm12
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxord ymm4, ymm4, ymm8
+ vpxord ymm5, ymm5, ymm9
+ vpxord ymm6, ymm6, ymm10
+ vpxord ymm7, ymm7, ymm11
+ vprord ymm4, ymm4, 12
+ vprord ymm5, ymm5, 12
+ vprord ymm6, ymm6, 12
+ vprord ymm7, ymm7, 12
+ vpaddd ymm0, ymm0, ymm17
+ vpaddd ymm1, ymm1, ymm19
+ vpaddd ymm2, ymm2, ymm21
+ vpaddd ymm3, ymm3, ymm23
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxord ymm12, ymm12, ymm0
+ vpxord ymm13, ymm13, ymm1
+ vpxord ymm14, ymm14, ymm2
+ vpxord ymm15, ymm15, ymm3
+ vprord ymm12, ymm12, 8
+ vprord ymm13, ymm13, 8
+ vprord ymm14, ymm14, 8
+ vprord ymm15, ymm15, 8
+ vpaddd ymm8, ymm8, ymm12
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxord ymm4, ymm4, ymm8
+ vpxord ymm5, ymm5, ymm9
+ vpxord ymm6, ymm6, ymm10
+ vpxord ymm7, ymm7, ymm11
+ vprord ymm4, ymm4, 7
+ vprord ymm5, ymm5, 7
+ vprord ymm6, ymm6, 7
+ vprord ymm7, ymm7, 7
+ vpaddd ymm0, ymm0, ymm24
+ vpaddd ymm1, ymm1, ymm26
+ vpaddd ymm2, ymm2, ymm28
+ vpaddd ymm3, ymm3, ymm30
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxord ymm15, ymm15, ymm0
+ vpxord ymm12, ymm12, ymm1
+ vpxord ymm13, ymm13, ymm2
+ vpxord ymm14, ymm14, ymm3
+ vprord ymm15, ymm15, 16
+ vprord ymm12, ymm12, 16
+ vprord ymm13, ymm13, 16
+ vprord ymm14, ymm14, 16
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm8, ymm13
+ vpaddd ymm9, ymm9, ymm14
+ vpxord ymm5, ymm5, ymm10
+ vpxord ymm6, ymm6, ymm11
+ vpxord ymm7, ymm7, ymm8
+ vpxord ymm4, ymm4, ymm9
+ vprord ymm5, ymm5, 12
+ vprord ymm6, ymm6, 12
+ vprord ymm7, ymm7, 12
+ vprord ymm4, ymm4, 12
+ vpaddd ymm0, ymm0, ymm25
+ vpaddd ymm1, ymm1, ymm27
+ vpaddd ymm2, ymm2, ymm29
+ vpaddd ymm3, ymm3, ymm31
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxord ymm15, ymm15, ymm0
+ vpxord ymm12, ymm12, ymm1
+ vpxord ymm13, ymm13, ymm2
+ vpxord ymm14, ymm14, ymm3
+ vprord ymm15, ymm15, 8
+ vprord ymm12, ymm12, 8
+ vprord ymm13, ymm13, 8
+ vprord ymm14, ymm14, 8
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm8, ymm13
+ vpaddd ymm9, ymm9, ymm14
+ vpxord ymm5, ymm5, ymm10
+ vpxord ymm6, ymm6, ymm11
+ vpxord ymm7, ymm7, ymm8
+ vpxord ymm4, ymm4, ymm9
+ vprord ymm5, ymm5, 7
+ vprord ymm6, ymm6, 7
+ vprord ymm7, ymm7, 7
+ vprord ymm4, ymm4, 7
+ vpaddd ymm0, ymm0, ymm18
+ vpaddd ymm1, ymm1, ymm19
+ vpaddd ymm2, ymm2, ymm23
+ vpaddd ymm3, ymm3, ymm20
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxord ymm12, ymm12, ymm0
+ vpxord ymm13, ymm13, ymm1
+ vpxord ymm14, ymm14, ymm2
+ vpxord ymm15, ymm15, ymm3
+ vprord ymm12, ymm12, 16
+ vprord ymm13, ymm13, 16
+ vprord ymm14, ymm14, 16
+ vprord ymm15, ymm15, 16
+ vpaddd ymm8, ymm8, ymm12
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxord ymm4, ymm4, ymm8
+ vpxord ymm5, ymm5, ymm9
+ vpxord ymm6, ymm6, ymm10
+ vpxord ymm7, ymm7, ymm11
+ vprord ymm4, ymm4, 12
+ vprord ymm5, ymm5, 12
+ vprord ymm6, ymm6, 12
+ vprord ymm7, ymm7, 12
+ vpaddd ymm0, ymm0, ymm22
+ vpaddd ymm1, ymm1, ymm26
+ vpaddd ymm2, ymm2, ymm16
+ vpaddd ymm3, ymm3, ymm29
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxord ymm12, ymm12, ymm0
+ vpxord ymm13, ymm13, ymm1
+ vpxord ymm14, ymm14, ymm2
+ vpxord ymm15, ymm15, ymm3
+ vprord ymm12, ymm12, 8
+ vprord ymm13, ymm13, 8
+ vprord ymm14, ymm14, 8
+ vprord ymm15, ymm15, 8
+ vpaddd ymm8, ymm8, ymm12
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxord ymm4, ymm4, ymm8
+ vpxord ymm5, ymm5, ymm9
+ vpxord ymm6, ymm6, ymm10
+ vpxord ymm7, ymm7, ymm11
+ vprord ymm4, ymm4, 7
+ vprord ymm5, ymm5, 7
+ vprord ymm6, ymm6, 7
+ vprord ymm7, ymm7, 7
+ vpaddd ymm0, ymm0, ymm17
+ vpaddd ymm1, ymm1, ymm28
+ vpaddd ymm2, ymm2, ymm25
+ vpaddd ymm3, ymm3, ymm31
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxord ymm15, ymm15, ymm0
+ vpxord ymm12, ymm12, ymm1
+ vpxord ymm13, ymm13, ymm2
+ vpxord ymm14, ymm14, ymm3
+ vprord ymm15, ymm15, 16
+ vprord ymm12, ymm12, 16
+ vprord ymm13, ymm13, 16
+ vprord ymm14, ymm14, 16
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm8, ymm13
+ vpaddd ymm9, ymm9, ymm14
+ vpxord ymm5, ymm5, ymm10
+ vpxord ymm6, ymm6, ymm11
+ vpxord ymm7, ymm7, ymm8
+ vpxord ymm4, ymm4, ymm9
+ vprord ymm5, ymm5, 12
+ vprord ymm6, ymm6, 12
+ vprord ymm7, ymm7, 12
+ vprord ymm4, ymm4, 12
+ vpaddd ymm0, ymm0, ymm27
+ vpaddd ymm1, ymm1, ymm21
+ vpaddd ymm2, ymm2, ymm30
+ vpaddd ymm3, ymm3, ymm24
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxord ymm15, ymm15, ymm0
+ vpxord ymm12, ymm12, ymm1
+ vpxord ymm13, ymm13, ymm2
+ vpxord ymm14, ymm14, ymm3
+ vprord ymm15, ymm15, 8
+ vprord ymm12, ymm12, 8
+ vprord ymm13, ymm13, 8
+ vprord ymm14, ymm14, 8
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm8, ymm13
+ vpaddd ymm9, ymm9, ymm14
+ vpxord ymm5, ymm5, ymm10
+ vpxord ymm6, ymm6, ymm11
+ vpxord ymm7, ymm7, ymm8
+ vpxord ymm4, ymm4, ymm9
+ vprord ymm5, ymm5, 7
+ vprord ymm6, ymm6, 7
+ vprord ymm7, ymm7, 7
+ vprord ymm4, ymm4, 7
+ vpaddd ymm0, ymm0, ymm19
+ vpaddd ymm1, ymm1, ymm26
+ vpaddd ymm2, ymm2, ymm29
+ vpaddd ymm3, ymm3, ymm23
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxord ymm12, ymm12, ymm0
+ vpxord ymm13, ymm13, ymm1
+ vpxord ymm14, ymm14, ymm2
+ vpxord ymm15, ymm15, ymm3
+ vprord ymm12, ymm12, 16
+ vprord ymm13, ymm13, 16
+ vprord ymm14, ymm14, 16
+ vprord ymm15, ymm15, 16
+ vpaddd ymm8, ymm8, ymm12
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxord ymm4, ymm4, ymm8
+ vpxord ymm5, ymm5, ymm9
+ vpxord ymm6, ymm6, ymm10
+ vpxord ymm7, ymm7, ymm11
+ vprord ymm4, ymm4, 12
+ vprord ymm5, ymm5, 12
+ vprord ymm6, ymm6, 12
+ vprord ymm7, ymm7, 12
+ vpaddd ymm0, ymm0, ymm20
+ vpaddd ymm1, ymm1, ymm28
+ vpaddd ymm2, ymm2, ymm18
+ vpaddd ymm3, ymm3, ymm30
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxord ymm12, ymm12, ymm0
+ vpxord ymm13, ymm13, ymm1
+ vpxord ymm14, ymm14, ymm2
+ vpxord ymm15, ymm15, ymm3
+ vprord ymm12, ymm12, 8
+ vprord ymm13, ymm13, 8
+ vprord ymm14, ymm14, 8
+ vprord ymm15, ymm15, 8
+ vpaddd ymm8, ymm8, ymm12
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxord ymm4, ymm4, ymm8
+ vpxord ymm5, ymm5, ymm9
+ vpxord ymm6, ymm6, ymm10
+ vpxord ymm7, ymm7, ymm11
+ vprord ymm4, ymm4, 7
+ vprord ymm5, ymm5, 7
+ vprord ymm6, ymm6, 7
+ vprord ymm7, ymm7, 7
+ vpaddd ymm0, ymm0, ymm22
+ vpaddd ymm1, ymm1, ymm25
+ vpaddd ymm2, ymm2, ymm27
+ vpaddd ymm3, ymm3, ymm24
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxord ymm15, ymm15, ymm0
+ vpxord ymm12, ymm12, ymm1
+ vpxord ymm13, ymm13, ymm2
+ vpxord ymm14, ymm14, ymm3
+ vprord ymm15, ymm15, 16
+ vprord ymm12, ymm12, 16
+ vprord ymm13, ymm13, 16
+ vprord ymm14, ymm14, 16
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm8, ymm13
+ vpaddd ymm9, ymm9, ymm14
+ vpxord ymm5, ymm5, ymm10
+ vpxord ymm6, ymm6, ymm11
+ vpxord ymm7, ymm7, ymm8
+ vpxord ymm4, ymm4, ymm9
+ vprord ymm5, ymm5, 12
+ vprord ymm6, ymm6, 12
+ vprord ymm7, ymm7, 12
+ vprord ymm4, ymm4, 12
+ vpaddd ymm0, ymm0, ymm21
+ vpaddd ymm1, ymm1, ymm16
+ vpaddd ymm2, ymm2, ymm31
+ vpaddd ymm3, ymm3, ymm17
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxord ymm15, ymm15, ymm0
+ vpxord ymm12, ymm12, ymm1
+ vpxord ymm13, ymm13, ymm2
+ vpxord ymm14, ymm14, ymm3
+ vprord ymm15, ymm15, 8
+ vprord ymm12, ymm12, 8
+ vprord ymm13, ymm13, 8
+ vprord ymm14, ymm14, 8
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm8, ymm13
+ vpaddd ymm9, ymm9, ymm14
+ vpxord ymm5, ymm5, ymm10
+ vpxord ymm6, ymm6, ymm11
+ vpxord ymm7, ymm7, ymm8
+ vpxord ymm4, ymm4, ymm9
+ vprord ymm5, ymm5, 7
+ vprord ymm6, ymm6, 7
+ vprord ymm7, ymm7, 7
+ vprord ymm4, ymm4, 7
+ vpaddd ymm0, ymm0, ymm26
+ vpaddd ymm1, ymm1, ymm28
+ vpaddd ymm2, ymm2, ymm30
+ vpaddd ymm3, ymm3, ymm29
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxord ymm12, ymm12, ymm0
+ vpxord ymm13, ymm13, ymm1
+ vpxord ymm14, ymm14, ymm2
+ vpxord ymm15, ymm15, ymm3
+ vprord ymm12, ymm12, 16
+ vprord ymm13, ymm13, 16
+ vprord ymm14, ymm14, 16
+ vprord ymm15, ymm15, 16
+ vpaddd ymm8, ymm8, ymm12
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxord ymm4, ymm4, ymm8
+ vpxord ymm5, ymm5, ymm9
+ vpxord ymm6, ymm6, ymm10
+ vpxord ymm7, ymm7, ymm11
+ vprord ymm4, ymm4, 12
+ vprord ymm5, ymm5, 12
+ vprord ymm6, ymm6, 12
+ vprord ymm7, ymm7, 12
+ vpaddd ymm0, ymm0, ymm23
+ vpaddd ymm1, ymm1, ymm25
+ vpaddd ymm2, ymm2, ymm19
+ vpaddd ymm3, ymm3, ymm31
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxord ymm12, ymm12, ymm0
+ vpxord ymm13, ymm13, ymm1
+ vpxord ymm14, ymm14, ymm2
+ vpxord ymm15, ymm15, ymm3
+ vprord ymm12, ymm12, 8
+ vprord ymm13, ymm13, 8
+ vprord ymm14, ymm14, 8
+ vprord ymm15, ymm15, 8
+ vpaddd ymm8, ymm8, ymm12
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxord ymm4, ymm4, ymm8
+ vpxord ymm5, ymm5, ymm9
+ vpxord ymm6, ymm6, ymm10
+ vpxord ymm7, ymm7, ymm11
+ vprord ymm4, ymm4, 7
+ vprord ymm5, ymm5, 7
+ vprord ymm6, ymm6, 7
+ vprord ymm7, ymm7, 7
+ vpaddd ymm0, ymm0, ymm20
+ vpaddd ymm1, ymm1, ymm27
+ vpaddd ymm2, ymm2, ymm21
+ vpaddd ymm3, ymm3, ymm17
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxord ymm15, ymm15, ymm0
+ vpxord ymm12, ymm12, ymm1
+ vpxord ymm13, ymm13, ymm2
+ vpxord ymm14, ymm14, ymm3
+ vprord ymm15, ymm15, 16
+ vprord ymm12, ymm12, 16
+ vprord ymm13, ymm13, 16
+ vprord ymm14, ymm14, 16
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm8, ymm13
+ vpaddd ymm9, ymm9, ymm14
+ vpxord ymm5, ymm5, ymm10
+ vpxord ymm6, ymm6, ymm11
+ vpxord ymm7, ymm7, ymm8
+ vpxord ymm4, ymm4, ymm9
+ vprord ymm5, ymm5, 12
+ vprord ymm6, ymm6, 12
+ vprord ymm7, ymm7, 12
+ vprord ymm4, ymm4, 12
+ vpaddd ymm0, ymm0, ymm16
+ vpaddd ymm1, ymm1, ymm18
+ vpaddd ymm2, ymm2, ymm24
+ vpaddd ymm3, ymm3, ymm22
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxord ymm15, ymm15, ymm0
+ vpxord ymm12, ymm12, ymm1
+ vpxord ymm13, ymm13, ymm2
+ vpxord ymm14, ymm14, ymm3
+ vprord ymm15, ymm15, 8
+ vprord ymm12, ymm12, 8
+ vprord ymm13, ymm13, 8
+ vprord ymm14, ymm14, 8
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm8, ymm13
+ vpaddd ymm9, ymm9, ymm14
+ vpxord ymm5, ymm5, ymm10
+ vpxord ymm6, ymm6, ymm11
+ vpxord ymm7, ymm7, ymm8
+ vpxord ymm4, ymm4, ymm9
+ vprord ymm5, ymm5, 7
+ vprord ymm6, ymm6, 7
+ vprord ymm7, ymm7, 7
+ vprord ymm4, ymm4, 7
+ vpaddd ymm0, ymm0, ymm28
+ vpaddd ymm1, ymm1, ymm25
+ vpaddd ymm2, ymm2, ymm31
+ vpaddd ymm3, ymm3, ymm30
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxord ymm12, ymm12, ymm0
+ vpxord ymm13, ymm13, ymm1
+ vpxord ymm14, ymm14, ymm2
+ vpxord ymm15, ymm15, ymm3
+ vprord ymm12, ymm12, 16
+ vprord ymm13, ymm13, 16
+ vprord ymm14, ymm14, 16
+ vprord ymm15, ymm15, 16
+ vpaddd ymm8, ymm8, ymm12
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxord ymm4, ymm4, ymm8
+ vpxord ymm5, ymm5, ymm9
+ vpxord ymm6, ymm6, ymm10
+ vpxord ymm7, ymm7, ymm11
+ vprord ymm4, ymm4, 12
+ vprord ymm5, ymm5, 12
+ vprord ymm6, ymm6, 12
+ vprord ymm7, ymm7, 12
+ vpaddd ymm0, ymm0, ymm29
+ vpaddd ymm1, ymm1, ymm27
+ vpaddd ymm2, ymm2, ymm26
+ vpaddd ymm3, ymm3, ymm24
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxord ymm12, ymm12, ymm0
+ vpxord ymm13, ymm13, ymm1
+ vpxord ymm14, ymm14, ymm2
+ vpxord ymm15, ymm15, ymm3
+ vprord ymm12, ymm12, 8
+ vprord ymm13, ymm13, 8
+ vprord ymm14, ymm14, 8
+ vprord ymm15, ymm15, 8
+ vpaddd ymm8, ymm8, ymm12
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxord ymm4, ymm4, ymm8
+ vpxord ymm5, ymm5, ymm9
+ vpxord ymm6, ymm6, ymm10
+ vpxord ymm7, ymm7, ymm11
+ vprord ymm4, ymm4, 7
+ vprord ymm5, ymm5, 7
+ vprord ymm6, ymm6, 7
+ vprord ymm7, ymm7, 7
+ vpaddd ymm0, ymm0, ymm23
+ vpaddd ymm1, ymm1, ymm21
+ vpaddd ymm2, ymm2, ymm16
+ vpaddd ymm3, ymm3, ymm22
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxord ymm15, ymm15, ymm0
+ vpxord ymm12, ymm12, ymm1
+ vpxord ymm13, ymm13, ymm2
+ vpxord ymm14, ymm14, ymm3
+ vprord ymm15, ymm15, 16
+ vprord ymm12, ymm12, 16
+ vprord ymm13, ymm13, 16
+ vprord ymm14, ymm14, 16
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm8, ymm13
+ vpaddd ymm9, ymm9, ymm14
+ vpxord ymm5, ymm5, ymm10
+ vpxord ymm6, ymm6, ymm11
+ vpxord ymm7, ymm7, ymm8
+ vpxord ymm4, ymm4, ymm9
+ vprord ymm5, ymm5, 12
+ vprord ymm6, ymm6, 12
+ vprord ymm7, ymm7, 12
+ vprord ymm4, ymm4, 12
+ vpaddd ymm0, ymm0, ymm18
+ vpaddd ymm1, ymm1, ymm19
+ vpaddd ymm2, ymm2, ymm17
+ vpaddd ymm3, ymm3, ymm20
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxord ymm15, ymm15, ymm0
+ vpxord ymm12, ymm12, ymm1
+ vpxord ymm13, ymm13, ymm2
+ vpxord ymm14, ymm14, ymm3
+ vprord ymm15, ymm15, 8
+ vprord ymm12, ymm12, 8
+ vprord ymm13, ymm13, 8
+ vprord ymm14, ymm14, 8
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm8, ymm13
+ vpaddd ymm9, ymm9, ymm14
+ vpxord ymm5, ymm5, ymm10
+ vpxord ymm6, ymm6, ymm11
+ vpxord ymm7, ymm7, ymm8
+ vpxord ymm4, ymm4, ymm9
+ vprord ymm5, ymm5, 7
+ vprord ymm6, ymm6, 7
+ vprord ymm7, ymm7, 7
+ vprord ymm4, ymm4, 7
+ vpaddd ymm0, ymm0, ymm25
+ vpaddd ymm1, ymm1, ymm27
+ vpaddd ymm2, ymm2, ymm24
+ vpaddd ymm3, ymm3, ymm31
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxord ymm12, ymm12, ymm0
+ vpxord ymm13, ymm13, ymm1
+ vpxord ymm14, ymm14, ymm2
+ vpxord ymm15, ymm15, ymm3
+ vprord ymm12, ymm12, 16
+ vprord ymm13, ymm13, 16
+ vprord ymm14, ymm14, 16
+ vprord ymm15, ymm15, 16
+ vpaddd ymm8, ymm8, ymm12
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxord ymm4, ymm4, ymm8
+ vpxord ymm5, ymm5, ymm9
+ vpxord ymm6, ymm6, ymm10
+ vpxord ymm7, ymm7, ymm11
+ vprord ymm4, ymm4, 12
+ vprord ymm5, ymm5, 12
+ vprord ymm6, ymm6, 12
+ vprord ymm7, ymm7, 12
+ vpaddd ymm0, ymm0, ymm30
+ vpaddd ymm1, ymm1, ymm21
+ vpaddd ymm2, ymm2, ymm28
+ vpaddd ymm3, ymm3, ymm17
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxord ymm12, ymm12, ymm0
+ vpxord ymm13, ymm13, ymm1
+ vpxord ymm14, ymm14, ymm2
+ vpxord ymm15, ymm15, ymm3
+ vprord ymm12, ymm12, 8
+ vprord ymm13, ymm13, 8
+ vprord ymm14, ymm14, 8
+ vprord ymm15, ymm15, 8
+ vpaddd ymm8, ymm8, ymm12
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxord ymm4, ymm4, ymm8
+ vpxord ymm5, ymm5, ymm9
+ vpxord ymm6, ymm6, ymm10
+ vpxord ymm7, ymm7, ymm11
+ vprord ymm4, ymm4, 7
+ vprord ymm5, ymm5, 7
+ vprord ymm6, ymm6, 7
+ vprord ymm7, ymm7, 7
+ vpaddd ymm0, ymm0, ymm29
+ vpaddd ymm1, ymm1, ymm16
+ vpaddd ymm2, ymm2, ymm18
+ vpaddd ymm3, ymm3, ymm20
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxord ymm15, ymm15, ymm0
+ vpxord ymm12, ymm12, ymm1
+ vpxord ymm13, ymm13, ymm2
+ vpxord ymm14, ymm14, ymm3
+ vprord ymm15, ymm15, 16
+ vprord ymm12, ymm12, 16
+ vprord ymm13, ymm13, 16
+ vprord ymm14, ymm14, 16
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm8, ymm13
+ vpaddd ymm9, ymm9, ymm14
+ vpxord ymm5, ymm5, ymm10
+ vpxord ymm6, ymm6, ymm11
+ vpxord ymm7, ymm7, ymm8
+ vpxord ymm4, ymm4, ymm9
+ vprord ymm5, ymm5, 12
+ vprord ymm6, ymm6, 12
+ vprord ymm7, ymm7, 12
+ vprord ymm4, ymm4, 12
+ vpaddd ymm0, ymm0, ymm19
+ vpaddd ymm1, ymm1, ymm26
+ vpaddd ymm2, ymm2, ymm22
+ vpaddd ymm3, ymm3, ymm23
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxord ymm15, ymm15, ymm0
+ vpxord ymm12, ymm12, ymm1
+ vpxord ymm13, ymm13, ymm2
+ vpxord ymm14, ymm14, ymm3
+ vprord ymm15, ymm15, 8
+ vprord ymm12, ymm12, 8
+ vprord ymm13, ymm13, 8
+ vprord ymm14, ymm14, 8
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm8, ymm13
+ vpaddd ymm9, ymm9, ymm14
+ vpxord ymm5, ymm5, ymm10
+ vpxord ymm6, ymm6, ymm11
+ vpxord ymm7, ymm7, ymm8
+ vpxord ymm4, ymm4, ymm9
+ vprord ymm5, ymm5, 7
+ vprord ymm6, ymm6, 7
+ vprord ymm7, ymm7, 7
+ vprord ymm4, ymm4, 7
+ vpaddd ymm0, ymm0, ymm27
+ vpaddd ymm1, ymm1, ymm21
+ vpaddd ymm2, ymm2, ymm17
+ vpaddd ymm3, ymm3, ymm24
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxord ymm12, ymm12, ymm0
+ vpxord ymm13, ymm13, ymm1
+ vpxord ymm14, ymm14, ymm2
+ vpxord ymm15, ymm15, ymm3
+ vprord ymm12, ymm12, 16
+ vprord ymm13, ymm13, 16
+ vprord ymm14, ymm14, 16
+ vprord ymm15, ymm15, 16
+ vpaddd ymm8, ymm8, ymm12
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxord ymm4, ymm4, ymm8
+ vpxord ymm5, ymm5, ymm9
+ vpxord ymm6, ymm6, ymm10
+ vpxord ymm7, ymm7, ymm11
+ vprord ymm4, ymm4, 12
+ vprord ymm5, ymm5, 12
+ vprord ymm6, ymm6, 12
+ vprord ymm7, ymm7, 12
+ vpaddd ymm0, ymm0, ymm31
+ vpaddd ymm1, ymm1, ymm16
+ vpaddd ymm2, ymm2, ymm25
+ vpaddd ymm3, ymm3, ymm22
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxord ymm12, ymm12, ymm0
+ vpxord ymm13, ymm13, ymm1
+ vpxord ymm14, ymm14, ymm2
+ vpxord ymm15, ymm15, ymm3
+ vprord ymm12, ymm12, 8
+ vprord ymm13, ymm13, 8
+ vprord ymm14, ymm14, 8
+ vprord ymm15, ymm15, 8
+ vpaddd ymm8, ymm8, ymm12
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxord ymm4, ymm4, ymm8
+ vpxord ymm5, ymm5, ymm9
+ vpxord ymm6, ymm6, ymm10
+ vpxord ymm7, ymm7, ymm11
+ vprord ymm4, ymm4, 7
+ vprord ymm5, ymm5, 7
+ vprord ymm6, ymm6, 7
+ vprord ymm7, ymm7, 7
+ vpaddd ymm0, ymm0, ymm30
+ vpaddd ymm1, ymm1, ymm18
+ vpaddd ymm2, ymm2, ymm19
+ vpaddd ymm3, ymm3, ymm23
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxord ymm15, ymm15, ymm0
+ vpxord ymm12, ymm12, ymm1
+ vpxord ymm13, ymm13, ymm2
+ vpxord ymm14, ymm14, ymm3
+ vprord ymm15, ymm15, 16
+ vprord ymm12, ymm12, 16
+ vprord ymm13, ymm13, 16
+ vprord ymm14, ymm14, 16
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm8, ymm13
+ vpaddd ymm9, ymm9, ymm14
+ vpxord ymm5, ymm5, ymm10
+ vpxord ymm6, ymm6, ymm11
+ vpxord ymm7, ymm7, ymm8
+ vpxord ymm4, ymm4, ymm9
+ vprord ymm5, ymm5, 12
+ vprord ymm6, ymm6, 12
+ vprord ymm7, ymm7, 12
+ vprord ymm4, ymm4, 12
+ vpaddd ymm0, ymm0, ymm26
+ vpaddd ymm1, ymm1, ymm28
+ vpaddd ymm2, ymm2, ymm20
+ vpaddd ymm3, ymm3, ymm29
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxord ymm15, ymm15, ymm0
+ vpxord ymm12, ymm12, ymm1
+ vpxord ymm13, ymm13, ymm2
+ vpxord ymm14, ymm14, ymm3
+ vprord ymm15, ymm15, 8
+ vprord ymm12, ymm12, 8
+ vprord ymm13, ymm13, 8
+ vprord ymm14, ymm14, 8
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm8, ymm13
+ vpaddd ymm9, ymm9, ymm14
+ vpxord ymm5, ymm5, ymm10
+ vpxord ymm6, ymm6, ymm11
+ vpxord ymm7, ymm7, ymm8
+ vpxord ymm4, ymm4, ymm9
+ vprord ymm5, ymm5, 7
+ vprord ymm6, ymm6, 7
+ vprord ymm7, ymm7, 7
+ vprord ymm4, ymm4, 7
+ vpxor ymm0, ymm0, ymm8
+ vpxor ymm1, ymm1, ymm9
+ vpxor ymm2, ymm2, ymm10
+ vpxor ymm3, ymm3, ymm11
+ vpxor ymm4, ymm4, ymm12
+ vpxor ymm5, ymm5, ymm13
+ vpxor ymm6, ymm6, ymm14
+ vpxor ymm7, ymm7, ymm15
+ movzx eax, byte ptr [rbp+0x38]
+ jne 2b
+ mov rbx, qword ptr [rbp+0x50]
+ vunpcklps ymm8, ymm0, ymm1
+ vunpcklps ymm9, ymm2, ymm3
+ vunpckhps ymm10, ymm0, ymm1
+ vunpcklps ymm11, ymm4, ymm5
+ vunpcklps ymm0, ymm6, ymm7
+ vshufps ymm12, ymm8, ymm9, 78
+ vblendps ymm1, ymm8, ymm12, 0xCC
+ vshufps ymm8, ymm11, ymm0, 78
+ vunpckhps ymm13, ymm2, ymm3
+ vblendps ymm2, ymm11, ymm8, 0xCC
+ vblendps ymm3, ymm12, ymm9, 0xCC
+ vperm2f128 ymm12, ymm1, ymm2, 0x20
+ vmovups ymmword ptr [rbx], ymm12
+ vunpckhps ymm14, ymm4, ymm5
+ vblendps ymm4, ymm8, ymm0, 0xCC
+ vunpckhps ymm15, ymm6, ymm7
+ vperm2f128 ymm7, ymm3, ymm4, 0x20
+ vmovups ymmword ptr [rbx+0x20], ymm7
+ vshufps ymm5, ymm10, ymm13, 78
+ vblendps ymm6, ymm5, ymm13, 0xCC
+ vshufps ymm13, ymm14, ymm15, 78
+ vblendps ymm10, ymm10, ymm5, 0xCC
+ vblendps ymm14, ymm14, ymm13, 0xCC
+ vperm2f128 ymm8, ymm10, ymm14, 0x20
+ vmovups ymmword ptr [rbx+0x40], ymm8
+ vblendps ymm15, ymm13, ymm15, 0xCC
+ vperm2f128 ymm13, ymm6, ymm15, 0x20
+ vmovups ymmword ptr [rbx+0x60], ymm13
+ vperm2f128 ymm9, ymm1, ymm2, 0x31
+ vperm2f128 ymm11, ymm3, ymm4, 0x31
+ vmovups ymmword ptr [rbx+0x80], ymm9
+ vperm2f128 ymm14, ymm10, ymm14, 0x31
+ vperm2f128 ymm15, ymm6, ymm15, 0x31
+ vmovups ymmword ptr [rbx+0xA0], ymm11
+ vmovups ymmword ptr [rbx+0xC0], ymm14
+ vmovups ymmword ptr [rbx+0xE0], ymm15
+ vmovdqa ymm0, ymmword ptr [rsp]
+ vmovdqa ymm2, ymmword ptr [rsp+0x2*0x20]
+ vmovdqa32 ymm0 {k1}, ymmword ptr [rsp+0x1*0x20]
+ vmovdqa32 ymm2 {k1}, ymmword ptr [rsp+0x3*0x20]
+ vmovdqa ymmword ptr [rsp], ymm0
+ vmovdqa ymmword ptr [rsp+0x2*0x20], ymm2
+ add rbx, 256
+ mov qword ptr [rbp+0x50], rbx
+ add rdi, 64
+ sub rsi, 8
+3:
+ mov rbx, qword ptr [rbp+0x50]
+ mov r15, qword ptr [rsp+0x80]
+ movzx r13, byte ptr [rbp+0x38]
+ movzx r12, byte ptr [rbp+0x48]
+ test esi, 0x4
+ je 3f
+ vbroadcasti32x4 zmm0, xmmword ptr [rcx]
+ vbroadcasti32x4 zmm1, xmmword ptr [rcx+0x1*0x10]
+ vmovdqa xmm12, xmmword ptr [rsp]
+ vmovdqa xmm13, xmmword ptr [rsp+0x4*0x10]
+ vpunpckldq xmm14, xmm12, xmm13
+ vpunpckhdq xmm15, xmm12, xmm13
+ vpermq ymm14, ymm14, 0xDC
+ vpermq ymm15, ymm15, 0xDC
+ vpbroadcastd zmm12, dword ptr [BLAKE3_BLOCK_LEN+rip]
+ vinserti32x8 zmm13, zmm14, ymm15, 0x01
+ mov eax, 17476
+ kmovw k2, eax
+ vpblendmd zmm13 {k2}, zmm13, zmm12
+ vbroadcasti32x4 zmm15, xmmword ptr [BLAKE3_IV+rip]
+ mov r8, qword ptr [rdi]
+ mov r9, qword ptr [rdi+0x8]
+ mov r10, qword ptr [rdi+0x10]
+ mov r11, qword ptr [rdi+0x18]
+ mov eax, 43690
+ kmovw k3, eax
+ mov eax, 34952
+ kmovw k4, eax
+ movzx eax, byte ptr [rbp+0x40]
+ or eax, r13d
+ xor edx, edx
+.p2align 5
+2:
+ mov r14d, eax
+ or eax, r12d
+ add rdx, 64
+ cmp rdx, r15
+ cmovne eax, r14d
+ mov dword ptr [rsp+0x88], eax
+ vmovdqa32 zmm2, zmm15
+ vpbroadcastd zmm8, dword ptr [rsp+0x22*0x4]
+ vpblendmd zmm3 {k4}, zmm13, zmm8
+ vmovups zmm8, zmmword ptr [r8+rdx-0x1*0x40]
+ vinserti32x4 zmm8, zmm8, xmmword ptr [r9+rdx-0x4*0x10], 0x01
+ vinserti32x4 zmm8, zmm8, xmmword ptr [r10+rdx-0x4*0x10], 0x02
+ vinserti32x4 zmm8, zmm8, xmmword ptr [r11+rdx-0x4*0x10], 0x03
+ vmovups zmm9, zmmword ptr [r8+rdx-0x30]
+ vinserti32x4 zmm9, zmm9, xmmword ptr [r9+rdx-0x3*0x10], 0x01
+ vinserti32x4 zmm9, zmm9, xmmword ptr [r10+rdx-0x3*0x10], 0x02
+ vinserti32x4 zmm9, zmm9, xmmword ptr [r11+rdx-0x3*0x10], 0x03
+ vshufps zmm4, zmm8, zmm9, 136
+ vshufps zmm5, zmm8, zmm9, 221
+ vmovups zmm8, zmmword ptr [r8+rdx-0x20]
+ vinserti32x4 zmm8, zmm8, xmmword ptr [r9+rdx-0x2*0x10], 0x01
+ vinserti32x4 zmm8, zmm8, xmmword ptr [r10+rdx-0x2*0x10], 0x02
+ vinserti32x4 zmm8, zmm8, xmmword ptr [r11+rdx-0x2*0x10], 0x03
+ vmovups zmm9, zmmword ptr [r8+rdx-0x10]
+ vinserti32x4 zmm9, zmm9, xmmword ptr [r9+rdx-0x1*0x10], 0x01
+ vinserti32x4 zmm9, zmm9, xmmword ptr [r10+rdx-0x1*0x10], 0x02
+ vinserti32x4 zmm9, zmm9, xmmword ptr [r11+rdx-0x1*0x10], 0x03
+ vshufps zmm6, zmm8, zmm9, 136
+ vshufps zmm7, zmm8, zmm9, 221
+ vpshufd zmm6, zmm6, 0x93
+ vpshufd zmm7, zmm7, 0x93
+ mov al, 7
+9:
+ vpaddd zmm0, zmm0, zmm4
+ vpaddd zmm0, zmm0, zmm1
+ vpxord zmm3, zmm3, zmm0
+ vprord zmm3, zmm3, 16
+ vpaddd zmm2, zmm2, zmm3
+ vpxord zmm1, zmm1, zmm2
+ vprord zmm1, zmm1, 12
+ vpaddd zmm0, zmm0, zmm5
+ vpaddd zmm0, zmm0, zmm1
+ vpxord zmm3, zmm3, zmm0
+ vprord zmm3, zmm3, 8
+ vpaddd zmm2, zmm2, zmm3
+ vpxord zmm1, zmm1, zmm2
+ vprord zmm1, zmm1, 7
+ vpshufd zmm0, zmm0, 0x93
+ vpshufd zmm3, zmm3, 0x4E
+ vpshufd zmm2, zmm2, 0x39
+ vpaddd zmm0, zmm0, zmm6
+ vpaddd zmm0, zmm0, zmm1
+ vpxord zmm3, zmm3, zmm0
+ vprord zmm3, zmm3, 16
+ vpaddd zmm2, zmm2, zmm3
+ vpxord zmm1, zmm1, zmm2
+ vprord zmm1, zmm1, 12
+ vpaddd zmm0, zmm0, zmm7
+ vpaddd zmm0, zmm0, zmm1
+ vpxord zmm3, zmm3, zmm0
+ vprord zmm3, zmm3, 8
+ vpaddd zmm2, zmm2, zmm3
+ vpxord zmm1, zmm1, zmm2
+ vprord zmm1, zmm1, 7
+ vpshufd zmm0, zmm0, 0x39
+ vpshufd zmm3, zmm3, 0x4E
+ vpshufd zmm2, zmm2, 0x93
+ dec al
+ jz 9f
+ vshufps zmm8, zmm4, zmm5, 214
+ vpshufd zmm9, zmm4, 0x0F
+ vpshufd zmm4, zmm8, 0x39
+ vshufps zmm8, zmm6, zmm7, 250
+ vpblendmd zmm9 {k3}, zmm9, zmm8
+ vpunpcklqdq zmm8, zmm7, zmm5
+ vpblendmd zmm8 {k4}, zmm8, zmm6
+ vpshufd zmm8, zmm8, 0x78
+ vpunpckhdq zmm5, zmm5, zmm7
+ vpunpckldq zmm6, zmm6, zmm5
+ vpshufd zmm7, zmm6, 0x1E
+ vmovdqa32 zmm5, zmm9
+ vmovdqa32 zmm6, zmm8
+ jmp 9b
+9:
+ vpxord zmm0, zmm0, zmm2
+ vpxord zmm1, zmm1, zmm3
+ mov eax, r13d
+ cmp rdx, r15
+ jne 2b
+ vmovdqu xmmword ptr [rbx], xmm0
+ vmovdqu xmmword ptr [rbx+0x10], xmm1
+ vextracti128 xmmword ptr [rbx+0x20], ymm0, 0x01
+ vextracti128 xmmword ptr [rbx+0x30], ymm1, 0x01
+ vextracti32x4 xmmword ptr [rbx+0x4*0x10], zmm0, 0x02
+ vextracti32x4 xmmword ptr [rbx+0x5*0x10], zmm1, 0x02
+ vextracti32x4 xmmword ptr [rbx+0x6*0x10], zmm0, 0x03
+ vextracti32x4 xmmword ptr [rbx+0x7*0x10], zmm1, 0x03
+ vmovdqa xmm0, xmmword ptr [rsp]
+ vmovdqa xmm2, xmmword ptr [rsp+0x40]
+ vmovdqa32 xmm0 {k1}, xmmword ptr [rsp+0x1*0x10]
+ vmovdqa32 xmm2 {k1}, xmmword ptr [rsp+0x5*0x10]
+ vmovdqa xmmword ptr [rsp], xmm0
+ vmovdqa xmmword ptr [rsp+0x40], xmm2
+ add rbx, 128
+ add rdi, 32
+ sub rsi, 4
+3:
+ test esi, 0x2
+ je 3f
+ vbroadcasti128 ymm0, xmmword ptr [rcx]
+ vbroadcasti128 ymm1, xmmword ptr [rcx+0x10]
+ vmovd xmm13, dword ptr [rsp]
+ vpinsrd xmm13, xmm13, dword ptr [rsp+0x40], 1
+ vpinsrd xmm13, xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
+ vmovd xmm14, dword ptr [rsp+0x4]
+ vpinsrd xmm14, xmm14, dword ptr [rsp+0x44], 1
+ vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
+ vinserti128 ymm13, ymm13, xmm14, 0x01
+ mov r8, qword ptr [rdi]
+ mov r9, qword ptr [rdi+0x8]
+ movzx eax, byte ptr [rbp+0x40]
+ or eax, r13d
+ xor edx, edx
+.p2align 5
+2:
+ mov r14d, eax
+ or eax, r12d
+ add rdx, 64
+ cmp rdx, r15
+ cmovne eax, r14d
+ mov dword ptr [rsp+0x88], eax
+ vbroadcasti128 ymm2, xmmword ptr [BLAKE3_IV+rip]
+ vpbroadcastd ymm8, dword ptr [rsp+0x88]
+ vpblendd ymm3, ymm13, ymm8, 0x88
+ vmovups ymm8, ymmword ptr [r8+rdx-0x40]
+ vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-0x40], 0x01
+ vmovups ymm9, ymmword ptr [r8+rdx-0x30]
+ vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-0x30], 0x01
+ vshufps ymm4, ymm8, ymm9, 136
+ vshufps ymm5, ymm8, ymm9, 221
+ vmovups ymm8, ymmword ptr [r8+rdx-0x20]
+ vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-0x20], 0x01
+ vmovups ymm9, ymmword ptr [r8+rdx-0x10]
+ vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-0x10], 0x01
+ vshufps ymm6, ymm8, ymm9, 136
+ vshufps ymm7, ymm8, ymm9, 221
+ vpshufd ymm6, ymm6, 0x93
+ vpshufd ymm7, ymm7, 0x93
+ mov al, 7
+9:
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm0, ymm0, ymm1
+ vpxord ymm3, ymm3, ymm0
+ vprord ymm3, ymm3, 16
+ vpaddd ymm2, ymm2, ymm3
+ vpxord ymm1, ymm1, ymm2
+ vprord ymm1, ymm1, 12
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm0, ymm0, ymm1
+ vpxord ymm3, ymm3, ymm0
+ vprord ymm3, ymm3, 8
+ vpaddd ymm2, ymm2, ymm3
+ vpxord ymm1, ymm1, ymm2
+ vprord ymm1, ymm1, 7
+ vpshufd ymm0, ymm0, 0x93
+ vpshufd ymm3, ymm3, 0x4E
+ vpshufd ymm2, ymm2, 0x39
+ vpaddd ymm0, ymm0, ymm6
+ vpaddd ymm0, ymm0, ymm1
+ vpxord ymm3, ymm3, ymm0
+ vprord ymm3, ymm3, 16
+ vpaddd ymm2, ymm2, ymm3
+ vpxord ymm1, ymm1, ymm2
+ vprord ymm1, ymm1, 12
+ vpaddd ymm0, ymm0, ymm7
+ vpaddd ymm0, ymm0, ymm1
+ vpxord ymm3, ymm3, ymm0
+ vprord ymm3, ymm3, 8
+ vpaddd ymm2, ymm2, ymm3
+ vpxord ymm1, ymm1, ymm2
+ vprord ymm1, ymm1, 7
+ vpshufd ymm0, ymm0, 0x39
+ vpshufd ymm3, ymm3, 0x4E
+ vpshufd ymm2, ymm2, 0x93
+ dec al
+ jz 9f
+ vshufps ymm8, ymm4, ymm5, 214
+ vpshufd ymm9, ymm4, 0x0F
+ vpshufd ymm4, ymm8, 0x39
+ vshufps ymm8, ymm6, ymm7, 250
+ vpblendd ymm9, ymm9, ymm8, 0xAA
+ vpunpcklqdq ymm8, ymm7, ymm5
+ vpblendd ymm8, ymm8, ymm6, 0x88
+ vpshufd ymm8, ymm8, 0x78
+ vpunpckhdq ymm5, ymm5, ymm7
+ vpunpckldq ymm6, ymm6, ymm5
+ vpshufd ymm7, ymm6, 0x1E
+ vmovdqa ymm5, ymm9
+ vmovdqa ymm6, ymm8
+ jmp 9b
+9:
+ vpxor ymm0, ymm0, ymm2
+ vpxor ymm1, ymm1, ymm3
+ mov eax, r13d
+ cmp rdx, r15
+ jne 2b
+ vmovdqu xmmword ptr [rbx], xmm0
+ vmovdqu xmmword ptr [rbx+0x10], xmm1
+ vextracti128 xmmword ptr [rbx+0x20], ymm0, 0x01
+ vextracti128 xmmword ptr [rbx+0x30], ymm1, 0x01
+ vmovdqa xmm0, xmmword ptr [rsp]
+ vmovdqa xmm2, xmmword ptr [rsp+0x4*0x10]
+ vmovdqu32 xmm0 {k1}, xmmword ptr [rsp+0x8]
+ vmovdqu32 xmm2 {k1}, xmmword ptr [rsp+0x48]
+ vmovdqa xmmword ptr [rsp], xmm0
+ vmovdqa xmmword ptr [rsp+0x4*0x10], xmm2
+ add rbx, 64
+ add rdi, 16
+ sub rsi, 2
+3:
+ test esi, 0x1
+ je 4b
+ vmovdqu xmm0, xmmword ptr [rcx]
+ vmovdqu xmm1, xmmword ptr [rcx+0x10]
+ vmovd xmm14, dword ptr [rsp]
+ vpinsrd xmm14, xmm14, dword ptr [rsp+0x40], 1
+ vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
+ vmovdqa xmm15, xmmword ptr [BLAKE3_IV+rip]
+ mov r8, qword ptr [rdi]
+ movzx eax, byte ptr [rbp+0x40]
+ or eax, r13d
+ xor edx, edx
+.p2align 5
+2:
+ mov r14d, eax
+ or eax, r12d
+ add rdx, 64
+ cmp rdx, r15
+ cmovne eax, r14d
+ vpinsrd xmm3, xmm14, eax, 3
+ vmovdqa xmm2, xmm15
+ vmovups xmm8, xmmword ptr [r8+rdx-0x40]
+ vmovups xmm9, xmmword ptr [r8+rdx-0x30]
+ vshufps xmm4, xmm8, xmm9, 136
+ vshufps xmm5, xmm8, xmm9, 221
+ vmovups xmm8, xmmword ptr [r8+rdx-0x20]
+ vmovups xmm9, xmmword ptr [r8+rdx-0x10]
+ vshufps xmm6, xmm8, xmm9, 136
+ vshufps xmm7, xmm8, xmm9, 221
+ vpshufd xmm6, xmm6, 0x93
+ vpshufd xmm7, xmm7, 0x93
+ mov al, 7
+9:
+ vpaddd xmm0, xmm0, xmm4
+ vpaddd xmm0, xmm0, xmm1
+ vpxord xmm3, xmm3, xmm0
+ vprord xmm3, xmm3, 16
+ vpaddd xmm2, xmm2, xmm3
+ vpxord xmm1, xmm1, xmm2
+ vprord xmm1, xmm1, 12
+ vpaddd xmm0, xmm0, xmm5
+ vpaddd xmm0, xmm0, xmm1
+ vpxord xmm3, xmm3, xmm0
+ vprord xmm3, xmm3, 8
+ vpaddd xmm2, xmm2, xmm3
+ vpxord xmm1, xmm1, xmm2
+ vprord xmm1, xmm1, 7
+ vpshufd xmm0, xmm0, 0x93
+ vpshufd xmm3, xmm3, 0x4E
+ vpshufd xmm2, xmm2, 0x39
+ vpaddd xmm0, xmm0, xmm6
+ vpaddd xmm0, xmm0, xmm1
+ vpxord xmm3, xmm3, xmm0
+ vprord xmm3, xmm3, 16
+ vpaddd xmm2, xmm2, xmm3
+ vpxord xmm1, xmm1, xmm2
+ vprord xmm1, xmm1, 12
+ vpaddd xmm0, xmm0, xmm7
+ vpaddd xmm0, xmm0, xmm1
+ vpxord xmm3, xmm3, xmm0
+ vprord xmm3, xmm3, 8
+ vpaddd xmm2, xmm2, xmm3
+ vpxord xmm1, xmm1, xmm2
+ vprord xmm1, xmm1, 7
+ vpshufd xmm0, xmm0, 0x39
+ vpshufd xmm3, xmm3, 0x4E
+ vpshufd xmm2, xmm2, 0x93
+ dec al
+ jz 9f
+ vshufps xmm8, xmm4, xmm5, 214
+ vpshufd xmm9, xmm4, 0x0F
+ vpshufd xmm4, xmm8, 0x39
+ vshufps xmm8, xmm6, xmm7, 250
+ vpblendd xmm9, xmm9, xmm8, 0xAA
+ vpunpcklqdq xmm8, xmm7, xmm5
+ vpblendd xmm8, xmm8, xmm6, 0x88
+ vpshufd xmm8, xmm8, 0x78
+ vpunpckhdq xmm5, xmm5, xmm7
+ vpunpckldq xmm6, xmm6, xmm5
+ vpshufd xmm7, xmm6, 0x1E
+ vmovdqa xmm5, xmm9
+ vmovdqa xmm6, xmm8
+ jmp 9b
+9:
+ vpxor xmm0, xmm0, xmm2
+ vpxor xmm1, xmm1, xmm3
+ mov eax, r13d
+ cmp rdx, r15
+ jne 2b
+ vmovdqu xmmword ptr [rbx], xmm0
+ vmovdqu xmmword ptr [rbx+0x10], xmm1
+ jmp 4b
+.p2align 6
+zfs_blake3_compress_in_place_avx512:
+ _CET_ENDBR
+ vmovdqu xmm0, xmmword ptr [rdi]
+ vmovdqu xmm1, xmmword ptr [rdi+0x10]
+ movzx eax, r8b
+ movzx edx, dl
+ shl rax, 32
+ add rdx, rax
+ vmovq xmm3, rcx
+ vmovq xmm4, rdx
+ vpunpcklqdq xmm3, xmm3, xmm4
+ vmovaps xmm2, xmmword ptr [BLAKE3_IV+rip]
+ vmovups xmm8, xmmword ptr [rsi]
+ vmovups xmm9, xmmword ptr [rsi+0x10]
+ vshufps xmm4, xmm8, xmm9, 136
+ vshufps xmm5, xmm8, xmm9, 221
+ vmovups xmm8, xmmword ptr [rsi+0x20]
+ vmovups xmm9, xmmword ptr [rsi+0x30]
+ vshufps xmm6, xmm8, xmm9, 136
+ vshufps xmm7, xmm8, xmm9, 221
+ vpshufd xmm6, xmm6, 0x93
+ vpshufd xmm7, xmm7, 0x93
+ mov al, 7
+9:
+ vpaddd xmm0, xmm0, xmm4
+ vpaddd xmm0, xmm0, xmm1
+ vpxord xmm3, xmm3, xmm0
+ vprord xmm3, xmm3, 16
+ vpaddd xmm2, xmm2, xmm3
+ vpxord xmm1, xmm1, xmm2
+ vprord xmm1, xmm1, 12
+ vpaddd xmm0, xmm0, xmm5
+ vpaddd xmm0, xmm0, xmm1
+ vpxord xmm3, xmm3, xmm0
+ vprord xmm3, xmm3, 8
+ vpaddd xmm2, xmm2, xmm3
+ vpxord xmm1, xmm1, xmm2
+ vprord xmm1, xmm1, 7
+ vpshufd xmm0, xmm0, 0x93
+ vpshufd xmm3, xmm3, 0x4E
+ vpshufd xmm2, xmm2, 0x39
+ vpaddd xmm0, xmm0, xmm6
+ vpaddd xmm0, xmm0, xmm1
+ vpxord xmm3, xmm3, xmm0
+ vprord xmm3, xmm3, 16
+ vpaddd xmm2, xmm2, xmm3
+ vpxord xmm1, xmm1, xmm2
+ vprord xmm1, xmm1, 12
+ vpaddd xmm0, xmm0, xmm7
+ vpaddd xmm0, xmm0, xmm1
+ vpxord xmm3, xmm3, xmm0
+ vprord xmm3, xmm3, 8
+ vpaddd xmm2, xmm2, xmm3
+ vpxord xmm1, xmm1, xmm2
+ vprord xmm1, xmm1, 7
+ vpshufd xmm0, xmm0, 0x39
+ vpshufd xmm3, xmm3, 0x4E
+ vpshufd xmm2, xmm2, 0x93
+ dec al
+ jz 9f
+ vshufps xmm8, xmm4, xmm5, 214
+ vpshufd xmm9, xmm4, 0x0F
+ vpshufd xmm4, xmm8, 0x39
+ vshufps xmm8, xmm6, xmm7, 250
+ vpblendd xmm9, xmm9, xmm8, 0xAA
+ vpunpcklqdq xmm8, xmm7, xmm5
+ vpblendd xmm8, xmm8, xmm6, 0x88
+ vpshufd xmm8, xmm8, 0x78
+ vpunpckhdq xmm5, xmm5, xmm7
+ vpunpckldq xmm6, xmm6, xmm5
+ vpshufd xmm7, xmm6, 0x1E
+ vmovdqa xmm5, xmm9
+ vmovdqa xmm6, xmm8
+ jmp 9b
+9:
+ vpxor xmm0, xmm0, xmm2
+ vpxor xmm1, xmm1, xmm3
+ vmovdqu xmmword ptr [rdi], xmm0
+ vmovdqu xmmword ptr [rdi+0x10], xmm1
+ ret
+
+.p2align 6
+zfs_blake3_compress_xof_avx512:
+ _CET_ENDBR
+ vmovdqu xmm0, xmmword ptr [rdi]
+ vmovdqu xmm1, xmmword ptr [rdi+0x10]
+ movzx eax, r8b
+ movzx edx, dl
+ shl rax, 32
+ add rdx, rax
+ vmovq xmm3, rcx
+ vmovq xmm4, rdx
+ vpunpcklqdq xmm3, xmm3, xmm4
+ vmovaps xmm2, xmmword ptr [BLAKE3_IV+rip]
+ vmovups xmm8, xmmword ptr [rsi]
+ vmovups xmm9, xmmword ptr [rsi+0x10]
+ vshufps xmm4, xmm8, xmm9, 136
+ vshufps xmm5, xmm8, xmm9, 221
+ vmovups xmm8, xmmword ptr [rsi+0x20]
+ vmovups xmm9, xmmword ptr [rsi+0x30]
+ vshufps xmm6, xmm8, xmm9, 136
+ vshufps xmm7, xmm8, xmm9, 221
+ vpshufd xmm6, xmm6, 0x93
+ vpshufd xmm7, xmm7, 0x93
+ mov al, 7
+9:
+ vpaddd xmm0, xmm0, xmm4
+ vpaddd xmm0, xmm0, xmm1
+ vpxord xmm3, xmm3, xmm0
+ vprord xmm3, xmm3, 16
+ vpaddd xmm2, xmm2, xmm3
+ vpxord xmm1, xmm1, xmm2
+ vprord xmm1, xmm1, 12
+ vpaddd xmm0, xmm0, xmm5
+ vpaddd xmm0, xmm0, xmm1
+ vpxord xmm3, xmm3, xmm0
+ vprord xmm3, xmm3, 8
+ vpaddd xmm2, xmm2, xmm3
+ vpxord xmm1, xmm1, xmm2
+ vprord xmm1, xmm1, 7
+ vpshufd xmm0, xmm0, 0x93
+ vpshufd xmm3, xmm3, 0x4E
+ vpshufd xmm2, xmm2, 0x39
+ vpaddd xmm0, xmm0, xmm6
+ vpaddd xmm0, xmm0, xmm1
+ vpxord xmm3, xmm3, xmm0
+ vprord xmm3, xmm3, 16
+ vpaddd xmm2, xmm2, xmm3
+ vpxord xmm1, xmm1, xmm2
+ vprord xmm1, xmm1, 12
+ vpaddd xmm0, xmm0, xmm7
+ vpaddd xmm0, xmm0, xmm1
+ vpxord xmm3, xmm3, xmm0
+ vprord xmm3, xmm3, 8
+ vpaddd xmm2, xmm2, xmm3
+ vpxord xmm1, xmm1, xmm2
+ vprord xmm1, xmm1, 7
+ vpshufd xmm0, xmm0, 0x39
+ vpshufd xmm3, xmm3, 0x4E
+ vpshufd xmm2, xmm2, 0x93
+ dec al
+ jz 9f
+ vshufps xmm8, xmm4, xmm5, 214
+ vpshufd xmm9, xmm4, 0x0F
+ vpshufd xmm4, xmm8, 0x39
+ vshufps xmm8, xmm6, xmm7, 250
+ vpblendd xmm9, xmm9, xmm8, 0xAA
+ vpunpcklqdq xmm8, xmm7, xmm5
+ vpblendd xmm8, xmm8, xmm6, 0x88
+ vpshufd xmm8, xmm8, 0x78
+ vpunpckhdq xmm5, xmm5, xmm7
+ vpunpckldq xmm6, xmm6, xmm5
+ vpshufd xmm7, xmm6, 0x1E
+ vmovdqa xmm5, xmm9
+ vmovdqa xmm6, xmm8
+ jmp 9b
+9:
+ vpxor xmm0, xmm0, xmm2
+ vpxor xmm1, xmm1, xmm3
+ vpxor xmm2, xmm2, [rdi]
+ vpxor xmm3, xmm3, [rdi+0x10]
+ vmovdqu xmmword ptr [r9], xmm0
+ vmovdqu xmmword ptr [r9+0x10], xmm1
+ vmovdqu xmmword ptr [r9+0x20], xmm2
+ vmovdqu xmmword ptr [r9+0x30], xmm3
+ ret
+
+.size zfs_blake3_hash_many_avx512, . - zfs_blake3_hash_many_avx512
+.size zfs_blake3_compress_in_place_avx512, . - zfs_blake3_compress_in_place_avx512
+.size zfs_blake3_compress_xof_avx512, . - zfs_blake3_compress_xof_avx512
+
+#ifdef __APPLE__
+.static_data
+#else
+.section .rodata
+#endif
+
+.p2align 6
+INDEX0:
+ .long 0, 1, 2, 3, 16, 17, 18, 19
+ .long 8, 9, 10, 11, 24, 25, 26, 27
+INDEX1:
+ .long 4, 5, 6, 7, 20, 21, 22, 23
+ .long 12, 13, 14, 15, 28, 29, 30, 31
+ADD0:
+ .long 0, 1, 2, 3, 4, 5, 6, 7
+ .long 8, 9, 10, 11, 12, 13, 14, 15
+ADD1: .long 1
+
+ADD16: .long 16
+BLAKE3_BLOCK_LEN:
+ .long 64
+.p2align 6
+BLAKE3_IV:
+BLAKE3_IV_0:
+ .long 0x6A09E667
+BLAKE3_IV_1:
+ .long 0xBB67AE85
+BLAKE3_IV_2:
+ .long 0x3C6EF372
+BLAKE3_IV_3:
+ .long 0xA54FF53A
+
+#endif /* HAVE_AVX512 */
+
+#ifdef __ELF__
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/sys/contrib/openzfs/module/icp/asm-x86_64/blake3/blake3_sse2.S b/sys/contrib/openzfs/module/icp/asm-x86_64/blake3/blake3_sse2.S
new file mode 100644
index 000000000000..39d23ee233df
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/asm-x86_64/blake3/blake3_sse2.S
@@ -0,0 +1,2323 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Based on BLAKE3 v1.3.1, https://github.com/BLAKE3-team/BLAKE3
+ * Copyright (c) 2019-2020 Samuel Neves and Matthew Krupcale
+ * Copyright (c) 2022 Tino Reichardt <milky-zfs@mcmilk.de>
+ */
+
+#if defined(HAVE_SSE2)
+
+#define _ASM
+#include <sys/asm_linkage.h>
+
+#if defined(__ELF__) && defined(__CET__) && defined(__has_include)
+#if __has_include(<cet.h>)
+#include <cet.h>
+#endif
+#endif
+
+#if !defined(_CET_ENDBR)
+#define _CET_ENDBR
+#endif
+
+.intel_syntax noprefix
+.global zfs_blake3_hash_many_sse2
+.global zfs_blake3_compress_in_place_sse2
+.global zfs_blake3_compress_xof_sse2
+
+.text
+.type zfs_blake3_hash_many_sse2,@function
+.type zfs_blake3_compress_in_place_sse2,@function
+.type zfs_blake3_compress_xof_sse2,@function
+
+ .p2align 6
+zfs_blake3_hash_many_sse2:
+ _CET_ENDBR
+ push r15
+ push r14
+ push r13
+ push r12
+ push rbx
+ push rbp
+ mov rbp, rsp
+ sub rsp, 360
+ and rsp, 0xFFFFFFFFFFFFFFC0
+ neg r9d
+ movd xmm0, r9d
+ pshufd xmm0, xmm0, 0x00
+ movdqa xmmword ptr [rsp+0x130], xmm0
+ movdqa xmm1, xmm0
+ pand xmm1, xmmword ptr [ADD0+rip]
+ pand xmm0, xmmword ptr [ADD1+rip]
+ movdqa xmmword ptr [rsp+0x150], xmm0
+ movd xmm0, r8d
+ pshufd xmm0, xmm0, 0x00
+ paddd xmm0, xmm1
+ movdqa xmmword ptr [rsp+0x110], xmm0
+ pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip]
+ pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip]
+ pcmpgtd xmm1, xmm0
+ shr r8, 32
+ movd xmm2, r8d
+ pshufd xmm2, xmm2, 0x00
+ psubd xmm2, xmm1
+ movdqa xmmword ptr [rsp+0x120], xmm2
+ mov rbx, qword ptr [rbp+0x50]
+ mov r15, rdx
+ shl r15, 6
+ movzx r13d, byte ptr [rbp+0x38]
+ movzx r12d, byte ptr [rbp+0x48]
+ cmp rsi, 4
+ jc 3f
+2:
+ movdqu xmm3, xmmword ptr [rcx]
+ pshufd xmm0, xmm3, 0x00
+ pshufd xmm1, xmm3, 0x55
+ pshufd xmm2, xmm3, 0xAA
+ pshufd xmm3, xmm3, 0xFF
+ movdqu xmm7, xmmword ptr [rcx+0x10]
+ pshufd xmm4, xmm7, 0x00
+ pshufd xmm5, xmm7, 0x55
+ pshufd xmm6, xmm7, 0xAA
+ pshufd xmm7, xmm7, 0xFF
+ mov r8, qword ptr [rdi]
+ mov r9, qword ptr [rdi+0x8]
+ mov r10, qword ptr [rdi+0x10]
+ mov r11, qword ptr [rdi+0x18]
+ movzx eax, byte ptr [rbp+0x40]
+ or eax, r13d
+ xor edx, edx
+9:
+ mov r14d, eax
+ or eax, r12d
+ add rdx, 64
+ cmp rdx, r15
+ cmovne eax, r14d
+ movdqu xmm8, xmmword ptr [r8+rdx-0x40]
+ movdqu xmm9, xmmword ptr [r9+rdx-0x40]
+ movdqu xmm10, xmmword ptr [r10+rdx-0x40]
+ movdqu xmm11, xmmword ptr [r11+rdx-0x40]
+ movdqa xmm12, xmm8
+ punpckldq xmm8, xmm9
+ punpckhdq xmm12, xmm9
+ movdqa xmm14, xmm10
+ punpckldq xmm10, xmm11
+ punpckhdq xmm14, xmm11
+ movdqa xmm9, xmm8
+ punpcklqdq xmm8, xmm10
+ punpckhqdq xmm9, xmm10
+ movdqa xmm13, xmm12
+ punpcklqdq xmm12, xmm14
+ punpckhqdq xmm13, xmm14
+ movdqa xmmword ptr [rsp], xmm8
+ movdqa xmmword ptr [rsp+0x10], xmm9
+ movdqa xmmword ptr [rsp+0x20], xmm12
+ movdqa xmmword ptr [rsp+0x30], xmm13
+ movdqu xmm8, xmmword ptr [r8+rdx-0x30]
+ movdqu xmm9, xmmword ptr [r9+rdx-0x30]
+ movdqu xmm10, xmmword ptr [r10+rdx-0x30]
+ movdqu xmm11, xmmword ptr [r11+rdx-0x30]
+ movdqa xmm12, xmm8
+ punpckldq xmm8, xmm9
+ punpckhdq xmm12, xmm9
+ movdqa xmm14, xmm10
+ punpckldq xmm10, xmm11
+ punpckhdq xmm14, xmm11
+ movdqa xmm9, xmm8
+ punpcklqdq xmm8, xmm10
+ punpckhqdq xmm9, xmm10
+ movdqa xmm13, xmm12
+ punpcklqdq xmm12, xmm14
+ punpckhqdq xmm13, xmm14
+ movdqa xmmword ptr [rsp+0x40], xmm8
+ movdqa xmmword ptr [rsp+0x50], xmm9
+ movdqa xmmword ptr [rsp+0x60], xmm12
+ movdqa xmmword ptr [rsp+0x70], xmm13
+ movdqu xmm8, xmmword ptr [r8+rdx-0x20]
+ movdqu xmm9, xmmword ptr [r9+rdx-0x20]
+ movdqu xmm10, xmmword ptr [r10+rdx-0x20]
+ movdqu xmm11, xmmword ptr [r11+rdx-0x20]
+ movdqa xmm12, xmm8
+ punpckldq xmm8, xmm9
+ punpckhdq xmm12, xmm9
+ movdqa xmm14, xmm10
+ punpckldq xmm10, xmm11
+ punpckhdq xmm14, xmm11
+ movdqa xmm9, xmm8
+ punpcklqdq xmm8, xmm10
+ punpckhqdq xmm9, xmm10
+ movdqa xmm13, xmm12
+ punpcklqdq xmm12, xmm14
+ punpckhqdq xmm13, xmm14
+ movdqa xmmword ptr [rsp+0x80], xmm8
+ movdqa xmmword ptr [rsp+0x90], xmm9
+ movdqa xmmword ptr [rsp+0xA0], xmm12
+ movdqa xmmword ptr [rsp+0xB0], xmm13
+ movdqu xmm8, xmmword ptr [r8+rdx-0x10]
+ movdqu xmm9, xmmword ptr [r9+rdx-0x10]
+ movdqu xmm10, xmmword ptr [r10+rdx-0x10]
+ movdqu xmm11, xmmword ptr [r11+rdx-0x10]
+ movdqa xmm12, xmm8
+ punpckldq xmm8, xmm9
+ punpckhdq xmm12, xmm9
+ movdqa xmm14, xmm10
+ punpckldq xmm10, xmm11
+ punpckhdq xmm14, xmm11
+ movdqa xmm9, xmm8
+ punpcklqdq xmm8, xmm10
+ punpckhqdq xmm9, xmm10
+ movdqa xmm13, xmm12
+ punpcklqdq xmm12, xmm14
+ punpckhqdq xmm13, xmm14
+ movdqa xmmword ptr [rsp+0xC0], xmm8
+ movdqa xmmword ptr [rsp+0xD0], xmm9
+ movdqa xmmword ptr [rsp+0xE0], xmm12
+ movdqa xmmword ptr [rsp+0xF0], xmm13
+ movdqa xmm9, xmmword ptr [BLAKE3_IV_1+rip]
+ movdqa xmm10, xmmword ptr [BLAKE3_IV_2+rip]
+ movdqa xmm11, xmmword ptr [BLAKE3_IV_3+rip]
+ movdqa xmm12, xmmword ptr [rsp+0x110]
+ movdqa xmm13, xmmword ptr [rsp+0x120]
+ movdqa xmm14, xmmword ptr [BLAKE3_BLOCK_LEN+rip]
+ movd xmm15, eax
+ pshufd xmm15, xmm15, 0x00
+ prefetcht0 [r8+rdx+0x80]
+ prefetcht0 [r9+rdx+0x80]
+ prefetcht0 [r10+rdx+0x80]
+ prefetcht0 [r11+rdx+0x80]
+ paddd xmm0, xmmword ptr [rsp]
+ paddd xmm1, xmmword ptr [rsp+0x20]
+ paddd xmm2, xmmword ptr [rsp+0x40]
+ paddd xmm3, xmmword ptr [rsp+0x60]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ pshuflw xmm12, xmm12, 0xB1
+ pshufhw xmm12, xmm12, 0xB1
+ pshuflw xmm13, xmm13, 0xB1
+ pshufhw xmm13, xmm13, 0xB1
+ pshuflw xmm14, xmm14, 0xB1
+ pshufhw xmm14, xmm14, 0xB1
+ pshuflw xmm15, xmm15, 0xB1
+ pshufhw xmm15, xmm15, 0xB1
+ movdqa xmm8, xmmword ptr [BLAKE3_IV_0+rip]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x10]
+ paddd xmm1, xmmword ptr [rsp+0x30]
+ paddd xmm2, xmmword ptr [rsp+0x50]
+ paddd xmm3, xmmword ptr [rsp+0x70]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ movdqa xmm8, xmm12
+ psrld xmm12, 8
+ pslld xmm8, 24
+ pxor xmm12, xmm8
+ movdqa xmm8, xmm13
+ psrld xmm13, 8
+ pslld xmm8, 24
+ pxor xmm13, xmm8
+ movdqa xmm8, xmm14
+ psrld xmm14, 8
+ pslld xmm8, 24
+ pxor xmm14, xmm8
+ movdqa xmm8, xmm15
+ psrld xmm15, 8
+ pslld xmm8, 24
+ pxor xmm15, xmm8
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x80]
+ paddd xmm1, xmmword ptr [rsp+0xA0]
+ paddd xmm2, xmmword ptr [rsp+0xC0]
+ paddd xmm3, xmmword ptr [rsp+0xE0]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ pshuflw xmm15, xmm15, 0xB1
+ pshufhw xmm15, xmm15, 0xB1
+ pshuflw xmm12, xmm12, 0xB1
+ pshufhw xmm12, xmm12, 0xB1
+ pshuflw xmm13, xmm13, 0xB1
+ pshufhw xmm13, xmm13, 0xB1
+ pshuflw xmm14, xmm14, 0xB1
+ pshufhw xmm14, xmm14, 0xB1
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x90]
+ paddd xmm1, xmmword ptr [rsp+0xB0]
+ paddd xmm2, xmmword ptr [rsp+0xD0]
+ paddd xmm3, xmmword ptr [rsp+0xF0]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ movdqa xmm8, xmm15
+ psrld xmm15, 8
+ pslld xmm8, 24
+ pxor xmm15, xmm8
+ movdqa xmm8, xmm12
+ psrld xmm12, 8
+ pslld xmm8, 24
+ pxor xmm12, xmm8
+ movdqa xmm8, xmm13
+ psrld xmm13, 8
+ pslld xmm8, 24
+ pxor xmm13, xmm8
+ movdqa xmm8, xmm14
+ psrld xmm14, 8
+ pslld xmm8, 24
+ pxor xmm14, xmm8
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x20]
+ paddd xmm1, xmmword ptr [rsp+0x30]
+ paddd xmm2, xmmword ptr [rsp+0x70]
+ paddd xmm3, xmmword ptr [rsp+0x40]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ pshuflw xmm12, xmm12, 0xB1
+ pshufhw xmm12, xmm12, 0xB1
+ pshuflw xmm13, xmm13, 0xB1
+ pshufhw xmm13, xmm13, 0xB1
+ pshuflw xmm14, xmm14, 0xB1
+ pshufhw xmm14, xmm14, 0xB1
+ pshuflw xmm15, xmm15, 0xB1
+ pshufhw xmm15, xmm15, 0xB1
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x60]
+ paddd xmm1, xmmword ptr [rsp+0xA0]
+ paddd xmm2, xmmword ptr [rsp]
+ paddd xmm3, xmmword ptr [rsp+0xD0]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ movdqa xmm8, xmm12
+ psrld xmm12, 8
+ pslld xmm8, 24
+ pxor xmm12, xmm8
+ movdqa xmm8, xmm13
+ psrld xmm13, 8
+ pslld xmm8, 24
+ pxor xmm13, xmm8
+ movdqa xmm8, xmm14
+ psrld xmm14, 8
+ pslld xmm8, 24
+ pxor xmm14, xmm8
+ movdqa xmm8, xmm15
+ psrld xmm15, 8
+ pslld xmm8, 24
+ pxor xmm15, xmm8
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x10]
+ paddd xmm1, xmmword ptr [rsp+0xC0]
+ paddd xmm2, xmmword ptr [rsp+0x90]
+ paddd xmm3, xmmword ptr [rsp+0xF0]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ pshuflw xmm15, xmm15, 0xB1
+ pshufhw xmm15, xmm15, 0xB1
+ pshuflw xmm12, xmm12, 0xB1
+ pshufhw xmm12, xmm12, 0xB1
+ pshuflw xmm13, xmm13, 0xB1
+ pshufhw xmm13, xmm13, 0xB1
+ pshuflw xmm14, xmm14, 0xB1
+ pshufhw xmm14, xmm14, 0xB1
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+0xB0]
+ paddd xmm1, xmmword ptr [rsp+0x50]
+ paddd xmm2, xmmword ptr [rsp+0xE0]
+ paddd xmm3, xmmword ptr [rsp+0x80]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ movdqa xmm8, xmm15
+ psrld xmm15, 8
+ pslld xmm8, 24
+ pxor xmm15, xmm8
+ movdqa xmm8, xmm12
+ psrld xmm12, 8
+ pslld xmm8, 24
+ pxor xmm12, xmm8
+ movdqa xmm8, xmm13
+ psrld xmm13, 8
+ pslld xmm8, 24
+ pxor xmm13, xmm8
+ movdqa xmm8, xmm14
+ psrld xmm14, 8
+ pslld xmm8, 24
+ pxor xmm14, xmm8
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x30]
+ paddd xmm1, xmmword ptr [rsp+0xA0]
+ paddd xmm2, xmmword ptr [rsp+0xD0]
+ paddd xmm3, xmmword ptr [rsp+0x70]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ pshuflw xmm12, xmm12, 0xB1
+ pshufhw xmm12, xmm12, 0xB1
+ pshuflw xmm13, xmm13, 0xB1
+ pshufhw xmm13, xmm13, 0xB1
+ pshuflw xmm14, xmm14, 0xB1
+ pshufhw xmm14, xmm14, 0xB1
+ pshuflw xmm15, xmm15, 0xB1
+ pshufhw xmm15, xmm15, 0xB1
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x40]
+ paddd xmm1, xmmword ptr [rsp+0xC0]
+ paddd xmm2, xmmword ptr [rsp+0x20]
+ paddd xmm3, xmmword ptr [rsp+0xE0]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ movdqa xmm8, xmm12
+ psrld xmm12, 8
+ pslld xmm8, 24
+ pxor xmm12, xmm8
+ movdqa xmm8, xmm13
+ psrld xmm13, 8
+ pslld xmm8, 24
+ pxor xmm13, xmm8
+ movdqa xmm8, xmm14
+ psrld xmm14, 8
+ pslld xmm8, 24
+ pxor xmm14, xmm8
+ movdqa xmm8, xmm15
+ psrld xmm15, 8
+ pslld xmm8, 24
+ pxor xmm15, xmm8
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x60]
+ paddd xmm1, xmmword ptr [rsp+0x90]
+ paddd xmm2, xmmword ptr [rsp+0xB0]
+ paddd xmm3, xmmword ptr [rsp+0x80]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ pshuflw xmm15, xmm15, 0xB1
+ pshufhw xmm15, xmm15, 0xB1
+ pshuflw xmm12, xmm12, 0xB1
+ pshufhw xmm12, xmm12, 0xB1
+ pshuflw xmm13, xmm13, 0xB1
+ pshufhw xmm13, xmm13, 0xB1
+ pshuflw xmm14, xmm14, 0xB1
+ pshufhw xmm14, xmm14, 0xB1
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x50]
+ paddd xmm1, xmmword ptr [rsp]
+ paddd xmm2, xmmword ptr [rsp+0xF0]
+ paddd xmm3, xmmword ptr [rsp+0x10]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ movdqa xmm8, xmm15
+ psrld xmm15, 8
+ pslld xmm8, 24
+ pxor xmm15, xmm8
+ movdqa xmm8, xmm12
+ psrld xmm12, 8
+ pslld xmm8, 24
+ pxor xmm12, xmm8
+ movdqa xmm8, xmm13
+ psrld xmm13, 8
+ pslld xmm8, 24
+ pxor xmm13, xmm8
+ movdqa xmm8, xmm14
+ psrld xmm14, 8
+ pslld xmm8, 24
+ pxor xmm14, xmm8
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+0xA0]
+ paddd xmm1, xmmword ptr [rsp+0xC0]
+ paddd xmm2, xmmword ptr [rsp+0xE0]
+ paddd xmm3, xmmword ptr [rsp+0xD0]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ pshuflw xmm12, xmm12, 0xB1
+ pshufhw xmm12, xmm12, 0xB1
+ pshuflw xmm13, xmm13, 0xB1
+ pshufhw xmm13, xmm13, 0xB1
+ pshuflw xmm14, xmm14, 0xB1
+ pshufhw xmm14, xmm14, 0xB1
+ pshuflw xmm15, xmm15, 0xB1
+ pshufhw xmm15, xmm15, 0xB1
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x70]
+ paddd xmm1, xmmword ptr [rsp+0x90]
+ paddd xmm2, xmmword ptr [rsp+0x30]
+ paddd xmm3, xmmword ptr [rsp+0xF0]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ movdqa xmm8, xmm12
+ psrld xmm12, 8
+ pslld xmm8, 24
+ pxor xmm12, xmm8
+ movdqa xmm8, xmm13
+ psrld xmm13, 8
+ pslld xmm8, 24
+ pxor xmm13, xmm8
+ movdqa xmm8, xmm14
+ psrld xmm14, 8
+ pslld xmm8, 24
+ pxor xmm14, xmm8
+ movdqa xmm8, xmm15
+ psrld xmm15, 8
+ pslld xmm8, 24
+ pxor xmm15, xmm8
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x40]
+ paddd xmm1, xmmword ptr [rsp+0xB0]
+ paddd xmm2, xmmword ptr [rsp+0x50]
+ paddd xmm3, xmmword ptr [rsp+0x10]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ pshuflw xmm15, xmm15, 0xB1
+ pshufhw xmm15, xmm15, 0xB1
+ pshuflw xmm12, xmm12, 0xB1
+ pshufhw xmm12, xmm12, 0xB1
+ pshuflw xmm13, xmm13, 0xB1
+ pshufhw xmm13, xmm13, 0xB1
+ pshuflw xmm14, xmm14, 0xB1
+ pshufhw xmm14, xmm14, 0xB1
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp]
+ paddd xmm1, xmmword ptr [rsp+0x20]
+ paddd xmm2, xmmword ptr [rsp+0x80]
+ paddd xmm3, xmmword ptr [rsp+0x60]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ movdqa xmm8, xmm15
+ psrld xmm15, 8
+ pslld xmm8, 24
+ pxor xmm15, xmm8
+ movdqa xmm8, xmm12
+ psrld xmm12, 8
+ pslld xmm8, 24
+ pxor xmm12, xmm8
+ movdqa xmm8, xmm13
+ psrld xmm13, 8
+ pslld xmm8, 24
+ pxor xmm13, xmm8
+ movdqa xmm8, xmm14
+ psrld xmm14, 8
+ pslld xmm8, 24
+ pxor xmm14, xmm8
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+0xC0]
+ paddd xmm1, xmmword ptr [rsp+0x90]
+ paddd xmm2, xmmword ptr [rsp+0xF0]
+ paddd xmm3, xmmword ptr [rsp+0xE0]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ pshuflw xmm12, xmm12, 0xB1
+ pshufhw xmm12, xmm12, 0xB1
+ pshuflw xmm13, xmm13, 0xB1
+ pshufhw xmm13, xmm13, 0xB1
+ pshuflw xmm14, xmm14, 0xB1
+ pshufhw xmm14, xmm14, 0xB1
+ pshuflw xmm15, xmm15, 0xB1
+ pshufhw xmm15, xmm15, 0xB1
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0xD0]
+ paddd xmm1, xmmword ptr [rsp+0xB0]
+ paddd xmm2, xmmword ptr [rsp+0xA0]
+ paddd xmm3, xmmword ptr [rsp+0x80]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ movdqa xmm8, xmm12
+ psrld xmm12, 8
+ pslld xmm8, 24
+ pxor xmm12, xmm8
+ movdqa xmm8, xmm13
+ psrld xmm13, 8
+ pslld xmm8, 24
+ pxor xmm13, xmm8
+ movdqa xmm8, xmm14
+ psrld xmm14, 8
+ pslld xmm8, 24
+ pxor xmm14, xmm8
+ movdqa xmm8, xmm15
+ psrld xmm15, 8
+ pslld xmm8, 24
+ pxor xmm15, xmm8
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x70]
+ paddd xmm1, xmmword ptr [rsp+0x50]
+ paddd xmm2, xmmword ptr [rsp]
+ paddd xmm3, xmmword ptr [rsp+0x60]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ pshuflw xmm15, xmm15, 0xB1
+ pshufhw xmm15, xmm15, 0xB1
+ pshuflw xmm12, xmm12, 0xB1
+ pshufhw xmm12, xmm12, 0xB1
+ pshuflw xmm13, xmm13, 0xB1
+ pshufhw xmm13, xmm13, 0xB1
+ pshuflw xmm14, xmm14, 0xB1
+ pshufhw xmm14, xmm14, 0xB1
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x20]
+ paddd xmm1, xmmword ptr [rsp+0x30]
+ paddd xmm2, xmmword ptr [rsp+0x10]
+ paddd xmm3, xmmword ptr [rsp+0x40]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ movdqa xmm8, xmm15
+ psrld xmm15, 8
+ pslld xmm8, 24
+ pxor xmm15, xmm8
+ movdqa xmm8, xmm12
+ psrld xmm12, 8
+ pslld xmm8, 24
+ pxor xmm12, xmm8
+ movdqa xmm8, xmm13
+ psrld xmm13, 8
+ pslld xmm8, 24
+ pxor xmm13, xmm8
+ movdqa xmm8, xmm14
+ psrld xmm14, 8
+ pslld xmm8, 24
+ pxor xmm14, xmm8
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x90]
+ paddd xmm1, xmmword ptr [rsp+0xB0]
+ paddd xmm2, xmmword ptr [rsp+0x80]
+ paddd xmm3, xmmword ptr [rsp+0xF0]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ pshuflw xmm12, xmm12, 0xB1
+ pshufhw xmm12, xmm12, 0xB1
+ pshuflw xmm13, xmm13, 0xB1
+ pshufhw xmm13, xmm13, 0xB1
+ pshuflw xmm14, xmm14, 0xB1
+ pshufhw xmm14, xmm14, 0xB1
+ pshuflw xmm15, xmm15, 0xB1
+ pshufhw xmm15, xmm15, 0xB1
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0xE0]
+ paddd xmm1, xmmword ptr [rsp+0x50]
+ paddd xmm2, xmmword ptr [rsp+0xC0]
+ paddd xmm3, xmmword ptr [rsp+0x10]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ movdqa xmm8, xmm12
+ psrld xmm12, 8
+ pslld xmm8, 24
+ pxor xmm12, xmm8
+ movdqa xmm8, xmm13
+ psrld xmm13, 8
+ pslld xmm8, 24
+ pxor xmm13, xmm8
+ movdqa xmm8, xmm14
+ psrld xmm14, 8
+ pslld xmm8, 24
+ pxor xmm14, xmm8
+ movdqa xmm8, xmm15
+ psrld xmm15, 8
+ pslld xmm8, 24
+ pxor xmm15, xmm8
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0xD0]
+ paddd xmm1, xmmword ptr [rsp]
+ paddd xmm2, xmmword ptr [rsp+0x20]
+ paddd xmm3, xmmword ptr [rsp+0x40]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ pshuflw xmm15, xmm15, 0xB1
+ pshufhw xmm15, xmm15, 0xB1
+ pshuflw xmm12, xmm12, 0xB1
+ pshufhw xmm12, xmm12, 0xB1
+ pshuflw xmm13, xmm13, 0xB1
+ pshufhw xmm13, xmm13, 0xB1
+ pshuflw xmm14, xmm14, 0xB1
+ pshufhw xmm14, xmm14, 0xB1
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x30]
+ paddd xmm1, xmmword ptr [rsp+0xA0]
+ paddd xmm2, xmmword ptr [rsp+0x60]
+ paddd xmm3, xmmword ptr [rsp+0x70]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ movdqa xmm8, xmm15
+ psrld xmm15, 8
+ pslld xmm8, 24
+ pxor xmm15, xmm8
+ movdqa xmm8, xmm12
+ psrld xmm12, 8
+ pslld xmm8, 24
+ pxor xmm12, xmm8
+ movdqa xmm8, xmm13
+ psrld xmm13, 8
+ pslld xmm8, 24
+ pxor xmm13, xmm8
+ movdqa xmm8, xmm14
+ psrld xmm14, 8
+ pslld xmm8, 24
+ pxor xmm14, xmm8
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+0xB0]
+ paddd xmm1, xmmword ptr [rsp+0x50]
+ paddd xmm2, xmmword ptr [rsp+0x10]
+ paddd xmm3, xmmword ptr [rsp+0x80]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ pshuflw xmm12, xmm12, 0xB1
+ pshufhw xmm12, xmm12, 0xB1
+ pshuflw xmm13, xmm13, 0xB1
+ pshufhw xmm13, xmm13, 0xB1
+ pshuflw xmm14, xmm14, 0xB1
+ pshufhw xmm14, xmm14, 0xB1
+ pshuflw xmm15, xmm15, 0xB1
+ pshufhw xmm15, xmm15, 0xB1
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0xF0]
+ paddd xmm1, xmmword ptr [rsp]
+ paddd xmm2, xmmword ptr [rsp+0x90]
+ paddd xmm3, xmmword ptr [rsp+0x60]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ movdqa xmm8, xmm12
+ psrld xmm12, 8
+ pslld xmm8, 24
+ pxor xmm12, xmm8
+ movdqa xmm8, xmm13
+ psrld xmm13, 8
+ pslld xmm8, 24
+ pxor xmm13, xmm8
+ movdqa xmm8, xmm14
+ psrld xmm14, 8
+ pslld xmm8, 24
+ pxor xmm14, xmm8
+ movdqa xmm8, xmm15
+ psrld xmm15, 8
+ pslld xmm8, 24
+ pxor xmm15, xmm8
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0xE0]
+ paddd xmm1, xmmword ptr [rsp+0x20]
+ paddd xmm2, xmmword ptr [rsp+0x30]
+ paddd xmm3, xmmword ptr [rsp+0x70]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ pshuflw xmm15, xmm15, 0xB1
+ pshufhw xmm15, xmm15, 0xB1
+ pshuflw xmm12, xmm12, 0xB1
+ pshufhw xmm12, xmm12, 0xB1
+ pshuflw xmm13, xmm13, 0xB1
+ pshufhw xmm13, xmm13, 0xB1
+ pshuflw xmm14, xmm14, 0xB1
+ pshufhw xmm14, xmm14, 0xB1
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+0xA0]
+ paddd xmm1, xmmword ptr [rsp+0xC0]
+ paddd xmm2, xmmword ptr [rsp+0x40]
+ paddd xmm3, xmmword ptr [rsp+0xD0]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ movdqa xmm8, xmm15
+ psrld xmm15, 8
+ pslld xmm8, 24
+ pxor xmm15, xmm8
+ movdqa xmm8, xmm12
+ psrld xmm12, 8
+ pslld xmm8, 24
+ pxor xmm12, xmm8
+ movdqa xmm8, xmm13
+ psrld xmm13, 8
+ pslld xmm8, 24
+ pxor xmm13, xmm8
+ movdqa xmm8, xmm14
+ psrld xmm14, 8
+ pslld xmm8, 24
+ pxor xmm14, xmm8
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ pxor xmm0, xmm8
+ pxor xmm1, xmm9
+ pxor xmm2, xmm10
+ pxor xmm3, xmm11
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ pxor xmm4, xmm12
+ pxor xmm5, xmm13
+ pxor xmm6, xmm14
+ pxor xmm7, xmm15
+ mov eax, r13d
+ jne 9b
+ movdqa xmm9, xmm0
+ punpckldq xmm0, xmm1
+ punpckhdq xmm9, xmm1
+ movdqa xmm11, xmm2
+ punpckldq xmm2, xmm3
+ punpckhdq xmm11, xmm3
+ movdqa xmm1, xmm0
+ punpcklqdq xmm0, xmm2
+ punpckhqdq xmm1, xmm2
+ movdqa xmm3, xmm9
+ punpcklqdq xmm9, xmm11
+ punpckhqdq xmm3, xmm11
+ movdqu xmmword ptr [rbx], xmm0
+ movdqu xmmword ptr [rbx+0x20], xmm1
+ movdqu xmmword ptr [rbx+0x40], xmm9
+ movdqu xmmword ptr [rbx+0x60], xmm3
+ movdqa xmm9, xmm4
+ punpckldq xmm4, xmm5
+ punpckhdq xmm9, xmm5
+ movdqa xmm11, xmm6
+ punpckldq xmm6, xmm7
+ punpckhdq xmm11, xmm7
+ movdqa xmm5, xmm4
+ punpcklqdq xmm4, xmm6
+ punpckhqdq xmm5, xmm6
+ movdqa xmm7, xmm9
+ punpcklqdq xmm9, xmm11
+ punpckhqdq xmm7, xmm11
+ movdqu xmmword ptr [rbx+0x10], xmm4
+ movdqu xmmword ptr [rbx+0x30], xmm5
+ movdqu xmmword ptr [rbx+0x50], xmm9
+ movdqu xmmword ptr [rbx+0x70], xmm7
+ movdqa xmm1, xmmword ptr [rsp+0x110]
+ movdqa xmm0, xmm1
+ paddd xmm1, xmmword ptr [rsp+0x150]
+ movdqa xmmword ptr [rsp+0x110], xmm1
+ pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip]
+ pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip]
+ pcmpgtd xmm0, xmm1
+ movdqa xmm1, xmmword ptr [rsp+0x120]
+ psubd xmm1, xmm0
+ movdqa xmmword ptr [rsp+0x120], xmm1
+ add rbx, 128
+ add rdi, 32
+ sub rsi, 4
+ cmp rsi, 4
+ jnc 2b
+ test rsi, rsi
+ jnz 3f
+4:
+ mov rsp, rbp
+ pop rbp
+ pop rbx
+ pop r12
+ pop r13
+ pop r14
+ pop r15
+ ret
+.p2align 5
+3:
+ test esi, 0x2
+ je 3f
+ movups xmm0, xmmword ptr [rcx]
+ movups xmm1, xmmword ptr [rcx+0x10]
+ movaps xmm8, xmm0
+ movaps xmm9, xmm1
+ movd xmm13, dword ptr [rsp+0x110]
+ movd xmm14, dword ptr [rsp+0x120]
+ punpckldq xmm13, xmm14
+ movaps xmmword ptr [rsp], xmm13
+ movd xmm14, dword ptr [rsp+0x114]
+ movd xmm13, dword ptr [rsp+0x124]
+ punpckldq xmm14, xmm13
+ movaps xmmword ptr [rsp+0x10], xmm14
+ mov r8, qword ptr [rdi]
+ mov r9, qword ptr [rdi+0x8]
+ movzx eax, byte ptr [rbp+0x40]
+ or eax, r13d
+ xor edx, edx
+2:
+ mov r14d, eax
+ or eax, r12d
+ add rdx, 64
+ cmp rdx, r15
+ cmovne eax, r14d
+ movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
+ movaps xmm10, xmm2
+ movups xmm4, xmmword ptr [r8+rdx-0x40]
+ movups xmm5, xmmword ptr [r8+rdx-0x30]
+ movaps xmm3, xmm4
+ shufps xmm4, xmm5, 136
+ shufps xmm3, xmm5, 221
+ movaps xmm5, xmm3
+ movups xmm6, xmmword ptr [r8+rdx-0x20]
+ movups xmm7, xmmword ptr [r8+rdx-0x10]
+ movaps xmm3, xmm6
+ shufps xmm6, xmm7, 136
+ pshufd xmm6, xmm6, 0x93
+ shufps xmm3, xmm7, 221
+ pshufd xmm7, xmm3, 0x93
+ movups xmm12, xmmword ptr [r9+rdx-0x40]
+ movups xmm13, xmmword ptr [r9+rdx-0x30]
+ movaps xmm11, xmm12
+ shufps xmm12, xmm13, 136
+ shufps xmm11, xmm13, 221
+ movaps xmm13, xmm11
+ movups xmm14, xmmword ptr [r9+rdx-0x20]
+ movups xmm15, xmmword ptr [r9+rdx-0x10]
+ movaps xmm11, xmm14
+ shufps xmm14, xmm15, 136
+ pshufd xmm14, xmm14, 0x93
+ shufps xmm11, xmm15, 221
+ pshufd xmm15, xmm11, 0x93
+ shl rax, 0x20
+ or rax, 0x40
+ movq xmm3, rax
+ movdqa xmmword ptr [rsp+0x20], xmm3
+ movaps xmm3, xmmword ptr [rsp]
+ movaps xmm11, xmmword ptr [rsp+0x10]
+ punpcklqdq xmm3, xmmword ptr [rsp+0x20]
+ punpcklqdq xmm11, xmmword ptr [rsp+0x20]
+ mov al, 7
+9:
+ paddd xmm0, xmm4
+ paddd xmm8, xmm12
+ movaps xmmword ptr [rsp+0x20], xmm4
+ movaps xmmword ptr [rsp+0x30], xmm12
+ paddd xmm0, xmm1
+ paddd xmm8, xmm9
+ pxor xmm3, xmm0
+ pxor xmm11, xmm8
+ pshuflw xmm3, xmm3, 0xB1
+ pshufhw xmm3, xmm3, 0xB1
+ pshuflw xmm11, xmm11, 0xB1
+ pshufhw xmm11, xmm11, 0xB1
+ paddd xmm2, xmm3
+ paddd xmm10, xmm11
+ pxor xmm1, xmm2
+ pxor xmm9, xmm10
+ movdqa xmm4, xmm1
+ pslld xmm1, 20
+ psrld xmm4, 12
+ por xmm1, xmm4
+ movdqa xmm4, xmm9
+ pslld xmm9, 20
+ psrld xmm4, 12
+ por xmm9, xmm4
+ paddd xmm0, xmm5
+ paddd xmm8, xmm13
+ movaps xmmword ptr [rsp+0x40], xmm5
+ movaps xmmword ptr [rsp+0x50], xmm13
+ paddd xmm0, xmm1
+ paddd xmm8, xmm9
+ pxor xmm3, xmm0
+ pxor xmm11, xmm8
+ movdqa xmm13, xmm3
+ psrld xmm3, 8
+ pslld xmm13, 24
+ pxor xmm3, xmm13
+ movdqa xmm13, xmm11
+ psrld xmm11, 8
+ pslld xmm13, 24
+ pxor xmm11, xmm13
+ paddd xmm2, xmm3
+ paddd xmm10, xmm11
+ pxor xmm1, xmm2
+ pxor xmm9, xmm10
+ movdqa xmm4, xmm1
+ pslld xmm1, 25
+ psrld xmm4, 7
+ por xmm1, xmm4
+ movdqa xmm4, xmm9
+ pslld xmm9, 25
+ psrld xmm4, 7
+ por xmm9, xmm4
+ pshufd xmm0, xmm0, 0x93
+ pshufd xmm8, xmm8, 0x93
+ pshufd xmm3, xmm3, 0x4E
+ pshufd xmm11, xmm11, 0x4E
+ pshufd xmm2, xmm2, 0x39
+ pshufd xmm10, xmm10, 0x39
+ paddd xmm0, xmm6
+ paddd xmm8, xmm14
+ paddd xmm0, xmm1
+ paddd xmm8, xmm9
+ pxor xmm3, xmm0
+ pxor xmm11, xmm8
+ pshuflw xmm3, xmm3, 0xB1
+ pshufhw xmm3, xmm3, 0xB1
+ pshuflw xmm11, xmm11, 0xB1
+ pshufhw xmm11, xmm11, 0xB1
+ paddd xmm2, xmm3
+ paddd xmm10, xmm11
+ pxor xmm1, xmm2
+ pxor xmm9, xmm10
+ movdqa xmm4, xmm1
+ pslld xmm1, 20
+ psrld xmm4, 12
+ por xmm1, xmm4
+ movdqa xmm4, xmm9
+ pslld xmm9, 20
+ psrld xmm4, 12
+ por xmm9, xmm4
+ paddd xmm0, xmm7
+ paddd xmm8, xmm15
+ paddd xmm0, xmm1
+ paddd xmm8, xmm9
+ pxor xmm3, xmm0
+ pxor xmm11, xmm8
+ movdqa xmm13, xmm3
+ psrld xmm3, 8
+ pslld xmm13, 24
+ pxor xmm3, xmm13
+ movdqa xmm13, xmm11
+ psrld xmm11, 8
+ pslld xmm13, 24
+ pxor xmm11, xmm13
+ paddd xmm2, xmm3
+ paddd xmm10, xmm11
+ pxor xmm1, xmm2
+ pxor xmm9, xmm10
+ movdqa xmm4, xmm1
+ pslld xmm1, 25
+ psrld xmm4, 7
+ por xmm1, xmm4
+ movdqa xmm4, xmm9
+ pslld xmm9, 25
+ psrld xmm4, 7
+ por xmm9, xmm4
+ pshufd xmm0, xmm0, 0x39
+ pshufd xmm8, xmm8, 0x39
+ pshufd xmm3, xmm3, 0x4E
+ pshufd xmm11, xmm11, 0x4E
+ pshufd xmm2, xmm2, 0x93
+ pshufd xmm10, xmm10, 0x93
+ dec al
+ je 9f
+ movdqa xmm12, xmmword ptr [rsp+0x20]
+ movdqa xmm5, xmmword ptr [rsp+0x40]
+ pshufd xmm13, xmm12, 0x0F
+ shufps xmm12, xmm5, 214
+ pshufd xmm4, xmm12, 0x39
+ movdqa xmm12, xmm6
+ shufps xmm12, xmm7, 250
+ pand xmm13, xmmword ptr [PBLENDW_0x33_MASK+rip]
+ pand xmm12, xmmword ptr [PBLENDW_0xCC_MASK+rip]
+ por xmm13, xmm12
+ movdqa xmmword ptr [rsp+0x20], xmm13
+ movdqa xmm12, xmm7
+ punpcklqdq xmm12, xmm5
+ movdqa xmm13, xmm6
+ pand xmm12, xmmword ptr [PBLENDW_0x3F_MASK+rip]
+ pand xmm13, xmmword ptr [PBLENDW_0xC0_MASK+rip]
+ por xmm12, xmm13
+ pshufd xmm12, xmm12, 0x78
+ punpckhdq xmm5, xmm7
+ punpckldq xmm6, xmm5
+ pshufd xmm7, xmm6, 0x1E
+ movdqa xmmword ptr [rsp+0x40], xmm12
+ movdqa xmm5, xmmword ptr [rsp+0x30]
+ movdqa xmm13, xmmword ptr [rsp+0x50]
+ pshufd xmm6, xmm5, 0x0F
+ shufps xmm5, xmm13, 214
+ pshufd xmm12, xmm5, 0x39
+ movdqa xmm5, xmm14
+ shufps xmm5, xmm15, 250
+ pand xmm6, xmmword ptr [PBLENDW_0x33_MASK+rip]
+ pand xmm5, xmmword ptr [PBLENDW_0xCC_MASK+rip]
+ por xmm6, xmm5
+ movdqa xmm5, xmm15
+ punpcklqdq xmm5, xmm13
+ movdqa xmmword ptr [rsp+0x30], xmm2
+ movdqa xmm2, xmm14
+ pand xmm5, xmmword ptr [PBLENDW_0x3F_MASK+rip]
+ pand xmm2, xmmword ptr [PBLENDW_0xC0_MASK+rip]
+ por xmm5, xmm2
+ movdqa xmm2, xmmword ptr [rsp+0x30]
+ pshufd xmm5, xmm5, 0x78
+ punpckhdq xmm13, xmm15
+ punpckldq xmm14, xmm13
+ pshufd xmm15, xmm14, 0x1E
+ movdqa xmm13, xmm6
+ movdqa xmm14, xmm5
+ movdqa xmm5, xmmword ptr [rsp+0x20]
+ movdqa xmm6, xmmword ptr [rsp+0x40]
+ jmp 9b
+9:
+ pxor xmm0, xmm2
+ pxor xmm1, xmm3
+ pxor xmm8, xmm10
+ pxor xmm9, xmm11
+ mov eax, r13d
+ cmp rdx, r15
+ jne 2b
+ movups xmmword ptr [rbx], xmm0
+ movups xmmword ptr [rbx+0x10], xmm1
+ movups xmmword ptr [rbx+0x20], xmm8
+ movups xmmword ptr [rbx+0x30], xmm9
+ mov eax, dword ptr [rsp+0x130]
+ neg eax
+ mov r10d, dword ptr [rsp+0x110+8*rax]
+ mov r11d, dword ptr [rsp+0x120+8*rax]
+ mov dword ptr [rsp+0x110], r10d
+ mov dword ptr [rsp+0x120], r11d
+ add rdi, 16
+ add rbx, 64
+ sub rsi, 2
+3:
+ test esi, 0x1
+ je 4b
+ movups xmm0, xmmword ptr [rcx]
+ movups xmm1, xmmword ptr [rcx+0x10]
+ movd xmm13, dword ptr [rsp+0x110]
+ movd xmm14, dword ptr [rsp+0x120]
+ punpckldq xmm13, xmm14
+ mov r8, qword ptr [rdi]
+ movzx eax, byte ptr [rbp+0x40]
+ or eax, r13d
+ xor edx, edx
+2:
+ mov r14d, eax
+ or eax, r12d
+ add rdx, 64
+ cmp rdx, r15
+ cmovne eax, r14d
+ movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
+ shl rax, 32
+ or rax, 64
+ movq xmm12, rax
+ movdqa xmm3, xmm13
+ punpcklqdq xmm3, xmm12
+ movups xmm4, xmmword ptr [r8+rdx-0x40]
+ movups xmm5, xmmword ptr [r8+rdx-0x30]
+ movaps xmm8, xmm4
+ shufps xmm4, xmm5, 136
+ shufps xmm8, xmm5, 221
+ movaps xmm5, xmm8
+ movups xmm6, xmmword ptr [r8+rdx-0x20]
+ movups xmm7, xmmword ptr [r8+rdx-0x10]
+ movaps xmm8, xmm6
+ shufps xmm6, xmm7, 136
+ pshufd xmm6, xmm6, 0x93
+ shufps xmm8, xmm7, 221
+ pshufd xmm7, xmm8, 0x93
+ mov al, 7
+9:
+ paddd xmm0, xmm4
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshuflw xmm3, xmm3, 0xB1
+ pshufhw xmm3, xmm3, 0xB1
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 20
+ psrld xmm11, 12
+ por xmm1, xmm11
+ paddd xmm0, xmm5
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ movdqa xmm14, xmm3
+ psrld xmm3, 8
+ pslld xmm14, 24
+ pxor xmm3, xmm14
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 25
+ psrld xmm11, 7
+ por xmm1, xmm11
+ pshufd xmm0, xmm0, 0x93
+ pshufd xmm3, xmm3, 0x4E
+ pshufd xmm2, xmm2, 0x39
+ paddd xmm0, xmm6
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshuflw xmm3, xmm3, 0xB1
+ pshufhw xmm3, xmm3, 0xB1
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 20
+ psrld xmm11, 12
+ por xmm1, xmm11
+ paddd xmm0, xmm7
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ movdqa xmm14, xmm3
+ psrld xmm3, 8
+ pslld xmm14, 24
+ pxor xmm3, xmm14
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 25
+ psrld xmm11, 7
+ por xmm1, xmm11
+ pshufd xmm0, xmm0, 0x39
+ pshufd xmm3, xmm3, 0x4E
+ pshufd xmm2, xmm2, 0x93
+ dec al
+ jz 9f
+ movdqa xmm8, xmm4
+ shufps xmm8, xmm5, 214
+ pshufd xmm9, xmm4, 0x0F
+ pshufd xmm4, xmm8, 0x39
+ movdqa xmm8, xmm6
+ shufps xmm8, xmm7, 250
+ pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip]
+ pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip]
+ por xmm9, xmm8
+ movdqa xmm8, xmm7
+ punpcklqdq xmm8, xmm5
+ movdqa xmm10, xmm6
+ pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip]
+ pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]
+ por xmm8, xmm10
+ pshufd xmm8, xmm8, 0x78
+ punpckhdq xmm5, xmm7
+ punpckldq xmm6, xmm5
+ pshufd xmm7, xmm6, 0x1E
+ movdqa xmm5, xmm9
+ movdqa xmm6, xmm8
+ jmp 9b
+9:
+ pxor xmm0, xmm2
+ pxor xmm1, xmm3
+ mov eax, r13d
+ cmp rdx, r15
+ jne 2b
+ movups xmmword ptr [rbx], xmm0
+ movups xmmword ptr [rbx+0x10], xmm1
+ jmp 4b
+
+.p2align 6
+zfs_blake3_compress_in_place_sse2:
+ _CET_ENDBR
+ movups xmm0, xmmword ptr [rdi]
+ movups xmm1, xmmword ptr [rdi+0x10]
+ movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
+ shl r8, 32
+ add rdx, r8
+ movq xmm3, rcx
+ movq xmm4, rdx
+ punpcklqdq xmm3, xmm4
+ movups xmm4, xmmword ptr [rsi]
+ movups xmm5, xmmword ptr [rsi+0x10]
+ movaps xmm8, xmm4
+ shufps xmm4, xmm5, 136
+ shufps xmm8, xmm5, 221
+ movaps xmm5, xmm8
+ movups xmm6, xmmword ptr [rsi+0x20]
+ movups xmm7, xmmword ptr [rsi+0x30]
+ movaps xmm8, xmm6
+ shufps xmm6, xmm7, 136
+ pshufd xmm6, xmm6, 0x93
+ shufps xmm8, xmm7, 221
+ pshufd xmm7, xmm8, 0x93
+ mov al, 7
+9:
+ paddd xmm0, xmm4
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshuflw xmm3, xmm3, 0xB1
+ pshufhw xmm3, xmm3, 0xB1
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 20
+ psrld xmm11, 12
+ por xmm1, xmm11
+ paddd xmm0, xmm5
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ movdqa xmm14, xmm3
+ psrld xmm3, 8
+ pslld xmm14, 24
+ pxor xmm3, xmm14
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 25
+ psrld xmm11, 7
+ por xmm1, xmm11
+ pshufd xmm0, xmm0, 0x93
+ pshufd xmm3, xmm3, 0x4E
+ pshufd xmm2, xmm2, 0x39
+ paddd xmm0, xmm6
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshuflw xmm3, xmm3, 0xB1
+ pshufhw xmm3, xmm3, 0xB1
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 20
+ psrld xmm11, 12
+ por xmm1, xmm11
+ paddd xmm0, xmm7
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ movdqa xmm14, xmm3
+ psrld xmm3, 8
+ pslld xmm14, 24
+ pxor xmm3, xmm14
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 25
+ psrld xmm11, 7
+ por xmm1, xmm11
+ pshufd xmm0, xmm0, 0x39
+ pshufd xmm3, xmm3, 0x4E
+ pshufd xmm2, xmm2, 0x93
+ dec al
+ jz 9f
+ movdqa xmm8, xmm4
+ shufps xmm8, xmm5, 214
+ pshufd xmm9, xmm4, 0x0F
+ pshufd xmm4, xmm8, 0x39
+ movdqa xmm8, xmm6
+ shufps xmm8, xmm7, 250
+ pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip]
+ pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip]
+ por xmm9, xmm8
+ movdqa xmm8, xmm7
+ punpcklqdq xmm8, xmm5
+ movdqa xmm10, xmm6
+ pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip]
+ pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]
+ por xmm8, xmm10
+ pshufd xmm8, xmm8, 0x78
+ punpckhdq xmm5, xmm7
+ punpckldq xmm6, xmm5
+ pshufd xmm7, xmm6, 0x1E
+ movdqa xmm5, xmm9
+ movdqa xmm6, xmm8
+ jmp 9b
+9:
+ pxor xmm0, xmm2
+ pxor xmm1, xmm3
+ movups xmmword ptr [rdi], xmm0
+ movups xmmword ptr [rdi+0x10], xmm1
+ ret
+
+.p2align 6
+zfs_blake3_compress_xof_sse2:
+ _CET_ENDBR
+ movups xmm0, xmmword ptr [rdi]
+ movups xmm1, xmmword ptr [rdi+0x10]
+ movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
+ movzx eax, r8b
+ movzx edx, dl
+ shl rax, 32
+ add rdx, rax
+ movq xmm3, rcx
+ movq xmm4, rdx
+ punpcklqdq xmm3, xmm4
+ movups xmm4, xmmword ptr [rsi]
+ movups xmm5, xmmword ptr [rsi+0x10]
+ movaps xmm8, xmm4
+ shufps xmm4, xmm5, 136
+ shufps xmm8, xmm5, 221
+ movaps xmm5, xmm8
+ movups xmm6, xmmword ptr [rsi+0x20]
+ movups xmm7, xmmword ptr [rsi+0x30]
+ movaps xmm8, xmm6
+ shufps xmm6, xmm7, 136
+ pshufd xmm6, xmm6, 0x93
+ shufps xmm8, xmm7, 221
+ pshufd xmm7, xmm8, 0x93
+ mov al, 7
+9:
+ paddd xmm0, xmm4
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshuflw xmm3, xmm3, 0xB1
+ pshufhw xmm3, xmm3, 0xB1
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 20
+ psrld xmm11, 12
+ por xmm1, xmm11
+ paddd xmm0, xmm5
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ movdqa xmm14, xmm3
+ psrld xmm3, 8
+ pslld xmm14, 24
+ pxor xmm3, xmm14
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 25
+ psrld xmm11, 7
+ por xmm1, xmm11
+ pshufd xmm0, xmm0, 0x93
+ pshufd xmm3, xmm3, 0x4E
+ pshufd xmm2, xmm2, 0x39
+ paddd xmm0, xmm6
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshuflw xmm3, xmm3, 0xB1
+ pshufhw xmm3, xmm3, 0xB1
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 20
+ psrld xmm11, 12
+ por xmm1, xmm11
+ paddd xmm0, xmm7
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ movdqa xmm14, xmm3
+ psrld xmm3, 8
+ pslld xmm14, 24
+ pxor xmm3, xmm14
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 25
+ psrld xmm11, 7
+ por xmm1, xmm11
+ pshufd xmm0, xmm0, 0x39
+ pshufd xmm3, xmm3, 0x4E
+ pshufd xmm2, xmm2, 0x93
+ dec al
+ jz 9f
+ movdqa xmm8, xmm4
+ shufps xmm8, xmm5, 214
+ pshufd xmm9, xmm4, 0x0F
+ pshufd xmm4, xmm8, 0x39
+ movdqa xmm8, xmm6
+ shufps xmm8, xmm7, 250
+ pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip]
+ pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip]
+ por xmm9, xmm8
+ movdqa xmm8, xmm7
+ punpcklqdq xmm8, xmm5
+ movdqa xmm10, xmm6
+ pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip]
+ pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]
+ por xmm8, xmm10
+ pshufd xmm8, xmm8, 0x78
+ punpckhdq xmm5, xmm7
+ punpckldq xmm6, xmm5
+ pshufd xmm7, xmm6, 0x1E
+ movdqa xmm5, xmm9
+ movdqa xmm6, xmm8
+ jmp 9b
+9:
+ movdqu xmm4, xmmword ptr [rdi]
+ movdqu xmm5, xmmword ptr [rdi+0x10]
+ pxor xmm0, xmm2
+ pxor xmm1, xmm3
+ pxor xmm2, xmm4
+ pxor xmm3, xmm5
+ movups xmmword ptr [r9], xmm0
+ movups xmmword ptr [r9+0x10], xmm1
+ movups xmmword ptr [r9+0x20], xmm2
+ movups xmmword ptr [r9+0x30], xmm3
+ ret
+
+.size zfs_blake3_hash_many_sse2, . - zfs_blake3_hash_many_sse2
+.size zfs_blake3_compress_in_place_sse2, . - zfs_blake3_compress_in_place_sse2
+.size zfs_blake3_compress_xof_sse2, . - zfs_blake3_compress_xof_sse2
+
+#ifdef __APPLE__
+.static_data
+#else
+.section .rodata
+#endif
+.p2align 6
+BLAKE3_IV:
+ .long 0x6A09E667, 0xBB67AE85
+ .long 0x3C6EF372, 0xA54FF53A
+ADD0:
+ .long 0, 1, 2, 3
+ADD1:
+ .long 4, 4, 4, 4
+BLAKE3_IV_0:
+ .long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667
+BLAKE3_IV_1:
+ .long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85
+BLAKE3_IV_2:
+ .long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372
+BLAKE3_IV_3:
+ .long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A
+BLAKE3_BLOCK_LEN:
+ .long 64, 64, 64, 64
+CMP_MSB_MASK:
+ .long 0x80000000, 0x80000000, 0x80000000, 0x80000000
+PBLENDW_0x33_MASK:
+ .long 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000
+PBLENDW_0xCC_MASK:
+ .long 0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF
+PBLENDW_0x3F_MASK:
+ .long 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000
+PBLENDW_0xC0_MASK:
+ .long 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF
+
+#endif /* HAVE_SSE2 */
+
+#ifdef __ELF__
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/sys/contrib/openzfs/module/icp/asm-x86_64/blake3/blake3_sse41.S b/sys/contrib/openzfs/module/icp/asm-x86_64/blake3/blake3_sse41.S
new file mode 100644
index 000000000000..1c40236f0628
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/asm-x86_64/blake3/blake3_sse41.S
@@ -0,0 +1,2058 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Based on BLAKE3 v1.3.1, https://github.com/BLAKE3-team/BLAKE3
+ * Copyright (c) 2019-2020 Samuel Neves
+ * Copyright (c) 2022 Tino Reichardt <milky-zfs@mcmilk.de>
+ */
+
+#if defined(HAVE_SSE4_1)
+
+#define _ASM
+#include <sys/asm_linkage.h>
+
+#if defined(__ELF__) && defined(__CET__) && defined(__has_include)
+#if __has_include(<cet.h>)
+#include <cet.h>
+#endif
+#endif
+
+#if !defined(_CET_ENDBR)
+#define _CET_ENDBR
+#endif
+
+.intel_syntax noprefix
+.global zfs_blake3_compress_in_place_sse41
+.global zfs_blake3_compress_xof_sse41
+.global zfs_blake3_hash_many_sse41
+
+.text
+.type zfs_blake3_hash_many_sse41,@function
+.type zfs_blake3_compress_in_place_sse41,@function
+.type zfs_blake3_compress_xof_sse41,@function
+
+.p2align 6
+zfs_blake3_hash_many_sse41:
+ _CET_ENDBR
+ push r15
+ push r14
+ push r13
+ push r12
+ push rbx
+ push rbp
+ mov rbp, rsp
+ sub rsp, 360
+ and rsp, 0xFFFFFFFFFFFFFFC0
+ neg r9d
+ movd xmm0, r9d
+ pshufd xmm0, xmm0, 0x00
+ movdqa xmmword ptr [rsp+0x130], xmm0
+ movdqa xmm1, xmm0
+ pand xmm1, xmmword ptr [ADD0+rip]
+ pand xmm0, xmmword ptr [ADD1+rip]
+ movdqa xmmword ptr [rsp+0x150], xmm0
+ movd xmm0, r8d
+ pshufd xmm0, xmm0, 0x00
+ paddd xmm0, xmm1
+ movdqa xmmword ptr [rsp+0x110], xmm0
+ pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip]
+ pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip]
+ pcmpgtd xmm1, xmm0
+ shr r8, 32
+ movd xmm2, r8d
+ pshufd xmm2, xmm2, 0x00
+ psubd xmm2, xmm1
+ movdqa xmmword ptr [rsp+0x120], xmm2
+ mov rbx, qword ptr [rbp+0x50]
+ mov r15, rdx
+ shl r15, 6
+ movzx r13d, byte ptr [rbp+0x38]
+ movzx r12d, byte ptr [rbp+0x48]
+ cmp rsi, 4
+ jc 3f
+2:
+ movdqu xmm3, xmmword ptr [rcx]
+ pshufd xmm0, xmm3, 0x00
+ pshufd xmm1, xmm3, 0x55
+ pshufd xmm2, xmm3, 0xAA
+ pshufd xmm3, xmm3, 0xFF
+ movdqu xmm7, xmmword ptr [rcx+0x10]
+ pshufd xmm4, xmm7, 0x00
+ pshufd xmm5, xmm7, 0x55
+ pshufd xmm6, xmm7, 0xAA
+ pshufd xmm7, xmm7, 0xFF
+ mov r8, qword ptr [rdi]
+ mov r9, qword ptr [rdi+0x8]
+ mov r10, qword ptr [rdi+0x10]
+ mov r11, qword ptr [rdi+0x18]
+ movzx eax, byte ptr [rbp+0x40]
+ or eax, r13d
+ xor edx, edx
+9:
+ mov r14d, eax
+ or eax, r12d
+ add rdx, 64
+ cmp rdx, r15
+ cmovne eax, r14d
+ movdqu xmm8, xmmword ptr [r8+rdx-0x40]
+ movdqu xmm9, xmmword ptr [r9+rdx-0x40]
+ movdqu xmm10, xmmword ptr [r10+rdx-0x40]
+ movdqu xmm11, xmmword ptr [r11+rdx-0x40]
+ movdqa xmm12, xmm8
+ punpckldq xmm8, xmm9
+ punpckhdq xmm12, xmm9
+ movdqa xmm14, xmm10
+ punpckldq xmm10, xmm11
+ punpckhdq xmm14, xmm11
+ movdqa xmm9, xmm8
+ punpcklqdq xmm8, xmm10
+ punpckhqdq xmm9, xmm10
+ movdqa xmm13, xmm12
+ punpcklqdq xmm12, xmm14
+ punpckhqdq xmm13, xmm14
+ movdqa xmmword ptr [rsp], xmm8
+ movdqa xmmword ptr [rsp+0x10], xmm9
+ movdqa xmmword ptr [rsp+0x20], xmm12
+ movdqa xmmword ptr [rsp+0x30], xmm13
+ movdqu xmm8, xmmword ptr [r8+rdx-0x30]
+ movdqu xmm9, xmmword ptr [r9+rdx-0x30]
+ movdqu xmm10, xmmword ptr [r10+rdx-0x30]
+ movdqu xmm11, xmmword ptr [r11+rdx-0x30]
+ movdqa xmm12, xmm8
+ punpckldq xmm8, xmm9
+ punpckhdq xmm12, xmm9
+ movdqa xmm14, xmm10
+ punpckldq xmm10, xmm11
+ punpckhdq xmm14, xmm11
+ movdqa xmm9, xmm8
+ punpcklqdq xmm8, xmm10
+ punpckhqdq xmm9, xmm10
+ movdqa xmm13, xmm12
+ punpcklqdq xmm12, xmm14
+ punpckhqdq xmm13, xmm14
+ movdqa xmmword ptr [rsp+0x40], xmm8
+ movdqa xmmword ptr [rsp+0x50], xmm9
+ movdqa xmmword ptr [rsp+0x60], xmm12
+ movdqa xmmword ptr [rsp+0x70], xmm13
+ movdqu xmm8, xmmword ptr [r8+rdx-0x20]
+ movdqu xmm9, xmmword ptr [r9+rdx-0x20]
+ movdqu xmm10, xmmword ptr [r10+rdx-0x20]
+ movdqu xmm11, xmmword ptr [r11+rdx-0x20]
+ movdqa xmm12, xmm8
+ punpckldq xmm8, xmm9
+ punpckhdq xmm12, xmm9
+ movdqa xmm14, xmm10
+ punpckldq xmm10, xmm11
+ punpckhdq xmm14, xmm11
+ movdqa xmm9, xmm8
+ punpcklqdq xmm8, xmm10
+ punpckhqdq xmm9, xmm10
+ movdqa xmm13, xmm12
+ punpcklqdq xmm12, xmm14
+ punpckhqdq xmm13, xmm14
+ movdqa xmmword ptr [rsp+0x80], xmm8
+ movdqa xmmword ptr [rsp+0x90], xmm9
+ movdqa xmmword ptr [rsp+0xA0], xmm12
+ movdqa xmmword ptr [rsp+0xB0], xmm13
+ movdqu xmm8, xmmword ptr [r8+rdx-0x10]
+ movdqu xmm9, xmmword ptr [r9+rdx-0x10]
+ movdqu xmm10, xmmword ptr [r10+rdx-0x10]
+ movdqu xmm11, xmmword ptr [r11+rdx-0x10]
+ movdqa xmm12, xmm8
+ punpckldq xmm8, xmm9
+ punpckhdq xmm12, xmm9
+ movdqa xmm14, xmm10
+ punpckldq xmm10, xmm11
+ punpckhdq xmm14, xmm11
+ movdqa xmm9, xmm8
+ punpcklqdq xmm8, xmm10
+ punpckhqdq xmm9, xmm10
+ movdqa xmm13, xmm12
+ punpcklqdq xmm12, xmm14
+ punpckhqdq xmm13, xmm14
+ movdqa xmmword ptr [rsp+0xC0], xmm8
+ movdqa xmmword ptr [rsp+0xD0], xmm9
+ movdqa xmmword ptr [rsp+0xE0], xmm12
+ movdqa xmmword ptr [rsp+0xF0], xmm13
+ movdqa xmm9, xmmword ptr [BLAKE3_IV_1+rip]
+ movdqa xmm10, xmmword ptr [BLAKE3_IV_2+rip]
+ movdqa xmm11, xmmword ptr [BLAKE3_IV_3+rip]
+ movdqa xmm12, xmmword ptr [rsp+0x110]
+ movdqa xmm13, xmmword ptr [rsp+0x120]
+ movdqa xmm14, xmmword ptr [BLAKE3_BLOCK_LEN+rip]
+ movd xmm15, eax
+ pshufd xmm15, xmm15, 0x00
+ prefetcht0 [r8+rdx+0x80]
+ prefetcht0 [r9+rdx+0x80]
+ prefetcht0 [r10+rdx+0x80]
+ prefetcht0 [r11+rdx+0x80]
+ paddd xmm0, xmmword ptr [rsp]
+ paddd xmm1, xmmword ptr [rsp+0x20]
+ paddd xmm2, xmmword ptr [rsp+0x40]
+ paddd xmm3, xmmword ptr [rsp+0x60]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ movdqa xmm8, xmmword ptr [ROT16+rip]
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ pshufb xmm15, xmm8
+ movdqa xmm8, xmmword ptr [BLAKE3_IV_0+rip]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x10]
+ paddd xmm1, xmmword ptr [rsp+0x30]
+ paddd xmm2, xmmword ptr [rsp+0x50]
+ paddd xmm3, xmmword ptr [rsp+0x70]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ movdqa xmm8, xmmword ptr [ROT8+rip]
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ pshufb xmm15, xmm8
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x80]
+ paddd xmm1, xmmword ptr [rsp+0xA0]
+ paddd xmm2, xmmword ptr [rsp+0xC0]
+ paddd xmm3, xmmword ptr [rsp+0xE0]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ movdqa xmm8, xmmword ptr [ROT16+rip]
+ pshufb xmm15, xmm8
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x90]
+ paddd xmm1, xmmword ptr [rsp+0xB0]
+ paddd xmm2, xmmword ptr [rsp+0xD0]
+ paddd xmm3, xmmword ptr [rsp+0xF0]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ movdqa xmm8, xmmword ptr [ROT8+rip]
+ pshufb xmm15, xmm8
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x20]
+ paddd xmm1, xmmword ptr [rsp+0x30]
+ paddd xmm2, xmmword ptr [rsp+0x70]
+ paddd xmm3, xmmword ptr [rsp+0x40]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ movdqa xmm8, xmmword ptr [ROT16+rip]
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ pshufb xmm15, xmm8
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x60]
+ paddd xmm1, xmmword ptr [rsp+0xA0]
+ paddd xmm2, xmmword ptr [rsp]
+ paddd xmm3, xmmword ptr [rsp+0xD0]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ movdqa xmm8, xmmword ptr [ROT8+rip]
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ pshufb xmm15, xmm8
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x10]
+ paddd xmm1, xmmword ptr [rsp+0xC0]
+ paddd xmm2, xmmword ptr [rsp+0x90]
+ paddd xmm3, xmmword ptr [rsp+0xF0]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ movdqa xmm8, xmmword ptr [ROT16+rip]
+ pshufb xmm15, xmm8
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+0xB0]
+ paddd xmm1, xmmword ptr [rsp+0x50]
+ paddd xmm2, xmmword ptr [rsp+0xE0]
+ paddd xmm3, xmmword ptr [rsp+0x80]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ movdqa xmm8, xmmword ptr [ROT8+rip]
+ pshufb xmm15, xmm8
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x30]
+ paddd xmm1, xmmword ptr [rsp+0xA0]
+ paddd xmm2, xmmword ptr [rsp+0xD0]
+ paddd xmm3, xmmword ptr [rsp+0x70]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ movdqa xmm8, xmmword ptr [ROT16+rip]
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ pshufb xmm15, xmm8
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x40]
+ paddd xmm1, xmmword ptr [rsp+0xC0]
+ paddd xmm2, xmmword ptr [rsp+0x20]
+ paddd xmm3, xmmword ptr [rsp+0xE0]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ movdqa xmm8, xmmword ptr [ROT8+rip]
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ pshufb xmm15, xmm8
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x60]
+ paddd xmm1, xmmword ptr [rsp+0x90]
+ paddd xmm2, xmmword ptr [rsp+0xB0]
+ paddd xmm3, xmmword ptr [rsp+0x80]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ movdqa xmm8, xmmword ptr [ROT16+rip]
+ pshufb xmm15, xmm8
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x50]
+ paddd xmm1, xmmword ptr [rsp]
+ paddd xmm2, xmmword ptr [rsp+0xF0]
+ paddd xmm3, xmmword ptr [rsp+0x10]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ movdqa xmm8, xmmword ptr [ROT8+rip]
+ pshufb xmm15, xmm8
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+0xA0]
+ paddd xmm1, xmmword ptr [rsp+0xC0]
+ paddd xmm2, xmmword ptr [rsp+0xE0]
+ paddd xmm3, xmmword ptr [rsp+0xD0]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ movdqa xmm8, xmmword ptr [ROT16+rip]
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ pshufb xmm15, xmm8
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x70]
+ paddd xmm1, xmmword ptr [rsp+0x90]
+ paddd xmm2, xmmword ptr [rsp+0x30]
+ paddd xmm3, xmmword ptr [rsp+0xF0]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ movdqa xmm8, xmmword ptr [ROT8+rip]
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ pshufb xmm15, xmm8
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x40]
+ paddd xmm1, xmmword ptr [rsp+0xB0]
+ paddd xmm2, xmmword ptr [rsp+0x50]
+ paddd xmm3, xmmword ptr [rsp+0x10]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ movdqa xmm8, xmmword ptr [ROT16+rip]
+ pshufb xmm15, xmm8
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp]
+ paddd xmm1, xmmword ptr [rsp+0x20]
+ paddd xmm2, xmmword ptr [rsp+0x80]
+ paddd xmm3, xmmword ptr [rsp+0x60]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ movdqa xmm8, xmmword ptr [ROT8+rip]
+ pshufb xmm15, xmm8
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+0xC0]
+ paddd xmm1, xmmword ptr [rsp+0x90]
+ paddd xmm2, xmmword ptr [rsp+0xF0]
+ paddd xmm3, xmmword ptr [rsp+0xE0]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ movdqa xmm8, xmmword ptr [ROT16+rip]
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ pshufb xmm15, xmm8
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0xD0]
+ paddd xmm1, xmmword ptr [rsp+0xB0]
+ paddd xmm2, xmmword ptr [rsp+0xA0]
+ paddd xmm3, xmmword ptr [rsp+0x80]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ movdqa xmm8, xmmword ptr [ROT8+rip]
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ pshufb xmm15, xmm8
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x70]
+ paddd xmm1, xmmword ptr [rsp+0x50]
+ paddd xmm2, xmmword ptr [rsp]
+ paddd xmm3, xmmword ptr [rsp+0x60]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ movdqa xmm8, xmmword ptr [ROT16+rip]
+ pshufb xmm15, xmm8
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x20]
+ paddd xmm1, xmmword ptr [rsp+0x30]
+ paddd xmm2, xmmword ptr [rsp+0x10]
+ paddd xmm3, xmmword ptr [rsp+0x40]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ movdqa xmm8, xmmword ptr [ROT8+rip]
+ pshufb xmm15, xmm8
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x90]
+ paddd xmm1, xmmword ptr [rsp+0xB0]
+ paddd xmm2, xmmword ptr [rsp+0x80]
+ paddd xmm3, xmmword ptr [rsp+0xF0]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ movdqa xmm8, xmmword ptr [ROT16+rip]
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ pshufb xmm15, xmm8
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0xE0]
+ paddd xmm1, xmmword ptr [rsp+0x50]
+ paddd xmm2, xmmword ptr [rsp+0xC0]
+ paddd xmm3, xmmword ptr [rsp+0x10]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ movdqa xmm8, xmmword ptr [ROT8+rip]
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ pshufb xmm15, xmm8
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0xD0]
+ paddd xmm1, xmmword ptr [rsp]
+ paddd xmm2, xmmword ptr [rsp+0x20]
+ paddd xmm3, xmmword ptr [rsp+0x40]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ movdqa xmm8, xmmword ptr [ROT16+rip]
+ pshufb xmm15, xmm8
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x30]
+ paddd xmm1, xmmword ptr [rsp+0xA0]
+ paddd xmm2, xmmword ptr [rsp+0x60]
+ paddd xmm3, xmmword ptr [rsp+0x70]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ movdqa xmm8, xmmword ptr [ROT8+rip]
+ pshufb xmm15, xmm8
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+0xB0]
+ paddd xmm1, xmmword ptr [rsp+0x50]
+ paddd xmm2, xmmword ptr [rsp+0x10]
+ paddd xmm3, xmmword ptr [rsp+0x80]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ movdqa xmm8, xmmword ptr [ROT16+rip]
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ pshufb xmm15, xmm8
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0xF0]
+ paddd xmm1, xmmword ptr [rsp]
+ paddd xmm2, xmmword ptr [rsp+0x90]
+ paddd xmm3, xmmword ptr [rsp+0x60]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ movdqa xmm8, xmmword ptr [ROT8+rip]
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ pshufb xmm15, xmm8
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0xE0]
+ paddd xmm1, xmmword ptr [rsp+0x20]
+ paddd xmm2, xmmword ptr [rsp+0x30]
+ paddd xmm3, xmmword ptr [rsp+0x70]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ movdqa xmm8, xmmword ptr [ROT16+rip]
+ pshufb xmm15, xmm8
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+0xA0]
+ paddd xmm1, xmmword ptr [rsp+0xC0]
+ paddd xmm2, xmmword ptr [rsp+0x40]
+ paddd xmm3, xmmword ptr [rsp+0xD0]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ movdqa xmm8, xmmword ptr [ROT8+rip]
+ pshufb xmm15, xmm8
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ pxor xmm0, xmm8
+ pxor xmm1, xmm9
+ pxor xmm2, xmm10
+ pxor xmm3, xmm11
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ pxor xmm4, xmm12
+ pxor xmm5, xmm13
+ pxor xmm6, xmm14
+ pxor xmm7, xmm15
+ mov eax, r13d
+ jne 9b
+ movdqa xmm9, xmm0
+ punpckldq xmm0, xmm1
+ punpckhdq xmm9, xmm1
+ movdqa xmm11, xmm2
+ punpckldq xmm2, xmm3
+ punpckhdq xmm11, xmm3
+ movdqa xmm1, xmm0
+ punpcklqdq xmm0, xmm2
+ punpckhqdq xmm1, xmm2
+ movdqa xmm3, xmm9
+ punpcklqdq xmm9, xmm11
+ punpckhqdq xmm3, xmm11
+ movdqu xmmword ptr [rbx], xmm0
+ movdqu xmmword ptr [rbx+0x20], xmm1
+ movdqu xmmword ptr [rbx+0x40], xmm9
+ movdqu xmmword ptr [rbx+0x60], xmm3
+ movdqa xmm9, xmm4
+ punpckldq xmm4, xmm5
+ punpckhdq xmm9, xmm5
+ movdqa xmm11, xmm6
+ punpckldq xmm6, xmm7
+ punpckhdq xmm11, xmm7
+ movdqa xmm5, xmm4
+ punpcklqdq xmm4, xmm6
+ punpckhqdq xmm5, xmm6
+ movdqa xmm7, xmm9
+ punpcklqdq xmm9, xmm11
+ punpckhqdq xmm7, xmm11
+ movdqu xmmword ptr [rbx+0x10], xmm4
+ movdqu xmmword ptr [rbx+0x30], xmm5
+ movdqu xmmword ptr [rbx+0x50], xmm9
+ movdqu xmmword ptr [rbx+0x70], xmm7
+ movdqa xmm1, xmmword ptr [rsp+0x110]
+ movdqa xmm0, xmm1
+ paddd xmm1, xmmword ptr [rsp+0x150]
+ movdqa xmmword ptr [rsp+0x110], xmm1
+ pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip]
+ pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip]
+ pcmpgtd xmm0, xmm1
+ movdqa xmm1, xmmword ptr [rsp+0x120]
+ psubd xmm1, xmm0
+ movdqa xmmword ptr [rsp+0x120], xmm1
+ add rbx, 128
+ add rdi, 32
+ sub rsi, 4
+ cmp rsi, 4
+ jnc 2b
+ test rsi, rsi
+ jnz 3f
+4:
+ mov rsp, rbp
+ pop rbp
+ pop rbx
+ pop r12
+ pop r13
+ pop r14
+ pop r15
+ ret
+.p2align 5
+3:
+ test esi, 0x2
+ je 3f
+ movups xmm0, xmmword ptr [rcx]
+ movups xmm1, xmmword ptr [rcx+0x10]
+ movaps xmm8, xmm0
+ movaps xmm9, xmm1
+ movd xmm13, dword ptr [rsp+0x110]
+ pinsrd xmm13, dword ptr [rsp+0x120], 1
+ pinsrd xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
+ movaps xmmword ptr [rsp], xmm13
+ movd xmm14, dword ptr [rsp+0x114]
+ pinsrd xmm14, dword ptr [rsp+0x124], 1
+ pinsrd xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
+ movaps xmmword ptr [rsp+0x10], xmm14
+ mov r8, qword ptr [rdi]
+ mov r9, qword ptr [rdi+0x8]
+ movzx eax, byte ptr [rbp+0x40]
+ or eax, r13d
+ xor edx, edx
+2:
+ mov r14d, eax
+ or eax, r12d
+ add rdx, 64
+ cmp rdx, r15
+ cmovne eax, r14d
+ movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
+ movaps xmm10, xmm2
+ movups xmm4, xmmword ptr [r8+rdx-0x40]
+ movups xmm5, xmmword ptr [r8+rdx-0x30]
+ movaps xmm3, xmm4
+ shufps xmm4, xmm5, 136
+ shufps xmm3, xmm5, 221
+ movaps xmm5, xmm3
+ movups xmm6, xmmword ptr [r8+rdx-0x20]
+ movups xmm7, xmmword ptr [r8+rdx-0x10]
+ movaps xmm3, xmm6
+ shufps xmm6, xmm7, 136
+ pshufd xmm6, xmm6, 0x93
+ shufps xmm3, xmm7, 221
+ pshufd xmm7, xmm3, 0x93
+ movups xmm12, xmmword ptr [r9+rdx-0x40]
+ movups xmm13, xmmword ptr [r9+rdx-0x30]
+ movaps xmm11, xmm12
+ shufps xmm12, xmm13, 136
+ shufps xmm11, xmm13, 221
+ movaps xmm13, xmm11
+ movups xmm14, xmmword ptr [r9+rdx-0x20]
+ movups xmm15, xmmword ptr [r9+rdx-0x10]
+ movaps xmm11, xmm14
+ shufps xmm14, xmm15, 136
+ pshufd xmm14, xmm14, 0x93
+ shufps xmm11, xmm15, 221
+ pshufd xmm15, xmm11, 0x93
+ movaps xmm3, xmmword ptr [rsp]
+ movaps xmm11, xmmword ptr [rsp+0x10]
+ pinsrd xmm3, eax, 3
+ pinsrd xmm11, eax, 3
+ mov al, 7
+9:
+ paddd xmm0, xmm4
+ paddd xmm8, xmm12
+ movaps xmmword ptr [rsp+0x20], xmm4
+ movaps xmmword ptr [rsp+0x30], xmm12
+ paddd xmm0, xmm1
+ paddd xmm8, xmm9
+ pxor xmm3, xmm0
+ pxor xmm11, xmm8
+ movaps xmm12, xmmword ptr [ROT16+rip]
+ pshufb xmm3, xmm12
+ pshufb xmm11, xmm12
+ paddd xmm2, xmm3
+ paddd xmm10, xmm11
+ pxor xmm1, xmm2
+ pxor xmm9, xmm10
+ movdqa xmm4, xmm1
+ pslld xmm1, 20
+ psrld xmm4, 12
+ por xmm1, xmm4
+ movdqa xmm4, xmm9
+ pslld xmm9, 20
+ psrld xmm4, 12
+ por xmm9, xmm4
+ paddd xmm0, xmm5
+ paddd xmm8, xmm13
+ movaps xmmword ptr [rsp+0x40], xmm5
+ movaps xmmword ptr [rsp+0x50], xmm13
+ paddd xmm0, xmm1
+ paddd xmm8, xmm9
+ pxor xmm3, xmm0
+ pxor xmm11, xmm8
+ movaps xmm13, xmmword ptr [ROT8+rip]
+ pshufb xmm3, xmm13
+ pshufb xmm11, xmm13
+ paddd xmm2, xmm3
+ paddd xmm10, xmm11
+ pxor xmm1, xmm2
+ pxor xmm9, xmm10
+ movdqa xmm4, xmm1
+ pslld xmm1, 25
+ psrld xmm4, 7
+ por xmm1, xmm4
+ movdqa xmm4, xmm9
+ pslld xmm9, 25
+ psrld xmm4, 7
+ por xmm9, xmm4
+ pshufd xmm0, xmm0, 0x93
+ pshufd xmm8, xmm8, 0x93
+ pshufd xmm3, xmm3, 0x4E
+ pshufd xmm11, xmm11, 0x4E
+ pshufd xmm2, xmm2, 0x39
+ pshufd xmm10, xmm10, 0x39
+ paddd xmm0, xmm6
+ paddd xmm8, xmm14
+ paddd xmm0, xmm1
+ paddd xmm8, xmm9
+ pxor xmm3, xmm0
+ pxor xmm11, xmm8
+ pshufb xmm3, xmm12
+ pshufb xmm11, xmm12
+ paddd xmm2, xmm3
+ paddd xmm10, xmm11
+ pxor xmm1, xmm2
+ pxor xmm9, xmm10
+ movdqa xmm4, xmm1
+ pslld xmm1, 20
+ psrld xmm4, 12
+ por xmm1, xmm4
+ movdqa xmm4, xmm9
+ pslld xmm9, 20
+ psrld xmm4, 12
+ por xmm9, xmm4
+ paddd xmm0, xmm7
+ paddd xmm8, xmm15
+ paddd xmm0, xmm1
+ paddd xmm8, xmm9
+ pxor xmm3, xmm0
+ pxor xmm11, xmm8
+ pshufb xmm3, xmm13
+ pshufb xmm11, xmm13
+ paddd xmm2, xmm3
+ paddd xmm10, xmm11
+ pxor xmm1, xmm2
+ pxor xmm9, xmm10
+ movdqa xmm4, xmm1
+ pslld xmm1, 25
+ psrld xmm4, 7
+ por xmm1, xmm4
+ movdqa xmm4, xmm9
+ pslld xmm9, 25
+ psrld xmm4, 7
+ por xmm9, xmm4
+ pshufd xmm0, xmm0, 0x39
+ pshufd xmm8, xmm8, 0x39
+ pshufd xmm3, xmm3, 0x4E
+ pshufd xmm11, xmm11, 0x4E
+ pshufd xmm2, xmm2, 0x93
+ pshufd xmm10, xmm10, 0x93
+ dec al
+ je 9f
+ movdqa xmm12, xmmword ptr [rsp+0x20]
+ movdqa xmm5, xmmword ptr [rsp+0x40]
+ pshufd xmm13, xmm12, 0x0F
+ shufps xmm12, xmm5, 214
+ pshufd xmm4, xmm12, 0x39
+ movdqa xmm12, xmm6
+ shufps xmm12, xmm7, 250
+ pblendw xmm13, xmm12, 0xCC
+ movdqa xmm12, xmm7
+ punpcklqdq xmm12, xmm5
+ pblendw xmm12, xmm6, 0xC0
+ pshufd xmm12, xmm12, 0x78
+ punpckhdq xmm5, xmm7
+ punpckldq xmm6, xmm5
+ pshufd xmm7, xmm6, 0x1E
+ movdqa xmmword ptr [rsp+0x20], xmm13
+ movdqa xmmword ptr [rsp+0x40], xmm12
+ movdqa xmm5, xmmword ptr [rsp+0x30]
+ movdqa xmm13, xmmword ptr [rsp+0x50]
+ pshufd xmm6, xmm5, 0x0F
+ shufps xmm5, xmm13, 214
+ pshufd xmm12, xmm5, 0x39
+ movdqa xmm5, xmm14
+ shufps xmm5, xmm15, 250
+ pblendw xmm6, xmm5, 0xCC
+ movdqa xmm5, xmm15
+ punpcklqdq xmm5, xmm13
+ pblendw xmm5, xmm14, 0xC0
+ pshufd xmm5, xmm5, 0x78
+ punpckhdq xmm13, xmm15
+ punpckldq xmm14, xmm13
+ pshufd xmm15, xmm14, 0x1E
+ movdqa xmm13, xmm6
+ movdqa xmm14, xmm5
+ movdqa xmm5, xmmword ptr [rsp+0x20]
+ movdqa xmm6, xmmword ptr [rsp+0x40]
+ jmp 9b
+9:
+ pxor xmm0, xmm2
+ pxor xmm1, xmm3
+ pxor xmm8, xmm10
+ pxor xmm9, xmm11
+ mov eax, r13d
+ cmp rdx, r15
+ jne 2b
+ movups xmmword ptr [rbx], xmm0
+ movups xmmword ptr [rbx+0x10], xmm1
+ movups xmmword ptr [rbx+0x20], xmm8
+ movups xmmword ptr [rbx+0x30], xmm9
+ movdqa xmm0, xmmword ptr [rsp+0x130]
+ movdqa xmm1, xmmword ptr [rsp+0x110]
+ movdqa xmm2, xmmword ptr [rsp+0x120]
+ movdqu xmm3, xmmword ptr [rsp+0x118]
+ movdqu xmm4, xmmword ptr [rsp+0x128]
+ blendvps xmm1, xmm3, xmm0
+ blendvps xmm2, xmm4, xmm0
+ movdqa xmmword ptr [rsp+0x110], xmm1
+ movdqa xmmword ptr [rsp+0x120], xmm2
+ add rdi, 16
+ add rbx, 64
+ sub rsi, 2
+3:
+ test esi, 0x1
+ je 4b
+ movups xmm0, xmmword ptr [rcx]
+ movups xmm1, xmmword ptr [rcx+0x10]
+ movd xmm13, dword ptr [rsp+0x110]
+ pinsrd xmm13, dword ptr [rsp+0x120], 1
+ pinsrd xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
+ movaps xmm14, xmmword ptr [ROT8+rip]
+ movaps xmm15, xmmword ptr [ROT16+rip]
+ mov r8, qword ptr [rdi]
+ movzx eax, byte ptr [rbp+0x40]
+ or eax, r13d
+ xor edx, edx
+2:
+ mov r14d, eax
+ or eax, r12d
+ add rdx, 64
+ cmp rdx, r15
+ cmovne eax, r14d
+ movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
+ movaps xmm3, xmm13
+ pinsrd xmm3, eax, 3
+ movups xmm4, xmmword ptr [r8+rdx-0x40]
+ movups xmm5, xmmword ptr [r8+rdx-0x30]
+ movaps xmm8, xmm4
+ shufps xmm4, xmm5, 136
+ shufps xmm8, xmm5, 221
+ movaps xmm5, xmm8
+ movups xmm6, xmmword ptr [r8+rdx-0x20]
+ movups xmm7, xmmword ptr [r8+rdx-0x10]
+ movaps xmm8, xmm6
+ shufps xmm6, xmm7, 136
+ pshufd xmm6, xmm6, 0x93
+ shufps xmm8, xmm7, 221
+ pshufd xmm7, xmm8, 0x93
+ mov al, 7
+9:
+ paddd xmm0, xmm4
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshufb xmm3, xmm15
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 20
+ psrld xmm11, 12
+ por xmm1, xmm11
+ paddd xmm0, xmm5
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshufb xmm3, xmm14
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 25
+ psrld xmm11, 7
+ por xmm1, xmm11
+ pshufd xmm0, xmm0, 0x93
+ pshufd xmm3, xmm3, 0x4E
+ pshufd xmm2, xmm2, 0x39
+ paddd xmm0, xmm6
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshufb xmm3, xmm15
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 20
+ psrld xmm11, 12
+ por xmm1, xmm11
+ paddd xmm0, xmm7
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshufb xmm3, xmm14
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 25
+ psrld xmm11, 7
+ por xmm1, xmm11
+ pshufd xmm0, xmm0, 0x39
+ pshufd xmm3, xmm3, 0x4E
+ pshufd xmm2, xmm2, 0x93
+ dec al
+ jz 9f
+ movdqa xmm8, xmm4
+ shufps xmm8, xmm5, 214
+ pshufd xmm9, xmm4, 0x0F
+ pshufd xmm4, xmm8, 0x39
+ movdqa xmm8, xmm6
+ shufps xmm8, xmm7, 250
+ pblendw xmm9, xmm8, 0xCC
+ movdqa xmm8, xmm7
+ punpcklqdq xmm8, xmm5
+ pblendw xmm8, xmm6, 0xC0
+ pshufd xmm8, xmm8, 0x78
+ punpckhdq xmm5, xmm7
+ punpckldq xmm6, xmm5
+ pshufd xmm7, xmm6, 0x1E
+ movdqa xmm5, xmm9
+ movdqa xmm6, xmm8
+ jmp 9b
+9:
+ pxor xmm0, xmm2
+ pxor xmm1, xmm3
+ mov eax, r13d
+ cmp rdx, r15
+ jne 2b
+ movups xmmword ptr [rbx], xmm0
+ movups xmmword ptr [rbx+0x10], xmm1
+ jmp 4b
+.p2align 6
+zfs_blake3_compress_in_place_sse41:
+ _CET_ENDBR
+ movups xmm0, xmmword ptr [rdi]
+ movups xmm1, xmmword ptr [rdi+0x10]
+ movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
+ shl r8, 32
+ add rdx, r8
+ movq xmm3, rcx
+ movq xmm4, rdx
+ punpcklqdq xmm3, xmm4
+ movups xmm4, xmmword ptr [rsi]
+ movups xmm5, xmmword ptr [rsi+0x10]
+ movaps xmm8, xmm4
+ shufps xmm4, xmm5, 136
+ shufps xmm8, xmm5, 221
+ movaps xmm5, xmm8
+ movups xmm6, xmmword ptr [rsi+0x20]
+ movups xmm7, xmmword ptr [rsi+0x30]
+ movaps xmm8, xmm6
+ shufps xmm6, xmm7, 136
+ pshufd xmm6, xmm6, 0x93
+ shufps xmm8, xmm7, 221
+ pshufd xmm7, xmm8, 0x93
+ movaps xmm14, xmmword ptr [ROT8+rip]
+ movaps xmm15, xmmword ptr [ROT16+rip]
+ mov al, 7
+9:
+ paddd xmm0, xmm4
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshufb xmm3, xmm15
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 20
+ psrld xmm11, 12
+ por xmm1, xmm11
+ paddd xmm0, xmm5
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshufb xmm3, xmm14
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 25
+ psrld xmm11, 7
+ por xmm1, xmm11
+ pshufd xmm0, xmm0, 0x93
+ pshufd xmm3, xmm3, 0x4E
+ pshufd xmm2, xmm2, 0x39
+ paddd xmm0, xmm6
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshufb xmm3, xmm15
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 20
+ psrld xmm11, 12
+ por xmm1, xmm11
+ paddd xmm0, xmm7
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshufb xmm3, xmm14
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 25
+ psrld xmm11, 7
+ por xmm1, xmm11
+ pshufd xmm0, xmm0, 0x39
+ pshufd xmm3, xmm3, 0x4E
+ pshufd xmm2, xmm2, 0x93
+ dec al
+ jz 9f
+ movdqa xmm8, xmm4
+ shufps xmm8, xmm5, 214
+ pshufd xmm9, xmm4, 0x0F
+ pshufd xmm4, xmm8, 0x39
+ movdqa xmm8, xmm6
+ shufps xmm8, xmm7, 250
+ pblendw xmm9, xmm8, 0xCC
+ movdqa xmm8, xmm7
+ punpcklqdq xmm8, xmm5
+ pblendw xmm8, xmm6, 0xC0
+ pshufd xmm8, xmm8, 0x78
+ punpckhdq xmm5, xmm7
+ punpckldq xmm6, xmm5
+ pshufd xmm7, xmm6, 0x1E
+ movdqa xmm5, xmm9
+ movdqa xmm6, xmm8
+ jmp 9b
+9:
+ pxor xmm0, xmm2
+ pxor xmm1, xmm3
+ movups xmmword ptr [rdi], xmm0
+ movups xmmword ptr [rdi+0x10], xmm1
+ ret
+.p2align 6
+zfs_blake3_compress_xof_sse41:
+ _CET_ENDBR
+ movups xmm0, xmmword ptr [rdi]
+ movups xmm1, xmmword ptr [rdi+0x10]
+ movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
+ movzx eax, r8b
+ movzx edx, dl
+ shl rax, 32
+ add rdx, rax
+ movq xmm3, rcx
+ movq xmm4, rdx
+ punpcklqdq xmm3, xmm4
+ movups xmm4, xmmword ptr [rsi]
+ movups xmm5, xmmword ptr [rsi+0x10]
+ movaps xmm8, xmm4
+ shufps xmm4, xmm5, 136
+ shufps xmm8, xmm5, 221
+ movaps xmm5, xmm8
+ movups xmm6, xmmword ptr [rsi+0x20]
+ movups xmm7, xmmword ptr [rsi+0x30]
+ movaps xmm8, xmm6
+ shufps xmm6, xmm7, 136
+ pshufd xmm6, xmm6, 0x93
+ shufps xmm8, xmm7, 221
+ pshufd xmm7, xmm8, 0x93
+ movaps xmm14, xmmword ptr [ROT8+rip]
+ movaps xmm15, xmmword ptr [ROT16+rip]
+ mov al, 7
+9:
+ paddd xmm0, xmm4
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshufb xmm3, xmm15
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 20
+ psrld xmm11, 12
+ por xmm1, xmm11
+ paddd xmm0, xmm5
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshufb xmm3, xmm14
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 25
+ psrld xmm11, 7
+ por xmm1, xmm11
+ pshufd xmm0, xmm0, 0x93
+ pshufd xmm3, xmm3, 0x4E
+ pshufd xmm2, xmm2, 0x39
+ paddd xmm0, xmm6
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshufb xmm3, xmm15
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 20
+ psrld xmm11, 12
+ por xmm1, xmm11
+ paddd xmm0, xmm7
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshufb xmm3, xmm14
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 25
+ psrld xmm11, 7
+ por xmm1, xmm11
+ pshufd xmm0, xmm0, 0x39
+ pshufd xmm3, xmm3, 0x4E
+ pshufd xmm2, xmm2, 0x93
+ dec al
+ jz 9f
+ movdqa xmm8, xmm4
+ shufps xmm8, xmm5, 214
+ pshufd xmm9, xmm4, 0x0F
+ pshufd xmm4, xmm8, 0x39
+ movdqa xmm8, xmm6
+ shufps xmm8, xmm7, 250
+ pblendw xmm9, xmm8, 0xCC
+ movdqa xmm8, xmm7
+ punpcklqdq xmm8, xmm5
+ pblendw xmm8, xmm6, 0xC0
+ pshufd xmm8, xmm8, 0x78
+ punpckhdq xmm5, xmm7
+ punpckldq xmm6, xmm5
+ pshufd xmm7, xmm6, 0x1E
+ movdqa xmm5, xmm9
+ movdqa xmm6, xmm8
+ jmp 9b
+9:
+ movdqu xmm4, xmmword ptr [rdi]
+ movdqu xmm5, xmmword ptr [rdi+0x10]
+ pxor xmm0, xmm2
+ pxor xmm1, xmm3
+ pxor xmm2, xmm4
+ pxor xmm3, xmm5
+ movups xmmword ptr [r9], xmm0
+ movups xmmword ptr [r9+0x10], xmm1
+ movups xmmword ptr [r9+0x20], xmm2
+ movups xmmword ptr [r9+0x30], xmm3
+ ret
+
+.size zfs_blake3_hash_many_sse41, . - zfs_blake3_hash_many_sse41
+.size zfs_blake3_compress_in_place_sse41, . - zfs_blake3_compress_in_place_sse41
+.size zfs_blake3_compress_xof_sse41, . - zfs_blake3_compress_xof_sse41
+
+#ifdef __APPLE__
+.static_data
+#else
+.section .rodata
+#endif
+.p2align 6
+BLAKE3_IV:
+ .long 0x6A09E667, 0xBB67AE85
+ .long 0x3C6EF372, 0xA54FF53A
+ROT16:
+ .byte 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13
+ROT8:
+ .byte 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12
+ADD0:
+ .long 0, 1, 2, 3
+ADD1:
+ .long 4, 4, 4, 4
+BLAKE3_IV_0:
+ .long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667
+BLAKE3_IV_1:
+ .long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85
+BLAKE3_IV_2:
+ .long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372
+BLAKE3_IV_3:
+ .long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A
+BLAKE3_BLOCK_LEN:
+ .long 64, 64, 64, 64
+CMP_MSB_MASK:
+ .long 0x80000000, 0x80000000, 0x80000000, 0x80000000
+
+#endif /* HAVE_SSE4_1 */
+
+#ifdef __ELF__
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/crypto_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/crypto_os.c
index c4d5f8761f5a..1f139ea5b807 100644
--- a/sys/contrib/openzfs/module/os/freebsd/zfs/crypto_os.c
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/crypto_os.c
@@ -149,6 +149,13 @@ freebsd_zfs_crypt_done(struct cryptop *crp)
return (0);
}
+static int
+freebsd_zfs_crypt_done_sync(struct cryptop *crp)
+{
+
+ return (0);
+}
+
void
freebsd_crypt_freesession(freebsd_crypt_session_t *sess)
{
@@ -158,26 +165,36 @@ freebsd_crypt_freesession(freebsd_crypt_session_t *sess)
}
static int
-zfs_crypto_dispatch(freebsd_crypt_session_t *session, struct cryptop *crp)
+zfs_crypto_dispatch(freebsd_crypt_session_t *session, struct cryptop *crp)
{
int error;
crp->crp_opaque = session;
- crp->crp_callback = freebsd_zfs_crypt_done;
for (;;) {
+#if __FreeBSD_version < 1400004
+ boolean_t async = ((crypto_ses2caps(crp->crp_session) &
+ CRYPTOCAP_F_SYNC) == 0);
+#else
+ boolean_t async = !CRYPTO_SESS_SYNC(crp->crp_session);
+#endif
+ crp->crp_callback = async ? freebsd_zfs_crypt_done :
+ freebsd_zfs_crypt_done_sync;
error = crypto_dispatch(crp);
- if (error)
- break;
- mtx_lock(&session->fs_lock);
- while (session->fs_done == false)
- msleep(crp, &session->fs_lock, 0,
- "zfs_crypto", 0);
- mtx_unlock(&session->fs_lock);
+ if (error == 0) {
+ if (async) {
+ mtx_lock(&session->fs_lock);
+ while (session->fs_done == false) {
+ msleep(crp, &session->fs_lock, 0,
+ "zfs_crypto", 0);
+ }
+ mtx_unlock(&session->fs_lock);
+ }
+ error = crp->crp_etype;
+ }
- if (crp->crp_etype == ENOMEM) {
+ if (error == ENOMEM) {
pause("zcrnomem", 1);
- } else if (crp->crp_etype != EAGAIN) {
- error = crp->crp_etype;
+ } else if (error != EAGAIN) {
break;
}
crp->crp_etype = 0;
diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-generic.c b/sys/contrib/openzfs/module/os/linux/spl/spl-generic.c
index f99a2f966660..5179100d1665 100644
--- a/sys/contrib/openzfs/module/os/linux/spl/spl-generic.c
+++ b/sys/contrib/openzfs/module/os/linux/spl/spl-generic.c
@@ -780,8 +780,13 @@ spl_init(void)
if ((rc = spl_zlib_init()))
goto out7;
+ if ((rc = spl_zone_init()))
+ goto out8;
+
return (rc);
+out8:
+ spl_zlib_fini();
out7:
spl_kstat_fini();
out6:
@@ -801,6 +806,7 @@ out1:
static void __exit
spl_fini(void)
{
+ spl_zone_fini();
spl_zlib_fini();
spl_kstat_fini();
spl_proc_fini();
diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-zone.c b/sys/contrib/openzfs/module/os/linux/spl/spl-zone.c
new file mode 100644
index 000000000000..b8a8b7cd8cd8
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/spl/spl-zone.c
@@ -0,0 +1,424 @@
+/*
+ * Copyright (c) 2021 Klara Systems, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/types.h>
+#include <sys/mutex.h>
+#include <sys/sysmacros.h>
+#include <sys/kmem.h>
+#include <linux/file.h>
+#include <linux/magic.h>
+#include <sys/zone.h>
+
+#if defined(CONFIG_USER_NS)
+#include <linux/statfs.h>
+#include <linux/proc_ns.h>
+#endif
+
+static kmutex_t zone_datasets_lock;
+static struct list_head zone_datasets;
+
+typedef struct zone_datasets {
+ struct list_head zds_list; /* zone_datasets linkage */
+ struct user_namespace *zds_userns; /* namespace reference */
+ struct list_head zds_datasets; /* datasets for the namespace */
+} zone_datasets_t;
+
+typedef struct zone_dataset {
+ struct list_head zd_list; /* zone_dataset linkage */
+ size_t zd_dsnamelen; /* length of name */
+ char zd_dsname[0]; /* name of the member dataset */
+} zone_dataset_t;
+
+#if defined(CONFIG_USER_NS) && defined(HAVE_USER_NS_COMMON_INUM)
+/*
+ * Returns:
+ * - 0 on success
+ * - EBADF if it cannot open the provided file descriptor
+ * - ENOTTY if the file itself is a not a user namespace file. We want to
+ * intercept this error in the ZFS layer. We cannot just return one of the
+ * ZFS_ERR_* errors here as we want to preserve the seperation of the ZFS
+ * and the SPL layers.
+ */
+static int
+user_ns_get(int fd, struct user_namespace **userns)
+{
+ struct kstatfs st;
+ struct file *nsfile;
+ struct ns_common *ns;
+ int error;
+
+ if ((nsfile = fget(fd)) == NULL)
+ return (EBADF);
+ if (vfs_statfs(&nsfile->f_path, &st) != 0) {
+ error = ENOTTY;
+ goto done;
+ }
+ if (st.f_type != NSFS_MAGIC) {
+ error = ENOTTY;
+ goto done;
+ }
+ ns = get_proc_ns(file_inode(nsfile));
+ if (ns->ops->type != CLONE_NEWUSER) {
+ error = ENOTTY;
+ goto done;
+ }
+ *userns = container_of(ns, struct user_namespace, ns);
+
+ error = 0;
+done:
+ fput(nsfile);
+
+ return (error);
+}
+#endif /* defined(CONFIG_USER_NS) && defined(HAVE_USER_NS_COMMON_INUM) */
+
+static unsigned int
+user_ns_zoneid(struct user_namespace *user_ns)
+{
+ unsigned int r;
+
+#if defined(HAVE_USER_NS_COMMON_INUM)
+ r = user_ns->ns.inum;
+#else
+ r = user_ns->proc_inum;
+#endif
+
+ return (r);
+}
+
+static struct zone_datasets *
+zone_datasets_lookup(unsigned int nsinum)
+{
+ zone_datasets_t *zds;
+
+ list_for_each_entry(zds, &zone_datasets, zds_list) {
+ if (user_ns_zoneid(zds->zds_userns) == nsinum)
+ return (zds);
+ }
+ return (NULL);
+}
+
+#if defined(CONFIG_USER_NS) && defined(HAVE_USER_NS_COMMON_INUM)
+static struct zone_dataset *
+zone_dataset_lookup(zone_datasets_t *zds, const char *dataset, size_t dsnamelen)
+{
+ zone_dataset_t *zd;
+
+ list_for_each_entry(zd, &zds->zds_datasets, zd_list) {
+ if (zd->zd_dsnamelen != dsnamelen)
+ continue;
+ if (strncmp(zd->zd_dsname, dataset, dsnamelen) == 0)
+ return (zd);
+ }
+
+ return (NULL);
+}
+
+static int
+zone_dataset_cred_check(cred_t *cred)
+{
+
+ if (!uid_eq(cred->uid, GLOBAL_ROOT_UID))
+ return (EPERM);
+
+ return (0);
+}
+#endif /* defined(CONFIG_USER_NS) && defined(HAVE_USER_NS_COMMON_INUM) */
+
+static int
+zone_dataset_name_check(const char *dataset, size_t *dsnamelen)
+{
+
+ if (dataset[0] == '\0' || dataset[0] == '/')
+ return (ENOENT);
+
+ *dsnamelen = strlen(dataset);
+ /* Ignore trailing slash, if supplied. */
+ if (dataset[*dsnamelen - 1] == '/')
+ (*dsnamelen)--;
+
+ return (0);
+}
+
+int
+zone_dataset_attach(cred_t *cred, const char *dataset, int userns_fd)
+{
+#if defined(CONFIG_USER_NS) && defined(HAVE_USER_NS_COMMON_INUM)
+ struct user_namespace *userns;
+ zone_datasets_t *zds;
+ zone_dataset_t *zd;
+ int error;
+ size_t dsnamelen;
+
+ if ((error = zone_dataset_cred_check(cred)) != 0)
+ return (error);
+ if ((error = zone_dataset_name_check(dataset, &dsnamelen)) != 0)
+ return (error);
+ if ((error = user_ns_get(userns_fd, &userns)) != 0)
+ return (error);
+
+ mutex_enter(&zone_datasets_lock);
+ zds = zone_datasets_lookup(user_ns_zoneid(userns));
+ if (zds == NULL) {
+ zds = kmem_alloc(sizeof (zone_datasets_t), KM_SLEEP);
+ INIT_LIST_HEAD(&zds->zds_list);
+ INIT_LIST_HEAD(&zds->zds_datasets);
+ zds->zds_userns = userns;
+ /*
+ * Lock the namespace by incresing its refcount to prevent
+ * the namespace ID from being reused.
+ */
+ get_user_ns(userns);
+ list_add_tail(&zds->zds_list, &zone_datasets);
+ } else {
+ zd = zone_dataset_lookup(zds, dataset, dsnamelen);
+ if (zd != NULL) {
+ mutex_exit(&zone_datasets_lock);
+ return (EEXIST);
+ }
+ }
+
+ zd = kmem_alloc(sizeof (zone_dataset_t) + dsnamelen + 1, KM_SLEEP);
+ zd->zd_dsnamelen = dsnamelen;
+ strncpy(zd->zd_dsname, dataset, dsnamelen);
+ zd->zd_dsname[dsnamelen] = '\0';
+ INIT_LIST_HEAD(&zd->zd_list);
+ list_add_tail(&zd->zd_list, &zds->zds_datasets);
+
+ mutex_exit(&zone_datasets_lock);
+ return (0);
+#else
+ return (ENXIO);
+#endif /* defined(CONFIG_USER_NS) && defined(HAVE_USER_NS_COMMON_INUM) */
+}
+EXPORT_SYMBOL(zone_dataset_attach);
+
+int
+zone_dataset_detach(cred_t *cred, const char *dataset, int userns_fd)
+{
+#if defined(CONFIG_USER_NS) && defined(HAVE_USER_NS_COMMON_INUM)
+ struct user_namespace *userns;
+ zone_datasets_t *zds;
+ zone_dataset_t *zd;
+ int error;
+ size_t dsnamelen;
+
+ if ((error = zone_dataset_cred_check(cred)) != 0)
+ return (error);
+ if ((error = zone_dataset_name_check(dataset, &dsnamelen)) != 0)
+ return (error);
+ if ((error = user_ns_get(userns_fd, &userns)) != 0)
+ return (error);
+
+ mutex_enter(&zone_datasets_lock);
+ zds = zone_datasets_lookup(user_ns_zoneid(userns));
+ if (zds != NULL)
+ zd = zone_dataset_lookup(zds, dataset, dsnamelen);
+ if (zds == NULL || zd == NULL) {
+ mutex_exit(&zone_datasets_lock);
+ return (ENOENT);
+ }
+
+ list_del(&zd->zd_list);
+ kmem_free(zd, sizeof (*zd) + zd->zd_dsnamelen + 1);
+
+ /* Prune the namespace entry if it has no more delegations. */
+ if (list_empty(&zds->zds_datasets)) {
+ /*
+ * Decrease the refcount now that the namespace is no longer
+ * used. It is no longer necessary to prevent the namespace ID
+ * from being reused.
+ */
+ put_user_ns(userns);
+ list_del(&zds->zds_list);
+ kmem_free(zds, sizeof (*zds));
+ }
+
+ mutex_exit(&zone_datasets_lock);
+ return (0);
+#else
+ return (ENXIO);
+#endif /* defined(CONFIG_USER_NS) && defined(HAVE_USER_NS_COMMON_INUM) */
+}
+EXPORT_SYMBOL(zone_dataset_detach);
+
+/*
+ * A dataset is visible if:
+ * - It is a parent of a namespace entry.
+ * - It is one of the namespace entries.
+ * - It is a child of a namespace entry.
+ *
+ * A dataset is writable if:
+ * - It is one of the namespace entries.
+ * - It is a child of a namespace entry.
+ *
+ * The parent datasets of namespace entries are visible and
+ * read-only to provide a path back to the root of the pool.
+ */
+int
+zone_dataset_visible(const char *dataset, int *write)
+{
+ zone_datasets_t *zds;
+ zone_dataset_t *zd;
+ size_t dsnamelen, zd_len;
+ int visible;
+
+ /* Default to read-only, in case visible is returned. */
+ if (write != NULL)
+ *write = 0;
+ if (zone_dataset_name_check(dataset, &dsnamelen) != 0)
+ return (0);
+ if (INGLOBALZONE(curproc)) {
+ if (write != NULL)
+ *write = 1;
+ return (1);
+ }
+
+ mutex_enter(&zone_datasets_lock);
+ zds = zone_datasets_lookup(crgetzoneid(curproc->cred));
+ if (zds == NULL) {
+ mutex_exit(&zone_datasets_lock);
+ return (0);
+ }
+
+ visible = 0;
+ list_for_each_entry(zd, &zds->zds_datasets, zd_list) {
+ zd_len = strlen(zd->zd_dsname);
+ if (zd_len > dsnamelen) {
+ /*
+ * The name of the namespace entry is longer than that
+ * of the dataset, so it could be that the dataset is a
+ * parent of the namespace entry.
+ */
+ visible = memcmp(zd->zd_dsname, dataset,
+ dsnamelen) == 0 &&
+ zd->zd_dsname[dsnamelen] == '/';
+ if (visible)
+ break;
+ } else if (zd_len == dsnamelen) {
+ /*
+ * The name of the namespace entry is as long as that
+ * of the dataset, so perhaps the dataset itself is the
+ * namespace entry.
+ */
+ visible = memcmp(zd->zd_dsname, dataset, zd_len) == 0;
+ if (visible) {
+ if (write != NULL)
+ *write = 1;
+ break;
+ }
+ } else {
+ /*
+ * The name of the namespace entry is shorter than that
+ * of the dataset, so perhaps the dataset is a child of
+ * the namespace entry.
+ */
+ visible = memcmp(zd->zd_dsname, dataset,
+ zd_len) == 0 && dataset[zd_len] == '/';
+ if (visible) {
+ if (write != NULL)
+ *write = 1;
+ break;
+ }
+ }
+ }
+
+ mutex_exit(&zone_datasets_lock);
+ return (visible);
+}
+EXPORT_SYMBOL(zone_dataset_visible);
+
+unsigned int
+global_zoneid(void)
+{
+ unsigned int z = 0;
+
+#if defined(CONFIG_USER_NS)
+ z = user_ns_zoneid(&init_user_ns);
+#endif
+
+ return (z);
+}
+EXPORT_SYMBOL(global_zoneid);
+
+unsigned int
+crgetzoneid(const cred_t *cr)
+{
+ unsigned int r = 0;
+
+#if defined(CONFIG_USER_NS)
+ r = user_ns_zoneid(cr->user_ns);
+#endif
+
+ return (r);
+}
+EXPORT_SYMBOL(crgetzoneid);
+
+boolean_t
+inglobalzone(proc_t *proc)
+{
+#if defined(CONFIG_USER_NS)
+ return (proc->cred->user_ns == &init_user_ns);
+#else
+ return (B_TRUE);
+#endif
+}
+EXPORT_SYMBOL(inglobalzone);
+
+int
+spl_zone_init(void)
+{
+ mutex_init(&zone_datasets_lock, NULL, MUTEX_DEFAULT, NULL);
+ INIT_LIST_HEAD(&zone_datasets);
+ return (0);
+}
+
+void
+spl_zone_fini(void)
+{
+ zone_datasets_t *zds;
+ zone_dataset_t *zd;
+
+ /*
+ * It would be better to assert an empty zone_datasets, but since
+ * there's no automatic mechanism for cleaning them up if the user
+ * namespace is destroyed, just do it here, since spl is about to go
+ * out of context.
+ */
+ while (!list_empty(&zone_datasets)) {
+ zds = list_entry(zone_datasets.next, zone_datasets_t, zds_list);
+ while (!list_empty(&zds->zds_datasets)) {
+ zd = list_entry(zds->zds_datasets.next,
+ zone_dataset_t, zd_list);
+ list_del(&zd->zd_list);
+ kmem_free(zd, sizeof (*zd) + zd->zd_dsnamelen + 1);
+ put_user_ns(zds->zds_userns);
+ }
+ list_del(&zds->zds_list);
+ kmem_free(zds, sizeof (*zds));
+ }
+ mutex_destroy(&zone_datasets_lock);
+}
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/policy.c b/sys/contrib/openzfs/module/os/linux/zfs/policy.c
index 5a52092bb90a..ab00d2ae14d2 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/policy.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/policy.c
@@ -61,7 +61,7 @@ priv_policy_ns(const cred_t *cr, int capability, int err,
static int
priv_policy(const cred_t *cr, int capability, int err)
{
- return (priv_policy_ns(cr, capability, err, NULL));
+ return (priv_policy_ns(cr, capability, err, cr->user_ns));
}
static int
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_ioctl_os.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_ioctl_os.c
index c65702e1a053..67b864aa77a9 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_ioctl_os.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_ioctl_os.c
@@ -37,6 +37,7 @@
* Copyright 2017 RackTop Systems.
* Copyright (c) 2017 Open-E, Inc. All Rights Reserved.
* Copyright (c) 2019 Datto Inc.
+ * Copyright (c) 2021 Klara, Inc.
*/
#include <sys/types.h>
@@ -150,6 +151,48 @@ out:
}
+static int
+zfs_ioc_userns_attach(zfs_cmd_t *zc)
+{
+ int error;
+
+ if (zc == NULL)
+ return (SET_ERROR(EINVAL));
+
+ error = zone_dataset_attach(CRED(), zc->zc_name, zc->zc_cleanup_fd);
+
+ /*
+ * Translate ENOTTY to ZFS_ERR_NOT_USER_NAMESPACE as we just arrived
+ * back from the SPL layer, which does not know about ZFS_ERR_* errors.
+ * See the comment at the user_ns_get() function in spl-zone.c for
+ * details.
+ */
+ if (error == ENOTTY)
+ error = ZFS_ERR_NOT_USER_NAMESPACE;
+
+ return (error);
+}
+
+static int
+zfs_ioc_userns_detach(zfs_cmd_t *zc)
+{
+ int error;
+
+ if (zc == NULL)
+ return (SET_ERROR(EINVAL));
+
+ error = zone_dataset_detach(CRED(), zc->zc_name, zc->zc_cleanup_fd);
+
+ /*
+ * See the comment in zfs_ioc_userns_attach() for details on what is
+ * going on here.
+ */
+ if (error == ENOTTY)
+ error = ZFS_ERR_NOT_USER_NAMESPACE;
+
+ return (error);
+}
+
uint64_t
zfs_max_nvlist_src_size_os(void)
{
@@ -168,6 +211,10 @@ zfs_ioctl_update_mount_cache(const char *dsname)
void
zfs_ioctl_init_os(void)
{
+ zfs_ioctl_register_dataset_nolog(ZFS_IOC_USERNS_ATTACH,
+ zfs_ioc_userns_attach, zfs_secpolicy_config, POOL_CHECK_NONE);
+ zfs_ioctl_register_dataset_nolog(ZFS_IOC_USERNS_DETACH,
+ zfs_ioc_userns_detach, zfs_secpolicy_config, POOL_CHECK_NONE);
}
#ifdef CONFIG_COMPAT
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_uio.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_uio.c
index 4f31bcb5959d..abb6dbe67cdf 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_uio.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_uio.c
@@ -126,7 +126,7 @@ zfs_uiomove_iov(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio)
}
static int
-zfs_uiomove_bvec(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio)
+zfs_uiomove_bvec_impl(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio)
{
const struct bio_vec *bv = uio->uio_bvec;
size_t skip = uio->uio_skip;
@@ -137,10 +137,13 @@ zfs_uiomove_bvec(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio)
cnt = MIN(bv->bv_len - skip, n);
paddr = zfs_kmap_atomic(bv->bv_page);
- if (rw == UIO_READ)
+ if (rw == UIO_READ) {
+ /* Copy from buffer 'p' to the bvec data */
memcpy(paddr + bv->bv_offset + skip, p, cnt);
- else
+ } else {
+ /* Copy from bvec data to buffer 'p' */
memcpy(p, paddr + bv->bv_offset + skip, cnt);
+ }
zfs_kunmap_atomic(paddr);
skip += cnt;
@@ -158,6 +161,141 @@ zfs_uiomove_bvec(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio)
return (0);
}
+#ifdef HAVE_BLK_MQ
+static void
+zfs_copy_bvec(void *p, size_t skip, size_t cnt, zfs_uio_rw_t rw,
+ struct bio_vec *bv)
+{
+ void *paddr;
+
+ paddr = zfs_kmap_atomic(bv->bv_page);
+ if (rw == UIO_READ) {
+ /* Copy from buffer 'p' to the bvec data */
+ memcpy(paddr + bv->bv_offset + skip, p, cnt);
+ } else {
+ /* Copy from bvec data to buffer 'p' */
+ memcpy(p, paddr + bv->bv_offset + skip, cnt);
+ }
+ zfs_kunmap_atomic(paddr);
+}
+
+/*
+ * Copy 'n' bytes of data between the buffer p[] and the data represented
+ * by the request in the uio.
+ */
+static int
+zfs_uiomove_bvec_rq(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio)
+{
+ struct request *rq = uio->rq;
+ struct bio_vec bv;
+ struct req_iterator iter;
+ size_t this_seg_start; /* logical offset */
+ size_t this_seg_end; /* logical offset */
+ size_t skip_in_seg;
+ size_t copy_from_seg;
+ size_t orig_loffset;
+ int copied = 0;
+
+ /*
+ * Get the original logical offset of this entire request (because
+ * uio->uio_loffset will be modified over time).
+ */
+ orig_loffset = io_offset(NULL, rq);
+ this_seg_start = orig_loffset;
+
+ rq_for_each_segment(bv, rq, iter) {
+ if (uio->iter.bio) {
+ /*
+ * If uio->iter.bio is present, then we know we've saved
+ * uio->iter from a previous call to this function, and
+ * we can skip ahead in this rq_for_each_segment() loop
+ * to where we last left off. That way, we don't need
+ * to iterate over tons of segments we've already
+ * processed - we can just restore the "saved state".
+ */
+ iter = uio->iter;
+ bv = uio->bv;
+ this_seg_start = uio->uio_loffset;
+ memset(&uio->iter, 0, sizeof (uio->iter));
+ continue;
+ }
+
+ /*
+ * Lookup what the logical offset of the last byte of this
+ * segment is.
+ */
+ this_seg_end = this_seg_start + bv.bv_len - 1;
+
+ /*
+ * We only need to operate on segments that have data we're
+ * copying.
+ */
+ if (uio->uio_loffset >= this_seg_start &&
+ uio->uio_loffset <= this_seg_end) {
+ /*
+ * Some, or all, of the data in this segment needs to be
+ * copied.
+ */
+
+ /*
+ * We may be not be copying from the first byte in the
+ * segment. Figure out how many bytes to skip copying
+ * from the beginning of this segment.
+ */
+ skip_in_seg = uio->uio_loffset - this_seg_start;
+
+ /*
+ * Calculate the total number of bytes from this
+ * segment that we will be copying.
+ */
+ copy_from_seg = MIN(bv.bv_len - skip_in_seg, n);
+
+ /* Copy the bytes */
+ zfs_copy_bvec(p, skip_in_seg, copy_from_seg, rw, &bv);
+ p = ((char *)p) + copy_from_seg;
+
+ n -= copy_from_seg;
+ uio->uio_resid -= copy_from_seg;
+ uio->uio_loffset += copy_from_seg;
+ copied = 1; /* We copied some data */
+ }
+
+ if (n == 0) {
+ /*
+ * All done copying. Save our 'iter' value to the uio.
+ * This allows us to "save our state" and skip ahead in
+ * the rq_for_each_segment() loop the next time we call
+ * call zfs_uiomove_bvec_rq() on this uio (which we
+ * will be doing for any remaining data in the uio).
+ */
+ uio->iter = iter; /* make a copy of the struct data */
+ uio->bv = bv;
+ return (0);
+ }
+
+ this_seg_start = this_seg_end + 1;
+ }
+
+ if (!copied) {
+ /* Didn't copy anything */
+ uio->uio_resid = 0;
+ }
+ return (0);
+}
+#endif
+
+static int
+zfs_uiomove_bvec(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio)
+{
+#ifdef HAVE_BLK_MQ
+ if (uio->rq != NULL)
+ return (zfs_uiomove_bvec_rq(p, n, rw, uio));
+#else
+ ASSERT3P(uio->rq, ==, NULL);
+#endif
+ return (zfs_uiomove_bvec_impl(p, n, rw, uio));
+}
+
#if defined(HAVE_VFS_IOV_ITER)
static int
zfs_uiomove_iter(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio,
@@ -300,8 +438,14 @@ zfs_uioskip(zfs_uio_t *uio, size_t n)
{
if (n > uio->uio_resid)
return;
-
- if (uio->uio_segflg == UIO_BVEC) {
+ /*
+ * When using a uio with a struct request, we simply
+ * use uio_loffset as a pointer to the next logical byte to
+ * copy in the request. We don't have to do any fancy
+ * accounting with uio_bvec/uio_iovcnt since we don't use
+ * them.
+ */
+ if (uio->uio_segflg == UIO_BVEC && uio->rq == NULL) {
uio->uio_skip += n;
while (uio->uio_iovcnt &&
uio->uio_skip >= uio->uio_bvec->bv_len) {
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c
index 81a059651e8a..a67ba821d06f 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c
@@ -1453,14 +1453,34 @@ zfs_domount(struct super_block *sb, zfs_mnt_t *zm, int silent)
int error = 0;
zfsvfs_t *zfsvfs = NULL;
vfs_t *vfs = NULL;
+ int canwrite;
+ int dataset_visible_zone;
ASSERT(zm);
ASSERT(osname);
+ dataset_visible_zone = zone_dataset_visible(osname, &canwrite);
+
+ /*
+ * Refuse to mount a filesystem if we are in a namespace and the
+ * dataset is not visible or writable in that namespace.
+ */
+ if (!INGLOBALZONE(curproc) &&
+ (!dataset_visible_zone || !canwrite)) {
+ return (SET_ERROR(EPERM));
+ }
+
error = zfsvfs_parse_options(zm->mnt_data, &vfs);
if (error)
return (error);
+ /*
+ * If a non-writable filesystem is being mounted without the
+ * read-only flag, pretend it was set, as done for snapshots.
+ */
+ if (!canwrite)
+ vfs->vfs_readonly = true;
+
error = zfsvfs_create(osname, vfs->vfs_readonly, &zfsvfs);
if (error) {
zfsvfs_vfs_free(vfs);
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_ctldir.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_ctldir.c
index f4e1ab7aef08..d5c222120a9d 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zpl_ctldir.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_ctldir.c
@@ -32,6 +32,9 @@
#include <sys/zfs_vnops.h>
#include <sys/zfs_ctldir.h>
#include <sys/zpl.h>
+#include <sys/dmu.h>
+#include <sys/dsl_dataset.h>
+#include <sys/zap.h>
/*
* Common open routine. Disallow any write access.
@@ -411,6 +414,20 @@ zpl_snapdir_getattr_impl(const struct path *path, struct kstat *stat,
#endif
stat->nlink = stat->size = 2;
+
+ dsl_dataset_t *ds = dmu_objset_ds(zfsvfs->z_os);
+ if (dsl_dataset_phys(ds)->ds_snapnames_zapobj != 0) {
+ uint64_t snap_count;
+ int err = zap_count(
+ dmu_objset_pool(ds->ds_objset)->dp_meta_objset,
+ dsl_dataset_phys(ds)->ds_snapnames_zapobj, &snap_count);
+ if (err != 0) {
+ ZPL_EXIT(zfsvfs);
+ return (-err);
+ }
+ stat->nlink += snap_count;
+ }
+
stat->ctime = stat->mtime = dmu_objset_snap_cmtime(zfsvfs->z_os);
stat->atime = current_time(ip);
ZPL_EXIT(zfsvfs);
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c
index c2fd3fee1401..b18efde9b18a 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c
@@ -360,6 +360,7 @@ const struct super_operations zpl_super_operations = {
struct file_system_type zpl_fs_type = {
.owner = THIS_MODULE,
.name = ZFS_DRIVER,
+ .fs_flags = FS_USERNS_MOUNT,
.mount = zpl_mount,
.kill_sb = zpl_kill_sb,
};
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_xattr.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_xattr.c
index c53bf3c2ab25..c30ce1c98439 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zpl_xattr.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_xattr.c
@@ -83,6 +83,7 @@
#include <sys/zap.h>
#include <sys/vfs.h>
#include <sys/zpl.h>
+#include <linux/vfs_compat.h>
enum xattr_permission {
XAPERM_DENY,
@@ -1495,7 +1496,9 @@ zpl_xattr_permission(xattr_filldir_t *xf, const char *name, int name_len)
return (perm);
}
-#if !defined(HAVE_POSIX_ACL_RELEASE) || defined(HAVE_POSIX_ACL_RELEASE_GPL_ONLY)
+#if defined(CONFIG_FS_POSIX_ACL) && \
+ (!defined(HAVE_POSIX_ACL_RELEASE) || \
+ defined(HAVE_POSIX_ACL_RELEASE_GPL_ONLY))
struct acl_rel_struct {
struct acl_rel_struct *next;
struct posix_acl *acl;
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c b/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c
index 39441700ae8c..acbab55d03ef 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c
@@ -41,20 +41,77 @@
#include <linux/blkdev_compat.h>
#include <linux/task_io_accounting_ops.h>
+#ifdef HAVE_BLK_MQ
+#include <linux/blk-mq.h>
+#endif
+
+static void zvol_request_impl(zvol_state_t *zv, struct bio *bio,
+ struct request *rq, boolean_t force_sync);
+
static unsigned int zvol_major = ZVOL_MAJOR;
static unsigned int zvol_request_sync = 0;
static unsigned int zvol_prefetch_bytes = (128 * 1024);
static unsigned long zvol_max_discard_blocks = 16384;
-static unsigned int zvol_threads = 32;
#ifndef HAVE_BLKDEV_GET_ERESTARTSYS
static const unsigned int zvol_open_timeout_ms = 1000;
#endif
+static unsigned int zvol_threads = 0;
+#ifdef HAVE_BLK_MQ
+static unsigned int zvol_blk_mq_threads = 0;
+static unsigned int zvol_blk_mq_actual_threads;
+static boolean_t zvol_use_blk_mq = B_FALSE;
+
+/*
+ * The maximum number of volblocksize blocks to process per thread. Typically,
+ * write heavy workloads preform better with higher values here, and read
+ * heavy workloads preform better with lower values, but that's not a hard
+ * and fast rule. It's basically a knob to tune between "less overhead with
+ * less parallelism" and "more overhead, but more parallelism".
+ *
+ * '8' was chosen as a reasonable, balanced, default based off of sequential
+ * read and write tests to a zvol in an NVMe pool (with 16 CPUs).
+ */
+static unsigned int zvol_blk_mq_blocks_per_thread = 8;
+#endif
+
+#ifndef BLKDEV_DEFAULT_RQ
+/* BLKDEV_MAX_RQ was renamed to BLKDEV_DEFAULT_RQ in the 5.16 kernel */
+#define BLKDEV_DEFAULT_RQ BLKDEV_MAX_RQ
+#endif
+
+/*
+ * Finalize our BIO or request.
+ */
+#ifdef HAVE_BLK_MQ
+#define END_IO(zv, bio, rq, error) do { \
+ if (bio) { \
+ BIO_END_IO(bio, error); \
+ } else { \
+ blk_mq_end_request(rq, errno_to_bi_status(error)); \
+ } \
+} while (0)
+#else
+#define END_IO(zv, bio, rq, error) BIO_END_IO(bio, error)
+#endif
+
+#ifdef HAVE_BLK_MQ
+static unsigned int zvol_blk_mq_queue_depth = BLKDEV_DEFAULT_RQ;
+static unsigned int zvol_actual_blk_mq_queue_depth;
+#endif
+
struct zvol_state_os {
struct gendisk *zvo_disk; /* generic disk */
struct request_queue *zvo_queue; /* request queue */
dev_t zvo_dev; /* device id */
+
+#ifdef HAVE_BLK_MQ
+ struct blk_mq_tag_set tag_set;
+#endif
+
+ /* Set from the global 'zvol_use_blk_mq' at zvol load */
+ boolean_t use_blk_mq;
};
taskq_t *zvol_taskq;
@@ -63,8 +120,14 @@ static struct ida zvol_ida;
typedef struct zv_request_stack {
zvol_state_t *zv;
struct bio *bio;
+ struct request *rq;
} zv_request_t;
+typedef struct zv_work {
+ struct request *rq;
+ struct work_struct work;
+} zv_work_t;
+
typedef struct zv_request_task {
zv_request_t zvr;
taskq_ent_t ent;
@@ -86,6 +149,62 @@ zv_request_task_free(zv_request_task_t *task)
kmem_free(task, sizeof (*task));
}
+#ifdef HAVE_BLK_MQ
+
+/*
+ * This is called when a new block multiqueue request comes in. A request
+ * contains one or more BIOs.
+ */
+static blk_status_t zvol_mq_queue_rq(struct blk_mq_hw_ctx *hctx,
+ const struct blk_mq_queue_data *bd)
+{
+ struct request *rq = bd->rq;
+ zvol_state_t *zv = rq->q->queuedata;
+
+ /* Tell the kernel that we are starting to process this request */
+ blk_mq_start_request(rq);
+
+ if (blk_rq_is_passthrough(rq)) {
+ /* Skip non filesystem request */
+ blk_mq_end_request(rq, BLK_STS_IOERR);
+ return (BLK_STS_IOERR);
+ }
+
+ zvol_request_impl(zv, NULL, rq, 0);
+
+ /* Acknowledge to the kernel that we got this request */
+ return (BLK_STS_OK);
+}
+
+static struct blk_mq_ops zvol_blk_mq_queue_ops = {
+ .queue_rq = zvol_mq_queue_rq,
+};
+
+/* Initialize our blk-mq struct */
+static int zvol_blk_mq_alloc_tag_set(zvol_state_t *zv)
+{
+ struct zvol_state_os *zso = zv->zv_zso;
+
+ memset(&zso->tag_set, 0, sizeof (zso->tag_set));
+
+ /* Initialize tag set. */
+ zso->tag_set.ops = &zvol_blk_mq_queue_ops;
+ zso->tag_set.nr_hw_queues = zvol_blk_mq_actual_threads;
+ zso->tag_set.queue_depth = zvol_actual_blk_mq_queue_depth;
+ zso->tag_set.numa_node = NUMA_NO_NODE;
+ zso->tag_set.cmd_size = 0;
+
+ /*
+ * We need BLK_MQ_F_BLOCKING here since we do blocking calls in
+ * zvol_request_impl()
+ */
+ zso->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_BLOCKING;
+ zso->tag_set.driver_data = zv;
+
+ return (blk_mq_alloc_tag_set(&zso->tag_set));
+}
+#endif /* HAVE_BLK_MQ */
+
/*
* Given a path, return TRUE if path is a ZVOL.
*/
@@ -107,38 +226,51 @@ static void
zvol_write(zv_request_t *zvr)
{
struct bio *bio = zvr->bio;
+ struct request *rq = zvr->rq;
int error = 0;
zfs_uio_t uio;
-
- zfs_uio_bvec_init(&uio, bio);
-
zvol_state_t *zv = zvr->zv;
+ struct request_queue *q;
+ struct gendisk *disk;
+ unsigned long start_time = 0;
+ boolean_t acct = B_FALSE;
+
ASSERT3P(zv, !=, NULL);
ASSERT3U(zv->zv_open_count, >, 0);
ASSERT3P(zv->zv_zilog, !=, NULL);
+ q = zv->zv_zso->zvo_queue;
+ disk = zv->zv_zso->zvo_disk;
+
/* bio marked as FLUSH need to flush before write */
- if (bio_is_flush(bio))
+ if (io_is_flush(bio, rq))
zil_commit(zv->zv_zilog, ZVOL_OBJ);
/* Some requests are just for flush and nothing else. */
- if (uio.uio_resid == 0) {
+ if (io_size(bio, rq) == 0) {
rw_exit(&zv->zv_suspend_lock);
- BIO_END_IO(bio, 0);
+ END_IO(zv, bio, rq, 0);
return;
}
- struct request_queue *q = zv->zv_zso->zvo_queue;
- struct gendisk *disk = zv->zv_zso->zvo_disk;
+ zfs_uio_bvec_init(&uio, bio, rq);
+
ssize_t start_resid = uio.uio_resid;
- unsigned long start_time;
- boolean_t acct = blk_queue_io_stat(q);
- if (acct)
- start_time = blk_generic_start_io_acct(q, disk, WRITE, bio);
+ /*
+ * With use_blk_mq, accounting is done by blk_mq_start_request()
+ * and blk_mq_end_request(), so we can skip it here.
+ */
+ if (bio) {
+ acct = blk_queue_io_stat(q);
+ if (acct) {
+ start_time = blk_generic_start_io_acct(q, disk, WRITE,
+ bio);
+ }
+ }
boolean_t sync =
- bio_is_fua(bio) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS;
+ io_is_fua(bio, rq) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS;
zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock,
uio.uio_loffset, uio.uio_resid, RL_WRITER);
@@ -180,10 +312,11 @@ zvol_write(zv_request_t *zvr)
rw_exit(&zv->zv_suspend_lock);
- if (acct)
+ if (bio && acct) {
blk_generic_end_io_acct(q, disk, WRITE, bio, start_time);
+ }
- BIO_END_IO(bio, -error);
+ END_IO(zv, bio, rq, -error);
}
static void
@@ -198,27 +331,33 @@ static void
zvol_discard(zv_request_t *zvr)
{
struct bio *bio = zvr->bio;
+ struct request *rq = zvr->rq;
zvol_state_t *zv = zvr->zv;
- uint64_t start = BIO_BI_SECTOR(bio) << 9;
- uint64_t size = BIO_BI_SIZE(bio);
+ uint64_t start = io_offset(bio, rq);
+ uint64_t size = io_size(bio, rq);
uint64_t end = start + size;
boolean_t sync;
int error = 0;
dmu_tx_t *tx;
+ struct request_queue *q = zv->zv_zso->zvo_queue;
+ struct gendisk *disk = zv->zv_zso->zvo_disk;
+ unsigned long start_time = 0;
+
+ boolean_t acct = blk_queue_io_stat(q);
ASSERT3P(zv, !=, NULL);
ASSERT3U(zv->zv_open_count, >, 0);
ASSERT3P(zv->zv_zilog, !=, NULL);
- struct request_queue *q = zv->zv_zso->zvo_queue;
- struct gendisk *disk = zv->zv_zso->zvo_disk;
- unsigned long start_time;
-
- boolean_t acct = blk_queue_io_stat(q);
- if (acct)
- start_time = blk_generic_start_io_acct(q, disk, WRITE, bio);
+ if (bio) {
+ acct = blk_queue_io_stat(q);
+ if (acct) {
+ start_time = blk_generic_start_io_acct(q, disk, WRITE,
+ bio);
+ }
+ }
- sync = bio_is_fua(bio) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS;
+ sync = io_is_fua(bio, rq) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS;
if (end > zv->zv_volsize) {
error = SET_ERROR(EIO);
@@ -231,7 +370,7 @@ zvol_discard(zv_request_t *zvr)
* the unaligned parts which is slow (read-modify-write) and useless
* since we are not freeing any space by doing so.
*/
- if (!bio_is_secure_erase(bio)) {
+ if (!io_is_secure_erase(bio, rq)) {
start = P2ROUNDUP(start, zv->zv_volblocksize);
end = P2ALIGN(end, zv->zv_volblocksize);
size = end - start;
@@ -262,10 +401,12 @@ zvol_discard(zv_request_t *zvr)
unlock:
rw_exit(&zv->zv_suspend_lock);
- if (acct)
- blk_generic_end_io_acct(q, disk, WRITE, bio, start_time);
+ if (bio && acct) {
+ blk_generic_end_io_acct(q, disk, WRITE, bio,
+ start_time);
+ }
- BIO_END_IO(bio, -error);
+ END_IO(zv, bio, rq, -error);
}
static void
@@ -280,28 +421,41 @@ static void
zvol_read(zv_request_t *zvr)
{
struct bio *bio = zvr->bio;
+ struct request *rq = zvr->rq;
int error = 0;
zfs_uio_t uio;
-
- zfs_uio_bvec_init(&uio, bio);
-
+ boolean_t acct = B_FALSE;
zvol_state_t *zv = zvr->zv;
+ struct request_queue *q;
+ struct gendisk *disk;
+ unsigned long start_time = 0;
+
ASSERT3P(zv, !=, NULL);
ASSERT3U(zv->zv_open_count, >, 0);
- struct request_queue *q = zv->zv_zso->zvo_queue;
- struct gendisk *disk = zv->zv_zso->zvo_disk;
+ zfs_uio_bvec_init(&uio, bio, rq);
+
+ q = zv->zv_zso->zvo_queue;
+ disk = zv->zv_zso->zvo_disk;
+
ssize_t start_resid = uio.uio_resid;
- unsigned long start_time;
- boolean_t acct = blk_queue_io_stat(q);
- if (acct)
- start_time = blk_generic_start_io_acct(q, disk, READ, bio);
+ /*
+ * When blk-mq is being used, accounting is done by
+ * blk_mq_start_request() and blk_mq_end_request().
+ */
+ if (bio) {
+ acct = blk_queue_io_stat(q);
+ if (acct)
+ start_time = blk_generic_start_io_acct(q, disk, READ,
+ bio);
+ }
zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock,
uio.uio_loffset, uio.uio_resid, RL_READER);
uint64_t volsize = zv->zv_volsize;
+
while (uio.uio_resid > 0 && uio.uio_loffset < volsize) {
uint64_t bytes = MIN(uio.uio_resid, DMU_MAX_ACCESS >> 1);
@@ -325,10 +479,11 @@ zvol_read(zv_request_t *zvr)
rw_exit(&zv->zv_suspend_lock);
- if (acct)
+ if (bio && acct) {
blk_generic_end_io_acct(q, disk, READ, bio, start_time);
+ }
- BIO_END_IO(bio, -error);
+ END_IO(zv, bio, rq, -error);
}
static void
@@ -339,52 +494,49 @@ zvol_read_task(void *arg)
zv_request_task_free(task);
}
-#ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS
-#ifdef HAVE_BDEV_SUBMIT_BIO_RETURNS_VOID
+
+/*
+ * Process a BIO or request
+ *
+ * Either 'bio' or 'rq' should be set depending on if we are processing a
+ * bio or a request (both should not be set).
+ *
+ * force_sync: Set to 0 to defer processing to a background taskq
+ * Set to 1 to process data synchronously
+ */
static void
-zvol_submit_bio(struct bio *bio)
-#else
-static blk_qc_t
-zvol_submit_bio(struct bio *bio)
-#endif
-#else
-static MAKE_REQUEST_FN_RET
-zvol_request(struct request_queue *q, struct bio *bio)
-#endif
+zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq,
+ boolean_t force_sync)
{
-#ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS
-#if defined(HAVE_BIO_BDEV_DISK)
- struct request_queue *q = bio->bi_bdev->bd_disk->queue;
-#else
- struct request_queue *q = bio->bi_disk->queue;
-#endif
-#endif
- zvol_state_t *zv = q->queuedata;
fstrans_cookie_t cookie = spl_fstrans_mark();
- uint64_t offset = BIO_BI_SECTOR(bio) << 9;
- uint64_t size = BIO_BI_SIZE(bio);
- int rw = bio_data_dir(bio);
+ uint64_t offset = io_offset(bio, rq);
+ uint64_t size = io_size(bio, rq);
+ int rw = io_data_dir(bio, rq);
- if (bio_has_data(bio) && offset + size > zv->zv_volsize) {
- printk(KERN_INFO
- "%s: bad access: offset=%llu, size=%lu\n",
+ if (zvol_request_sync)
+ force_sync = 1;
+
+ zv_request_t zvr = {
+ .zv = zv,
+ .bio = bio,
+ .rq = rq,
+ };
+
+ if (io_has_data(bio, rq) && offset + size > zv->zv_volsize) {
+ printk(KERN_INFO "%s: bad access: offset=%llu, size=%lu\n",
zv->zv_zso->zvo_disk->disk_name,
(long long unsigned)offset,
(long unsigned)size);
- BIO_END_IO(bio, -SET_ERROR(EIO));
+ END_IO(zv, bio, rq, -SET_ERROR(EIO));
goto out;
}
- zv_request_t zvr = {
- .zv = zv,
- .bio = bio,
- };
zv_request_task_t *task;
if (rw == WRITE) {
if (unlikely(zv->zv_flags & ZVOL_RDONLY)) {
- BIO_END_IO(bio, -SET_ERROR(EROFS));
+ END_IO(zv, bio, rq, -SET_ERROR(EROFS));
goto out;
}
@@ -421,7 +573,7 @@ zvol_request(struct request_queue *q, struct bio *bio)
* i/o may be a ZIL write (via zil_commit()), or a read of an
* indirect block, or a read of a data block (if this is a
* partial-block write). We will indicate that the i/o is
- * complete by calling BIO_END_IO() from the taskq callback.
+ * complete by calling END_IO() from the taskq callback.
*
* This design allows the calling thread to continue and
* initiate more concurrent operations by calling
@@ -441,12 +593,12 @@ zvol_request(struct request_queue *q, struct bio *bio)
* of one i/o at a time per zvol. However, an even better
* design would be for zvol_request() to initiate the zio
* directly, and then be notified by the zio_done callback,
- * which would call BIO_END_IO(). Unfortunately, the DMU/ZIL
+ * which would call END_IO(). Unfortunately, the DMU/ZIL
* interfaces lack this functionality (they block waiting for
* the i/o to complete).
*/
- if (bio_is_discard(bio) || bio_is_secure_erase(bio)) {
- if (zvol_request_sync) {
+ if (io_is_discard(bio, rq) || io_is_secure_erase(bio, rq)) {
+ if (force_sync) {
zvol_discard(&zvr);
} else {
task = zv_request_task_create(zvr);
@@ -454,7 +606,7 @@ zvol_request(struct request_queue *q, struct bio *bio)
zvol_discard_task, task, 0, &task->ent);
}
} else {
- if (zvol_request_sync) {
+ if (force_sync) {
zvol_write(&zvr);
} else {
task = zv_request_task_create(zvr);
@@ -469,14 +621,14 @@ zvol_request(struct request_queue *q, struct bio *bio)
* data and require no additional handling.
*/
if (size == 0) {
- BIO_END_IO(bio, 0);
+ END_IO(zv, bio, rq, 0);
goto out;
}
rw_enter(&zv->zv_suspend_lock, RW_READER);
/* See comment in WRITE case above. */
- if (zvol_request_sync) {
+ if (force_sync) {
zvol_read(&zvr);
} else {
task = zv_request_task_create(zvr);
@@ -487,8 +639,33 @@ zvol_request(struct request_queue *q, struct bio *bio)
out:
spl_fstrans_unmark(cookie);
-#if (defined(HAVE_MAKE_REQUEST_FN_RET_QC) || \
- defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS)) && \
+}
+
+#ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS
+#ifdef HAVE_BDEV_SUBMIT_BIO_RETURNS_VOID
+static void
+zvol_submit_bio(struct bio *bio)
+#else
+static blk_qc_t
+zvol_submit_bio(struct bio *bio)
+#endif
+#else
+static MAKE_REQUEST_FN_RET
+zvol_request(struct request_queue *q, struct bio *bio)
+#endif
+{
+#ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS
+#if defined(HAVE_BIO_BDEV_DISK)
+ struct request_queue *q = bio->bi_bdev->bd_disk->queue;
+#else
+ struct request_queue *q = bio->bi_disk->queue;
+#endif
+#endif
+ zvol_state_t *zv = q->queuedata;
+
+ zvol_request_impl(zv, bio, NULL, 0);
+#if defined(HAVE_MAKE_REQUEST_FN_RET_QC) || \
+ defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS) && \
!defined(HAVE_BDEV_SUBMIT_BIO_RETURNS_VOID)
return (BLK_QC_T_NONE);
#endif
@@ -805,6 +982,27 @@ zvol_getgeo(struct block_device *bdev, struct hd_geometry *geo)
return (0);
}
+/*
+ * Why have two separate block_device_operations structs?
+ *
+ * Normally we'd just have one, and assign 'submit_bio' as needed. However,
+ * it's possible the user's kernel is built with CONSTIFY_PLUGIN, meaning we
+ * can't just change submit_bio dynamically at runtime. So just create two
+ * separate structs to get around this.
+ */
+static const struct block_device_operations zvol_ops_blk_mq = {
+ .open = zvol_open,
+ .release = zvol_release,
+ .ioctl = zvol_ioctl,
+ .compat_ioctl = zvol_compat_ioctl,
+ .check_events = zvol_check_events,
+#ifdef HAVE_BLOCK_DEVICE_OPERATIONS_REVALIDATE_DISK
+ .revalidate_disk = zvol_revalidate_disk,
+#endif
+ .getgeo = zvol_getgeo,
+ .owner = THIS_MODULE,
+};
+
static const struct block_device_operations zvol_ops = {
.open = zvol_open,
.release = zvol_release,
@@ -821,6 +1019,87 @@ static const struct block_device_operations zvol_ops = {
#endif
};
+static int
+zvol_alloc_non_blk_mq(struct zvol_state_os *zso)
+{
+#if defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS)
+#if defined(HAVE_BLK_ALLOC_DISK)
+ zso->zvo_disk = blk_alloc_disk(NUMA_NO_NODE);
+ if (zso->zvo_disk == NULL)
+ return (1);
+
+ zso->zvo_disk->minors = ZVOL_MINORS;
+ zso->zvo_queue = zso->zvo_disk->queue;
+#else
+ zso->zvo_queue = blk_alloc_queue(NUMA_NO_NODE);
+ if (zso->zvo_queue == NULL)
+ return (1);
+
+ zso->zvo_disk = alloc_disk(ZVOL_MINORS);
+ if (zso->zvo_disk == NULL) {
+ blk_cleanup_queue(zso->zvo_queue);
+ return (1);
+ }
+
+ zso->zvo_disk->queue = zso->zvo_queue;
+#endif /* HAVE_BLK_ALLOC_DISK */
+#else
+ zso->zvo_queue = blk_generic_alloc_queue(zvol_request, NUMA_NO_NODE);
+ if (zso->zvo_queue == NULL)
+ return (1);
+
+ zso->zvo_disk = alloc_disk(ZVOL_MINORS);
+ if (zso->zvo_disk == NULL) {
+ blk_cleanup_queue(zso->zvo_queue);
+ return (1);
+ }
+
+ zso->zvo_disk->queue = zso->zvo_queue;
+#endif /* HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS */
+ return (0);
+
+}
+
+static int
+zvol_alloc_blk_mq(zvol_state_t *zv)
+{
+#ifdef HAVE_BLK_MQ
+ struct zvol_state_os *zso = zv->zv_zso;
+
+ /* Allocate our blk-mq tag_set */
+ if (zvol_blk_mq_alloc_tag_set(zv) != 0)
+ return (1);
+
+#if defined(HAVE_BLK_ALLOC_DISK)
+ zso->zvo_disk = blk_mq_alloc_disk(&zso->tag_set, zv);
+ if (zso->zvo_disk == NULL) {
+ blk_mq_free_tag_set(&zso->tag_set);
+ return (1);
+ }
+ zso->zvo_queue = zso->zvo_disk->queue;
+ zso->zvo_disk->minors = ZVOL_MINORS;
+#else
+ zso->zvo_disk = alloc_disk(ZVOL_MINORS);
+ if (zso->zvo_disk == NULL) {
+ blk_cleanup_queue(zso->zvo_queue);
+ blk_mq_free_tag_set(&zso->tag_set);
+ return (1);
+ }
+ /* Allocate queue */
+ zso->zvo_queue = blk_mq_init_queue(&zso->tag_set);
+ if (IS_ERR(zso->zvo_queue)) {
+ blk_mq_free_tag_set(&zso->tag_set);
+ return (1);
+ }
+
+ /* Our queue is now created, assign it to our disk */
+ zso->zvo_disk->queue = zso->zvo_queue;
+
+#endif
+#endif
+ return (0);
+}
+
/*
* Allocate memory for a new zvol_state_t and setup the required
* request queue and generic disk structures for the block device.
@@ -831,6 +1110,7 @@ zvol_alloc(dev_t dev, const char *name)
zvol_state_t *zv;
struct zvol_state_os *zso;
uint64_t volmode;
+ int ret;
if (dsl_prop_get_integer(name, "volmode", &volmode, NULL) != 0)
return (NULL);
@@ -849,48 +1129,44 @@ zvol_alloc(dev_t dev, const char *name)
list_link_init(&zv->zv_next);
mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL);
-#ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS
-#ifdef HAVE_BLK_ALLOC_DISK
- zso->zvo_disk = blk_alloc_disk(NUMA_NO_NODE);
- if (zso->zvo_disk == NULL)
- goto out_kmem;
-
- zso->zvo_disk->minors = ZVOL_MINORS;
- zso->zvo_queue = zso->zvo_disk->queue;
-#else
- zso->zvo_queue = blk_alloc_queue(NUMA_NO_NODE);
- if (zso->zvo_queue == NULL)
- goto out_kmem;
+#ifdef HAVE_BLK_MQ
+ zv->zv_zso->use_blk_mq = zvol_use_blk_mq;
+#endif
- zso->zvo_disk = alloc_disk(ZVOL_MINORS);
- if (zso->zvo_disk == NULL) {
- blk_cleanup_queue(zso->zvo_queue);
- goto out_kmem;
+ /*
+ * The block layer has 3 interfaces for getting BIOs:
+ *
+ * 1. blk-mq request queues (new)
+ * 2. submit_bio() (oldest)
+ * 3. regular request queues (old).
+ *
+ * Each of those interfaces has two permutations:
+ *
+ * a) We have blk_alloc_disk()/blk_mq_alloc_disk(), which allocates
+ * both the disk and its queue (5.14 kernel or newer)
+ *
+ * b) We don't have blk_*alloc_disk(), and have to allocate the
+ * disk and the queue separately. (5.13 kernel or older)
+ */
+ if (zv->zv_zso->use_blk_mq) {
+ ret = zvol_alloc_blk_mq(zv);
+ zso->zvo_disk->fops = &zvol_ops_blk_mq;
+ } else {
+ ret = zvol_alloc_non_blk_mq(zso);
+ zso->zvo_disk->fops = &zvol_ops;
}
-
- zso->zvo_disk->queue = zso->zvo_queue;
-#endif /* HAVE_BLK_ALLOC_DISK */
-#else
- zso->zvo_queue = blk_generic_alloc_queue(zvol_request, NUMA_NO_NODE);
- if (zso->zvo_queue == NULL)
+ if (ret != 0)
goto out_kmem;
- zso->zvo_disk = alloc_disk(ZVOL_MINORS);
- if (zso->zvo_disk == NULL) {
- blk_cleanup_queue(zso->zvo_queue);
- goto out_kmem;
- }
-
- zso->zvo_disk->queue = zso->zvo_queue;
-#endif /* HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS */
-
blk_queue_set_write_cache(zso->zvo_queue, B_TRUE, B_TRUE);
/* Limit read-ahead to a single page to prevent over-prefetching. */
blk_queue_set_read_ahead(zso->zvo_queue, 1);
- /* Disable write merging in favor of the ZIO pipeline. */
- blk_queue_flag_set(QUEUE_FLAG_NOMERGES, zso->zvo_queue);
+ if (!zv->zv_zso->use_blk_mq) {
+ /* Disable write merging in favor of the ZIO pipeline. */
+ blk_queue_flag_set(QUEUE_FLAG_NOMERGES, zso->zvo_queue);
+ }
/* Enable /proc/diskstats */
blk_queue_flag_set(QUEUE_FLAG_IO_STAT, zso->zvo_queue);
@@ -918,7 +1194,6 @@ zvol_alloc(dev_t dev, const char *name)
}
zso->zvo_disk->first_minor = (dev & MINORMASK);
- zso->zvo_disk->fops = &zvol_ops;
zso->zvo_disk->private_data = zv;
snprintf(zso->zvo_disk->disk_name, DISK_NAME_LEN, "%s%d",
ZVOL_DEV_NAME, (dev & MINORMASK));
@@ -963,6 +1238,11 @@ zvol_os_free(zvol_state_t *zv)
put_disk(zv->zv_zso->zvo_disk);
#endif
+#ifdef HAVE_BLK_MQ
+ if (zv->zv_zso->use_blk_mq)
+ blk_mq_free_tag_set(&zv->zv_zso->tag_set);
+#endif
+
ida_simple_remove(&zvol_ida,
MINOR(zv->zv_zso->zvo_dev) >> ZVOL_MINOR_BITS);
@@ -1044,8 +1324,69 @@ zvol_os_create_minor(const char *name)
blk_queue_max_hw_sectors(zv->zv_zso->zvo_queue,
(DMU_MAX_ACCESS / 4) >> 9);
- blk_queue_max_segments(zv->zv_zso->zvo_queue, UINT16_MAX);
- blk_queue_max_segment_size(zv->zv_zso->zvo_queue, UINT_MAX);
+
+ if (zv->zv_zso->use_blk_mq) {
+ /*
+ * IO requests can be really big (1MB). When an IO request
+ * comes in, it is passed off to zvol_read() or zvol_write()
+ * in a new thread, where it is chunked up into 'volblocksize'
+ * sized pieces and processed. So for example, if the request
+ * is a 1MB write and your volblocksize is 128k, one zvol_write
+ * thread will take that request and sequentially do ten 128k
+ * IOs. This is due to the fact that the thread needs to lock
+ * each volblocksize sized block. So you might be wondering:
+ * "instead of passing the whole 1MB request to one thread,
+ * why not pass ten individual 128k chunks to ten threads and
+ * process the whole write in parallel?" The short answer is
+ * that there's a sweet spot number of chunks that balances
+ * the greater parallelism with the added overhead of more
+ * threads. The sweet spot can be different depending on if you
+ * have a read or write heavy workload. Writes typically want
+ * high chunk counts while reads typically want lower ones. On
+ * a test pool with 6 NVMe drives in a 3x 2-disk mirror
+ * configuration, with volblocksize=8k, the sweet spot for good
+ * sequential reads and writes was at 8 chunks.
+ */
+
+ /*
+ * Below we tell the kernel how big we want our requests
+ * to be. You would think that blk_queue_io_opt() would be
+ * used to do this since it is used to "set optimal request
+ * size for the queue", but that doesn't seem to do
+ * anything - the kernel still gives you huge requests
+ * with tons of little PAGE_SIZE segments contained within it.
+ *
+ * Knowing that the kernel will just give you PAGE_SIZE segments
+ * no matter what, you can say "ok, I want PAGE_SIZE byte
+ * segments, and I want 'N' of them per request", where N is
+ * the correct number of segments for the volblocksize and
+ * number of chunks you want.
+ */
+#ifdef HAVE_BLK_MQ
+ if (zvol_blk_mq_blocks_per_thread != 0) {
+ unsigned int chunks;
+ chunks = MIN(zvol_blk_mq_blocks_per_thread, UINT16_MAX);
+
+ blk_queue_max_segment_size(zv->zv_zso->zvo_queue,
+ PAGE_SIZE);
+ blk_queue_max_segments(zv->zv_zso->zvo_queue,
+ (zv->zv_volblocksize * chunks) / PAGE_SIZE);
+ } else {
+ /*
+ * Special case: zvol_blk_mq_blocks_per_thread = 0
+ * Max everything out.
+ */
+ blk_queue_max_segments(zv->zv_zso->zvo_queue,
+ UINT16_MAX);
+ blk_queue_max_segment_size(zv->zv_zso->zvo_queue,
+ UINT_MAX);
+ }
+#endif
+ } else {
+ blk_queue_max_segments(zv->zv_zso->zvo_queue, UINT16_MAX);
+ blk_queue_max_segment_size(zv->zv_zso->zvo_queue, UINT_MAX);
+ }
+
blk_queue_physical_block_size(zv->zv_zso->zvo_queue,
zv->zv_volblocksize);
blk_queue_io_opt(zv->zv_zso->zvo_queue, zv->zv_volblocksize);
@@ -1167,19 +1508,54 @@ int
zvol_init(void)
{
int error;
- int threads = MIN(MAX(zvol_threads, 1), 1024);
+
+ /*
+ * zvol_threads is the module param the user passes in.
+ *
+ * zvol_actual_threads is what we use internally, since the user can
+ * pass zvol_thread = 0 to mean "use all the CPUs" (the default).
+ */
+ static unsigned int zvol_actual_threads;
+
+ if (zvol_threads == 0) {
+ /*
+ * See dde9380a1 for why 32 was chosen here. This should
+ * probably be refined to be some multiple of the number
+ * of CPUs.
+ */
+ zvol_actual_threads = MAX(num_online_cpus(), 32);
+ } else {
+ zvol_actual_threads = MIN(MAX(zvol_threads, 1), 1024);
+ }
error = register_blkdev(zvol_major, ZVOL_DRIVER);
if (error) {
printk(KERN_INFO "ZFS: register_blkdev() failed %d\n", error);
return (error);
}
- zvol_taskq = taskq_create(ZVOL_DRIVER, threads, maxclsyspri,
- threads * 2, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC);
+
+#ifdef HAVE_BLK_MQ
+ if (zvol_blk_mq_queue_depth == 0) {
+ zvol_actual_blk_mq_queue_depth = BLKDEV_DEFAULT_RQ;
+ } else {
+ zvol_actual_blk_mq_queue_depth =
+ MAX(zvol_blk_mq_queue_depth, BLKDEV_MIN_RQ);
+ }
+
+ if (zvol_blk_mq_threads == 0) {
+ zvol_blk_mq_actual_threads = num_online_cpus();
+ } else {
+ zvol_blk_mq_actual_threads = MIN(MAX(zvol_blk_mq_threads, 1),
+ 1024);
+ }
+#endif
+ zvol_taskq = taskq_create(ZVOL_DRIVER, zvol_actual_threads, maxclsyspri,
+ zvol_actual_threads, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC);
if (zvol_taskq == NULL) {
unregister_blkdev(zvol_major, ZVOL_DRIVER);
return (-ENOMEM);
}
+
zvol_init_impl();
ida_init(&zvol_ida);
return (0);
@@ -1202,7 +1578,8 @@ module_param(zvol_major, uint, 0444);
MODULE_PARM_DESC(zvol_major, "Major number for zvol device");
module_param(zvol_threads, uint, 0444);
-MODULE_PARM_DESC(zvol_threads, "Max number of threads to handle I/O requests");
+MODULE_PARM_DESC(zvol_threads, "Number of threads to handle I/O requests. Set"
+ "to 0 to use all active CPUs");
module_param(zvol_request_sync, uint, 0644);
MODULE_PARM_DESC(zvol_request_sync, "Synchronously handle bio requests");
@@ -1215,4 +1592,17 @@ MODULE_PARM_DESC(zvol_prefetch_bytes, "Prefetch N bytes at zvol start+end");
module_param(zvol_volmode, uint, 0644);
MODULE_PARM_DESC(zvol_volmode, "Default volmode property value");
+
+#ifdef HAVE_BLK_MQ
+module_param(zvol_blk_mq_queue_depth, uint, 0644);
+MODULE_PARM_DESC(zvol_blk_mq_queue_depth, "Default blk-mq queue depth");
+
+module_param(zvol_use_blk_mq, uint, 0644);
+MODULE_PARM_DESC(zvol_use_blk_mq, "Use the blk-mq API for zvols");
+
+module_param(zvol_blk_mq_blocks_per_thread, uint, 0644);
+MODULE_PARM_DESC(zvol_blk_mq_blocks_per_thread,
+ "Process volblocksize blocks per thread");
+#endif
+
/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/zcommon/zfeature_common.c b/sys/contrib/openzfs/module/zcommon/zfeature_common.c
index f09389e6d02e..4df09884aa91 100644
--- a/sys/contrib/openzfs/module/zcommon/zfeature_common.c
+++ b/sys/contrib/openzfs/module/zcommon/zfeature_common.c
@@ -696,16 +696,15 @@ zpool_feature_init(void)
ZFEATURE_FLAG_MOS, ZFEATURE_TYPE_BOOLEAN, NULL, sfeatures);
{
-
- static const spa_feature_t zilsaxattr_deps[] = {
- SPA_FEATURE_EXTENSIBLE_DATASET,
- SPA_FEATURE_NONE
- };
- zfeature_register(SPA_FEATURE_ZILSAXATTR,
- "org.openzfs:zilsaxattr", "zilsaxattr",
- "Support for xattr=sa extended attribute logging in ZIL.",
- ZFEATURE_FLAG_PER_DATASET | ZFEATURE_FLAG_READONLY_COMPAT,
- ZFEATURE_TYPE_BOOLEAN, zilsaxattr_deps, sfeatures);
+ static const spa_feature_t zilsaxattr_deps[] = {
+ SPA_FEATURE_EXTENSIBLE_DATASET,
+ SPA_FEATURE_NONE
+ };
+ zfeature_register(SPA_FEATURE_ZILSAXATTR,
+ "org.openzfs:zilsaxattr", "zilsaxattr",
+ "Support for xattr=sa extended attribute logging in ZIL.",
+ ZFEATURE_FLAG_PER_DATASET | ZFEATURE_FLAG_READONLY_COMPAT,
+ ZFEATURE_TYPE_BOOLEAN, zilsaxattr_deps, sfeatures);
}
zfeature_register(SPA_FEATURE_HEAD_ERRLOG,
@@ -714,6 +713,18 @@ zpool_feature_init(void)
ZFEATURE_FLAG_ACTIVATE_ON_ENABLE, ZFEATURE_TYPE_BOOLEAN, NULL,
sfeatures);
+ {
+ static const spa_feature_t blake3_deps[] = {
+ SPA_FEATURE_EXTENSIBLE_DATASET,
+ SPA_FEATURE_NONE
+ };
+ zfeature_register(SPA_FEATURE_BLAKE3,
+ "org.openzfs:blake3", "blake3",
+ "BLAKE3 hash algorithm.",
+ ZFEATURE_FLAG_PER_DATASET, ZFEATURE_TYPE_BOOLEAN,
+ blake3_deps, sfeatures);
+ }
+
zfs_mod_list_supported_free(sfeatures);
}
diff --git a/sys/contrib/openzfs/module/zcommon/zfs_prop.c b/sys/contrib/openzfs/module/zcommon/zfs_prop.c
index 500d80a33b6b..32475611e11f 100644
--- a/sys/contrib/openzfs/module/zcommon/zfs_prop.c
+++ b/sys/contrib/openzfs/module/zcommon/zfs_prop.c
@@ -84,6 +84,7 @@ zfs_prop_init(void)
{ "sha512", ZIO_CHECKSUM_SHA512 },
{ "skein", ZIO_CHECKSUM_SKEIN },
{ "edonr", ZIO_CHECKSUM_EDONR },
+ { "blake3", ZIO_CHECKSUM_BLAKE3 },
{ NULL }
};
@@ -102,6 +103,9 @@ zfs_prop_init(void)
ZIO_CHECKSUM_SKEIN | ZIO_CHECKSUM_VERIFY },
{ "edonr,verify",
ZIO_CHECKSUM_EDONR | ZIO_CHECKSUM_VERIFY },
+ { "blake3", ZIO_CHECKSUM_BLAKE3 },
+ { "blake3,verify",
+ ZIO_CHECKSUM_BLAKE3 | ZIO_CHECKSUM_VERIFY },
{ NULL }
};
@@ -394,12 +398,12 @@ zfs_prop_init(void)
ZIO_CHECKSUM_DEFAULT, PROP_INHERIT, ZFS_TYPE_FILESYSTEM |
ZFS_TYPE_VOLUME,
"on | off | fletcher2 | fletcher4 | sha256 | sha512 | skein"
- " | edonr",
+ " | edonr | blake3",
"CHECKSUM", checksum_table, sfeatures);
zprop_register_index(ZFS_PROP_DEDUP, "dedup", ZIO_CHECKSUM_OFF,
PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
"on | off | verify | sha256[,verify] | sha512[,verify] | "
- "skein[,verify] | edonr,verify",
+ "skein[,verify] | edonr,verify | blake3[,verify]",
"DEDUP", dedup_table, sfeatures);
zprop_register_index(ZFS_PROP_COMPRESSION, "compression",
ZIO_COMPRESS_DEFAULT, PROP_INHERIT,
diff --git a/sys/contrib/openzfs/module/zfs/blake3_zfs.c b/sys/contrib/openzfs/module/zfs/blake3_zfs.c
new file mode 100644
index 000000000000..7560f30fd4e4
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/blake3_zfs.c
@@ -0,0 +1,117 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://opensource.org/licenses/CDDL-1.0.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2022 Tino Reichardt <milky-zfs@mcmilk.de>
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/zio_checksum.h>
+#include <sys/blake3.h>
+#include <sys/abd.h>
+
+static int
+blake3_incremental(void *buf, size_t size, void *arg)
+{
+ BLAKE3_CTX *ctx = arg;
+
+ Blake3_Update(ctx, buf, size);
+
+ return (0);
+}
+
+/*
+ * Computes a native 256-bit BLAKE3 MAC checksum. Please note that this
+ * function requires the presence of a ctx_template that should be allocated
+ * using abd_checksum_blake3_tmpl_init.
+ */
+void
+abd_checksum_blake3_native(abd_t *abd, uint64_t size, const void *ctx_template,
+ zio_cksum_t *zcp)
+{
+ ASSERT(ctx_template != 0);
+
+#if defined(_KERNEL)
+ BLAKE3_CTX *ctx = blake3_per_cpu_ctx[CPU_SEQID_UNSTABLE];
+#else
+ BLAKE3_CTX *ctx = kmem_alloc(sizeof (*ctx), KM_SLEEP);
+#endif
+
+ memcpy(ctx, ctx_template, sizeof (*ctx));
+ (void) abd_iterate_func(abd, 0, size, blake3_incremental, ctx);
+ Blake3_Final(ctx, (uint8_t *)zcp);
+
+#if !defined(_KERNEL)
+ memset(ctx, 0, sizeof (*ctx));
+ kmem_free(ctx, sizeof (*ctx));
+#endif
+}
+
+/*
+ * Byteswapped version of abd_checksum_blake3_native. This just invokes
+ * the native checksum function and byteswaps the resulting checksum (since
+ * BLAKE3 is internally endian-insensitive).
+ */
+void
+abd_checksum_blake3_byteswap(abd_t *abd, uint64_t size,
+ const void *ctx_template, zio_cksum_t *zcp)
+{
+ zio_cksum_t tmp;
+
+ ASSERT(ctx_template != 0);
+
+ abd_checksum_blake3_native(abd, size, ctx_template, &tmp);
+ zcp->zc_word[0] = BSWAP_64(tmp.zc_word[0]);
+ zcp->zc_word[1] = BSWAP_64(tmp.zc_word[1]);
+ zcp->zc_word[2] = BSWAP_64(tmp.zc_word[2]);
+ zcp->zc_word[3] = BSWAP_64(tmp.zc_word[3]);
+}
+
+/*
+ * Allocates a BLAKE3 MAC template suitable for using in BLAKE3 MAC checksum
+ * computations and returns a pointer to it.
+ */
+void *
+abd_checksum_blake3_tmpl_init(const zio_cksum_salt_t *salt)
+{
+ BLAKE3_CTX *ctx;
+
+ ASSERT(sizeof (salt->zcs_bytes) == 32);
+
+ /* init reference object */
+ ctx = kmem_zalloc(sizeof (*ctx), KM_SLEEP);
+ Blake3_InitKeyed(ctx, salt->zcs_bytes);
+
+ return (ctx);
+}
+
+/*
+ * Frees a BLAKE3 context template previously allocated using
+ * zio_checksum_blake3_tmpl_init.
+ */
+void
+abd_checksum_blake3_tmpl_free(void *ctx_template)
+{
+ BLAKE3_CTX *ctx = ctx_template;
+
+ memset(ctx, 0, sizeof (*ctx));
+ kmem_free(ctx, sizeof (*ctx));
+}
diff --git a/sys/contrib/openzfs/module/zfs/dsl_prop.c b/sys/contrib/openzfs/module/zfs/dsl_prop.c
index cfdafd2f4436..51d689355ef0 100644
--- a/sys/contrib/openzfs/module/zfs/dsl_prop.c
+++ b/sys/contrib/openzfs/module/zfs/dsl_prop.c
@@ -88,7 +88,7 @@ dsl_prop_get_dd(dsl_dir_t *dd, const char *propname,
setpoint[0] = '\0';
prop = zfs_name_to_prop(propname);
- inheritable = (prop == ZPROP_INVAL || zfs_prop_inheritable(prop));
+ inheritable = (prop == ZPROP_USERPROP || zfs_prop_inheritable(prop));
inheritstr = kmem_asprintf("%s%s", propname, ZPROP_INHERIT_SUFFIX);
recvdstr = kmem_asprintf("%s%s", propname, ZPROP_RECVD_SUFFIX);
@@ -168,7 +168,7 @@ dsl_prop_get_ds(dsl_dataset_t *ds, const char *propname,
uint64_t zapobj;
ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool));
- inheritable = (prop == ZPROP_INVAL || zfs_prop_inheritable(prop));
+ inheritable = (prop == ZPROP_USERPROP || zfs_prop_inheritable(prop));
zapobj = dsl_dataset_phys(ds)->ds_props_obj;
if (zapobj != 0) {
@@ -1055,12 +1055,12 @@ dsl_prop_get_all_impl(objset_t *mos, uint64_t propobj,
prop = zfs_name_to_prop(propname);
/* Skip non-inheritable properties. */
- if ((flags & DSL_PROP_GET_INHERITING) && prop != ZPROP_INVAL &&
- !zfs_prop_inheritable(prop))
+ if ((flags & DSL_PROP_GET_INHERITING) &&
+ prop != ZPROP_USERPROP && !zfs_prop_inheritable(prop))
continue;
/* Skip properties not valid for this type. */
- if ((flags & DSL_PROP_GET_SNAPSHOT) && prop != ZPROP_INVAL &&
+ if ((flags & DSL_PROP_GET_SNAPSHOT) && prop != ZPROP_USERPROP &&
!zfs_prop_valid_for_type(prop, ZFS_TYPE_SNAPSHOT, B_FALSE))
continue;
diff --git a/sys/contrib/openzfs/module/zfs/dsl_scan.c b/sys/contrib/openzfs/module/zfs/dsl_scan.c
index 89448f0ecea2..d8152939d8e1 100644
--- a/sys/contrib/openzfs/module/zfs/dsl_scan.c
+++ b/sys/contrib/openzfs/module/zfs/dsl_scan.c
@@ -280,6 +280,7 @@ typedef struct scan_io {
struct dsl_scan_io_queue {
dsl_scan_t *q_scn; /* associated dsl_scan_t */
vdev_t *q_vd; /* top-level vdev that this queue represents */
+ zio_t *q_zio; /* scn_zio_root child for waiting on IO */
/* trees used for sorting I/Os and extents of I/Os */
range_tree_t *q_exts_by_addr;
@@ -1276,9 +1277,12 @@ dsl_scan_should_clear(dsl_scan_t *scn)
mutex_enter(&tvd->vdev_scan_io_queue_lock);
queue = tvd->vdev_scan_io_queue;
if (queue != NULL) {
- /* # extents in exts_by_size = # in exts_by_addr */
+ /*
+ * # of extents in exts_by_size = # in exts_by_addr.
+ * B-tree efficiency is ~75%, but can be as low as 50%.
+ */
mused += zfs_btree_numnodes(&queue->q_exts_by_size) *
- sizeof (range_seg_gap_t) + queue->q_sio_memused;
+ 3 * sizeof (range_seg_gap_t) + queue->q_sio_memused;
}
mutex_exit(&tvd->vdev_scan_io_queue_lock);
}
@@ -3033,15 +3037,19 @@ scan_io_queues_run_one(void *arg)
dsl_scan_io_queue_t *queue = arg;
kmutex_t *q_lock = &queue->q_vd->vdev_scan_io_queue_lock;
boolean_t suspended = B_FALSE;
- range_seg_t *rs = NULL;
- scan_io_t *sio = NULL;
+ range_seg_t *rs;
+ scan_io_t *sio;
+ zio_t *zio;
list_t sio_list;
ASSERT(queue->q_scn->scn_is_sorted);
list_create(&sio_list, sizeof (scan_io_t),
offsetof(scan_io_t, sio_nodes.sio_list_node));
+ zio = zio_null(queue->q_scn->scn_zio_root, queue->q_scn->scn_dp->dp_spa,
+ NULL, NULL, NULL, ZIO_FLAG_CANFAIL);
mutex_enter(q_lock);
+ queue->q_zio = zio;
/* Calculate maximum in-flight bytes for this vdev. */
queue->q_maxinflight_bytes = MAX(1, zfs_scan_vdev_limit *
@@ -3108,7 +3116,9 @@ scan_io_queues_run_one(void *arg)
scan_io_queue_insert_impl(queue, sio);
}
+ queue->q_zio = NULL;
mutex_exit(q_lock);
+ zio_nowait(zio);
list_destroy(&sio_list);
}
@@ -4073,6 +4083,7 @@ scan_exec_io(dsl_pool_t *dp, const blkptr_t *bp, int zio_flags,
dsl_scan_t *scn = dp->dp_scan;
size_t size = BP_GET_PSIZE(bp);
abd_t *data = abd_alloc_for_io(size, B_FALSE);
+ zio_t *pio;
if (queue == NULL) {
ASSERT3U(scn->scn_maxinflight_bytes, >, 0);
@@ -4081,6 +4092,7 @@ scan_exec_io(dsl_pool_t *dp, const blkptr_t *bp, int zio_flags,
cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
spa->spa_scrub_inflight += BP_GET_PSIZE(bp);
mutex_exit(&spa->spa_scrub_lock);
+ pio = scn->scn_zio_root;
} else {
kmutex_t *q_lock = &queue->q_vd->vdev_scan_io_queue_lock;
@@ -4089,12 +4101,14 @@ scan_exec_io(dsl_pool_t *dp, const blkptr_t *bp, int zio_flags,
while (queue->q_inflight_bytes >= queue->q_maxinflight_bytes)
cv_wait(&queue->q_zio_cv, q_lock);
queue->q_inflight_bytes += BP_GET_PSIZE(bp);
+ pio = queue->q_zio;
mutex_exit(q_lock);
}
+ ASSERT(pio != NULL);
count_block(scn, dp->dp_blkstats, bp);
- zio_nowait(zio_read(scn->scn_zio_root, spa, bp, data, size,
- dsl_scan_scrub_done, queue, ZIO_PRIORITY_SCRUB, zio_flags, zb));
+ zio_nowait(zio_read(pio, spa, bp, data, size, dsl_scan_scrub_done,
+ queue, ZIO_PRIORITY_SCRUB, zio_flags, zb));
}
/*
diff --git a/sys/contrib/openzfs/module/zfs/spa_misc.c b/sys/contrib/openzfs/module/zfs/spa_misc.c
index 12aec4a568eb..c57c69bd70e1 100644
--- a/sys/contrib/openzfs/module/zfs/spa_misc.c
+++ b/sys/contrib/openzfs/module/zfs/spa_misc.c
@@ -30,6 +30,7 @@
*/
#include <sys/zfs_context.h>
+#include <sys/zfs_chksum.h>
#include <sys/spa_impl.h>
#include <sys/zio.h>
#include <sys/zio_checksum.h>
@@ -2417,6 +2418,7 @@ spa_init(spa_mode_t mode)
vdev_raidz_math_init();
vdev_file_init();
zfs_prop_init();
+ chksum_init();
zpool_prop_init();
zpool_feature_init();
spa_config_load();
@@ -2438,6 +2440,7 @@ spa_fini(void)
vdev_cache_stat_fini();
vdev_mirror_stat_fini();
vdev_raidz_math_fini();
+ chksum_fini();
zil_fini();
dmu_fini();
zio_fini();
diff --git a/sys/contrib/openzfs/module/zfs/vdev.c b/sys/contrib/openzfs/module/zfs/vdev.c
index ce7f020a0d86..2dcb97b7feb4 100644
--- a/sys/contrib/openzfs/module/zfs/vdev.c
+++ b/sys/contrib/openzfs/module/zfs/vdev.c
@@ -5496,7 +5496,7 @@ vdev_props_set_sync(void *arg, dmu_tx_t *tx)
}
switch (prop = vdev_name_to_prop(propname)) {
- case VDEV_PROP_USER:
+ case VDEV_PROP_USERPROP:
if (vdev_prop_user(propname)) {
strval = fnvpair_value_string(elem);
if (strlen(strval) == 0) {
@@ -5580,7 +5580,7 @@ vdev_prop_set(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl)
uint64_t intval = 0;
char *strval = NULL;
- if (prop == VDEV_PROP_USER && !vdev_prop_user(propname)) {
+ if (prop == VDEV_PROP_USERPROP && !vdev_prop_user(propname)) {
error = EINVAL;
goto end;
}
@@ -5937,7 +5937,7 @@ vdev_prop_get(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl)
case VDEV_PROP_COMMENT:
/* Exists in the ZAP below */
/* FALLTHRU */
- case VDEV_PROP_USER:
+ case VDEV_PROP_USERPROP:
/* User Properites */
src = ZPROP_SRC_LOCAL;
diff --git a/sys/contrib/openzfs/module/zfs/zcp_synctask.c b/sys/contrib/openzfs/module/zfs/zcp_synctask.c
index 403856ae3571..24210117eca0 100644
--- a/sys/contrib/openzfs/module/zfs/zcp_synctask.c
+++ b/sys/contrib/openzfs/module/zfs/zcp_synctask.c
@@ -325,7 +325,7 @@ zcp_synctask_inherit_prop_check(void *arg, dmu_tx_t *tx)
zcp_inherit_prop_arg_t *args = arg;
zfs_prop_t prop = zfs_name_to_prop(args->zipa_prop);
- if (prop == ZPROP_INVAL) {
+ if (prop == ZPROP_USERPROP) {
if (zfs_prop_user(args->zipa_prop))
return (0);
diff --git a/sys/contrib/openzfs/module/zfs/zfs_chksum.c b/sys/contrib/openzfs/module/zfs/zfs_chksum.c
new file mode 100644
index 000000000000..639784287d72
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/zfs_chksum.c
@@ -0,0 +1,323 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2021 Tino Reichardt <milky-zfs@mcmilk.de>
+ */
+
+#include <sys/types.h>
+#include <sys/spa.h>
+#include <sys/zio_checksum.h>
+#include <sys/zfs_context.h>
+#include <sys/zfs_chksum.h>
+
+#include <sys/blake3.h>
+
+static kstat_t *chksum_kstat = NULL;
+
+typedef struct {
+ const char *name;
+ const char *impl;
+ uint64_t bs1k;
+ uint64_t bs4k;
+ uint64_t bs16k;
+ uint64_t bs64k;
+ uint64_t bs256k;
+ uint64_t bs1m;
+ uint64_t bs4m;
+ zio_cksum_salt_t salt;
+ zio_checksum_t *(func);
+ zio_checksum_tmpl_init_t *(init);
+ zio_checksum_tmpl_free_t *(free);
+} chksum_stat_t;
+
+static int chksum_stat_cnt = 0;
+static chksum_stat_t *chksum_stat_data = 0;
+
+/*
+ * i3-1005G1 test output:
+ *
+ * implementation 1k 4k 16k 64k 256k 1m 4m
+ * fletcher-4 5421 15001 26468 32555 34720 32801 18847
+ * edonr-generic 1196 1602 1761 1749 1762 1759 1751
+ * skein-generic 546 591 608 615 619 612 616
+ * sha256-generic 246 270 274 274 277 275 276
+ * sha256-avx 262 296 304 307 307 307 306
+ * sha256-sha-ni 769 1072 1172 1220 1219 1232 1228
+ * sha256-openssl 240 300 316 314 304 285 276
+ * sha512-generic 333 374 385 392 391 393 392
+ * sha512-openssl 353 441 467 476 472 467 426
+ * sha512-avx 362 444 473 475 479 476 478
+ * sha512-avx2 394 500 530 538 543 545 542
+ * blake3-generic 308 313 313 313 312 313 312
+ * blake3-sse2 402 1289 1423 1446 1432 1458 1413
+ * blake3-sse41 427 1470 1625 1704 1679 1607 1629
+ * blake3-avx2 428 1920 3095 3343 3356 3318 3204
+ * blake3-avx512 473 2687 4905 5836 5844 5643 5374
+ */
+static int
+chksum_stat_kstat_headers(char *buf, size_t size)
+{
+ ssize_t off = 0;
+
+ off += snprintf(buf + off, size, "%-23s", "implementation");
+ off += snprintf(buf + off, size - off, "%8s", "1k");
+ off += snprintf(buf + off, size - off, "%8s", "4k");
+ off += snprintf(buf + off, size - off, "%8s", "16k");
+ off += snprintf(buf + off, size - off, "%8s", "64k");
+ off += snprintf(buf + off, size - off, "%8s", "256k");
+ off += snprintf(buf + off, size - off, "%8s", "1m");
+ (void) snprintf(buf + off, size - off, "%8s\n", "4m");
+
+ return (0);
+}
+
+static int
+chksum_stat_kstat_data(char *buf, size_t size, void *data)
+{
+ chksum_stat_t *cs;
+ ssize_t off = 0;
+ char b[24];
+
+ cs = (chksum_stat_t *)data;
+ snprintf(b, 23, "%s-%s", cs->name, cs->impl);
+ off += snprintf(buf + off, size - off, "%-23s", b);
+ off += snprintf(buf + off, size - off, "%8llu",
+ (u_longlong_t)cs->bs1k);
+ off += snprintf(buf + off, size - off, "%8llu",
+ (u_longlong_t)cs->bs4k);
+ off += snprintf(buf + off, size - off, "%8llu",
+ (u_longlong_t)cs->bs16k);
+ off += snprintf(buf + off, size - off, "%8llu",
+ (u_longlong_t)cs->bs64k);
+ off += snprintf(buf + off, size - off, "%8llu",
+ (u_longlong_t)cs->bs256k);
+ off += snprintf(buf + off, size - off, "%8llu",
+ (u_longlong_t)cs->bs1m);
+ (void) snprintf(buf + off, size - off, "%8llu\n",
+ (u_longlong_t)cs->bs4m);
+
+ return (0);
+}
+
+static void *
+chksum_stat_kstat_addr(kstat_t *ksp, loff_t n)
+{
+ if (n < chksum_stat_cnt)
+ ksp->ks_private = (void *)(chksum_stat_data + n);
+ else
+ ksp->ks_private = NULL;
+
+ return (ksp->ks_private);
+}
+
+static void
+chksum_run(chksum_stat_t *cs, abd_t *abd, void *ctx, int round,
+ uint64_t *result)
+{
+ hrtime_t start;
+ uint64_t run_bw, run_time_ns, run_count = 0, size = 0;
+ uint32_t l, loops = 0;
+ zio_cksum_t zcp;
+
+ switch (round) {
+ case 1: /* 1k */
+ size = 1<<10; loops = 128; break;
+ case 2: /* 2k */
+ size = 1<<12; loops = 64; break;
+ case 3: /* 4k */
+ size = 1<<14; loops = 32; break;
+ case 4: /* 16k */
+ size = 1<<16; loops = 16; break;
+ case 5: /* 256k */
+ size = 1<<18; loops = 8; break;
+ case 6: /* 1m */
+ size = 1<<20; loops = 4; break;
+ case 7: /* 4m */
+ size = 1<<22; loops = 1; break;
+ }
+
+ kpreempt_disable();
+ start = gethrtime();
+ do {
+ for (l = 0; l < loops; l++, run_count++)
+ cs->func(abd, size, ctx, &zcp);
+
+ run_time_ns = gethrtime() - start;
+ } while (run_time_ns < MSEC2NSEC(1));
+ kpreempt_enable();
+
+ run_bw = size * run_count * NANOSEC;
+ run_bw /= run_time_ns; /* B/s */
+ *result = run_bw/1024/1024; /* MiB/s */
+}
+
+static void
+chksum_benchit(chksum_stat_t *cs)
+{
+ abd_t *abd;
+ void *ctx = 0;
+ void *salt = &cs->salt.zcs_bytes;
+
+ /* allocate test memory via default abd interface */
+ abd = abd_alloc_linear(1<<22, B_FALSE);
+ memset(salt, 0, sizeof (cs->salt.zcs_bytes));
+ if (cs->init) {
+ ctx = cs->init(&cs->salt);
+ }
+
+ chksum_run(cs, abd, ctx, 1, &cs->bs1k);
+ chksum_run(cs, abd, ctx, 2, &cs->bs4k);
+ chksum_run(cs, abd, ctx, 3, &cs->bs16k);
+ chksum_run(cs, abd, ctx, 4, &cs->bs64k);
+ chksum_run(cs, abd, ctx, 5, &cs->bs256k);
+ chksum_run(cs, abd, ctx, 6, &cs->bs1m);
+ chksum_run(cs, abd, ctx, 7, &cs->bs4m);
+
+ /* free up temp memory */
+ if (cs->free) {
+ cs->free(ctx);
+ }
+ abd_free(abd);
+}
+
+/*
+ * Initialize and benchmark all supported implementations.
+ */
+static void
+chksum_benchmark(void)
+{
+
+#ifndef _KERNEL
+ /* we need the benchmark only for the kernel module */
+ return;
+#endif
+
+ chksum_stat_t *cs;
+ int cbid = 0, id;
+ uint64_t max = 0;
+
+ /* space for the benchmark times */
+ chksum_stat_cnt = 4;
+ chksum_stat_cnt += blake3_get_impl_count();
+ chksum_stat_data = (chksum_stat_t *)kmem_zalloc(
+ sizeof (chksum_stat_t) * chksum_stat_cnt, KM_SLEEP);
+
+ /* edonr */
+ cs = &chksum_stat_data[cbid++];
+ cs->init = abd_checksum_edonr_tmpl_init;
+ cs->func = abd_checksum_edonr_native;
+ cs->free = abd_checksum_edonr_tmpl_free;
+ cs->name = "edonr";
+ cs->impl = "generic";
+ chksum_benchit(cs);
+
+ /* skein */
+ cs = &chksum_stat_data[cbid++];
+ cs->init = abd_checksum_skein_tmpl_init;
+ cs->func = abd_checksum_skein_native;
+ cs->free = abd_checksum_skein_tmpl_free;
+ cs->name = "skein";
+ cs->impl = "generic";
+ chksum_benchit(cs);
+
+ /* sha256 */
+ cs = &chksum_stat_data[cbid++];
+ cs->init = 0;
+ cs->func = abd_checksum_SHA256;
+ cs->free = 0;
+ cs->name = "sha256";
+ cs->impl = "generic";
+ chksum_benchit(cs);
+
+ /* sha512 */
+ cs = &chksum_stat_data[cbid++];
+ cs->init = 0;
+ cs->func = abd_checksum_SHA512_native;
+ cs->free = 0;
+ cs->name = "sha512";
+ cs->impl = "generic";
+ chksum_benchit(cs);
+
+ /* blake3 */
+ for (id = 0; id < blake3_get_impl_count(); id++) {
+ blake3_set_impl_id(id);
+ cs = &chksum_stat_data[cbid++];
+ cs->init = abd_checksum_blake3_tmpl_init;
+ cs->func = abd_checksum_blake3_native;
+ cs->free = abd_checksum_blake3_tmpl_free;
+ cs->name = "blake3";
+ cs->impl = blake3_get_impl_name();
+ chksum_benchit(cs);
+ if (cs->bs256k > max) {
+ max = cs->bs256k;
+ blake3_set_impl_fastest(id);
+ }
+ }
+}
+
+void
+chksum_init(void)
+{
+#ifdef _KERNEL
+ blake3_per_cpu_ctx_init();
+#endif
+
+ /* Benchmark supported implementations */
+ chksum_benchmark();
+
+ /* Install kstats for all implementations */
+ chksum_kstat = kstat_create("zfs", 0, "chksum_bench", "misc",
+ KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL);
+
+ if (chksum_kstat != NULL) {
+ chksum_kstat->ks_data = NULL;
+ chksum_kstat->ks_ndata = UINT32_MAX;
+ kstat_set_raw_ops(chksum_kstat,
+ chksum_stat_kstat_headers,
+ chksum_stat_kstat_data,
+ chksum_stat_kstat_addr);
+ kstat_install(chksum_kstat);
+ }
+
+ /* setup implementations */
+ blake3_setup_impl();
+}
+
+void
+chksum_fini(void)
+{
+ if (chksum_kstat != NULL) {
+ kstat_delete(chksum_kstat);
+ chksum_kstat = NULL;
+ }
+
+ if (chksum_stat_cnt) {
+ kmem_free(chksum_stat_data,
+ sizeof (chksum_stat_t) * chksum_stat_cnt);
+ chksum_stat_cnt = 0;
+ chksum_stat_data = 0;
+ }
+
+#ifdef _KERNEL
+ blake3_per_cpu_ctx_fini();
+#endif
+}
diff --git a/sys/contrib/openzfs/module/zfs/zfs_ioctl.c b/sys/contrib/openzfs/module/zfs/zfs_ioctl.c
index b3f32d64f3ef..35aec5226233 100644
--- a/sys/contrib/openzfs/module/zfs/zfs_ioctl.c
+++ b/sys/contrib/openzfs/module/zfs/zfs_ioctl.c
@@ -1104,7 +1104,7 @@ zfs_secpolicy_inherit_prop(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
(void) innvl;
zfs_prop_t prop = zfs_name_to_prop(zc->zc_value);
- if (prop == ZPROP_INVAL) {
+ if (prop == ZPROP_USERPROP) {
if (!zfs_prop_user(zc->zc_value))
return (SET_ERROR(EINVAL));
return (zfs_secpolicy_write_perms(zc->zc_name,
@@ -2406,7 +2406,7 @@ zfs_prop_set_special(const char *dsname, zprop_source_t source,
const char *strval = NULL;
int err = -1;
- if (prop == ZPROP_INVAL) {
+ if (prop == ZPROP_USERPROP) {
if (zfs_prop_userquota(propname))
return (zfs_prop_set_userquota(dsname, pair));
return (-1);
@@ -2577,7 +2577,7 @@ retry:
/* inherited properties are expected to be booleans */
if (nvpair_type(propval) != DATA_TYPE_BOOLEAN)
err = SET_ERROR(EINVAL);
- } else if (err == 0 && prop == ZPROP_INVAL) {
+ } else if (err == 0 && prop == ZPROP_USERPROP) {
if (zfs_prop_user(propname)) {
if (nvpair_type(propval) != DATA_TYPE_STRING)
err = SET_ERROR(EINVAL);
@@ -2853,11 +2853,11 @@ zfs_ioc_inherit_prop(zfs_cmd_t *zc)
* and reservation to the received or default values even though
* they are not considered inheritable.
*/
- if (prop != ZPROP_INVAL && !zfs_prop_inheritable(prop))
+ if (prop != ZPROP_USERPROP && !zfs_prop_inheritable(prop))
return (SET_ERROR(EINVAL));
}
- if (prop == ZPROP_INVAL) {
+ if (prop == ZPROP_USERPROP) {
if (!zfs_prop_user(propname))
return (SET_ERROR(EINVAL));
@@ -4488,7 +4488,7 @@ zfs_check_settable(const char *dsname, nvpair_t *pair, cred_t *cr)
uint64_t intval, compval;
int err;
- if (prop == ZPROP_INVAL) {
+ if (prop == ZPROP_USERPROP) {
if (zfs_prop_user(propname)) {
if ((err = zfs_secpolicy_write_perms(dsname,
ZFS_DELEG_PERM_USERPROP, cr)))
@@ -5034,7 +5034,7 @@ zfs_ioc_recv_impl(char *tofs, char *tosnap, char *origin, nvlist_t *recvprops,
/* -x property */
const char *name = nvpair_name(nvp);
zfs_prop_t prop = zfs_name_to_prop(name);
- if (prop != ZPROP_INVAL) {
+ if (prop != ZPROP_USERPROP) {
if (!zfs_prop_inheritable(prop))
continue;
} else if (!zfs_prop_user(name))
diff --git a/sys/contrib/openzfs/module/zfs/zio_checksum.c b/sys/contrib/openzfs/module/zfs/zio_checksum.c
index d89e5765326f..3c5cdf604100 100644
--- a/sys/contrib/openzfs/module/zfs/zio_checksum.c
+++ b/sys/contrib/openzfs/module/zfs/zio_checksum.c
@@ -195,6 +195,10 @@ zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = {
abd_checksum_edonr_tmpl_init, abd_checksum_edonr_tmpl_free,
ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_SALTED |
ZCHECKSUM_FLAG_NOPWRITE, "edonr"},
+ {{abd_checksum_blake3_native, abd_checksum_blake3_byteswap},
+ abd_checksum_blake3_tmpl_init, abd_checksum_blake3_tmpl_free,
+ ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
+ ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "blake3"},
};
/*
@@ -207,6 +211,8 @@ zio_checksum_to_feature(enum zio_checksum cksum)
VERIFY((cksum & ~ZIO_CHECKSUM_MASK) == 0);
switch (cksum) {
+ case ZIO_CHECKSUM_BLAKE3:
+ return (SPA_FEATURE_BLAKE3);
case ZIO_CHECKSUM_SHA512:
return (SPA_FEATURE_SHA512);
case ZIO_CHECKSUM_SKEIN: