Welcome to little lamb

Code » limb » commit c1c3b65

Add BLAKE3 functions: blake3_{init,update,final}

author Olivier Brunel
2023-01-25 21:31:18 UTC
committer Olivier Brunel
2023-01-30 22:07:00 UTC
parent c6157411ee724b0ce9cae2baa43a04bb7b7bf1bb

Add BLAKE3 functions: blake3_{init,update,final}

Also blake3() as helper to call all 3 functions at once.

The BLAKE3 implementation is the official C implementation[1] :
Copyright (C) 2019-2020 Samuel Neves and Jack O'Connor
Released into the public domain with CC0 1.0.

[1]: https://github.com/BLAKE3-team/BLAKE3

doc/blake3.3.md +35 -0
include/blake3.h +246 -0
include/limb/blake3.h +43 -0
meta/AUTHORS +2 -0
meta/HISTORY +13 -0
meta/libs/limb +6 -0
project.mk +17 -0
src/blake3/blake3.c +11 -0
src/blake3/blake3_avx2.c +326 -0
src/blake3/blake3_avx2_x86-64_unix.S +1815 -0
src/blake3/blake3_avx512.c +1220 -0
src/blake3/blake3_avx512_x86-64_unix.S +2585 -0
src/blake3/blake3_dispatch.c +269 -0
src/blake3/blake3_impl.c +594 -0
src/blake3/blake3_portable.c +169 -0
src/blake3/blake3_sse2.c +566 -0
src/blake3/blake3_sse2_x86-64_unix.S +2291 -0
src/blake3/blake3_sse41.c +560 -0
src/blake3/blake3_sse41_x86-64_unix.S +2028 -0

diff --git a/doc/blake3.3.md b/doc/blake3.3.md
new file mode 100644
index 0000000..a95cd9f
--- /dev/null
+++ b/doc/blake3.3.md
@@ -0,0 +1,35 @@
+% limb manual
+% blake3(3)
+
+# NAME
+
+blake3_init, blake3_update, blake3_final, blake3 - compute the BLAKE3 of a given block of
+data
+
+# SYNOPSIS
+
+    #include <limb/blake3.h>
+
+```pre hl
+void blake3_init(blake3_ctx *<em>ctx</em>);
+void blake3_update(blake3_ctx *<em>ctx</em>, const void *<em>msg</em>, size_t <em>size</em>);
+void blake3_final(blake3_ctx *<em>ctx</em>, unsigned char * restrict <em>md</em>);
+
+void blake3(const void *<em>msg</em>, size_t <em>size</em>, unsigned char * restrict <em>md</em>);
+```
+
+# DESCRIPTION
+
+The `blake3_init`() function initializes the given blake3 context `ctx` to
+calculate a 256bit BLAKE3 digest.
+
+The `blake3_update`() function feeds the specified chunk of data pointed by `msg`
+of length `size` (in bytes) to be hashed into the given `ctx`. You can call this
+function repeatedly as many times as needed.
+
+The `blake3_final`() function stores the calculated hash from `ctx` in binary
+form into `md`, which must be able to store 32 bytes.
+
+As a convenience, the `blake3`() function allows to compute the BLAKE3 digest of
+a given `msg` of length `size` in one call. The 32 bytes digest will be stored
+in `md`.
diff --git a/include/blake3.h b/include/blake3.h
new file mode 100644
index 0000000..c5ca3a6
--- /dev/null
+++ b/include/blake3.h
@@ -0,0 +1,246 @@
+#ifndef LIMB_BLAKE3_BLAKE3_H
+#define LIMB_BLAKE3_BLAKE3_H
+
+#include "limb/blake3.h"
+
+/* internal flags */
+enum blake3_flags {
+  CHUNK_START         = 1 << 0,
+  CHUNK_END           = 1 << 1,
+  PARENT              = 1 << 2,
+  ROOT                = 1 << 3,
+  KEYED_HASH          = 1 << 4,
+  DERIVE_KEY_CONTEXT  = 1 << 5,
+  DERIVE_KEY_MATERIAL = 1 << 6,
+};
+
+#define INLINE static inline __attribute__((always_inline))
+
+#if defined(__x86_64__) || defined(_M_X64) 
+#define IS_X86
+#define IS_X86_64
+#endif
+
+#if defined(__i386__) || defined(_M_IX86)
+#define IS_X86
+#define IS_X86_32
+#endif
+
+#if defined(__aarch64__) || defined(_M_ARM64)
+#define IS_AARCH64
+#endif
+
+#if !defined(BLAKE3_USE_NEON)
+  /* If BLAKE3_USE_NEON not manually set, autodetect based on AArch64ness */
+  #if defined(IS_AARCH64)
+    #define BLAKE3_USE_NEON 1
+  #else
+    #define BLAKE3_USE_NEON 0
+  #endif
+#endif
+
+#if defined(IS_X86)
+#define MAX_SIMD_DEGREE 16
+#elif BLAKE3_USE_NEON == 1
+#define MAX_SIMD_DEGREE 4
+#else
+#define MAX_SIMD_DEGREE 1
+#endif
+
+/* There are some places where we want a static size that's equal to the
+ * MAX_SIMD_DEGREE, but also at least 2. */
+#define MAX_SIMD_DEGREE_OR_2 (MAX_SIMD_DEGREE > 2 ? MAX_SIMD_DEGREE : 2)
+
+static const u32 IV[8] = {
+    0x6A09E667UL, 0xBB67AE85UL, 0x3C6EF372UL, 0xA54FF53AUL,
+    0x510E527FUL, 0x9B05688CUL, 0x1F83D9ABUL, 0x5BE0CD19UL
+};
+
+static const u8 MSG_SCHEDULE[7][16] = {
+    {  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 },
+    {  2,  6,  3, 10,  7,  0,  4, 13,  1, 11, 12,  5,  9, 14, 15,  8 },
+    {  3,  4, 10, 12, 13,  2,  7, 14,  6,  5,  9,  0, 11, 15,  8,  1 },
+    { 10,  7, 12,  9, 14,  3, 13, 15,  4,  0, 11,  2,  5,  8,  1,  6 },
+    { 12, 13,  9, 11, 15, 10, 14,  8,  7,  2,  5,  3,  0,  1,  6,  4 },
+    {  9, 14, 11,  5,  8, 12, 15,  1, 13,  3,  0, 10,  2,  6,  4,  7 },
+    { 11, 15,  5,  0,  1,  9,  8,  6, 14, 10,  2, 12,  3,  4,  7, 13 },
+};
+
+/* Count the number of 1 bits. */
+INLINE unsigned int
+popcnt(u64 x)
+{
+#if defined(__GNUC__) || defined(__clang__)
+    return (unsigned int) __builtin_popcountll(x);
+#else
+    unsigned int count = 0;
+    while (x != 0) {
+        count += 1;
+        x &= x - 1;
+    }
+    return count;
+#endif
+}
+
+/* Largest power of two less than or equal to x. As a special case, returns 1
+ * when x is 0. */
+INLINE u64
+round_down_to_power_of_2(u64 x)
+{
+    return 1ULL << (msb64(x | 1) - 1);
+}
+
+INLINE u32
+counter_low(u64 counter)
+{
+    return (u32) counter;
+}
+
+INLINE u32
+counter_high(u64 counter)
+{
+    return (u32) (counter >> 32);
+}
+
+INLINE u32
+load32(const void *src)
+{
+    const u8 *p = (const u8 *) src;
+    return ((u32)(p[0])      ) | ((u32)(p[1]) <<  8)
+        |  ((u32)(p[2]) << 16) | ((u32)(p[3]) << 24);
+}
+
+INLINE void
+load_key_words(const u8 key[BLAKE3_KEY_LEN], u32 key_words[8])
+{
+    key_words[0] = load32(&key[0 * 4]);
+    key_words[1] = load32(&key[1 * 4]);
+    key_words[2] = load32(&key[2 * 4]);
+    key_words[3] = load32(&key[3 * 4]);
+    key_words[4] = load32(&key[4 * 4]);
+    key_words[5] = load32(&key[5 * 4]);
+    key_words[6] = load32(&key[6 * 4]);
+    key_words[7] = load32(&key[7 * 4]);
+}
+
+INLINE void
+store32(void *dst, u32 w)
+{
+    u8 *p = (u8 *) dst;
+    p[0] = (u8)  w;
+    p[1] = (u8) (w >> 8);
+    p[2] = (u8) (w >> 16);
+    p[3] = (u8) (w >> 24);
+}
+
+INLINE void
+store_cv_words(u8 bytes_out[32], u32 cv_words[8])
+{
+    store32(&bytes_out[0 * 4], cv_words[0]);
+    store32(&bytes_out[1 * 4], cv_words[1]);
+    store32(&bytes_out[2 * 4], cv_words[2]);
+    store32(&bytes_out[3 * 4], cv_words[3]);
+    store32(&bytes_out[4 * 4], cv_words[4]);
+    store32(&bytes_out[5 * 4], cv_words[5]);
+    store32(&bytes_out[6 * 4], cv_words[6]);
+    store32(&bytes_out[7 * 4], cv_words[7]);
+}
+
+void blake3_compress_in_place(u32 cv[8],
+                              const u8 block[BLAKE3_BLOCK_LEN],
+                              u8 block_len, u64 counter, u8 flags);
+
+void blake3_compress_xof(const u32 cv[8],
+                         const u8 block[BLAKE3_BLOCK_LEN],
+                         u8 block_len, u64 counter, u8 flags, u8 out[64]);
+
+void blake3_hash_many(const u8 *const *inputs, size_t num_inputs,
+                      size_t blocks, const u32 key[8], u64 counter,
+                      char increment_counter, u8 flags,
+                      u8 flags_start, u8 flags_end, u8 *out);
+
+size_t blake3_simd_degree(void);
+
+
+// Declarations for implementation-specific functions.
+void blake3_compress_in_place_portable(u32 cv[8],
+                                       const u8 block[BLAKE3_BLOCK_LEN],
+                                       u8 block_len, u64 counter, u8 flags);
+
+void blake3_compress_xof_portable(const u32 cv[8],
+                                  const u8 block[BLAKE3_BLOCK_LEN],
+                                  u8 block_len, u64 counter, u8 flags, u8 out[64]);
+
+void blake3_hash_many_portable(const u8 *const *inputs, size_t num_inputs,
+                               size_t blocks, const u32 key[8],
+                               u64 counter, char increment_counter,
+                               u8 flags, u8 flags_start,
+                               u8 flags_end, u8 *out);
+
+#if defined(IS_X86)
+#if !defined(BLAKE3_NO_SSE2)
+void blake3_compress_in_place_sse2(u32 cv[8],
+                                   const u8 block[BLAKE3_BLOCK_LEN],
+                                   u8 block_len, u64 counter,
+                                   u8 flags);
+void blake3_compress_xof_sse2(const u32 cv[8],
+                              const u8 block[BLAKE3_BLOCK_LEN],
+                              u8 block_len, u64 counter,
+                              u8 flags, u8 out[64]);
+void blake3_hash_many_sse2(const u8 *const *inputs, size_t num_inputs,
+                           size_t blocks, const u32 key[8],
+                           u64 counter, char increment_counter,
+                           u8 flags, u8 flags_start,
+                           u8 flags_end, u8 *out);
+#endif
+#if !defined(BLAKE3_NO_SSE41)
+void blake3_compress_in_place_sse41(u32 cv[8],
+                                    const u8 block[BLAKE3_BLOCK_LEN],
+                                    u8 block_len, u64 counter,
+                                    u8 flags);
+void blake3_compress_xof_sse41(const u32 cv[8],
+                               const u8 block[BLAKE3_BLOCK_LEN],
+                               u8 block_len, u64 counter,
+                               u8 flags, u8 out[64]);
+void blake3_hash_many_sse41(const u8 *const *inputs, size_t num_inputs,
+                            size_t blocks, const u32 key[8],
+                            u64 counter, char increment_counter,
+                            u8 flags, u8 flags_start,
+                            u8 flags_end, u8 *out);
+#endif
+#if !defined(BLAKE3_NO_AVX2)
+void blake3_hash_many_avx2(const u8 *const *inputs, size_t num_inputs,
+                           size_t blocks, const u32 key[8],
+                           u64 counter, char increment_counter,
+                           u8 flags, u8 flags_start,
+                           u8 flags_end, u8 *out);
+#endif
+#if !defined(BLAKE3_NO_AVX512)
+void blake3_compress_in_place_avx512(u32 cv[8],
+                                     const u8 block[BLAKE3_BLOCK_LEN],
+                                     u8 block_len, u64 counter,
+                                     u8 flags);
+
+void blake3_compress_xof_avx512(const u32 cv[8],
+                                const u8 block[BLAKE3_BLOCK_LEN],
+                                u8 block_len, u64 counter,
+                                u8 flags, u8 out[64]);
+
+void blake3_hash_many_avx512(const u8 *const *inputs, size_t num_inputs,
+                             size_t blocks, const u32 key[8],
+                             u64 counter, char increment_counter,
+                             u8 flags, u8 flags_start,
+                             u8 flags_end, u8 *out);
+#endif
+#endif
+
+#if BLAKE3_USE_NEON == 1
+void blake3_hash_many_neon(const u8 *const *inputs, size_t num_inputs,
+                           size_t blocks, const u32 key[8],
+                           u64 counter, char increment_counter,
+                           u8 flags, u8 flags_start,
+                           u8 flags_end, u8 *out);
+#endif
+
+
+#endif /* LIMB_BLAKE3_BLAKE3_H */
diff --git a/include/limb/blake3.h b/include/limb/blake3.h
new file mode 100644
index 0000000..1520f5a
--- /dev/null
+++ b/include/limb/blake3.h
@@ -0,0 +1,43 @@
+#ifndef LIMB_BLAKE3_H
+#define LIMB_BLAKE3_H
+
+#include <string.h> /* size_t */
+#include "limb/int.h"
+
+#define BLAKE3_KEY_LEN 32
+#define BLAKE3_OUT_LEN 32
+#define BLAKE3_BLOCK_LEN 64
+#define BLAKE3_CHUNK_LEN 1024
+#define BLAKE3_MAX_DEPTH 54
+
+/* This struct is a private implementation detail. It has to be here because
+ * it's part of blake3_hasher below. */
+typedef struct {
+  u32 cv[8];
+  u64 chunk_counter;
+  u8 buf[BLAKE3_BLOCK_LEN];
+  u8 buf_len;
+  u8 blocks_compressed;
+  u8 flags;
+} blake3_chunk_state;
+
+typedef struct {
+  u32 key[8];
+  blake3_chunk_state chunk;
+  u8 cv_stack_len;
+  /* The stack size is MAX_DEPTH + 1 because we do lazy merging. For example,
+   * with 7 chunks, we have 3 entries in the stack. Adding an 8th chunk
+   * requires a 4th entry, rather than merging everything down to 1, because we
+   * don't know whether more input is coming. This is different from how the
+   * reference implementation does things.
+   */
+  u8 cv_stack[(BLAKE3_MAX_DEPTH + 1) * BLAKE3_OUT_LEN];
+} blake3_ctx;
+
+void blake3_init(blake3_ctx *ctx);
+void blake3_update(blake3_ctx *ctx, const void *msg, size_t len);
+void blake3_final(blake3_ctx *ctx, unsigned char * restrict md);
+
+void blake3(const void *msg, size_t size, unsigned char * restrict md);
+
+#endif /* LIMB_BLAKE3_H */
diff --git a/meta/AUTHORS b/meta/AUTHORS
index 7c47e4b..10768e8 100644
--- a/meta/AUTHORS
+++ b/meta/AUTHORS
@@ -3,3 +3,5 @@ Main author:
 
 Contributors:
 * Aleksey Kravchenko <rhash.admin@gmail.com> [sha3]
+* Samuel Neves [blake3]
+* Jack O'Connor [blake3]
diff --git a/meta/HISTORY b/meta/HISTORY
index e4dee8c..bc580dd 100644
--- a/meta/HISTORY
+++ b/meta/HISTORY
@@ -1,5 +1,16 @@
 # Current development
 
+- Add BLAKE3 functions: blake3_{init,update,final}
+
+  Also blake3() as helper to call all 3 functions at once.
+
+  The BLAKE3 implementation is the official C implementation :
+  Copyright (C) 2019-2020 Samuel Neves and Jack O'Connor
+  Released into the public domain with CC0 1.0.
+
+  https://github.com/BLAKE3-team/BLAKE3
+
+
 - Add SHA3 functions: sha3_{init,update,final}
 
   Also sha3() as helper to call all 3 functions at once.
@@ -10,8 +21,10 @@
 
   [RHash]: https://github.com/rhash/RHash
 
+
 - Add msb64() to find the most significant bit set.
 
+
 # version 0.0.1 [released on 2023-01-16]
 
 - First release.
diff --git a/meta/libs/limb b/meta/libs/limb
index 120aced..ab2f38a 100644
--- a/meta/libs/limb
+++ b/meta/libs/limb
@@ -14,5 +14,11 @@ src/sha3/sha3_final.o
 src/sha3/sha3_init.o
 src/sha3/sha3.o
 src/sha3/sha3_update.o
+# BLAKE3
+src/blake3/blake3.o
+src/blake3/blake3_impl.o
+src/blake3/blake3_dispatch.o
+src/blake3/blake3_portable.o
+$$(BLAKE3_OPTIMIZ)
 # skalibs dependency
 skalibs
diff --git a/project.mk b/project.mk
index 63a3fe3..f72ea69 100644
--- a/project.mk
+++ b/project.mk
@@ -1 +1,18 @@
 LIBS = limb
+
+ifeq ($(BITS),64)
+BLAKE3_OPTIMIZ := src/blake3/blake3_avx2_x86-64_unix.o \
+				  src/blake3/blake3_avx512_x86-64_unix.o \
+				  src/blake3/blake3_sse2_x86-64_unix.o \
+				  src/blake3/blake3_sse41_x86-64_unix.o
+BLAKE3_OPTIMIZ_so := $(BLAKE3_OPTIMIZ)
+else
+BLAKE3_OPTIMIZ := src/blake3/blake3_avx2.o src/blake3/blake3_avx512.o \
+				  src/blake3/blake3_sse2.o src/blake3/blake3_sse41.o
+BLAKE3_OPTIMIZ_so := $(BLAKE3_OPTIMIZ:.o=.lo)
+
+CFLAGS_blake3/blake3_avx2 = -mavx2
+CFLAGS_blake3/blake3_avx512 = -mavx512f -mavx512vl -mavx512bw
+CFLAGS_blake3/blake3_sse2 = -msse2
+CFLAGS_blake3/blake3_sse41 = -msse4.1
+endif
diff --git a/src/blake3/blake3.c b/src/blake3/blake3.c
new file mode 100644
index 0000000..f74933a
--- /dev/null
+++ b/src/blake3/blake3.c
@@ -0,0 +1,11 @@
+#include "blake3.h"
+
+void
+blake3(const void *msg, size_t size, unsigned char * restrict md)
+{
+    blake3_ctx ctx;
+
+    blake3_init(&ctx);
+    blake3_update(&ctx, msg, size);
+    blake3_final(&ctx, md);
+}
diff --git a/src/blake3/blake3_avx2.c b/src/blake3/blake3_avx2.c
new file mode 100644
index 0000000..f9590d6
--- /dev/null
+++ b/src/blake3/blake3_avx2.c
@@ -0,0 +1,326 @@
+#include "blake3.h"
+
+#include <immintrin.h>
+
+#define DEGREE 8
+
+INLINE __m256i loadu(const u8 src[32]) {
+  return _mm256_loadu_si256((const __m256i *)src);
+}
+
+INLINE void storeu(__m256i src, u8 dest[16]) {
+  _mm256_storeu_si256((__m256i *)dest, src);
+}
+
+INLINE __m256i addv(__m256i a, __m256i b) { return _mm256_add_epi32(a, b); }
+
+// Note that clang-format doesn't like the name "xor" for some reason.
+INLINE __m256i xorv(__m256i a, __m256i b) { return _mm256_xor_si256(a, b); }
+
+INLINE __m256i set1(u32 x) { return _mm256_set1_epi32((int32_t)x); }
+
+INLINE __m256i rot16(__m256i x) {
+  return _mm256_shuffle_epi8(
+      x, _mm256_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2,
+                         13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2));
+}
+
+INLINE __m256i rot12(__m256i x) {
+  return _mm256_or_si256(_mm256_srli_epi32(x, 12), _mm256_slli_epi32(x, 32 - 12));
+}
+
+INLINE __m256i rot8(__m256i x) {
+  return _mm256_shuffle_epi8(
+      x, _mm256_set_epi8(12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1,
+                         12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1));
+}
+
+INLINE __m256i rot7(__m256i x) {
+  return _mm256_or_si256(_mm256_srli_epi32(x, 7), _mm256_slli_epi32(x, 32 - 7));
+}
+
+INLINE void round_fn(__m256i v[16], __m256i m[16], size_t r) {
+  v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][0]]);
+  v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][2]]);
+  v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][4]]);
+  v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][6]]);
+  v[0] = addv(v[0], v[4]);
+  v[1] = addv(v[1], v[5]);
+  v[2] = addv(v[2], v[6]);
+  v[3] = addv(v[3], v[7]);
+  v[12] = xorv(v[12], v[0]);
+  v[13] = xorv(v[13], v[1]);
+  v[14] = xorv(v[14], v[2]);
+  v[15] = xorv(v[15], v[3]);
+  v[12] = rot16(v[12]);
+  v[13] = rot16(v[13]);
+  v[14] = rot16(v[14]);
+  v[15] = rot16(v[15]);
+  v[8] = addv(v[8], v[12]);
+  v[9] = addv(v[9], v[13]);
+  v[10] = addv(v[10], v[14]);
+  v[11] = addv(v[11], v[15]);
+  v[4] = xorv(v[4], v[8]);
+  v[5] = xorv(v[5], v[9]);
+  v[6] = xorv(v[6], v[10]);
+  v[7] = xorv(v[7], v[11]);
+  v[4] = rot12(v[4]);
+  v[5] = rot12(v[5]);
+  v[6] = rot12(v[6]);
+  v[7] = rot12(v[7]);
+  v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][1]]);
+  v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][3]]);
+  v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][5]]);
+  v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][7]]);
+  v[0] = addv(v[0], v[4]);
+  v[1] = addv(v[1], v[5]);
+  v[2] = addv(v[2], v[6]);
+  v[3] = addv(v[3], v[7]);
+  v[12] = xorv(v[12], v[0]);
+  v[13] = xorv(v[13], v[1]);
+  v[14] = xorv(v[14], v[2]);
+  v[15] = xorv(v[15], v[3]);
+  v[12] = rot8(v[12]);
+  v[13] = rot8(v[13]);
+  v[14] = rot8(v[14]);
+  v[15] = rot8(v[15]);
+  v[8] = addv(v[8], v[12]);
+  v[9] = addv(v[9], v[13]);
+  v[10] = addv(v[10], v[14]);
+  v[11] = addv(v[11], v[15]);
+  v[4] = xorv(v[4], v[8]);
+  v[5] = xorv(v[5], v[9]);
+  v[6] = xorv(v[6], v[10]);
+  v[7] = xorv(v[7], v[11]);
+  v[4] = rot7(v[4]);
+  v[5] = rot7(v[5]);
+  v[6] = rot7(v[6]);
+  v[7] = rot7(v[7]);
+
+  v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][8]]);
+  v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][10]]);
+  v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][12]]);
+  v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][14]]);
+  v[0] = addv(v[0], v[5]);
+  v[1] = addv(v[1], v[6]);
+  v[2] = addv(v[2], v[7]);
+  v[3] = addv(v[3], v[4]);
+  v[15] = xorv(v[15], v[0]);
+  v[12] = xorv(v[12], v[1]);
+  v[13] = xorv(v[13], v[2]);
+  v[14] = xorv(v[14], v[3]);
+  v[15] = rot16(v[15]);
+  v[12] = rot16(v[12]);
+  v[13] = rot16(v[13]);
+  v[14] = rot16(v[14]);
+  v[10] = addv(v[10], v[15]);
+  v[11] = addv(v[11], v[12]);
+  v[8] = addv(v[8], v[13]);
+  v[9] = addv(v[9], v[14]);
+  v[5] = xorv(v[5], v[10]);
+  v[6] = xorv(v[6], v[11]);
+  v[7] = xorv(v[7], v[8]);
+  v[4] = xorv(v[4], v[9]);
+  v[5] = rot12(v[5]);
+  v[6] = rot12(v[6]);
+  v[7] = rot12(v[7]);
+  v[4] = rot12(v[4]);
+  v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][9]]);
+  v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][11]]);
+  v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][13]]);
+  v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][15]]);
+  v[0] = addv(v[0], v[5]);
+  v[1] = addv(v[1], v[6]);
+  v[2] = addv(v[2], v[7]);
+  v[3] = addv(v[3], v[4]);
+  v[15] = xorv(v[15], v[0]);
+  v[12] = xorv(v[12], v[1]);
+  v[13] = xorv(v[13], v[2]);
+  v[14] = xorv(v[14], v[3]);
+  v[15] = rot8(v[15]);
+  v[12] = rot8(v[12]);
+  v[13] = rot8(v[13]);
+  v[14] = rot8(v[14]);
+  v[10] = addv(v[10], v[15]);
+  v[11] = addv(v[11], v[12]);
+  v[8] = addv(v[8], v[13]);
+  v[9] = addv(v[9], v[14]);
+  v[5] = xorv(v[5], v[10]);
+  v[6] = xorv(v[6], v[11]);
+  v[7] = xorv(v[7], v[8]);
+  v[4] = xorv(v[4], v[9]);
+  v[5] = rot7(v[5]);
+  v[6] = rot7(v[6]);
+  v[7] = rot7(v[7]);
+  v[4] = rot7(v[4]);
+}
+
+INLINE void transpose_vecs(__m256i vecs[DEGREE]) {
+  // Interleave 32-bit lanes. The low unpack is lanes 00/11/44/55, and the high
+  // is 22/33/66/77.
+  __m256i ab_0145 = _mm256_unpacklo_epi32(vecs[0], vecs[1]);
+  __m256i ab_2367 = _mm256_unpackhi_epi32(vecs[0], vecs[1]);
+  __m256i cd_0145 = _mm256_unpacklo_epi32(vecs[2], vecs[3]);
+  __m256i cd_2367 = _mm256_unpackhi_epi32(vecs[2], vecs[3]);
+  __m256i ef_0145 = _mm256_unpacklo_epi32(vecs[4], vecs[5]);
+  __m256i ef_2367 = _mm256_unpackhi_epi32(vecs[4], vecs[5]);
+  __m256i gh_0145 = _mm256_unpacklo_epi32(vecs[6], vecs[7]);
+  __m256i gh_2367 = _mm256_unpackhi_epi32(vecs[6], vecs[7]);
+
+  // Interleave 64-bit lates. The low unpack is lanes 00/22 and the high is
+  // 11/33.
+  __m256i abcd_04 = _mm256_unpacklo_epi64(ab_0145, cd_0145);
+  __m256i abcd_15 = _mm256_unpackhi_epi64(ab_0145, cd_0145);
+  __m256i abcd_26 = _mm256_unpacklo_epi64(ab_2367, cd_2367);
+  __m256i abcd_37 = _mm256_unpackhi_epi64(ab_2367, cd_2367);
+  __m256i efgh_04 = _mm256_unpacklo_epi64(ef_0145, gh_0145);
+  __m256i efgh_15 = _mm256_unpackhi_epi64(ef_0145, gh_0145);
+  __m256i efgh_26 = _mm256_unpacklo_epi64(ef_2367, gh_2367);
+  __m256i efgh_37 = _mm256_unpackhi_epi64(ef_2367, gh_2367);
+
+  // Interleave 128-bit lanes.
+  vecs[0] = _mm256_permute2x128_si256(abcd_04, efgh_04, 0x20);
+  vecs[1] = _mm256_permute2x128_si256(abcd_15, efgh_15, 0x20);
+  vecs[2] = _mm256_permute2x128_si256(abcd_26, efgh_26, 0x20);
+  vecs[3] = _mm256_permute2x128_si256(abcd_37, efgh_37, 0x20);
+  vecs[4] = _mm256_permute2x128_si256(abcd_04, efgh_04, 0x31);
+  vecs[5] = _mm256_permute2x128_si256(abcd_15, efgh_15, 0x31);
+  vecs[6] = _mm256_permute2x128_si256(abcd_26, efgh_26, 0x31);
+  vecs[7] = _mm256_permute2x128_si256(abcd_37, efgh_37, 0x31);
+}
+
+INLINE void transpose_msg_vecs(const u8 *const *inputs,
+                               size_t block_offset, __m256i out[16]) {
+  out[0] = loadu(&inputs[0][block_offset + 0 * sizeof(__m256i)]);
+  out[1] = loadu(&inputs[1][block_offset + 0 * sizeof(__m256i)]);
+  out[2] = loadu(&inputs[2][block_offset + 0 * sizeof(__m256i)]);
+  out[3] = loadu(&inputs[3][block_offset + 0 * sizeof(__m256i)]);
+  out[4] = loadu(&inputs[4][block_offset + 0 * sizeof(__m256i)]);
+  out[5] = loadu(&inputs[5][block_offset + 0 * sizeof(__m256i)]);
+  out[6] = loadu(&inputs[6][block_offset + 0 * sizeof(__m256i)]);
+  out[7] = loadu(&inputs[7][block_offset + 0 * sizeof(__m256i)]);
+  out[8] = loadu(&inputs[0][block_offset + 1 * sizeof(__m256i)]);
+  out[9] = loadu(&inputs[1][block_offset + 1 * sizeof(__m256i)]);
+  out[10] = loadu(&inputs[2][block_offset + 1 * sizeof(__m256i)]);
+  out[11] = loadu(&inputs[3][block_offset + 1 * sizeof(__m256i)]);
+  out[12] = loadu(&inputs[4][block_offset + 1 * sizeof(__m256i)]);
+  out[13] = loadu(&inputs[5][block_offset + 1 * sizeof(__m256i)]);
+  out[14] = loadu(&inputs[6][block_offset + 1 * sizeof(__m256i)]);
+  out[15] = loadu(&inputs[7][block_offset + 1 * sizeof(__m256i)]);
+  for (size_t i = 0; i < 8; ++i) {
+    _mm_prefetch((const void *)&inputs[i][block_offset + 256], _MM_HINT_T0);
+  }
+  transpose_vecs(&out[0]);
+  transpose_vecs(&out[8]);
+}
+
+INLINE void load_counters(u64 counter, char increment_counter,
+                          __m256i *out_lo, __m256i *out_hi) {
+  const __m256i mask = _mm256_set1_epi32(-(int32_t)increment_counter);
+  const __m256i add0 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
+  const __m256i add1 = _mm256_and_si256(mask, add0);
+  __m256i l = _mm256_add_epi32(_mm256_set1_epi32((int32_t)counter), add1);
+  __m256i carry = _mm256_cmpgt_epi32(_mm256_xor_si256(add1, _mm256_set1_epi32(0x80000000)), 
+                                     _mm256_xor_si256(   l, _mm256_set1_epi32(0x80000000)));
+  __m256i h = _mm256_sub_epi32(_mm256_set1_epi32((int32_t)(counter >> 32)), carry);
+  *out_lo = l;
+  *out_hi = h;
+}
+
+static
+void blake3_hash8_avx2(const u8 *const *inputs, size_t blocks,
+                       const u32 key[8], u64 counter,
+                       char increment_counter, u8 flags,
+                       u8 flags_start, u8 flags_end, u8 *out) {
+  __m256i h_vecs[8] = {
+      set1(key[0]), set1(key[1]), set1(key[2]), set1(key[3]),
+      set1(key[4]), set1(key[5]), set1(key[6]), set1(key[7]),
+  };
+  __m256i counter_low_vec, counter_high_vec;
+  load_counters(counter, increment_counter, &counter_low_vec,
+                &counter_high_vec);
+  u8 block_flags = flags | flags_start;
+
+  for (size_t block = 0; block < blocks; block++) {
+    if (block + 1 == blocks) {
+      block_flags |= flags_end;
+    }
+    __m256i block_len_vec = set1(BLAKE3_BLOCK_LEN);
+    __m256i block_flags_vec = set1(block_flags);
+    __m256i msg_vecs[16];
+    transpose_msg_vecs(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs);
+
+    __m256i v[16] = {
+        h_vecs[0],       h_vecs[1],        h_vecs[2],     h_vecs[3],
+        h_vecs[4],       h_vecs[5],        h_vecs[6],     h_vecs[7],
+        set1(IV[0]),     set1(IV[1]),      set1(IV[2]),   set1(IV[3]),
+        counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec,
+    };
+    round_fn(v, msg_vecs, 0);
+    round_fn(v, msg_vecs, 1);
+    round_fn(v, msg_vecs, 2);
+    round_fn(v, msg_vecs, 3);
+    round_fn(v, msg_vecs, 4);
+    round_fn(v, msg_vecs, 5);
+    round_fn(v, msg_vecs, 6);
+    h_vecs[0] = xorv(v[0], v[8]);
+    h_vecs[1] = xorv(v[1], v[9]);
+    h_vecs[2] = xorv(v[2], v[10]);
+    h_vecs[3] = xorv(v[3], v[11]);
+    h_vecs[4] = xorv(v[4], v[12]);
+    h_vecs[5] = xorv(v[5], v[13]);
+    h_vecs[6] = xorv(v[6], v[14]);
+    h_vecs[7] = xorv(v[7], v[15]);
+
+    block_flags = flags;
+  }
+
+  transpose_vecs(h_vecs);
+  storeu(h_vecs[0], &out[0 * sizeof(__m256i)]);
+  storeu(h_vecs[1], &out[1 * sizeof(__m256i)]);
+  storeu(h_vecs[2], &out[2 * sizeof(__m256i)]);
+  storeu(h_vecs[3], &out[3 * sizeof(__m256i)]);
+  storeu(h_vecs[4], &out[4 * sizeof(__m256i)]);
+  storeu(h_vecs[5], &out[5 * sizeof(__m256i)]);
+  storeu(h_vecs[6], &out[6 * sizeof(__m256i)]);
+  storeu(h_vecs[7], &out[7 * sizeof(__m256i)]);
+}
+
+#if !defined(BLAKE3_NO_SSE41)
+void blake3_hash_many_sse41(const u8 *const *inputs, size_t num_inputs,
+                            size_t blocks, const u32 key[8],
+                            u64 counter, char increment_counter,
+                            u8 flags, u8 flags_start,
+                            u8 flags_end, u8 *out);
+#else
+void blake3_hash_many_portable(const u8 *const *inputs, size_t num_inputs,
+                               size_t blocks, const u32 key[8],
+                               u64 counter, char increment_counter,
+                               u8 flags, u8 flags_start,
+                               u8 flags_end, u8 *out);
+#endif
+
+void blake3_hash_many_avx2(const u8 *const *inputs, size_t num_inputs,
+                           size_t blocks, const u32 key[8],
+                           u64 counter, char increment_counter,
+                           u8 flags, u8 flags_start,
+                           u8 flags_end, u8 *out) {
+  while (num_inputs >= DEGREE) {
+    blake3_hash8_avx2(inputs, blocks, key, counter, increment_counter, flags,
+                      flags_start, flags_end, out);
+    if (increment_counter) {
+      counter += DEGREE;
+    }
+    inputs += DEGREE;
+    num_inputs -= DEGREE;
+    out = &out[DEGREE * BLAKE3_OUT_LEN];
+  }
+#if !defined(BLAKE3_NO_SSE41)
+  blake3_hash_many_sse41(inputs, num_inputs, blocks, key, counter,
+                         increment_counter, flags, flags_start, flags_end, out);
+#else
+  blake3_hash_many_portable(inputs, num_inputs, blocks, key, counter,
+                            increment_counter, flags, flags_start, flags_end,
+                            out);
+#endif
+}
diff --git a/src/blake3/blake3_avx2_x86-64_unix.S b/src/blake3/blake3_avx2_x86-64_unix.S
new file mode 100644
index 0000000..812bb85
--- /dev/null
+++ b/src/blake3/blake3_avx2_x86-64_unix.S
@@ -0,0 +1,1815 @@
+#if defined(__ELF__) && defined(__linux__)
+.section .note.GNU-stack,"",%progbits
+#endif
+
+#if defined(__ELF__) && defined(__CET__) && defined(__has_include)
+#if __has_include(<cet.h>)
+#include <cet.h>
+#endif
+#endif
+
+#if !defined(_CET_ENDBR)
+#define _CET_ENDBR
+#endif
+
+.intel_syntax noprefix
+.global _blake3_hash_many_avx2
+.global blake3_hash_many_avx2
+#ifdef __APPLE__
+.text
+#else
+.section .text
+#endif
+        .p2align  6
+_blake3_hash_many_avx2:
+blake3_hash_many_avx2:
+        _CET_ENDBR
+        push    r15
+        push    r14
+        push    r13
+        push    r12
+        push    rbx
+        push    rbp
+        mov     rbp, rsp
+        sub     rsp, 680
+        and     rsp, 0xFFFFFFFFFFFFFFC0
+        neg     r9d
+        vmovd   xmm0, r9d
+        vpbroadcastd ymm0, xmm0
+        vmovdqa ymmword ptr [rsp+0x280], ymm0
+        vpand   ymm1, ymm0, ymmword ptr [ADD0+rip]
+        vpand   ymm2, ymm0, ymmword ptr [ADD1+rip]
+        vmovdqa ymmword ptr [rsp+0x220], ymm2
+        vmovd   xmm2, r8d
+        vpbroadcastd ymm2, xmm2
+        vpaddd  ymm2, ymm2, ymm1
+        vmovdqa ymmword ptr [rsp+0x240], ymm2
+        vpxor   ymm1, ymm1, ymmword ptr [CMP_MSB_MASK+rip]
+        vpxor   ymm2, ymm2, ymmword ptr [CMP_MSB_MASK+rip]
+        vpcmpgtd ymm2, ymm1, ymm2
+        shr     r8, 32
+        vmovd   xmm3, r8d
+        vpbroadcastd ymm3, xmm3
+        vpsubd  ymm3, ymm3, ymm2
+        vmovdqa ymmword ptr [rsp+0x260], ymm3
+        shl     rdx, 6
+        mov     qword ptr [rsp+0x2A0], rdx
+        cmp     rsi, 8
+        jc      3f
+2:
+        vpbroadcastd ymm0, dword ptr [rcx]
+        vpbroadcastd ymm1, dword ptr [rcx+0x4]
+        vpbroadcastd ymm2, dword ptr [rcx+0x8]
+        vpbroadcastd ymm3, dword ptr [rcx+0xC]
+        vpbroadcastd ymm4, dword ptr [rcx+0x10]
+        vpbroadcastd ymm5, dword ptr [rcx+0x14]
+        vpbroadcastd ymm6, dword ptr [rcx+0x18]
+        vpbroadcastd ymm7, dword ptr [rcx+0x1C]
+        mov     r8, qword ptr [rdi]
+        mov     r9, qword ptr [rdi+0x8]
+        mov     r10, qword ptr [rdi+0x10]
+        mov     r11, qword ptr [rdi+0x18]
+        mov     r12, qword ptr [rdi+0x20]
+        mov     r13, qword ptr [rdi+0x28]
+        mov     r14, qword ptr [rdi+0x30]
+        mov     r15, qword ptr [rdi+0x38]
+        movzx   eax, byte ptr [rbp+0x38]
+        movzx   ebx, byte ptr [rbp+0x40]
+        or      eax, ebx
+        xor     edx, edx
+.p2align  5
+9:
+        movzx   ebx, byte ptr [rbp+0x48]
+        or      ebx, eax
+        add     rdx, 64
+        cmp     rdx, qword ptr [rsp+0x2A0]
+        cmove   eax, ebx
+        mov     dword ptr [rsp+0x200], eax
+        vmovups xmm8, xmmword ptr [r8+rdx-0x40]
+        vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x40], 0x01
+        vmovups xmm9, xmmword ptr [r9+rdx-0x40]
+        vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x40], 0x01
+        vunpcklpd ymm12, ymm8, ymm9
+        vunpckhpd ymm13, ymm8, ymm9
+        vmovups xmm10, xmmword ptr [r10+rdx-0x40]
+        vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x40], 0x01
+        vmovups xmm11, xmmword ptr [r11+rdx-0x40]
+        vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x40], 0x01
+        vunpcklpd ymm14, ymm10, ymm11
+        vunpckhpd ymm15, ymm10, ymm11
+        vshufps ymm8, ymm12, ymm14, 136
+        vmovaps ymmword ptr [rsp], ymm8
+        vshufps ymm9, ymm12, ymm14, 221
+        vmovaps ymmword ptr [rsp+0x20], ymm9
+        vshufps ymm10, ymm13, ymm15, 136
+        vmovaps ymmword ptr [rsp+0x40], ymm10
+        vshufps ymm11, ymm13, ymm15, 221
+        vmovaps ymmword ptr [rsp+0x60], ymm11
+        vmovups xmm8, xmmword ptr [r8+rdx-0x30]
+        vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x30], 0x01
+        vmovups xmm9, xmmword ptr [r9+rdx-0x30]
+        vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x30], 0x01
+        vunpcklpd ymm12, ymm8, ymm9
+        vunpckhpd ymm13, ymm8, ymm9
+        vmovups xmm10, xmmword ptr [r10+rdx-0x30]
+        vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x30], 0x01
+        vmovups xmm11, xmmword ptr [r11+rdx-0x30]
+        vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x30], 0x01
+        vunpcklpd ymm14, ymm10, ymm11
+        vunpckhpd ymm15, ymm10, ymm11
+        vshufps ymm8, ymm12, ymm14, 136
+        vmovaps ymmword ptr [rsp+0x80], ymm8
+        vshufps ymm9, ymm12, ymm14, 221
+        vmovaps ymmword ptr [rsp+0xA0], ymm9
+        vshufps ymm10, ymm13, ymm15, 136
+        vmovaps ymmword ptr [rsp+0xC0], ymm10
+        vshufps ymm11, ymm13, ymm15, 221
+        vmovaps ymmword ptr [rsp+0xE0], ymm11
+        vmovups xmm8, xmmword ptr [r8+rdx-0x20]
+        vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x20], 0x01
+        vmovups xmm9, xmmword ptr [r9+rdx-0x20]
+        vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x20], 0x01
+        vunpcklpd ymm12, ymm8, ymm9
+        vunpckhpd ymm13, ymm8, ymm9
+        vmovups xmm10, xmmword ptr [r10+rdx-0x20]
+        vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x20], 0x01
+        vmovups xmm11, xmmword ptr [r11+rdx-0x20]
+        vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x20], 0x01
+        vunpcklpd ymm14, ymm10, ymm11
+        vunpckhpd ymm15, ymm10, ymm11
+        vshufps ymm8, ymm12, ymm14, 136
+        vmovaps ymmword ptr [rsp+0x100], ymm8
+        vshufps ymm9, ymm12, ymm14, 221
+        vmovaps ymmword ptr [rsp+0x120], ymm9
+        vshufps ymm10, ymm13, ymm15, 136
+        vmovaps ymmword ptr [rsp+0x140], ymm10
+        vshufps ymm11, ymm13, ymm15, 221
+        vmovaps ymmword ptr [rsp+0x160], ymm11
+        vmovups xmm8, xmmword ptr [r8+rdx-0x10]
+        vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x10], 0x01
+        vmovups xmm9, xmmword ptr [r9+rdx-0x10]
+        vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x10], 0x01
+        vunpcklpd ymm12, ymm8, ymm9
+        vunpckhpd ymm13, ymm8, ymm9
+        vmovups xmm10, xmmword ptr [r10+rdx-0x10]
+        vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x10], 0x01
+        vmovups xmm11, xmmword ptr [r11+rdx-0x10]
+        vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x10], 0x01
+        vunpcklpd ymm14, ymm10, ymm11
+        vunpckhpd ymm15, ymm10, ymm11
+        vshufps ymm8, ymm12, ymm14, 136
+        vmovaps ymmword ptr [rsp+0x180], ymm8
+        vshufps ymm9, ymm12, ymm14, 221
+        vmovaps ymmword ptr [rsp+0x1A0], ymm9
+        vshufps ymm10, ymm13, ymm15, 136
+        vmovaps ymmword ptr [rsp+0x1C0], ymm10
+        vshufps ymm11, ymm13, ymm15, 221
+        vmovaps ymmword ptr [rsp+0x1E0], ymm11
+        vpbroadcastd ymm15, dword ptr [rsp+0x200]
+        prefetcht0 [r8+rdx+0x80]
+        prefetcht0 [r12+rdx+0x80]
+        prefetcht0 [r9+rdx+0x80]
+        prefetcht0 [r13+rdx+0x80]
+        prefetcht0 [r10+rdx+0x80]
+        prefetcht0 [r14+rdx+0x80]
+        prefetcht0 [r11+rdx+0x80]
+        prefetcht0 [r15+rdx+0x80]
+        vpaddd  ymm0, ymm0, ymmword ptr [rsp]
+        vpaddd  ymm1, ymm1, ymmword ptr [rsp+0x40]
+        vpaddd  ymm2, ymm2, ymmword ptr [rsp+0x80]
+        vpaddd  ymm3, ymm3, ymmword ptr [rsp+0xC0]
+        vpaddd  ymm0, ymm0, ymm4
+        vpaddd  ymm1, ymm1, ymm5
+        vpaddd  ymm2, ymm2, ymm6
+        vpaddd  ymm3, ymm3, ymm7
+        vpxor   ymm12, ymm0, ymmword ptr [rsp+0x240]
+        vpxor   ymm13, ymm1, ymmword ptr [rsp+0x260]
+        vpxor   ymm14, ymm2, ymmword ptr [BLAKE3_BLOCK_LEN+rip]
+        vpxor   ymm15, ymm3, ymm15
+        vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
+        vpshufb ymm12, ymm12, ymm8
+        vpshufb ymm13, ymm13, ymm8
+        vpshufb ymm14, ymm14, ymm8
+        vpshufb ymm15, ymm15, ymm8
+        vpaddd  ymm8, ymm12, ymmword ptr [BLAKE3_IV_0+rip]
+        vpaddd  ymm9, ymm13, ymmword ptr [BLAKE3_IV_1+rip]
+        vpaddd  ymm10, ymm14, ymmword ptr [BLAKE3_IV_2+rip]
+        vpaddd  ymm11, ymm15, ymmword ptr [BLAKE3_IV_3+rip]
+        vpxor   ymm4, ymm4, ymm8
+        vpxor   ymm5, ymm5, ymm9
+        vpxor   ymm6, ymm6, ymm10
+        vpxor   ymm7, ymm7, ymm11
+        vmovdqa ymmword ptr [rsp+0x200], ymm8
+        vpsrld  ymm8, ymm4, 12
+        vpslld  ymm4, ymm4, 20
+        vpor    ymm4, ymm4, ymm8
+        vpsrld  ymm8, ymm5, 12
+        vpslld  ymm5, ymm5, 20
+        vpor    ymm5, ymm5, ymm8
+        vpsrld  ymm8, ymm6, 12
+        vpslld  ymm6, ymm6, 20
+        vpor    ymm6, ymm6, ymm8
+        vpsrld  ymm8, ymm7, 12
+        vpslld  ymm7, ymm7, 20
+        vpor    ymm7, ymm7, ymm8
+        vpaddd  ymm0, ymm0, ymmword ptr [rsp+0x20]
+        vpaddd  ymm1, ymm1, ymmword ptr [rsp+0x60]
+        vpaddd  ymm2, ymm2, ymmword ptr [rsp+0xA0]
+        vpaddd  ymm3, ymm3, ymmword ptr [rsp+0xE0]
+        vpaddd  ymm0, ymm0, ymm4
+        vpaddd  ymm1, ymm1, ymm5
+        vpaddd  ymm2, ymm2, ymm6
+        vpaddd  ymm3, ymm3, ymm7
+        vpxor   ymm12, ymm12, ymm0
+        vpxor   ymm13, ymm13, ymm1
+        vpxor   ymm14, ymm14, ymm2
+        vpxor   ymm15, ymm15, ymm3
+        vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
+        vpshufb ymm12, ymm12, ymm8
+        vpshufb ymm13, ymm13, ymm8
+        vpshufb ymm14, ymm14, ymm8
+        vpshufb ymm15, ymm15, ymm8
+        vpaddd  ymm8, ymm12, ymmword ptr [rsp+0x200]
+        vpaddd  ymm9, ymm9, ymm13
+        vpaddd  ymm10, ymm10, ymm14
+        vpaddd  ymm11, ymm11, ymm15
+        vpxor   ymm4, ymm4, ymm8
+        vpxor   ymm5, ymm5, ymm9
+        vpxor   ymm6, ymm6, ymm10
+        vpxor   ymm7, ymm7, ymm11
+        vmovdqa ymmword ptr [rsp+0x200], ymm8
+        vpsrld  ymm8, ymm4, 7
+        vpslld  ymm4, ymm4, 25
+        vpor    ymm4, ymm4, ymm8
+        vpsrld  ymm8, ymm5, 7
+        vpslld  ymm5, ymm5, 25
+        vpor    ymm5, ymm5, ymm8
+        vpsrld  ymm8, ymm6, 7
+        vpslld  ymm6, ymm6, 25
+        vpor    ymm6, ymm6, ymm8
+        vpsrld  ymm8, ymm7, 7
+        vpslld  ymm7, ymm7, 25
+        vpor    ymm7, ymm7, ymm8
+        vpaddd  ymm0, ymm0, ymmword ptr [rsp+0x100]
+        vpaddd  ymm1, ymm1, ymmword ptr [rsp+0x140]
+        vpaddd  ymm2, ymm2, ymmword ptr [rsp+0x180]
+        vpaddd  ymm3, ymm3, ymmword ptr [rsp+0x1C0]
+        vpaddd  ymm0, ymm0, ymm5
+        vpaddd  ymm1, ymm1, ymm6
+        vpaddd  ymm2, ymm2, ymm7
+        vpaddd  ymm3, ymm3, ymm4
+        vpxor   ymm15, ymm15, ymm0
+        vpxor   ymm12, ymm12, ymm1
+        vpxor   ymm13, ymm13, ymm2
+        vpxor   ymm14, ymm14, ymm3
+        vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
+        vpshufb ymm15, ymm15, ymm8
+        vpshufb ymm12, ymm12, ymm8
+        vpshufb ymm13, ymm13, ymm8
+        vpshufb ymm14, ymm14, ymm8
+        vpaddd  ymm10, ymm10, ymm15
+        vpaddd  ymm11, ymm11, ymm12
+        vpaddd  ymm8, ymm13, ymmword ptr [rsp+0x200]
+        vpaddd  ymm9, ymm9, ymm14
+        vpxor   ymm5, ymm5, ymm10
+        vpxor   ymm6, ymm6, ymm11
+        vpxor   ymm7, ymm7, ymm8
+        vpxor   ymm4, ymm4, ymm9
+        vmovdqa ymmword ptr [rsp+0x200], ymm8
+        vpsrld  ymm8, ymm5, 12
+        vpslld  ymm5, ymm5, 20
+        vpor    ymm5, ymm5, ymm8
+        vpsrld  ymm8, ymm6, 12
+        vpslld  ymm6, ymm6, 20
+        vpor    ymm6, ymm6, ymm8
+        vpsrld  ymm8, ymm7, 12
+        vpslld  ymm7, ymm7, 20
+        vpor    ymm7, ymm7, ymm8
+        vpsrld  ymm8, ymm4, 12
+        vpslld  ymm4, ymm4, 20
+        vpor    ymm4, ymm4, ymm8
+        vpaddd  ymm0, ymm0, ymmword ptr [rsp+0x120]
+        vpaddd  ymm1, ymm1, ymmword ptr [rsp+0x160]
+        vpaddd  ymm2, ymm2, ymmword ptr [rsp+0x1A0]
+        vpaddd  ymm3, ymm3, ymmword ptr [rsp+0x1E0]
+        vpaddd  ymm0, ymm0, ymm5
+        vpaddd  ymm1, ymm1, ymm6
+        vpaddd  ymm2, ymm2, ymm7
+        vpaddd  ymm3, ymm3, ymm4
+        vpxor   ymm15, ymm15, ymm0
+        vpxor   ymm12, ymm12, ymm1
+        vpxor   ymm13, ymm13, ymm2
+        vpxor   ymm14, ymm14, ymm3
+        vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
+        vpshufb ymm15, ymm15, ymm8
+        vpshufb ymm12, ymm12, ymm8
+        vpshufb ymm13, ymm13, ymm8
+        vpshufb ymm14, ymm14, ymm8
+        vpaddd  ymm10, ymm10, ymm15
+        vpaddd  ymm11, ymm11, ymm12
+        vpaddd  ymm8, ymm13, ymmword ptr [rsp+0x200]
+        vpaddd  ymm9, ymm9, ymm14
+        vpxor   ymm5, ymm5, ymm10
+        vpxor   ymm6, ymm6, ymm11
+        vpxor   ymm7, ymm7, ymm8
+        vpxor   ymm4, ymm4, ymm9
+        vmovdqa ymmword ptr [rsp+0x200], ymm8
+        vpsrld  ymm8, ymm5, 7
+        vpslld  ymm5, ymm5, 25
+        vpor    ymm5, ymm5, ymm8
+        vpsrld  ymm8, ymm6, 7
+        vpslld  ymm6, ymm6, 25
+        vpor    ymm6, ymm6, ymm8
+        vpsrld  ymm8, ymm7, 7
+        vpslld  ymm7, ymm7, 25
+        vpor    ymm7, ymm7, ymm8
+        vpsrld  ymm8, ymm4, 7
+        vpslld  ymm4, ymm4, 25
+        vpor    ymm4, ymm4, ymm8
+        vpaddd  ymm0, ymm0, ymmword ptr [rsp+0x40]
+        vpaddd  ymm1, ymm1, ymmword ptr [rsp+0x60]
+        vpaddd  ymm2, ymm2, ymmword ptr [rsp+0xE0]
+        vpaddd  ymm3, ymm3, ymmword ptr [rsp+0x80]
+        vpaddd  ymm0, ymm0, ymm4
+        vpaddd  ymm1, ymm1, ymm5
+        vpaddd  ymm2, ymm2, ymm6
+        vpaddd  ymm3, ymm3, ymm7
+        vpxor   ymm12, ymm12, ymm0
+        vpxor   ymm13, ymm13, ymm1
+        vpxor   ymm14, ymm14, ymm2
+        vpxor   ymm15, ymm15, ymm3
+        vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
+        vpshufb ymm12, ymm12, ymm8
+        vpshufb ymm13, ymm13, ymm8
+        vpshufb ymm14, ymm14, ymm8
+        vpshufb ymm15, ymm15, ymm8
+        vpaddd  ymm8, ymm12, ymmword ptr [rsp+0x200]
+        vpaddd  ymm9, ymm9, ymm13
+        vpaddd  ymm10, ymm10, ymm14
+        vpaddd  ymm11, ymm11, ymm15
+        vpxor   ymm4, ymm4, ymm8
+        vpxor   ymm5, ymm5, ymm9
+        vpxor   ymm6, ymm6, ymm10
+        vpxor   ymm7, ymm7, ymm11
+        vmovdqa ymmword ptr [rsp+0x200], ymm8
+        vpsrld  ymm8, ymm4, 12
+        vpslld  ymm4, ymm4, 20
+        vpor    ymm4, ymm4, ymm8
+        vpsrld  ymm8, ymm5, 12
+        vpslld  ymm5, ymm5, 20
+        vpor    ymm5, ymm5, ymm8
+        vpsrld  ymm8, ymm6, 12
+        vpslld  ymm6, ymm6, 20
+        vpor    ymm6, ymm6, ymm8
+        vpsrld  ymm8, ymm7, 12
+        vpslld  ymm7, ymm7, 20
+        vpor    ymm7, ymm7, ymm8
+        vpaddd  ymm0, ymm0, ymmword ptr [rsp+0xC0]
+        vpaddd  ymm1, ymm1, ymmword ptr [rsp+0x140]
+        vpaddd  ymm2, ymm2, ymmword ptr [rsp]
+        vpaddd  ymm3, ymm3, ymmword ptr [rsp+0x1A0]
+        vpaddd  ymm0, ymm0, ymm4
+        vpaddd  ymm1, ymm1, ymm5
+        vpaddd  ymm2, ymm2, ymm6
+        vpaddd  ymm3, ymm3, ymm7
+        vpxor   ymm12, ymm12, ymm0
+        vpxor   ymm13, ymm13, ymm1
+        vpxor   ymm14, ymm14, ymm2
+        vpxor   ymm15, ymm15, ymm3
+        vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
+        vpshufb ymm12, ymm12, ymm8
+        vpshufb ymm13, ymm13, ymm8
+        vpshufb ymm14, ymm14, ymm8
+        vpshufb ymm15, ymm15, ymm8
+        vpaddd  ymm8, ymm12, ymmword ptr [rsp+0x200]
+        vpaddd  ymm9, ymm9, ymm13
+        vpaddd  ymm10, ymm10, ymm14
+        vpaddd  ymm11, ymm11, ymm15
+        vpxor   ymm4, ymm4, ymm8
+        vpxor   ymm5, ymm5, ymm9
+        vpxor   ymm6, ymm6, ymm10
+        vpxor   ymm7, ymm7, ymm11
+        vmovdqa ymmword ptr [rsp+0x200], ymm8
+        vpsrld  ymm8, ymm4, 7
+        vpslld  ymm4, ymm4, 25
+        vpor    ymm4, ymm4, ymm8
+        vpsrld  ymm8, ymm5, 7
+        vpslld  ymm5, ymm5, 25
+        vpor    ymm5, ymm5, ymm8
+        vpsrld  ymm8, ymm6, 7
+        vpslld  ymm6, ymm6, 25
+        vpor    ymm6, ymm6, ymm8
+        vpsrld  ymm8, ymm7, 7
+        vpslld  ymm7, ymm7, 25
+        vpor    ymm7, ymm7, ymm8
+        vpaddd  ymm0, ymm0, ymmword ptr [rsp+0x20]
+        vpaddd  ymm1, ymm1, ymmword ptr [rsp+0x180]
+        vpaddd  ymm2, ymm2, ymmword ptr [rsp+0x120]
+        vpaddd  ymm3, ymm3, ymmword ptr [rsp+0x1E0]
+        vpaddd  ymm0, ymm0, ymm5
+        vpaddd  ymm1, ymm1, ymm6
+        vpaddd  ymm2, ymm2, ymm7
+        vpaddd  ymm3, ymm3, ymm4
+        vpxor   ymm15, ymm15, ymm0
+        vpxor   ymm12, ymm12, ymm1
+        vpxor   ymm13, ymm13, ymm2
+        vpxor   ymm14, ymm14, ymm3
+        vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
+        vpshufb ymm15, ymm15, ymm8
+        vpshufb ymm12, ymm12, ymm8
+        vpshufb ymm13, ymm13, ymm8
+        vpshufb ymm14, ymm14, ymm8
+        vpaddd  ymm10, ymm10, ymm15
+        vpaddd  ymm11, ymm11, ymm12
+        vpaddd  ymm8, ymm13, ymmword ptr [rsp+0x200]
+        vpaddd  ymm9, ymm9, ymm14
+        vpxor   ymm5, ymm5, ymm10
+        vpxor   ymm6, ymm6, ymm11
+        vpxor   ymm7, ymm7, ymm8
+        vpxor   ymm4, ymm4, ymm9
+        vmovdqa ymmword ptr [rsp+0x200], ymm8
+        vpsrld  ymm8, ymm5, 12
+        vpslld  ymm5, ymm5, 20
+        vpor    ymm5, ymm5, ymm8
+        vpsrld  ymm8, ymm6, 12
+        vpslld  ymm6, ymm6, 20
+        vpor    ymm6, ymm6, ymm8
+        vpsrld  ymm8, ymm7, 12
+        vpslld  ymm7, ymm7, 20
+        vpor    ymm7, ymm7, ymm8
+        vpsrld  ymm8, ymm4, 12
+        vpslld  ymm4, ymm4, 20
+        vpor    ymm4, ymm4, ymm8
+        vpaddd  ymm0, ymm0, ymmword ptr [rsp+0x160]
+        vpaddd  ymm1, ymm1, ymmword ptr [rsp+0xA0]
+        vpaddd  ymm2, ymm2, ymmword ptr [rsp+0x1C0]
+        vpaddd  ymm3, ymm3, ymmword ptr [rsp+0x100]
+        vpaddd  ymm0, ymm0, ymm5
+        vpaddd  ymm1, ymm1, ymm6
+        vpaddd  ymm2, ymm2, ymm7
+        vpaddd  ymm3, ymm3, ymm4
+        vpxor   ymm15, ymm15, ymm0
+        vpxor   ymm12, ymm12, ymm1
+        vpxor   ymm13, ymm13, ymm2
+        vpxor   ymm14, ymm14, ymm3
+        vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
+        vpshufb ymm15, ymm15, ymm8
+        vpshufb ymm12, ymm12, ymm8
+        vpshufb ymm13, ymm13, ymm8
+        vpshufb ymm14, ymm14, ymm8
+        vpaddd  ymm10, ymm10, ymm15
+        vpaddd  ymm11, ymm11, ymm12
+        vpaddd  ymm8, ymm13, ymmword ptr [rsp+0x200]
+        vpaddd  ymm9, ymm9, ymm14
+        vpxor   ymm5, ymm5, ymm10
+        vpxor   ymm6, ymm6, ymm11
+        vpxor   ymm7, ymm7, ymm8
+        vpxor   ymm4, ymm4, ymm9
+        vmovdqa ymmword ptr [rsp+0x200], ymm8
+        vpsrld  ymm8, ymm5, 7
+        vpslld  ymm5, ymm5, 25
+        vpor    ymm5, ymm5, ymm8
+        vpsrld  ymm8, ymm6, 7
+        vpslld  ymm6, ymm6, 25
+        vpor    ymm6, ymm6, ymm8
+        vpsrld  ymm8, ymm7, 7
+        vpslld  ymm7, ymm7, 25
+        vpor    ymm7, ymm7, ymm8
+        vpsrld  ymm8, ymm4, 7
+        vpslld  ymm4, ymm4, 25
+        vpor    ymm4, ymm4, ymm8
+        vpaddd  ymm0, ymm0, ymmword ptr [rsp+0x60]
+        vpaddd  ymm1, ymm1, ymmword ptr [rsp+0x140]
+        vpaddd  ymm2, ymm2, ymmword ptr [rsp+0x1A0]
+        vpaddd  ymm3, ymm3, ymmword ptr [rsp+0xE0]
+        vpaddd  ymm0, ymm0, ymm4
+        vpaddd  ymm1, ymm1, ymm5
+        vpaddd  ymm2, ymm2, ymm6
+        vpaddd  ymm3, ymm3, ymm7
+        vpxor   ymm12, ymm12, ymm0
+        vpxor   ymm13, ymm13, ymm1
+        vpxor   ymm14, ymm14, ymm2
+        vpxor   ymm15, ymm15, ymm3
+        vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
+        vpshufb ymm12, ymm12, ymm8
+        vpshufb ymm13, ymm13, ymm8
+        vpshufb ymm14, ymm14, ymm8
+        vpshufb ymm15, ymm15, ymm8
+        vpaddd  ymm8, ymm12, ymmword ptr [rsp+0x200]
+        vpaddd  ymm9, ymm9, ymm13
+        vpaddd  ymm10, ymm10, ymm14
+        vpaddd  ymm11, ymm11, ymm15
+        vpxor   ymm4, ymm4, ymm8
+        vpxor   ymm5, ymm5, ymm9
+        vpxor   ymm6, ymm6, ymm10
+        vpxor   ymm7, ymm7, ymm11
+        vmovdqa ymmword ptr [rsp+0x200], ymm8
+        vpsrld  ymm8, ymm4, 12
+        vpslld  ymm4, ymm4, 20
+        vpor    ymm4, ymm4, ymm8
+        vpsrld  ymm8, ymm5, 12
+        vpslld  ymm5, ymm5, 20
+        vpor    ymm5, ymm5, ymm8
+        vpsrld  ymm8, ymm6, 12
+        vpslld  ymm6, ymm6, 20
+        vpor    ymm6, ymm6, ymm8
+        vpsrld  ymm8, ymm7, 12
+        vpslld  ymm7, ymm7, 20
+        vpor    ymm7, ymm7, ymm8
+        vpaddd  ymm0, ymm0, ymmword ptr [rsp+0x80]
+        vpaddd  ymm1, ymm1, ymmword ptr [rsp+0x180]
+        vpaddd  ymm2, ymm2, ymmword ptr [rsp+0x40]
+        vpaddd  ymm3, ymm3, ymmword ptr [rsp+0x1C0]
+        vpaddd  ymm0, ymm0, ymm4
+        vpaddd  ymm1, ymm1, ymm5
+        vpaddd  ymm2, ymm2, ymm6
+        vpaddd  ymm3, ymm3, ymm7
+        vpxor   ymm12, ymm12, ymm0
+        vpxor   ymm13, ymm13, ymm1
+        vpxor   ymm14, ymm14, ymm2
+        vpxor   ymm15, ymm15, ymm3
+        vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
+        vpshufb ymm12, ymm12, ymm8
+        vpshufb ymm13, ymm13, ymm8
+        vpshufb ymm14, ymm14, ymm8
+        vpshufb ymm15, ymm15, ymm8
+        vpaddd  ymm8, ymm12, ymmword ptr [rsp+0x200]
+        vpaddd  ymm9, ymm9, ymm13
+        vpaddd  ymm10, ymm10, ymm14
+        vpaddd  ymm11, ymm11, ymm15
+        vpxor   ymm4, ymm4, ymm8
+        vpxor   ymm5, ymm5, ymm9
+        vpxor   ymm6, ymm6, ymm10
+        vpxor   ymm7, ymm7, ymm11
+        vmovdqa ymmword ptr [rsp+0x200], ymm8
+        vpsrld  ymm8, ymm4, 7
+        vpslld  ymm4, ymm4, 25
+        vpor    ymm4, ymm4, ymm8
+        vpsrld  ymm8, ymm5, 7
+        vpslld  ymm5, ymm5, 25
+        vpor    ymm5, ymm5, ymm8
+        vpsrld  ymm8, ymm6, 7
+        vpslld  ymm6, ymm6, 25
+        vpor    ymm6, ymm6, ymm8
+        vpsrld  ymm8, ymm7, 7
+        vpslld  ymm7, ymm7, 25
+        vpor    ymm7, ymm7, ymm8
+        vpaddd  ymm0, ymm0, ymmword ptr [rsp+0xC0]
+        vpaddd  ymm1, ymm1, ymmword ptr [rsp+0x120]
+        vpaddd  ymm2, ymm2, ymmword ptr [rsp+0x160]
+        vpaddd  ymm3, ymm3, ymmword ptr [rsp+0x100]
+        vpaddd  ymm0, ymm0, ymm5
+        vpaddd  ymm1, ymm1, ymm6
+        vpaddd  ymm2, ymm2, ymm7
+        vpaddd  ymm3, ymm3, ymm4
+        vpxor   ymm15, ymm15, ymm0
+        vpxor   ymm12, ymm12, ymm1
+        vpxor   ymm13, ymm13, ymm2
+        vpxor   ymm14, ymm14, ymm3
+        vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
+        vpshufb ymm15, ymm15, ymm8
+        vpshufb ymm12, ymm12, ymm8
+        vpshufb ymm13, ymm13, ymm8
+        vpshufb ymm14, ymm14, ymm8
+        vpaddd  ymm10, ymm10, ymm15
+        vpaddd  ymm11, ymm11, ymm12
+        vpaddd  ymm8, ymm13, ymmword ptr [rsp+0x200]
+        vpaddd  ymm9, ymm9, ymm14
+        vpxor   ymm5, ymm5, ymm10
+        vpxor   ymm6, ymm6, ymm11
+        vpxor   ymm7, ymm7, ymm8
+        vpxor   ymm4, ymm4, ymm9
+        vmovdqa ymmword ptr [rsp+0x200], ymm8
+        vpsrld  ymm8, ymm5, 12
+        vpslld  ymm5, ymm5, 20
+        vpor    ymm5, ymm5, ymm8
+        vpsrld  ymm8, ymm6, 12
+        vpslld  ymm6, ymm6, 20
+        vpor    ymm6, ymm6, ymm8
+        vpsrld  ymm8, ymm7, 12
+        vpslld  ymm7, ymm7, 20
+        vpor    ymm7, ymm7, ymm8
+        vpsrld  ymm8, ymm4, 12
+        vpslld  ymm4, ymm4, 20
+        vpor    ymm4, ymm4, ymm8
+        vpaddd  ymm0, ymm0, ymmword ptr [rsp+0xA0]
+        vpaddd  ymm1, ymm1, ymmword ptr [rsp]
+        vpaddd  ymm2, ymm2, ymmword ptr [rsp+0x1E0]
+        vpaddd  ymm3, ymm3, ymmword ptr [rsp+0x20]
+        vpaddd  ymm0, ymm0, ymm5
+        vpaddd  ymm1, ymm1, ymm6
+        vpaddd  ymm2, ymm2, ymm7
+        vpaddd  ymm3, ymm3, ymm4
+        vpxor   ymm15, ymm15, ymm0
+        vpxor   ymm12, ymm12, ymm1
+        vpxor   ymm13, ymm13, ymm2
+        vpxor   ymm14, ymm14, ymm3
+        vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
+        vpshufb ymm15, ymm15, ymm8
+        vpshufb ymm12, ymm12, ymm8
+        vpshufb ymm13, ymm13, ymm8
+        vpshufb ymm14, ymm14, ymm8
+        vpaddd  ymm10, ymm10, ymm15
+        vpaddd  ymm11, ymm11, ymm12
+        vpaddd  ymm8, ymm13, ymmword ptr [rsp+0x200]
+        vpaddd  ymm9, ymm9, ymm14
+        vpxor   ymm5, ymm5, ymm10
+        vpxor   ymm6, ymm6, ymm11
+        vpxor   ymm7, ymm7, ymm8
+        vpxor   ymm4, ymm4, ymm9
+        vmovdqa ymmword ptr [rsp+0x200], ymm8
+        vpsrld  ymm8, ymm5, 7
+        vpslld  ymm5, ymm5, 25
+        vpor    ymm5, ymm5, ymm8
+        vpsrld  ymm8, ymm6, 7
+        vpslld  ymm6, ymm6, 25
+        vpor    ymm6, ymm6, ymm8
+        vpsrld  ymm8, ymm7, 7
+        vpslld  ymm7, ymm7, 25
+        vpor    ymm7, ymm7, ymm8
+        vpsrld  ymm8, ymm4, 7
+        vpslld  ymm4, ymm4, 25
+        vpor    ymm4, ymm4, ymm8
+        vpaddd  ymm0, ymm0, ymmword ptr [rsp+0x140]
+        vpaddd  ymm1, ymm1, ymmword ptr [rsp+0x180]
+        vpaddd  ymm2, ymm2, ymmword ptr [rsp+0x1C0]
+        vpaddd  ymm3, ymm3, ymmword ptr [rsp+0x1A0]
+        vpaddd  ymm0, ymm0, ymm4
+        vpaddd  ymm1, ymm1, ymm5
+        vpaddd  ymm2, ymm2, ymm6
+        vpaddd  ymm3, ymm3, ymm7
+        vpxor   ymm12, ymm12, ymm0
+        vpxor   ymm13, ymm13, ymm1
+        vpxor   ymm14, ymm14, ymm2
+        vpxor   ymm15, ymm15, ymm3
+        vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
+        vpshufb ymm12, ymm12, ymm8
+        vpshufb ymm13, ymm13, ymm8
+        vpshufb ymm14, ymm14, ymm8
+        vpshufb ymm15, ymm15, ymm8
+        vpaddd  ymm8, ymm12, ymmword ptr [rsp+0x200]
+        vpaddd  ymm9, ymm9, ymm13
+        vpaddd  ymm10, ymm10, ymm14
+        vpaddd  ymm11, ymm11, ymm15
+        vpxor   ymm4, ymm4, ymm8
+        vpxor   ymm5, ymm5, ymm9
+        vpxor   ymm6, ymm6, ymm10
+        vpxor   ymm7, ymm7, ymm11
+        vmovdqa ymmword ptr [rsp+0x200], ymm8
+        vpsrld  ymm8, ymm4, 12
+        vpslld  ymm4, ymm4, 20
+        vpor    ymm4, ymm4, ymm8
+        vpsrld  ymm8, ymm5, 12
+        vpslld  ymm5, ymm5, 20
+        vpor    ymm5, ymm5, ymm8
+        vpsrld  ymm8, ymm6, 12
+        vpslld  ymm6, ymm6, 20
+        vpor    ymm6, ymm6, ymm8
+        vpsrld  ymm8, ymm7, 12
+        vpslld  ymm7, ymm7, 20
+        vpor    ymm7, ymm7, ymm8
+        vpaddd  ymm0, ymm0, ymmword ptr [rsp+0xE0]
+        vpaddd  ymm1, ymm1, ymmword ptr [rsp+0x120]
+        vpaddd  ymm2, ymm2, ymmword ptr [rsp+0x60]
+        vpaddd  ymm3, ymm3, ymmword ptr [rsp+0x1E0]
+        vpaddd  ymm0, ymm0, ymm4
+        vpaddd  ymm1, ymm1, ymm5
+        vpaddd  ymm2, ymm2, ymm6
+        vpaddd  ymm3, ymm3, ymm7
+        vpxor   ymm12, ymm12, ymm0
+        vpxor   ymm13, ymm13, ymm1
+        vpxor   ymm14, ymm14, ymm2
+        vpxor   ymm15, ymm15, ymm3
+        vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
+        vpshufb ymm12, ymm12, ymm8
+        vpshufb ymm13, ymm13, ymm8
+        vpshufb ymm14, ymm14, ymm8
+        vpshufb ymm15, ymm15, ymm8
+        vpaddd  ymm8, ymm12, ymmword ptr [rsp+0x200]
+        vpaddd  ymm9, ymm9, ymm13
+        vpaddd  ymm10, ymm10, ymm14
+        vpaddd  ymm11, ymm11, ymm15
+        vpxor   ymm4, ymm4, ymm8
+        vpxor   ymm5, ymm5, ymm9
+        vpxor   ymm6, ymm6, ymm10
+        vpxor   ymm7, ymm7, ymm11
+        vmovdqa ymmword ptr [rsp+0x200], ymm8
+        vpsrld  ymm8, ymm4, 7
+        vpslld  ymm4, ymm4, 25
+        vpor    ymm4, ymm4, ymm8
+        vpsrld  ymm8, ymm5, 7
+        vpslld  ymm5, ymm5, 25
+        vpor    ymm5, ymm5, ymm8
+        vpsrld  ymm8, ymm6, 7
+        vpslld  ymm6, ymm6, 25
+        vpor    ymm6, ymm6, ymm8
+        vpsrld  ymm8, ymm7, 7
+        vpslld  ymm7, ymm7, 25
+        vpor    ymm7, ymm7, ymm8
+        vpaddd  ymm0, ymm0, ymmword ptr [rsp+0x80]
+        vpaddd  ymm1, ymm1, ymmword ptr [rsp+0x160]
+        vpaddd  ymm2, ymm2, ymmword ptr [rsp+0xA0]
+        vpaddd  ymm3, ymm3, ymmword ptr [rsp+0x20]
+        vpaddd  ymm0, ymm0, ymm5
+        vpaddd  ymm1, ymm1, ymm6
+        vpaddd  ymm2, ymm2, ymm7
+        vpaddd  ymm3, ymm3, ymm4
+        vpxor   ymm15, ymm15, ymm0
+        vpxor   ymm12, ymm12, ymm1
+        vpxor   ymm13, ymm13, ymm2
+        vpxor   ymm14, ymm14, ymm3
+        vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
+        vpshufb ymm15, ymm15, ymm8
+        vpshufb ymm12, ymm12, ymm8
+        vpshufb ymm13, ymm13, ymm8
+        vpshufb ymm14, ymm14, ymm8
+        vpaddd  ymm10, ymm10, ymm15
+        vpaddd  ymm11, ymm11, ymm12
+        vpaddd  ymm8, ymm13, ymmword ptr [rsp+0x200]
+        vpaddd  ymm9, ymm9, ymm14
+        vpxor   ymm5, ymm5, ymm10
+        vpxor   ymm6, ymm6, ymm11
+        vpxor   ymm7, ymm7, ymm8
+        vpxor   ymm4, ymm4, ymm9
+        vmovdqa ymmword ptr [rsp+0x200], ymm8
+        vpsrld  ymm8, ymm5, 12
+        vpslld  ymm5, ymm5, 20
+        vpor    ymm5, ymm5, ymm8
+        vpsrld  ymm8, ymm6, 12
+        vpslld  ymm6, ymm6, 20
+        vpor    ymm6, ymm6, ymm8
+        vpsrld  ymm8, ymm7, 12
+        vpslld  ymm7, ymm7, 20
+        vpor    ymm7, ymm7, ymm8
+        vpsrld  ymm8, ymm4, 12
+        vpslld  ymm4, ymm4, 20
+        vpor    ymm4, ymm4, ymm8
+        vpaddd  ymm0, ymm0, ymmword ptr [rsp]
+        vpaddd  ymm1, ymm1, ymmword ptr [rsp+0x40]
+        vpaddd  ymm2, ymm2, ymmword ptr [rsp+0x100]
+        vpaddd  ymm3, ymm3, ymmword ptr [rsp+0xC0]
+        vpaddd  ymm0, ymm0, ymm5
+        vpaddd  ymm1, ymm1, ymm6
+        vpaddd  ymm2, ymm2, ymm7
+        vpaddd  ymm3, ymm3, ymm4
+        vpxor   ymm15, ymm15, ymm0
+        vpxor   ymm12, ymm12, ymm1
+        vpxor   ymm13, ymm13, ymm2
+        vpxor   ymm14, ymm14, ymm3
+        vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
+        vpshufb ymm15, ymm15, ymm8
+        vpshufb ymm12, ymm12, ymm8
+        vpshufb ymm13, ymm13, ymm8
+        vpshufb ymm14, ymm14, ymm8
+        vpaddd  ymm10, ymm10, ymm15
+        vpaddd  ymm11, ymm11, ymm12
+        vpaddd  ymm8, ymm13, ymmword ptr [rsp+0x200]
+        vpaddd  ymm9, ymm9, ymm14
+        vpxor   ymm5, ymm5, ymm10
+        vpxor   ymm6, ymm6, ymm11
+        vpxor   ymm7, ymm7, ymm8
+        vpxor   ymm4, ymm4, ymm9
+        vmovdqa ymmword ptr [rsp+0x200], ymm8
+        vpsrld  ymm8, ymm5, 7
+        vpslld  ymm5, ymm5, 25
+        vpor    ymm5, ymm5, ymm8
+        vpsrld  ymm8, ymm6, 7
+        vpslld  ymm6, ymm6, 25
+        vpor    ymm6, ymm6, ymm8
+        vpsrld  ymm8, ymm7, 7
+        vpslld  ymm7, ymm7, 25
+        vpor    ymm7, ymm7, ymm8
+        vpsrld  ymm8, ymm4, 7
+        vpslld  ymm4, ymm4, 25
+        vpor    ymm4, ymm4, ymm8
+        vpaddd  ymm0, ymm0, ymmword ptr [rsp+0x180]
+        vpaddd  ymm1, ymm1, ymmword ptr [rsp+0x120]
+        vpaddd  ymm2, ymm2, ymmword ptr [rsp+0x1E0]
+        vpaddd  ymm3, ymm3, ymmword ptr [rsp+0x1C0]
+        vpaddd  ymm0, ymm0, ymm4
+        vpaddd  ymm1, ymm1, ymm5
+        vpaddd  ymm2, ymm2, ymm6
+        vpaddd  ymm3, ymm3, ymm7
+        vpxor   ymm12, ymm12, ymm0
+        vpxor   ymm13, ymm13, ymm1
+        vpxor   ymm14, ymm14, ymm2
+        vpxor   ymm15, ymm15, ymm3
+        vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
+        vpshufb ymm12, ymm12, ymm8
+        vpshufb ymm13, ymm13, ymm8
+        vpshufb ymm14, ymm14, ymm8
+        vpshufb ymm15, ymm15, ymm8
+        vpaddd  ymm8, ymm12, ymmword ptr [rsp+0x200]
+        vpaddd  ymm9, ymm9, ymm13
+        vpaddd  ymm10, ymm10, ymm14
+        vpaddd  ymm11, ymm11, ymm15
+        vpxor   ymm4, ymm4, ymm8
+        vpxor   ymm5, ymm5, ymm9
+        vpxor   ymm6, ymm6, ymm10
+        vpxor   ymm7, ymm7, ymm11
+        vmovdqa ymmword ptr [rsp+0x200], ymm8
+        vpsrld  ymm8, ymm4, 12
+        vpslld  ymm4, ymm4, 20
+        vpor    ymm4, ymm4, ymm8
+        vpsrld  ymm8, ymm5, 12
+        vpslld  ymm5, ymm5, 20
+        vpor    ymm5, ymm5, ymm8
+        vpsrld  ymm8, ymm6, 12
+        vpslld  ymm6, ymm6, 20
+        vpor    ymm6, ymm6, ymm8
+        vpsrld  ymm8, ymm7, 12
+        vpslld  ymm7, ymm7, 20
+        vpor    ymm7, ymm7, ymm8
+        vpaddd  ymm0, ymm0, ymmword ptr [rsp+0x1A0]
+        vpaddd  ymm1, ymm1, ymmword ptr [rsp+0x160]
+        vpaddd  ymm2, ymm2, ymmword ptr [rsp+0x140]
+        vpaddd  ymm3, ymm3, ymmword ptr [rsp+0x100]
+        vpaddd  ymm0, ymm0, ymm4
+        vpaddd  ymm1, ymm1, ymm5
+        vpaddd  ymm2, ymm2, ymm6
+        vpaddd  ymm3, ymm3, ymm7
+        vpxor   ymm12, ymm12, ymm0
+        vpxor   ymm13, ymm13, ymm1
+        vpxor   ymm14, ymm14, ymm2
+        vpxor   ymm15, ymm15, ymm3
+        vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
+        vpshufb ymm12, ymm12, ymm8
+        vpshufb ymm13, ymm13, ymm8
+        vpshufb ymm14, ymm14, ymm8
+        vpshufb ymm15, ymm15, ymm8
+        vpaddd  ymm8, ymm12, ymmword ptr [rsp+0x200]
+        vpaddd  ymm9, ymm9, ymm13
+        vpaddd  ymm10, ymm10, ymm14
+        vpaddd  ymm11, ymm11, ymm15
+        vpxor   ymm4, ymm4, ymm8
+        vpxor   ymm5, ymm5, ymm9
+        vpxor   ymm6, ymm6, ymm10
+        vpxor   ymm7, ymm7, ymm11
+        vmovdqa ymmword ptr [rsp+0x200], ymm8
+        vpsrld  ymm8, ymm4, 7
+        vpslld  ymm4, ymm4, 25
+        vpor    ymm4, ymm4, ymm8
+        vpsrld  ymm8, ymm5, 7
+        vpslld  ymm5, ymm5, 25
+        vpor    ymm5, ymm5, ymm8
+        vpsrld  ymm8, ymm6, 7
+        vpslld  ymm6, ymm6, 25
+        vpor    ymm6, ymm6, ymm8
+        vpsrld  ymm8, ymm7, 7
+        vpslld  ymm7, ymm7, 25
+        vpor    ymm7, ymm7, ymm8
+        vpaddd  ymm0, ymm0, ymmword ptr [rsp+0xE0]
+        vpaddd  ymm1, ymm1, ymmword ptr [rsp+0xA0]
+        vpaddd  ymm2, ymm2, ymmword ptr [rsp]
+        vpaddd  ymm3, ymm3, ymmword ptr [rsp+0xC0]
+        vpaddd  ymm0, ymm0, ymm5
+        vpaddd  ymm1, ymm1, ymm6
+        vpaddd  ymm2, ymm2, ymm7
+        vpaddd  ymm3, ymm3, ymm4
+        vpxor   ymm15, ymm15, ymm0
+        vpxor   ymm12, ymm12, ymm1
+        vpxor   ymm13, ymm13, ymm2
+        vpxor   ymm14, ymm14, ymm3
+        vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
+        vpshufb ymm15, ymm15, ymm8
+        vpshufb ymm12, ymm12, ymm8
+        vpshufb ymm13, ymm13, ymm8
+        vpshufb ymm14, ymm14, ymm8
+        vpaddd  ymm10, ymm10, ymm15
+        vpaddd  ymm11, ymm11, ymm12
+        vpaddd  ymm8, ymm13, ymmword ptr [rsp+0x200]
+        vpaddd  ymm9, ymm9, ymm14
+        vpxor   ymm5, ymm5, ymm10
+        vpxor   ymm6, ymm6, ymm11
+        vpxor   ymm7, ymm7, ymm8
+        vpxor   ymm4, ymm4, ymm9
+        vmovdqa ymmword ptr [rsp+0x200], ymm8
+        vpsrld  ymm8, ymm5, 12
+        vpslld  ymm5, ymm5, 20
+        vpor    ymm5, ymm5, ymm8
+        vpsrld  ymm8, ymm6, 12
+        vpslld  ymm6, ymm6, 20
+        vpor    ymm6, ymm6, ymm8
+        vpsrld  ymm8, ymm7, 12
+        vpslld  ymm7, ymm7, 20
+        vpor    ymm7, ymm7, ymm8
+        vpsrld  ymm8, ymm4, 12
+        vpslld  ymm4, ymm4, 20
+        vpor    ymm4, ymm4, ymm8
+        vpaddd  ymm0, ymm0, ymmword ptr [rsp+0x40]
+        vpaddd  ymm1, ymm1, ymmword ptr [rsp+0x60]
+        vpaddd  ymm2, ymm2, ymmword ptr [rsp+0x20]
+        vpaddd  ymm3, ymm3, ymmword ptr [rsp+0x80]
+        vpaddd  ymm0, ymm0, ymm5
+        vpaddd  ymm1, ymm1, ymm6
+        vpaddd  ymm2, ymm2, ymm7
+        vpaddd  ymm3, ymm3, ymm4
+        vpxor   ymm15, ymm15, ymm0
+        vpxor   ymm12, ymm12, ymm1
+        vpxor   ymm13, ymm13, ymm2
+        vpxor   ymm14, ymm14, ymm3
+        vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
+        vpshufb ymm15, ymm15, ymm8
+        vpshufb ymm12, ymm12, ymm8
+        vpshufb ymm13, ymm13, ymm8
+        vpshufb ymm14, ymm14, ymm8
+        vpaddd  ymm10, ymm10, ymm15
+        vpaddd  ymm11, ymm11, ymm12
+        vpaddd  ymm8, ymm13, ymmword ptr [rsp+0x200]
+        vpaddd  ymm9, ymm9, ymm14
+        vpxor   ymm5, ymm5, ymm10
+        vpxor   ymm6, ymm6, ymm11
+        vpxor   ymm7, ymm7, ymm8
+        vpxor   ymm4, ymm4, ymm9
+        vmovdqa ymmword ptr [rsp+0x200], ymm8
+        vpsrld  ymm8, ymm5, 7
+        vpslld  ymm5, ymm5, 25
+        vpor    ymm5, ymm5, ymm8
+        vpsrld  ymm8, ymm6, 7
+        vpslld  ymm6, ymm6, 25
+        vpor    ymm6, ymm6, ymm8
+        vpsrld  ymm8, ymm7, 7
+        vpslld  ymm7, ymm7, 25
+        vpor    ymm7, ymm7, ymm8
+        vpsrld  ymm8, ymm4, 7
+        vpslld  ymm4, ymm4, 25
+        vpor    ymm4, ymm4, ymm8
+        vpaddd  ymm0, ymm0, ymmword ptr [rsp+0x120]
+        vpaddd  ymm1, ymm1, ymmword ptr [rsp+0x160]
+        vpaddd  ymm2, ymm2, ymmword ptr [rsp+0x100]
+        vpaddd  ymm3, ymm3, ymmword ptr [rsp+0x1E0]
+        vpaddd  ymm0, ymm0, ymm4
+        vpaddd  ymm1, ymm1, ymm5
+        vpaddd  ymm2, ymm2, ymm6
+        vpaddd  ymm3, ymm3, ymm7
+        vpxor   ymm12, ymm12, ymm0
+        vpxor   ymm13, ymm13, ymm1
+        vpxor   ymm14, ymm14, ymm2
+        vpxor   ymm15, ymm15, ymm3
+        vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
+        vpshufb ymm12, ymm12, ymm8
+        vpshufb ymm13, ymm13, ymm8
+        vpshufb ymm14, ymm14, ymm8
+        vpshufb ymm15, ymm15, ymm8
+        vpaddd  ymm8, ymm12, ymmword ptr [rsp+0x200]
+        vpaddd  ymm9, ymm9, ymm13
+        vpaddd  ymm10, ymm10, ymm14
+        vpaddd  ymm11, ymm11, ymm15
+        vpxor   ymm4, ymm4, ymm8
+        vpxor   ymm5, ymm5, ymm9
+        vpxor   ymm6, ymm6, ymm10
+        vpxor   ymm7, ymm7, ymm11
+        vmovdqa ymmword ptr [rsp+0x200], ymm8
+        vpsrld  ymm8, ymm4, 12
+        vpslld  ymm4, ymm4, 20
+        vpor    ymm4, ymm4, ymm8
+        vpsrld  ymm8, ymm5, 12
+        vpslld  ymm5, ymm5, 20
+        vpor    ymm5, ymm5, ymm8
+        vpsrld  ymm8, ymm6, 12
+        vpslld  ymm6, ymm6, 20
+        vpor    ymm6, ymm6, ymm8
+        vpsrld  ymm8, ymm7, 12
+        vpslld  ymm7, ymm7, 20
+        vpor    ymm7, ymm7, ymm8
+        vpaddd  ymm0, ymm0, ymmword ptr [rsp+0x1C0]
+        vpaddd  ymm1, ymm1, ymmword ptr [rsp+0xA0]
+        vpaddd  ymm2, ymm2, ymmword ptr [rsp+0x180]
+        vpaddd  ymm3, ymm3, ymmword ptr [rsp+0x20]
+        vpaddd  ymm0, ymm0, ymm4
+        vpaddd  ymm1, ymm1, ymm5
+        vpaddd  ymm2, ymm2, ymm6
+        vpaddd  ymm3, ymm3, ymm7
+        vpxor   ymm12, ymm12, ymm0
+        vpxor   ymm13, ymm13, ymm1
+        vpxor   ymm14, ymm14, ymm2
+        vpxor   ymm15, ymm15, ymm3
+        vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
+        vpshufb ymm12, ymm12, ymm8
+        vpshufb ymm13, ymm13, ymm8
+        vpshufb ymm14, ymm14, ymm8
+        vpshufb ymm15, ymm15, ymm8
+        vpaddd  ymm8, ymm12, ymmword ptr [rsp+0x200]
+        vpaddd  ymm9, ymm9, ymm13
+        vpaddd  ymm10, ymm10, ymm14
+        vpaddd  ymm11, ymm11, ymm15
+        vpxor   ymm4, ymm4, ymm8
+        vpxor   ymm5, ymm5, ymm9
+        vpxor   ymm6, ymm6, ymm10
+        vpxor   ymm7, ymm7, ymm11
+        vmovdqa ymmword ptr [rsp+0x200], ymm8
+        vpsrld  ymm8, ymm4, 7
+        vpslld  ymm4, ymm4, 25
+        vpor    ymm4, ymm4, ymm8
+        vpsrld  ymm8, ymm5, 7
+        vpslld  ymm5, ymm5, 25
+        vpor    ymm5, ymm5, ymm8
+        vpsrld  ymm8, ymm6, 7
+        vpslld  ymm6, ymm6, 25
+        vpor    ymm6, ymm6, ymm8
+        vpsrld  ymm8, ymm7, 7
+        vpslld  ymm7, ymm7, 25
+        vpor    ymm7, ymm7, ymm8
+        vpaddd  ymm0, ymm0, ymmword ptr [rsp+0x1A0]
+        vpaddd  ymm1, ymm1, ymmword ptr [rsp]
+        vpaddd  ymm2, ymm2, ymmword ptr [rsp+0x40]
+        vpaddd  ymm3, ymm3, ymmword ptr [rsp+0x80]
+        vpaddd  ymm0, ymm0, ymm5
+        vpaddd  ymm1, ymm1, ymm6
+        vpaddd  ymm2, ymm2, ymm7
+        vpaddd  ymm3, ymm3, ymm4
+        vpxor   ymm15, ymm15, ymm0
+        vpxor   ymm12, ymm12, ymm1
+        vpxor   ymm13, ymm13, ymm2
+        vpxor   ymm14, ymm14, ymm3
+        vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
+        vpshufb ymm15, ymm15, ymm8
+        vpshufb ymm12, ymm12, ymm8
+        vpshufb ymm13, ymm13, ymm8
+        vpshufb ymm14, ymm14, ymm8
+        vpaddd  ymm10, ymm10, ymm15
+        vpaddd  ymm11, ymm11, ymm12
+        vpaddd  ymm8, ymm13, ymmword ptr [rsp+0x200]
+        vpaddd  ymm9, ymm9, ymm14
+        vpxor   ymm5, ymm5, ymm10
+        vpxor   ymm6, ymm6, ymm11
+        vpxor   ymm7, ymm7, ymm8
+        vpxor   ymm4, ymm4, ymm9
+        vmovdqa ymmword ptr [rsp+0x200], ymm8
+        vpsrld  ymm8, ymm5, 12
+        vpslld  ymm5, ymm5, 20
+        vpor    ymm5, ymm5, ymm8
+        vpsrld  ymm8, ymm6, 12
+        vpslld  ymm6, ymm6, 20
+        vpor    ymm6, ymm6, ymm8
+        vpsrld  ymm8, ymm7, 12
+        vpslld  ymm7, ymm7, 20
+        vpor    ymm7, ymm7, ymm8
+        vpsrld  ymm8, ymm4, 12
+        vpslld  ymm4, ymm4, 20
+        vpor    ymm4, ymm4, ymm8
+        vpaddd  ymm0, ymm0, ymmword ptr [rsp+0x60]
+        vpaddd  ymm1, ymm1, ymmword ptr [rsp+0x140]
+        vpaddd  ymm2, ymm2, ymmword ptr [rsp+0xC0]
+        vpaddd  ymm3, ymm3, ymmword ptr [rsp+0xE0]
+        vpaddd  ymm0, ymm0, ymm5
+        vpaddd  ymm1, ymm1, ymm6
+        vpaddd  ymm2, ymm2, ymm7
+        vpaddd  ymm3, ymm3, ymm4
+        vpxor   ymm15, ymm15, ymm0
+        vpxor   ymm12, ymm12, ymm1
+        vpxor   ymm13, ymm13, ymm2
+        vpxor   ymm14, ymm14, ymm3
+        vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
+        vpshufb ymm15, ymm15, ymm8
+        vpshufb ymm12, ymm12, ymm8
+        vpshufb ymm13, ymm13, ymm8
+        vpshufb ymm14, ymm14, ymm8
+        vpaddd  ymm10, ymm10, ymm15
+        vpaddd  ymm11, ymm11, ymm12
+        vpaddd  ymm8, ymm13, ymmword ptr [rsp+0x200]
+        vpaddd  ymm9, ymm9, ymm14
+        vpxor   ymm5, ymm5, ymm10
+        vpxor   ymm6, ymm6, ymm11
+        vpxor   ymm7, ymm7, ymm8
+        vpxor   ymm4, ymm4, ymm9
+        vmovdqa ymmword ptr [rsp+0x200], ymm8
+        vpsrld  ymm8, ymm5, 7
+        vpslld  ymm5, ymm5, 25
+        vpor    ymm5, ymm5, ymm8
+        vpsrld  ymm8, ymm6, 7
+        vpslld  ymm6, ymm6, 25
+        vpor    ymm6, ymm6, ymm8
+        vpsrld  ymm8, ymm7, 7
+        vpslld  ymm7, ymm7, 25
+        vpor    ymm7, ymm7, ymm8
+        vpsrld  ymm8, ymm4, 7
+        vpslld  ymm4, ymm4, 25
+        vpor    ymm4, ymm4, ymm8
+        vpaddd  ymm0, ymm0, ymmword ptr [rsp+0x160]
+        vpaddd  ymm1, ymm1, ymmword ptr [rsp+0xA0]
+        vpaddd  ymm2, ymm2, ymmword ptr [rsp+0x20]
+        vpaddd  ymm3, ymm3, ymmword ptr [rsp+0x100]
+        vpaddd  ymm0, ymm0, ymm4
+        vpaddd  ymm1, ymm1, ymm5
+        vpaddd  ymm2, ymm2, ymm6
+        vpaddd  ymm3, ymm3, ymm7
+        vpxor   ymm12, ymm12, ymm0
+        vpxor   ymm13, ymm13, ymm1
+        vpxor   ymm14, ymm14, ymm2
+        vpxor   ymm15, ymm15, ymm3
+        vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
+        vpshufb ymm12, ymm12, ymm8
+        vpshufb ymm13, ymm13, ymm8
+        vpshufb ymm14, ymm14, ymm8
+        vpshufb ymm15, ymm15, ymm8
+        vpaddd  ymm8, ymm12, ymmword ptr [rsp+0x200]
+        vpaddd  ymm9, ymm9, ymm13
+        vpaddd  ymm10, ymm10, ymm14
+        vpaddd  ymm11, ymm11, ymm15
+        vpxor   ymm4, ymm4, ymm8
+        vpxor   ymm5, ymm5, ymm9
+        vpxor   ymm6, ymm6, ymm10
+        vpxor   ymm7, ymm7, ymm11
+        vmovdqa ymmword ptr [rsp+0x200], ymm8
+        vpsrld  ymm8, ymm4, 12
+        vpslld  ymm4, ymm4, 20
+        vpor    ymm4, ymm4, ymm8
+        vpsrld  ymm8, ymm5, 12
+        vpslld  ymm5, ymm5, 20
+        vpor    ymm5, ymm5, ymm8
+        vpsrld  ymm8, ymm6, 12
+        vpslld  ymm6, ymm6, 20
+        vpor    ymm6, ymm6, ymm8
+        vpsrld  ymm8, ymm7, 12
+        vpslld  ymm7, ymm7, 20
+        vpor    ymm7, ymm7, ymm8
+        vpaddd  ymm0, ymm0, ymmword ptr [rsp+0x1E0]
+        vpaddd  ymm1, ymm1, ymmword ptr [rsp]
+        vpaddd  ymm2, ymm2, ymmword ptr [rsp+0x120]
+        vpaddd  ymm3, ymm3, ymmword ptr [rsp+0xC0]
+        vpaddd  ymm0, ymm0, ymm4
+        vpaddd  ymm1, ymm1, ymm5
+        vpaddd  ymm2, ymm2, ymm6
+        vpaddd  ymm3, ymm3, ymm7
+        vpxor   ymm12, ymm12, ymm0
+        vpxor   ymm13, ymm13, ymm1
+        vpxor   ymm14, ymm14, ymm2
+        vpxor   ymm15, ymm15, ymm3
+        vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
+        vpshufb ymm12, ymm12, ymm8
+        vpshufb ymm13, ymm13, ymm8
+        vpshufb ymm14, ymm14, ymm8
+        vpshufb ymm15, ymm15, ymm8
+        vpaddd  ymm8, ymm12, ymmword ptr [rsp+0x200]
+        vpaddd  ymm9, ymm9, ymm13
+        vpaddd  ymm10, ymm10, ymm14
+        vpaddd  ymm11, ymm11, ymm15
+        vpxor   ymm4, ymm4, ymm8
+        vpxor   ymm5, ymm5, ymm9
+        vpxor   ymm6, ymm6, ymm10
+        vpxor   ymm7, ymm7, ymm11
+        vmovdqa ymmword ptr [rsp+0x200], ymm8
+        vpsrld  ymm8, ymm4, 7
+        vpslld  ymm4, ymm4, 25
+        vpor    ymm4, ymm4, ymm8
+        vpsrld  ymm8, ymm5, 7
+        vpslld  ymm5, ymm5, 25
+        vpor    ymm5, ymm5, ymm8
+        vpsrld  ymm8, ymm6, 7
+        vpslld  ymm6, ymm6, 25
+        vpor    ymm6, ymm6, ymm8
+        vpsrld  ymm8, ymm7, 7
+        vpslld  ymm7, ymm7, 25
+        vpor    ymm7, ymm7, ymm8
+        vpaddd  ymm0, ymm0, ymmword ptr [rsp+0x1C0]
+        vpaddd  ymm1, ymm1, ymmword ptr [rsp+0x40]
+        vpaddd  ymm2, ymm2, ymmword ptr [rsp+0x60]
+        vpaddd  ymm3, ymm3, ymmword ptr [rsp+0xE0]
+        vpaddd  ymm0, ymm0, ymm5
+        vpaddd  ymm1, ymm1, ymm6
+        vpaddd  ymm2, ymm2, ymm7
+        vpaddd  ymm3, ymm3, ymm4
+        vpxor   ymm15, ymm15, ymm0
+        vpxor   ymm12, ymm12, ymm1
+        vpxor   ymm13, ymm13, ymm2
+        vpxor   ymm14, ymm14, ymm3
+        vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
+        vpshufb ymm15, ymm15, ymm8
+        vpshufb ymm12, ymm12, ymm8
+        vpshufb ymm13, ymm13, ymm8
+        vpshufb ymm14, ymm14, ymm8
+        vpaddd  ymm10, ymm10, ymm15
+        vpaddd  ymm11, ymm11, ymm12
+        vpaddd  ymm8, ymm13, ymmword ptr [rsp+0x200]
+        vpaddd  ymm9, ymm9, ymm14
+        vpxor   ymm5, ymm5, ymm10
+        vpxor   ymm6, ymm6, ymm11
+        vpxor   ymm7, ymm7, ymm8
+        vpxor   ymm4, ymm4, ymm9
+        vmovdqa ymmword ptr [rsp+0x200], ymm8
+        vpsrld  ymm8, ymm5, 12
+        vpslld  ymm5, ymm5, 20
+        vpor    ymm5, ymm5, ymm8
+        vpsrld  ymm8, ymm6, 12
+        vpslld  ymm6, ymm6, 20
+        vpor    ymm6, ymm6, ymm8
+        vpsrld  ymm8, ymm7, 12
+        vpslld  ymm7, ymm7, 20
+        vpor    ymm7, ymm7, ymm8
+        vpsrld  ymm8, ymm4, 12
+        vpslld  ymm4, ymm4, 20
+        vpor    ymm4, ymm4, ymm8
+        vpaddd  ymm0, ymm0, ymmword ptr [rsp+0x140]
+        vpaddd  ymm1, ymm1, ymmword ptr [rsp+0x180]
+        vpaddd  ymm2, ymm2, ymmword ptr [rsp+0x80]
+        vpaddd  ymm3, ymm3, ymmword ptr [rsp+0x1A0]
+        vpaddd  ymm0, ymm0, ymm5
+        vpaddd  ymm1, ymm1, ymm6
+        vpaddd  ymm2, ymm2, ymm7
+        vpaddd  ymm3, ymm3, ymm4
+        vpxor   ymm15, ymm15, ymm0
+        vpxor   ymm12, ymm12, ymm1
+        vpxor   ymm13, ymm13, ymm2
+        vpxor   ymm14, ymm14, ymm3
+        vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
+        vpshufb ymm15, ymm15, ymm8
+        vpshufb ymm12, ymm12, ymm8
+        vpshufb ymm13, ymm13, ymm8
+        vpshufb ymm14, ymm14, ymm8
+        vpaddd  ymm10, ymm10, ymm15
+        vpaddd  ymm11, ymm11, ymm12
+        vpaddd  ymm8, ymm13, ymmword ptr [rsp+0x200]
+        vpaddd  ymm9, ymm9, ymm14
+        vpxor   ymm5, ymm5, ymm10
+        vpxor   ymm6, ymm6, ymm11
+        vpxor   ymm7, ymm7, ymm8
+        vpxor   ymm4, ymm4, ymm9
+        vpxor   ymm0, ymm0, ymm8
+        vpxor   ymm1, ymm1, ymm9
+        vpxor   ymm2, ymm2, ymm10
+        vpxor   ymm3, ymm3, ymm11
+        vpsrld  ymm8, ymm5, 7
+        vpslld  ymm5, ymm5, 25
+        vpor    ymm5, ymm5, ymm8
+        vpsrld  ymm8, ymm6, 7
+        vpslld  ymm6, ymm6, 25
+        vpor    ymm6, ymm6, ymm8
+        vpsrld  ymm8, ymm7, 7
+        vpslld  ymm7, ymm7, 25
+        vpor    ymm7, ymm7, ymm8
+        vpsrld  ymm8, ymm4, 7
+        vpslld  ymm4, ymm4, 25
+        vpor    ymm4, ymm4, ymm8
+        vpxor   ymm4, ymm4, ymm12
+        vpxor   ymm5, ymm5, ymm13
+        vpxor   ymm6, ymm6, ymm14
+        vpxor   ymm7, ymm7, ymm15
+        movzx   eax, byte ptr [rbp+0x38]
+        jne     9b
+        mov     rbx, qword ptr [rbp+0x50]
+        vunpcklps ymm8, ymm0, ymm1
+        vunpcklps ymm9, ymm2, ymm3
+        vunpckhps ymm10, ymm0, ymm1
+        vunpcklps ymm11, ymm4, ymm5
+        vunpcklps ymm0, ymm6, ymm7
+        vshufps ymm12, ymm8, ymm9, 78
+        vblendps ymm1, ymm8, ymm12, 0xCC
+        vshufps ymm8, ymm11, ymm0, 78
+        vunpckhps ymm13, ymm2, ymm3
+        vblendps ymm2, ymm11, ymm8, 0xCC
+        vblendps ymm3, ymm12, ymm9, 0xCC
+        vperm2f128 ymm12, ymm1, ymm2, 0x20
+        vmovups ymmword ptr [rbx], ymm12
+        vunpckhps ymm14, ymm4, ymm5
+        vblendps ymm4, ymm8, ymm0, 0xCC
+        vunpckhps ymm15, ymm6, ymm7
+        vperm2f128 ymm7, ymm3, ymm4, 0x20
+        vmovups ymmword ptr [rbx+0x20], ymm7
+        vshufps ymm5, ymm10, ymm13, 78
+        vblendps ymm6, ymm5, ymm13, 0xCC
+        vshufps ymm13, ymm14, ymm15, 78
+        vblendps ymm10, ymm10, ymm5, 0xCC
+        vblendps ymm14, ymm14, ymm13, 0xCC
+        vperm2f128 ymm8, ymm10, ymm14, 0x20
+        vmovups ymmword ptr [rbx+0x40], ymm8
+        vblendps ymm15, ymm13, ymm15, 0xCC
+        vperm2f128 ymm13, ymm6, ymm15, 0x20
+        vmovups ymmword ptr [rbx+0x60], ymm13
+        vperm2f128 ymm9, ymm1, ymm2, 0x31
+        vperm2f128 ymm11, ymm3, ymm4, 0x31
+        vmovups ymmword ptr [rbx+0x80], ymm9
+        vperm2f128 ymm14, ymm10, ymm14, 0x31
+        vperm2f128 ymm15, ymm6, ymm15, 0x31
+        vmovups ymmword ptr [rbx+0xA0], ymm11
+        vmovups ymmword ptr [rbx+0xC0], ymm14
+        vmovups ymmword ptr [rbx+0xE0], ymm15
+        vmovdqa ymm0, ymmword ptr [rsp+0x220]
+        vpaddd  ymm1, ymm0, ymmword ptr [rsp+0x240]
+        vmovdqa ymmword ptr [rsp+0x240], ymm1
+        vpxor   ymm0, ymm0, ymmword ptr [CMP_MSB_MASK+rip]
+        vpxor   ymm2, ymm1, ymmword ptr [CMP_MSB_MASK+rip]
+        vpcmpgtd ymm2, ymm0, ymm2
+        vmovdqa ymm0, ymmword ptr [rsp+0x260]
+        vpsubd  ymm2, ymm0, ymm2
+        vmovdqa ymmword ptr [rsp+0x260], ymm2
+        add     rdi, 64
+        add     rbx, 256
+        mov     qword ptr [rbp+0x50], rbx
+        sub     rsi, 8
+        cmp     rsi, 8
+        jnc     2b
+        test    rsi, rsi
+        jnz     3f
+4:
+        vzeroupper
+        mov     rsp, rbp
+        pop     rbp
+        pop     rbx
+        pop     r12
+        pop     r13
+        pop     r14
+        pop     r15
+        ret
+.p2align  5
+3:
+        mov     rbx, qword ptr [rbp+0x50]
+        mov     r15, qword ptr [rsp+0x2A0]
+        movzx   r13d, byte ptr [rbp+0x38]
+        movzx   r12d, byte ptr [rbp+0x48]
+        test    rsi, 0x4
+        je      3f
+        vbroadcasti128 ymm0, xmmword ptr [rcx]
+        vbroadcasti128 ymm1, xmmword ptr [rcx+0x10]
+        vmovdqa ymm8, ymm0
+        vmovdqa ymm9, ymm1
+        vbroadcasti128 ymm12, xmmword ptr [rsp+0x240]
+        vbroadcasti128 ymm13, xmmword ptr [rsp+0x260]
+        vpunpckldq ymm14, ymm12, ymm13
+        vpunpckhdq ymm15, ymm12, ymm13
+        vpermq  ymm14, ymm14, 0x50
+        vpermq  ymm15, ymm15, 0x50
+        vbroadcasti128 ymm12, xmmword ptr [BLAKE3_BLOCK_LEN+rip]
+        vpblendd ymm14, ymm14, ymm12, 0x44
+        vpblendd ymm15, ymm15, ymm12, 0x44
+        vmovdqa ymmword ptr [rsp], ymm14
+        vmovdqa ymmword ptr [rsp+0x20], ymm15
+        mov     r8, qword ptr [rdi]
+        mov     r9, qword ptr [rdi+0x8]
+        mov     r10, qword ptr [rdi+0x10]
+        mov     r11, qword ptr [rdi+0x18]
+        movzx   eax, byte ptr [rbp+0x40]
+        or      eax, r13d
+        xor     edx, edx
+.p2align  5
+2:
+        mov     r14d, eax
+        or      eax, r12d
+        add     rdx, 64
+        cmp     rdx, r15
+        cmovne  eax, r14d
+        mov     dword ptr [rsp+0x200], eax
+        vmovups ymm2, ymmword ptr [r8+rdx-0x40]
+        vinsertf128 ymm2, ymm2, xmmword ptr [r9+rdx-0x40], 0x01
+        vmovups ymm3, ymmword ptr [r8+rdx-0x30]
+        vinsertf128 ymm3, ymm3, xmmword ptr [r9+rdx-0x30], 0x01
+        vshufps ymm4, ymm2, ymm3, 136
+        vshufps ymm5, ymm2, ymm3, 221
+        vmovups ymm2, ymmword ptr [r8+rdx-0x20]
+        vinsertf128 ymm2, ymm2, xmmword ptr [r9+rdx-0x20], 0x01
+        vmovups ymm3, ymmword ptr [r8+rdx-0x10]
+        vinsertf128 ymm3, ymm3, xmmword ptr [r9+rdx-0x10], 0x01
+        vshufps ymm6, ymm2, ymm3, 136
+        vshufps ymm7, ymm2, ymm3, 221
+        vpshufd ymm6, ymm6, 0x93
+        vpshufd ymm7, ymm7, 0x93
+        vmovups ymm10, ymmword ptr [r10+rdx-0x40]
+        vinsertf128 ymm10, ymm10, xmmword ptr [r11+rdx-0x40], 0x01
+        vmovups ymm11, ymmword ptr [r10+rdx-0x30]
+        vinsertf128 ymm11, ymm11, xmmword ptr [r11+rdx-0x30], 0x01
+        vshufps ymm12, ymm10, ymm11, 136
+        vshufps ymm13, ymm10, ymm11, 221
+        vmovups ymm10, ymmword ptr [r10+rdx-0x20]
+        vinsertf128 ymm10, ymm10, xmmword ptr [r11+rdx-0x20], 0x01
+        vmovups ymm11, ymmword ptr [r10+rdx-0x10]
+        vinsertf128 ymm11, ymm11, xmmword ptr [r11+rdx-0x10], 0x01
+        vshufps ymm14, ymm10, ymm11, 136
+        vshufps ymm15, ymm10, ymm11, 221
+        vpshufd ymm14, ymm14, 0x93
+        vpshufd ymm15, ymm15, 0x93
+        prefetcht0 [r8+rdx+0x80]
+        prefetcht0 [r9+rdx+0x80]
+        prefetcht0 [r10+rdx+0x80]
+        prefetcht0 [r11+rdx+0x80]
+        vpbroadcastd ymm2, dword ptr [rsp+0x200]
+        vmovdqa ymm3, ymmword ptr [rsp]
+        vmovdqa ymm11, ymmword ptr [rsp+0x20]
+        vpblendd ymm3, ymm3, ymm2, 0x88
+        vpblendd ymm11, ymm11, ymm2, 0x88
+        vbroadcasti128 ymm2, xmmword ptr [BLAKE3_IV+rip]
+        vmovdqa ymm10, ymm2
+        mov     al, 7
+9:
+        vpaddd  ymm0, ymm0, ymm4
+        vpaddd  ymm8, ymm8, ymm12
+        vmovdqa ymmword ptr [rsp+0x40], ymm4
+        nop
+        vmovdqa ymmword ptr [rsp+0x60], ymm12
+        nop
+        vpaddd  ymm0, ymm0, ymm1
+        vpaddd  ymm8, ymm8, ymm9
+        vpxor   ymm3, ymm3, ymm0
+        vpxor   ymm11, ymm11, ymm8
+        vbroadcasti128 ymm4, xmmword ptr [ROT16+rip]
+        vpshufb ymm3, ymm3, ymm4
+        vpshufb ymm11, ymm11, ymm4
+        vpaddd  ymm2, ymm2, ymm3
+        vpaddd  ymm10, ymm10, ymm11
+        vpxor   ymm1, ymm1, ymm2
+        vpxor   ymm9, ymm9, ymm10
+        vpsrld  ymm4, ymm1, 12
+        vpslld  ymm1, ymm1, 20
+        vpor    ymm1, ymm1, ymm4
+        vpsrld  ymm4, ymm9, 12
+        vpslld  ymm9, ymm9, 20
+        vpor    ymm9, ymm9, ymm4
+        vpaddd  ymm0, ymm0, ymm5
+        vpaddd  ymm8, ymm8, ymm13
+        vpaddd  ymm0, ymm0, ymm1
+        vpaddd  ymm8, ymm8, ymm9
+        vmovdqa ymmword ptr [rsp+0x80], ymm5
+        vmovdqa ymmword ptr [rsp+0xA0], ymm13
+        vpxor   ymm3, ymm3, ymm0
+        vpxor   ymm11, ymm11, ymm8
+        vbroadcasti128 ymm4, xmmword ptr [ROT8+rip]
+        vpshufb ymm3, ymm3, ymm4
+        vpshufb ymm11, ymm11, ymm4
+        vpaddd  ymm2, ymm2, ymm3
+        vpaddd  ymm10, ymm10, ymm11
+        vpxor   ymm1, ymm1, ymm2
+        vpxor   ymm9, ymm9, ymm10
+        vpsrld  ymm4, ymm1, 7
+        vpslld  ymm1, ymm1, 25
+        vpor    ymm1, ymm1, ymm4
+        vpsrld  ymm4, ymm9, 7
+        vpslld  ymm9, ymm9, 25
+        vpor    ymm9, ymm9, ymm4
+        vpshufd ymm0, ymm0, 0x93
+        vpshufd ymm8, ymm8, 0x93
+        vpshufd ymm3, ymm3, 0x4E
+        vpshufd ymm11, ymm11, 0x4E
+        vpshufd ymm2, ymm2, 0x39
+        vpshufd ymm10, ymm10, 0x39
+        vpaddd  ymm0, ymm0, ymm6
+        vpaddd  ymm8, ymm8, ymm14
+        vpaddd  ymm0, ymm0, ymm1
+        vpaddd  ymm8, ymm8, ymm9
+        vpxor   ymm3, ymm3, ymm0
+        vpxor   ymm11, ymm11, ymm8
+        vbroadcasti128 ymm4, xmmword ptr [ROT16+rip]
+        vpshufb ymm3, ymm3, ymm4
+        vpshufb ymm11, ymm11, ymm4
+        vpaddd  ymm2, ymm2, ymm3
+        vpaddd  ymm10, ymm10, ymm11
+        vpxor   ymm1, ymm1, ymm2
+        vpxor   ymm9, ymm9, ymm10
+        vpsrld  ymm4, ymm1, 12
+        vpslld  ymm1, ymm1, 20
+        vpor    ymm1, ymm1, ymm4
+        vpsrld  ymm4, ymm9, 12
+        vpslld  ymm9, ymm9, 20
+        vpor    ymm9, ymm9, ymm4
+        vpaddd  ymm0, ymm0, ymm7
+        vpaddd  ymm8, ymm8, ymm15
+        vpaddd  ymm0, ymm0, ymm1
+        vpaddd  ymm8, ymm8, ymm9
+        vpxor   ymm3, ymm3, ymm0
+        vpxor   ymm11, ymm11, ymm8
+        vbroadcasti128 ymm4, xmmword ptr [ROT8+rip]
+        vpshufb ymm3, ymm3, ymm4
+        vpshufb ymm11, ymm11, ymm4
+        vpaddd  ymm2, ymm2, ymm3
+        vpaddd  ymm10, ymm10, ymm11
+        vpxor   ymm1, ymm1, ymm2
+        vpxor   ymm9, ymm9, ymm10
+        vpsrld  ymm4, ymm1, 7
+        vpslld  ymm1, ymm1, 25
+        vpor    ymm1, ymm1, ymm4
+        vpsrld  ymm4, ymm9, 7
+        vpslld  ymm9, ymm9, 25
+        vpor    ymm9, ymm9, ymm4
+        vpshufd ymm0, ymm0, 0x39
+        vpshufd ymm8, ymm8, 0x39
+        vpshufd ymm3, ymm3, 0x4E
+        vpshufd ymm11, ymm11, 0x4E
+        vpshufd ymm2, ymm2, 0x93
+        vpshufd ymm10, ymm10, 0x93
+        dec     al
+        je      9f
+        vmovdqa ymm4, ymmword ptr [rsp+0x40]
+        vmovdqa ymm5, ymmword ptr [rsp+0x80]
+        vshufps ymm12, ymm4, ymm5, 214
+        vpshufd ymm13, ymm4, 0x0F
+        vpshufd ymm4, ymm12, 0x39
+        vshufps ymm12, ymm6, ymm7, 250
+        vpblendd ymm13, ymm13, ymm12, 0xAA
+        vpunpcklqdq ymm12, ymm7, ymm5
+        vpblendd ymm12, ymm12, ymm6, 0x88
+        vpshufd ymm12, ymm12, 0x78
+        vpunpckhdq ymm5, ymm5, ymm7
+        vpunpckldq ymm6, ymm6, ymm5
+        vpshufd ymm7, ymm6, 0x1E
+        vmovdqa ymmword ptr [rsp+0x40], ymm13
+        vmovdqa ymmword ptr [rsp+0x80], ymm12
+        vmovdqa ymm12, ymmword ptr [rsp+0x60]
+        vmovdqa ymm13, ymmword ptr [rsp+0xA0]
+        vshufps ymm5, ymm12, ymm13, 214
+        vpshufd ymm6, ymm12, 0x0F
+        vpshufd ymm12, ymm5, 0x39
+        vshufps ymm5, ymm14, ymm15, 250
+        vpblendd ymm6, ymm6, ymm5, 0xAA
+        vpunpcklqdq ymm5, ymm15, ymm13
+        vpblendd ymm5, ymm5, ymm14, 0x88
+        vpshufd ymm5, ymm5, 0x78
+        vpunpckhdq ymm13, ymm13, ymm15
+        vpunpckldq ymm14, ymm14, ymm13
+        vpshufd ymm15, ymm14, 0x1E
+        vmovdqa ymm13, ymm6
+        vmovdqa ymm14, ymm5
+        vmovdqa ymm5, ymmword ptr [rsp+0x40]
+        vmovdqa ymm6, ymmword ptr [rsp+0x80]
+        jmp     9b
+9:
+        vpxor   ymm0, ymm0, ymm2
+        vpxor   ymm1, ymm1, ymm3
+        vpxor   ymm8, ymm8, ymm10
+        vpxor   ymm9, ymm9, ymm11
+        mov     eax, r13d
+        cmp     rdx, r15
+        jne     2b
+        vmovdqu xmmword ptr [rbx], xmm0
+        vmovdqu xmmword ptr [rbx+0x10], xmm1
+        vextracti128 xmmword ptr [rbx+0x20], ymm0, 0x01
+        vextracti128 xmmword ptr [rbx+0x30], ymm1, 0x01
+        vmovdqu xmmword ptr [rbx+0x40], xmm8
+        vmovdqu xmmword ptr [rbx+0x50], xmm9
+        vextracti128 xmmword ptr [rbx+0x60], ymm8, 0x01
+        vextracti128 xmmword ptr [rbx+0x70], ymm9, 0x01
+        vmovaps xmm8, xmmword ptr [rsp+0x280]
+        vmovaps xmm0, xmmword ptr [rsp+0x240]
+        vmovaps xmm1, xmmword ptr [rsp+0x250]
+        vmovaps xmm2, xmmword ptr [rsp+0x260]
+        vmovaps xmm3, xmmword ptr [rsp+0x270]
+        vblendvps xmm0, xmm0, xmm1, xmm8
+        vblendvps xmm2, xmm2, xmm3, xmm8
+        vmovaps xmmword ptr [rsp+0x240], xmm0
+        vmovaps xmmword ptr [rsp+0x260], xmm2
+        add     rbx, 128
+        add     rdi, 32
+        sub     rsi, 4
+3:
+        test    rsi, 0x2
+        je      3f
+        vbroadcasti128 ymm0, xmmword ptr [rcx]
+        vbroadcasti128 ymm1, xmmword ptr [rcx+0x10]
+        vmovd   xmm13, dword ptr [rsp+0x240]
+        vpinsrd xmm13, xmm13, dword ptr [rsp+0x260], 1
+        vpinsrd xmm13, xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
+        vmovd   xmm14, dword ptr [rsp+0x244]
+        vpinsrd xmm14, xmm14, dword ptr [rsp+0x264], 1
+        vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
+        vinserti128 ymm13, ymm13, xmm14, 0x01
+        vbroadcasti128 ymm14, xmmword ptr [ROT16+rip]
+        vbroadcasti128 ymm15, xmmword ptr [ROT8+rip]
+        mov     r8, qword ptr [rdi]
+        mov     r9, qword ptr [rdi+0x8]
+        movzx   eax, byte ptr [rbp+0x40]
+        or      eax, r13d
+        xor     edx, edx
+.p2align  5
+2:
+        mov     r14d, eax
+        or      eax, r12d
+        add     rdx, 64
+        cmp     rdx, r15
+        cmovne  eax, r14d
+        mov     dword ptr [rsp+0x200], eax
+        vbroadcasti128 ymm2, xmmword ptr [BLAKE3_IV+rip]
+        vpbroadcastd ymm8, dword ptr [rsp+0x200]
+        vpblendd ymm3, ymm13, ymm8, 0x88
+        vmovups ymm8, ymmword ptr [r8+rdx-0x40]
+        vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-0x40], 0x01
+        vmovups ymm9, ymmword ptr [r8+rdx-0x30]
+        vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-0x30], 0x01
+        vshufps ymm4, ymm8, ymm9, 136
+        vshufps ymm5, ymm8, ymm9, 221
+        vmovups ymm8, ymmword ptr [r8+rdx-0x20]
+        vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-0x20], 0x01
+        vmovups ymm9, ymmword ptr [r8+rdx-0x10]
+        vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-0x10], 0x01
+        vshufps ymm6, ymm8, ymm9, 136
+        vshufps ymm7, ymm8, ymm9, 221
+        vpshufd ymm6, ymm6, 0x93
+        vpshufd ymm7, ymm7, 0x93
+        mov     al, 7
+9:
+        vpaddd  ymm0, ymm0, ymm4
+        vpaddd  ymm0, ymm0, ymm1
+        vpxor   ymm3, ymm3, ymm0
+        vpshufb ymm3, ymm3, ymm14
+        vpaddd  ymm2, ymm2, ymm3
+        vpxor   ymm1, ymm1, ymm2
+        vpsrld  ymm8, ymm1, 12
+        vpslld  ymm1, ymm1, 20
+        vpor    ymm1, ymm1, ymm8
+        vpaddd  ymm0, ymm0, ymm5
+        vpaddd  ymm0, ymm0, ymm1
+        vpxor   ymm3, ymm3, ymm0
+        vpshufb ymm3, ymm3, ymm15
+        vpaddd  ymm2, ymm2, ymm3
+        vpxor   ymm1, ymm1, ymm2
+        vpsrld  ymm8, ymm1, 7
+        vpslld  ymm1, ymm1, 25
+        vpor    ymm1, ymm1, ymm8
+        vpshufd ymm0, ymm0, 0x93
+        vpshufd ymm3, ymm3, 0x4E
+        vpshufd ymm2, ymm2, 0x39
+        vpaddd  ymm0, ymm0, ymm6
+        vpaddd  ymm0, ymm0, ymm1
+        vpxor   ymm3, ymm3, ymm0
+        vpshufb ymm3, ymm3, ymm14
+        vpaddd  ymm2, ymm2, ymm3
+        vpxor   ymm1, ymm1, ymm2
+        vpsrld  ymm8, ymm1, 12
+        vpslld  ymm1, ymm1, 20
+        vpor    ymm1, ymm1, ymm8
+        vpaddd  ymm0, ymm0, ymm7
+        vpaddd  ymm0, ymm0, ymm1
+        vpxor   ymm3, ymm3, ymm0
+        vpshufb ymm3, ymm3, ymm15
+        vpaddd  ymm2, ymm2, ymm3
+        vpxor   ymm1, ymm1, ymm2
+        vpsrld  ymm8, ymm1, 7
+        vpslld  ymm1, ymm1, 25
+        vpor    ymm1, ymm1, ymm8
+        vpshufd ymm0, ymm0, 0x39
+        vpshufd ymm3, ymm3, 0x4E
+        vpshufd ymm2, ymm2, 0x93
+        dec     al
+        jz      9f
+        vshufps ymm8, ymm4, ymm5, 214
+        vpshufd ymm9, ymm4, 0x0F
+        vpshufd ymm4, ymm8, 0x39
+        vshufps ymm8, ymm6, ymm7, 250
+        vpblendd ymm9, ymm9, ymm8, 0xAA
+        vpunpcklqdq ymm8, ymm7, ymm5
+        vpblendd ymm8, ymm8, ymm6, 0x88
+        vpshufd ymm8, ymm8, 0x78
+        vpunpckhdq ymm5, ymm5, ymm7
+        vpunpckldq ymm6, ymm6, ymm5
+        vpshufd ymm7, ymm6, 0x1E
+        vmovdqa ymm5, ymm9
+        vmovdqa ymm6, ymm8
+        jmp     9b
+9:
+        vpxor   ymm0, ymm0, ymm2
+        vpxor   ymm1, ymm1, ymm3
+        mov     eax, r13d
+        cmp     rdx, r15
+        jne     2b
+        vmovdqu xmmword ptr [rbx], xmm0
+        vmovdqu xmmword ptr [rbx+0x10], xmm1
+        vextracti128 xmmword ptr [rbx+0x20], ymm0, 0x01
+        vextracti128 xmmword ptr [rbx+0x30], ymm1, 0x01
+        vmovaps ymm8, ymmword ptr [rsp+0x280]
+        vmovaps ymm0, ymmword ptr [rsp+0x240]
+        vmovups ymm1, ymmword ptr [rsp+0x248]
+        vmovaps ymm2, ymmword ptr [rsp+0x260]
+        vmovups ymm3, ymmword ptr [rsp+0x268]
+        vblendvps ymm0, ymm0, ymm1, ymm8
+        vblendvps ymm2, ymm2, ymm3, ymm8
+        vmovaps ymmword ptr [rsp+0x240], ymm0
+        vmovaps ymmword ptr [rsp+0x260], ymm2
+        add     rbx, 64
+        add     rdi, 16
+        sub     rsi, 2
+3:
+        test    rsi, 0x1
+        je      4b
+        vmovdqu xmm0, xmmword ptr [rcx]
+        vmovdqu xmm1, xmmword ptr [rcx+0x10]
+        vmovd   xmm3, dword ptr [rsp+0x240]
+        vpinsrd xmm3, xmm3, dword ptr [rsp+0x260], 1
+        vpinsrd xmm13, xmm3, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
+        vmovdqa xmm14, xmmword ptr [ROT16+rip]
+        vmovdqa xmm15, xmmword ptr [ROT8+rip]
+        mov     r8, qword ptr [rdi]
+        movzx   eax, byte ptr [rbp+0x40]
+        or      eax, r13d
+        xor     edx, edx
+.p2align  5
+2:
+        mov     r14d, eax
+        or      eax, r12d
+        add     rdx, 64
+        cmp     rdx, r15
+        cmovne  eax, r14d
+        vmovdqa xmm2, xmmword ptr [BLAKE3_IV+rip]
+        vmovdqa xmm3, xmm13
+        vpinsrd xmm3, xmm3, eax, 3
+        vmovups xmm8, xmmword ptr [r8+rdx-0x40]
+        vmovups xmm9, xmmword ptr [r8+rdx-0x30]
+        vshufps xmm4, xmm8, xmm9, 136
+        vshufps xmm5, xmm8, xmm9, 221
+        vmovups xmm8, xmmword ptr [r8+rdx-0x20]
+        vmovups xmm9, xmmword ptr [r8+rdx-0x10]
+        vshufps xmm6, xmm8, xmm9, 136
+        vshufps xmm7, xmm8, xmm9, 221
+        vpshufd xmm6, xmm6, 0x93
+        vpshufd xmm7, xmm7, 0x93
+        mov     al, 7
+9:
+        vpaddd  xmm0, xmm0, xmm4
+        vpaddd  xmm0, xmm0, xmm1
+        vpxor   xmm3, xmm3, xmm0
+        vpshufb xmm3, xmm3, xmm14
+        vpaddd  xmm2, xmm2, xmm3
+        vpxor   xmm1, xmm1, xmm2
+        vpsrld  xmm8, xmm1, 12
+        vpslld  xmm1, xmm1, 20
+        vpor    xmm1, xmm1, xmm8
+        vpaddd  xmm0, xmm0, xmm5
+        vpaddd  xmm0, xmm0, xmm1
+        vpxor   xmm3, xmm3, xmm0
+        vpshufb xmm3, xmm3, xmm15
+        vpaddd  xmm2, xmm2, xmm3
+        vpxor   xmm1, xmm1, xmm2
+        vpsrld  xmm8, xmm1, 7
+        vpslld  xmm1, xmm1, 25
+        vpor    xmm1, xmm1, xmm8
+        vpshufd xmm0, xmm0, 0x93
+        vpshufd xmm3, xmm3, 0x4E
+        vpshufd xmm2, xmm2, 0x39
+        vpaddd  xmm0, xmm0, xmm6
+        vpaddd  xmm0, xmm0, xmm1
+        vpxor   xmm3, xmm3, xmm0
+        vpshufb xmm3, xmm3, xmm14
+        vpaddd  xmm2, xmm2, xmm3
+        vpxor   xmm1, xmm1, xmm2
+        vpsrld  xmm8, xmm1, 12
+        vpslld  xmm1, xmm1, 20
+        vpor    xmm1, xmm1, xmm8
+        vpaddd  xmm0, xmm0, xmm7
+        vpaddd  xmm0, xmm0, xmm1
+        vpxor   xmm3, xmm3, xmm0
+        vpshufb xmm3, xmm3, xmm15
+        vpaddd  xmm2, xmm2, xmm3
+        vpxor   xmm1, xmm1, xmm2
+        vpsrld  xmm8, xmm1, 7
+        vpslld  xmm1, xmm1, 25
+        vpor    xmm1, xmm1, xmm8
+        vpshufd xmm0, xmm0, 0x39
+        vpshufd xmm3, xmm3, 0x4E
+        vpshufd xmm2, xmm2, 0x93
+        dec     al
+        jz      9f
+        vshufps xmm8, xmm4, xmm5, 214
+        vpshufd xmm9, xmm4, 0x0F
+        vpshufd xmm4, xmm8, 0x39
+        vshufps xmm8, xmm6, xmm7, 250
+        vpblendd xmm9, xmm9, xmm8, 0xAA
+        vpunpcklqdq xmm8, xmm7, xmm5
+        vpblendd xmm8, xmm8, xmm6, 0x88
+        vpshufd xmm8, xmm8, 0x78
+        vpunpckhdq xmm5, xmm5, xmm7
+        vpunpckldq xmm6, xmm6, xmm5
+        vpshufd xmm7, xmm6, 0x1E
+        vmovdqa xmm5, xmm9
+        vmovdqa xmm6, xmm8
+        jmp     9b
+9:
+        vpxor   xmm0, xmm0, xmm2
+        vpxor   xmm1, xmm1, xmm3
+        mov     eax, r13d
+        cmp     rdx, r15
+        jne     2b
+        vmovdqu xmmword ptr [rbx], xmm0
+        vmovdqu xmmword ptr [rbx+0x10], xmm1
+        jmp     4b
+
+
+#ifdef __APPLE__
+.static_data
+#else
+.section .rodata
+#endif
+.p2align  6
+ADD0:
+        .long  0, 1, 2, 3, 4, 5, 6, 7
+ADD1:
+        .long  8, 8, 8, 8, 8, 8, 8, 8
+BLAKE3_IV_0:
+        .long  0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667
+        .long  0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667
+BLAKE3_IV_1:
+        .long  0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85
+        .long  0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85
+BLAKE3_IV_2:
+        .long  0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372
+        .long  0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372
+BLAKE3_IV_3:
+        .long  0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A
+        .long  0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A
+BLAKE3_BLOCK_LEN:
+        .long  0x00000040, 0x00000040, 0x00000040, 0x00000040
+        .long  0x00000040, 0x00000040, 0x00000040, 0x00000040
+ROT16:
+        .byte  2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13
+ROT8:
+        .byte  1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12
+CMP_MSB_MASK:
+        .long  0x80000000, 0x80000000, 0x80000000, 0x80000000
+        .long  0x80000000, 0x80000000, 0x80000000, 0x80000000
+BLAKE3_IV:
+        .long  0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A
+
diff --git a/src/blake3/blake3_avx512.c b/src/blake3/blake3_avx512.c
new file mode 100644
index 0000000..69f325c
--- /dev/null
+++ b/src/blake3/blake3_avx512.c
@@ -0,0 +1,1220 @@
+#include "blake3.h"
+
+#include <immintrin.h>
+
+#define _mm_shuffle_ps2(a, b, c)                                               \
+  (_mm_castps_si128(                                                           \
+      _mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), (c))))
+
+INLINE __m128i loadu_128(const u8 src[16]) {
+  return _mm_loadu_si128((const __m128i *)src);
+}
+
+INLINE __m256i loadu_256(const u8 src[32]) {
+  return _mm256_loadu_si256((const __m256i *)src);
+}
+
+INLINE __m512i loadu_512(const u8 src[64]) {
+  return _mm512_loadu_si512((const __m512i *)src);
+}
+
+INLINE void storeu_128(__m128i src, u8 dest[16]) {
+  _mm_storeu_si128((__m128i *)dest, src);
+}
+
+INLINE void storeu_256(__m256i src, u8 dest[16]) {
+  _mm256_storeu_si256((__m256i *)dest, src);
+}
+
+INLINE __m128i add_128(__m128i a, __m128i b) { return _mm_add_epi32(a, b); }
+
+INLINE __m256i add_256(__m256i a, __m256i b) { return _mm256_add_epi32(a, b); }
+
+INLINE __m512i add_512(__m512i a, __m512i b) { return _mm512_add_epi32(a, b); }
+
+INLINE __m128i xor_128(__m128i a, __m128i b) { return _mm_xor_si128(a, b); }
+
+INLINE __m256i xor_256(__m256i a, __m256i b) { return _mm256_xor_si256(a, b); }
+
+INLINE __m512i xor_512(__m512i a, __m512i b) { return _mm512_xor_si512(a, b); }
+
+INLINE __m128i set1_128(u32 x) { return _mm_set1_epi32((int32_t)x); }
+
+INLINE __m256i set1_256(u32 x) { return _mm256_set1_epi32((int32_t)x); }
+
+INLINE __m512i set1_512(u32 x) { return _mm512_set1_epi32((int32_t)x); }
+
+INLINE __m128i set4(u32 a, u32 b, u32 c, u32 d) {
+  return _mm_setr_epi32((int32_t)a, (int32_t)b, (int32_t)c, (int32_t)d);
+}
+
+INLINE __m128i rot16_128(__m128i x) { return _mm_ror_epi32(x, 16); }
+
+INLINE __m256i rot16_256(__m256i x) { return _mm256_ror_epi32(x, 16); }
+
+INLINE __m512i rot16_512(__m512i x) { return _mm512_ror_epi32(x, 16); }
+
+INLINE __m128i rot12_128(__m128i x) { return _mm_ror_epi32(x, 12); }
+
+INLINE __m256i rot12_256(__m256i x) { return _mm256_ror_epi32(x, 12); }
+
+INLINE __m512i rot12_512(__m512i x) { return _mm512_ror_epi32(x, 12); }
+
+INLINE __m128i rot8_128(__m128i x) { return _mm_ror_epi32(x, 8); }
+
+INLINE __m256i rot8_256(__m256i x) { return _mm256_ror_epi32(x, 8); }
+
+INLINE __m512i rot8_512(__m512i x) { return _mm512_ror_epi32(x, 8); }
+
+INLINE __m128i rot7_128(__m128i x) { return _mm_ror_epi32(x, 7); }
+
+INLINE __m256i rot7_256(__m256i x) { return _mm256_ror_epi32(x, 7); }
+
+INLINE __m512i rot7_512(__m512i x) { return _mm512_ror_epi32(x, 7); }
+
+/*
+ * ----------------------------------------------------------------------------
+ * compress_avx512
+ * ----------------------------------------------------------------------------
+ */
+
+INLINE void g1(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3,
+               __m128i m) {
+  *row0 = add_128(add_128(*row0, m), *row1);
+  *row3 = xor_128(*row3, *row0);
+  *row3 = rot16_128(*row3);
+  *row2 = add_128(*row2, *row3);
+  *row1 = xor_128(*row1, *row2);
+  *row1 = rot12_128(*row1);
+}
+
+INLINE void g2(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3,
+               __m128i m) {
+  *row0 = add_128(add_128(*row0, m), *row1);
+  *row3 = xor_128(*row3, *row0);
+  *row3 = rot8_128(*row3);
+  *row2 = add_128(*row2, *row3);
+  *row1 = xor_128(*row1, *row2);
+  *row1 = rot7_128(*row1);
+}
+
+// Note the optimization here of leaving row1 as the unrotated row, rather than
+// row0. All the message loads below are adjusted to compensate for this. See
+// discussion at https://github.com/sneves/blake2-avx2/pull/4
+INLINE void diagonalize(__m128i *row0, __m128i *row2, __m128i *row3) {
+  *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(2, 1, 0, 3));
+  *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2));
+  *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(0, 3, 2, 1));
+}
+
+INLINE void undiagonalize(__m128i *row0, __m128i *row2, __m128i *row3) {
+  *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(0, 3, 2, 1));
+  *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2));
+  *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(2, 1, 0, 3));
+}
+
+INLINE void compress_pre(__m128i rows[4], const u32 cv[8],
+                         const u8 block[BLAKE3_BLOCK_LEN],
+                         u8 block_len, u64 counter, u8 flags) {
+  rows[0] = loadu_128((u8 *)&cv[0]);
+  rows[1] = loadu_128((u8 *)&cv[4]);
+  rows[2] = set4(IV[0], IV[1], IV[2], IV[3]);
+  rows[3] = set4(counter_low(counter), counter_high(counter),
+                 (u32)block_len, (u32)flags);
+
+  __m128i m0 = loadu_128(&block[sizeof(__m128i) * 0]);
+  __m128i m1 = loadu_128(&block[sizeof(__m128i) * 1]);
+  __m128i m2 = loadu_128(&block[sizeof(__m128i) * 2]);
+  __m128i m3 = loadu_128(&block[sizeof(__m128i) * 3]);
+
+  __m128i t0, t1, t2, t3, tt;
+
+  // Round 1. The first round permutes the message words from the original
+  // input order, into the groups that get mixed in parallel.
+  t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(2, 0, 2, 0)); //  6  4  2  0
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
+  t1 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 3, 1)); //  7  5  3  1
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
+  diagonalize(&rows[0], &rows[2], &rows[3]);
+  t2 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(2, 0, 2, 0)); // 14 12 10  8
+  t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2, 1, 0, 3));   // 12 10  8 14
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
+  t3 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 1, 3, 1)); // 15 13 11  9
+  t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE(2, 1, 0, 3));   // 13 11  9 15
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
+  undiagonalize(&rows[0], &rows[2], &rows[3]);
+  m0 = t0;
+  m1 = t1;
+  m2 = t2;
+  m3 = t3;
+
+  // Round 2. This round and all following rounds apply a fixed permutation
+  // to the message words from the round before.
+  t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
+  t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
+  t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
+  tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
+  t1 = _mm_blend_epi16(tt, t1, 0xCC);
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
+  diagonalize(&rows[0], &rows[2], &rows[3]);
+  t2 = _mm_unpacklo_epi64(m3, m1);
+  tt = _mm_blend_epi16(t2, m2, 0xC0);
+  t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
+  t3 = _mm_unpackhi_epi32(m1, m3);
+  tt = _mm_unpacklo_epi32(m2, t3);
+  t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
+  undiagonalize(&rows[0], &rows[2], &rows[3]);
+  m0 = t0;
+  m1 = t1;
+  m2 = t2;
+  m3 = t3;
+
+  // Round 3
+  t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
+  t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
+  t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
+  tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
+  t1 = _mm_blend_epi16(tt, t1, 0xCC);
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
+  diagonalize(&rows[0], &rows[2], &rows[3]);
+  t2 = _mm_unpacklo_epi64(m3, m1);
+  tt = _mm_blend_epi16(t2, m2, 0xC0);
+  t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
+  t3 = _mm_unpackhi_epi32(m1, m3);
+  tt = _mm_unpacklo_epi32(m2, t3);
+  t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
+  undiagonalize(&rows[0], &rows[2], &rows[3]);
+  m0 = t0;
+  m1 = t1;
+  m2 = t2;
+  m3 = t3;
+
+  // Round 4
+  t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
+  t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
+  t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
+  tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
+  t1 = _mm_blend_epi16(tt, t1, 0xCC);
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
+  diagonalize(&rows[0], &rows[2], &rows[3]);
+  t2 = _mm_unpacklo_epi64(m3, m1);
+  tt = _mm_blend_epi16(t2, m2, 0xC0);
+  t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
+  t3 = _mm_unpackhi_epi32(m1, m3);
+  tt = _mm_unpacklo_epi32(m2, t3);
+  t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
+  undiagonalize(&rows[0], &rows[2], &rows[3]);
+  m0 = t0;
+  m1 = t1;
+  m2 = t2;
+  m3 = t3;
+
+  // Round 5
+  t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
+  t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
+  t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
+  tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
+  t1 = _mm_blend_epi16(tt, t1, 0xCC);
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
+  diagonalize(&rows[0], &rows[2], &rows[3]);
+  t2 = _mm_unpacklo_epi64(m3, m1);
+  tt = _mm_blend_epi16(t2, m2, 0xC0);
+  t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
+  t3 = _mm_unpackhi_epi32(m1, m3);
+  tt = _mm_unpacklo_epi32(m2, t3);
+  t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
+  undiagonalize(&rows[0], &rows[2], &rows[3]);
+  m0 = t0;
+  m1 = t1;
+  m2 = t2;
+  m3 = t3;
+
+  // Round 6
+  t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
+  t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
+  t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
+  tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
+  t1 = _mm_blend_epi16(tt, t1, 0xCC);
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
+  diagonalize(&rows[0], &rows[2], &rows[3]);
+  t2 = _mm_unpacklo_epi64(m3, m1);
+  tt = _mm_blend_epi16(t2, m2, 0xC0);
+  t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
+  t3 = _mm_unpackhi_epi32(m1, m3);
+  tt = _mm_unpacklo_epi32(m2, t3);
+  t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
+  undiagonalize(&rows[0], &rows[2], &rows[3]);
+  m0 = t0;
+  m1 = t1;
+  m2 = t2;
+  m3 = t3;
+
+  // Round 7
+  t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
+  t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
+  t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
+  tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
+  t1 = _mm_blend_epi16(tt, t1, 0xCC);
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
+  diagonalize(&rows[0], &rows[2], &rows[3]);
+  t2 = _mm_unpacklo_epi64(m3, m1);
+  tt = _mm_blend_epi16(t2, m2, 0xC0);
+  t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
+  t3 = _mm_unpackhi_epi32(m1, m3);
+  tt = _mm_unpacklo_epi32(m2, t3);
+  t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
+  undiagonalize(&rows[0], &rows[2], &rows[3]);
+}
+
+void blake3_compress_xof_avx512(const u32 cv[8],
+                                const u8 block[BLAKE3_BLOCK_LEN],
+                                u8 block_len, u64 counter,
+                                u8 flags, u8 out[64]) {
+  __m128i rows[4];
+  compress_pre(rows, cv, block, block_len, counter, flags);
+  storeu_128(xor_128(rows[0], rows[2]), &out[0]);
+  storeu_128(xor_128(rows[1], rows[3]), &out[16]);
+  storeu_128(xor_128(rows[2], loadu_128((u8 *)&cv[0])), &out[32]);
+  storeu_128(xor_128(rows[3], loadu_128((u8 *)&cv[4])), &out[48]);
+}
+
+void blake3_compress_in_place_avx512(u32 cv[8],
+                                     const u8 block[BLAKE3_BLOCK_LEN],
+                                     u8 block_len, u64 counter,
+                                     u8 flags) {
+  __m128i rows[4];
+  compress_pre(rows, cv, block, block_len, counter, flags);
+  storeu_128(xor_128(rows[0], rows[2]), (u8 *)&cv[0]);
+  storeu_128(xor_128(rows[1], rows[3]), (u8 *)&cv[4]);
+}
+
+/*
+ * ----------------------------------------------------------------------------
+ * hash4_avx512
+ * ----------------------------------------------------------------------------
+ */
+
+INLINE void round_fn4(__m128i v[16], __m128i m[16], size_t r) {
+  v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][0]]);
+  v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][2]]);
+  v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][4]]);
+  v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][6]]);
+  v[0] = add_128(v[0], v[4]);
+  v[1] = add_128(v[1], v[5]);
+  v[2] = add_128(v[2], v[6]);
+  v[3] = add_128(v[3], v[7]);
+  v[12] = xor_128(v[12], v[0]);
+  v[13] = xor_128(v[13], v[1]);
+  v[14] = xor_128(v[14], v[2]);
+  v[15] = xor_128(v[15], v[3]);
+  v[12] = rot16_128(v[12]);
+  v[13] = rot16_128(v[13]);
+  v[14] = rot16_128(v[14]);
+  v[15] = rot16_128(v[15]);
+  v[8] = add_128(v[8], v[12]);
+  v[9] = add_128(v[9], v[13]);
+  v[10] = add_128(v[10], v[14]);
+  v[11] = add_128(v[11], v[15]);
+  v[4] = xor_128(v[4], v[8]);
+  v[5] = xor_128(v[5], v[9]);
+  v[6] = xor_128(v[6], v[10]);
+  v[7] = xor_128(v[7], v[11]);
+  v[4] = rot12_128(v[4]);
+  v[5] = rot12_128(v[5]);
+  v[6] = rot12_128(v[6]);
+  v[7] = rot12_128(v[7]);
+  v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][1]]);
+  v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][3]]);
+  v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][5]]);
+  v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][7]]);
+  v[0] = add_128(v[0], v[4]);
+  v[1] = add_128(v[1], v[5]);
+  v[2] = add_128(v[2], v[6]);
+  v[3] = add_128(v[3], v[7]);
+  v[12] = xor_128(v[12], v[0]);
+  v[13] = xor_128(v[13], v[1]);
+  v[14] = xor_128(v[14], v[2]);
+  v[15] = xor_128(v[15], v[3]);
+  v[12] = rot8_128(v[12]);
+  v[13] = rot8_128(v[13]);
+  v[14] = rot8_128(v[14]);
+  v[15] = rot8_128(v[15]);
+  v[8] = add_128(v[8], v[12]);
+  v[9] = add_128(v[9], v[13]);
+  v[10] = add_128(v[10], v[14]);
+  v[11] = add_128(v[11], v[15]);
+  v[4] = xor_128(v[4], v[8]);
+  v[5] = xor_128(v[5], v[9]);
+  v[6] = xor_128(v[6], v[10]);
+  v[7] = xor_128(v[7], v[11]);
+  v[4] = rot7_128(v[4]);
+  v[5] = rot7_128(v[5]);
+  v[6] = rot7_128(v[6]);
+  v[7] = rot7_128(v[7]);
+
+  v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][8]]);
+  v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][10]]);
+  v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][12]]);
+  v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][14]]);
+  v[0] = add_128(v[0], v[5]);
+  v[1] = add_128(v[1], v[6]);
+  v[2] = add_128(v[2], v[7]);
+  v[3] = add_128(v[3], v[4]);
+  v[15] = xor_128(v[15], v[0]);
+  v[12] = xor_128(v[12], v[1]);
+  v[13] = xor_128(v[13], v[2]);
+  v[14] = xor_128(v[14], v[3]);
+  v[15] = rot16_128(v[15]);
+  v[12] = rot16_128(v[12]);
+  v[13] = rot16_128(v[13]);
+  v[14] = rot16_128(v[14]);
+  v[10] = add_128(v[10], v[15]);
+  v[11] = add_128(v[11], v[12]);
+  v[8] = add_128(v[8], v[13]);
+  v[9] = add_128(v[9], v[14]);
+  v[5] = xor_128(v[5], v[10]);
+  v[6] = xor_128(v[6], v[11]);
+  v[7] = xor_128(v[7], v[8]);
+  v[4] = xor_128(v[4], v[9]);
+  v[5] = rot12_128(v[5]);
+  v[6] = rot12_128(v[6]);
+  v[7] = rot12_128(v[7]);
+  v[4] = rot12_128(v[4]);
+  v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][9]]);
+  v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][11]]);
+  v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][13]]);
+  v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][15]]);
+  v[0] = add_128(v[0], v[5]);
+  v[1] = add_128(v[1], v[6]);
+  v[2] = add_128(v[2], v[7]);
+  v[3] = add_128(v[3], v[4]);
+  v[15] = xor_128(v[15], v[0]);
+  v[12] = xor_128(v[12], v[1]);
+  v[13] = xor_128(v[13], v[2]);
+  v[14] = xor_128(v[14], v[3]);
+  v[15] = rot8_128(v[15]);
+  v[12] = rot8_128(v[12]);
+  v[13] = rot8_128(v[13]);
+  v[14] = rot8_128(v[14]);
+  v[10] = add_128(v[10], v[15]);
+  v[11] = add_128(v[11], v[12]);
+  v[8] = add_128(v[8], v[13]);
+  v[9] = add_128(v[9], v[14]);
+  v[5] = xor_128(v[5], v[10]);
+  v[6] = xor_128(v[6], v[11]);
+  v[7] = xor_128(v[7], v[8]);
+  v[4] = xor_128(v[4], v[9]);
+  v[5] = rot7_128(v[5]);
+  v[6] = rot7_128(v[6]);
+  v[7] = rot7_128(v[7]);
+  v[4] = rot7_128(v[4]);
+}
+
+INLINE void transpose_vecs_128(__m128i vecs[4]) {
+  // Interleave 32-bit lates. The low unpack is lanes 00/11 and the high is
+  // 22/33. Note that this doesn't split the vector into two lanes, as the
+  // AVX2 counterparts do.
+  __m128i ab_01 = _mm_unpacklo_epi32(vecs[0], vecs[1]);
+  __m128i ab_23 = _mm_unpackhi_epi32(vecs[0], vecs[1]);
+  __m128i cd_01 = _mm_unpacklo_epi32(vecs[2], vecs[3]);
+  __m128i cd_23 = _mm_unpackhi_epi32(vecs[2], vecs[3]);
+
+  // Interleave 64-bit lanes.
+  __m128i abcd_0 = _mm_unpacklo_epi64(ab_01, cd_01);
+  __m128i abcd_1 = _mm_unpackhi_epi64(ab_01, cd_01);
+  __m128i abcd_2 = _mm_unpacklo_epi64(ab_23, cd_23);
+  __m128i abcd_3 = _mm_unpackhi_epi64(ab_23, cd_23);
+
+  vecs[0] = abcd_0;
+  vecs[1] = abcd_1;
+  vecs[2] = abcd_2;
+  vecs[3] = abcd_3;
+}
+
+INLINE void transpose_msg_vecs4(const u8 *const *inputs,
+                                size_t block_offset, __m128i out[16]) {
+  out[0] = loadu_128(&inputs[0][block_offset + 0 * sizeof(__m128i)]);
+  out[1] = loadu_128(&inputs[1][block_offset + 0 * sizeof(__m128i)]);
+  out[2] = loadu_128(&inputs[2][block_offset + 0 * sizeof(__m128i)]);
+  out[3] = loadu_128(&inputs[3][block_offset + 0 * sizeof(__m128i)]);
+  out[4] = loadu_128(&inputs[0][block_offset + 1 * sizeof(__m128i)]);
+  out[5] = loadu_128(&inputs[1][block_offset + 1 * sizeof(__m128i)]);
+  out[6] = loadu_128(&inputs[2][block_offset + 1 * sizeof(__m128i)]);
+  out[7] = loadu_128(&inputs[3][block_offset + 1 * sizeof(__m128i)]);
+  out[8] = loadu_128(&inputs[0][block_offset + 2 * sizeof(__m128i)]);
+  out[9] = loadu_128(&inputs[1][block_offset + 2 * sizeof(__m128i)]);
+  out[10] = loadu_128(&inputs[2][block_offset + 2 * sizeof(__m128i)]);
+  out[11] = loadu_128(&inputs[3][block_offset + 2 * sizeof(__m128i)]);
+  out[12] = loadu_128(&inputs[0][block_offset + 3 * sizeof(__m128i)]);
+  out[13] = loadu_128(&inputs[1][block_offset + 3 * sizeof(__m128i)]);
+  out[14] = loadu_128(&inputs[2][block_offset + 3 * sizeof(__m128i)]);
+  out[15] = loadu_128(&inputs[3][block_offset + 3 * sizeof(__m128i)]);
+  for (size_t i = 0; i < 4; ++i) {
+    _mm_prefetch((const void *)&inputs[i][block_offset + 256], _MM_HINT_T0);
+  }
+  transpose_vecs_128(&out[0]);
+  transpose_vecs_128(&out[4]);
+  transpose_vecs_128(&out[8]);
+  transpose_vecs_128(&out[12]);
+}
+
+INLINE void load_counters4(u64 counter, char increment_counter,
+                           __m128i *out_lo, __m128i *out_hi) {
+  u64 mask = (increment_counter ? ~0 : 0);
+  __m256i mask_vec = _mm256_set1_epi64x(mask);
+  __m256i deltas = _mm256_setr_epi64x(0, 1, 2, 3);
+  deltas = _mm256_and_si256(mask_vec, deltas);
+  __m256i counters =
+      _mm256_add_epi64(_mm256_set1_epi64x((int64_t)counter), deltas);
+  *out_lo = _mm256_cvtepi64_epi32(counters);
+  *out_hi = _mm256_cvtepi64_epi32(_mm256_srli_epi64(counters, 32));
+}
+
+static
+void blake3_hash4_avx512(const u8 *const *inputs, size_t blocks,
+                         const u32 key[8], u64 counter,
+                         char increment_counter, u8 flags,
+                         u8 flags_start, u8 flags_end, u8 *out) {
+  __m128i h_vecs[8] = {
+      set1_128(key[0]), set1_128(key[1]), set1_128(key[2]), set1_128(key[3]),
+      set1_128(key[4]), set1_128(key[5]), set1_128(key[6]), set1_128(key[7]),
+  };
+  __m128i counter_low_vec, counter_high_vec;
+  load_counters4(counter, increment_counter, &counter_low_vec,
+                 &counter_high_vec);
+  u8 block_flags = flags | flags_start;
+
+  for (size_t block = 0; block < blocks; block++) {
+    if (block + 1 == blocks) {
+      block_flags |= flags_end;
+    }
+    __m128i block_len_vec = set1_128(BLAKE3_BLOCK_LEN);
+    __m128i block_flags_vec = set1_128(block_flags);
+    __m128i msg_vecs[16];
+    transpose_msg_vecs4(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs);
+
+    __m128i v[16] = {
+        h_vecs[0],       h_vecs[1],        h_vecs[2],       h_vecs[3],
+        h_vecs[4],       h_vecs[5],        h_vecs[6],       h_vecs[7],
+        set1_128(IV[0]), set1_128(IV[1]),  set1_128(IV[2]), set1_128(IV[3]),
+        counter_low_vec, counter_high_vec, block_len_vec,   block_flags_vec,
+    };
+    round_fn4(v, msg_vecs, 0);
+    round_fn4(v, msg_vecs, 1);
+    round_fn4(v, msg_vecs, 2);
+    round_fn4(v, msg_vecs, 3);
+    round_fn4(v, msg_vecs, 4);
+    round_fn4(v, msg_vecs, 5);
+    round_fn4(v, msg_vecs, 6);
+    h_vecs[0] = xor_128(v[0], v[8]);
+    h_vecs[1] = xor_128(v[1], v[9]);
+    h_vecs[2] = xor_128(v[2], v[10]);
+    h_vecs[3] = xor_128(v[3], v[11]);
+    h_vecs[4] = xor_128(v[4], v[12]);
+    h_vecs[5] = xor_128(v[5], v[13]);
+    h_vecs[6] = xor_128(v[6], v[14]);
+    h_vecs[7] = xor_128(v[7], v[15]);
+
+    block_flags = flags;
+  }
+
+  transpose_vecs_128(&h_vecs[0]);
+  transpose_vecs_128(&h_vecs[4]);
+  // The first four vecs now contain the first half of each output, and the
+  // second four vecs contain the second half of each output.
+  storeu_128(h_vecs[0], &out[0 * sizeof(__m128i)]);
+  storeu_128(h_vecs[4], &out[1 * sizeof(__m128i)]);
+  storeu_128(h_vecs[1], &out[2 * sizeof(__m128i)]);
+  storeu_128(h_vecs[5], &out[3 * sizeof(__m128i)]);
+  storeu_128(h_vecs[2], &out[4 * sizeof(__m128i)]);
+  storeu_128(h_vecs[6], &out[5 * sizeof(__m128i)]);
+  storeu_128(h_vecs[3], &out[6 * sizeof(__m128i)]);
+  storeu_128(h_vecs[7], &out[7 * sizeof(__m128i)]);
+}
+
+/*
+ * ----------------------------------------------------------------------------
+ * hash8_avx512
+ * ----------------------------------------------------------------------------
+ */
+
+INLINE void round_fn8(__m256i v[16], __m256i m[16], size_t r) {
+  v[0] = add_256(v[0], m[(size_t)MSG_SCHEDULE[r][0]]);
+  v[1] = add_256(v[1], m[(size_t)MSG_SCHEDULE[r][2]]);
+  v[2] = add_256(v[2], m[(size_t)MSG_SCHEDULE[r][4]]);
+  v[3] = add_256(v[3], m[(size_t)MSG_SCHEDULE[r][6]]);
+  v[0] = add_256(v[0], v[4]);
+  v[1] = add_256(v[1], v[5]);
+  v[2] = add_256(v[2], v[6]);
+  v[3] = add_256(v[3], v[7]);
+  v[12] = xor_256(v[12], v[0]);
+  v[13] = xor_256(v[13], v[1]);
+  v[14] = xor_256(v[14], v[2]);
+  v[15] = xor_256(v[15], v[3]);
+  v[12] = rot16_256(v[12]);
+  v[13] = rot16_256(v[13]);
+  v[14] = rot16_256(v[14]);
+  v[15] = rot16_256(v[15]);
+  v[8] = add_256(v[8], v[12]);
+  v[9] = add_256(v[9], v[13]);
+  v[10] = add_256(v[10], v[14]);
+  v[11] = add_256(v[11], v[15]);
+  v[4] = xor_256(v[4], v[8]);
+  v[5] = xor_256(v[5], v[9]);
+  v[6] = xor_256(v[6], v[10]);
+  v[7] = xor_256(v[7], v[11]);
+  v[4] = rot12_256(v[4]);
+  v[5] = rot12_256(v[5]);
+  v[6] = rot12_256(v[6]);
+  v[7] = rot12_256(v[7]);
+  v[0] = add_256(v[0], m[(size_t)MSG_SCHEDULE[r][1]]);
+  v[1] = add_256(v[1], m[(size_t)MSG_SCHEDULE[r][3]]);
+  v[2] = add_256(v[2], m[(size_t)MSG_SCHEDULE[r][5]]);
+  v[3] = add_256(v[3], m[(size_t)MSG_SCHEDULE[r][7]]);
+  v[0] = add_256(v[0], v[4]);
+  v[1] = add_256(v[1], v[5]);
+  v[2] = add_256(v[2], v[6]);
+  v[3] = add_256(v[3], v[7]);
+  v[12] = xor_256(v[12], v[0]);
+  v[13] = xor_256(v[13], v[1]);
+  v[14] = xor_256(v[14], v[2]);
+  v[15] = xor_256(v[15], v[3]);
+  v[12] = rot8_256(v[12]);
+  v[13] = rot8_256(v[13]);
+  v[14] = rot8_256(v[14]);
+  v[15] = rot8_256(v[15]);
+  v[8] = add_256(v[8], v[12]);
+  v[9] = add_256(v[9], v[13]);
+  v[10] = add_256(v[10], v[14]);
+  v[11] = add_256(v[11], v[15]);
+  v[4] = xor_256(v[4], v[8]);
+  v[5] = xor_256(v[5], v[9]);
+  v[6] = xor_256(v[6], v[10]);
+  v[7] = xor_256(v[7], v[11]);
+  v[4] = rot7_256(v[4]);
+  v[5] = rot7_256(v[5]);
+  v[6] = rot7_256(v[6]);
+  v[7] = rot7_256(v[7]);
+
+  v[0] = add_256(v[0], m[(size_t)MSG_SCHEDULE[r][8]]);
+  v[1] = add_256(v[1], m[(size_t)MSG_SCHEDULE[r][10]]);
+  v[2] = add_256(v[2], m[(size_t)MSG_SCHEDULE[r][12]]);
+  v[3] = add_256(v[3], m[(size_t)MSG_SCHEDULE[r][14]]);
+  v[0] = add_256(v[0], v[5]);
+  v[1] = add_256(v[1], v[6]);
+  v[2] = add_256(v[2], v[7]);
+  v[3] = add_256(v[3], v[4]);
+  v[15] = xor_256(v[15], v[0]);
+  v[12] = xor_256(v[12], v[1]);
+  v[13] = xor_256(v[13], v[2]);
+  v[14] = xor_256(v[14], v[3]);
+  v[15] = rot16_256(v[15]);
+  v[12] = rot16_256(v[12]);
+  v[13] = rot16_256(v[13]);
+  v[14] = rot16_256(v[14]);
+  v[10] = add_256(v[10], v[15]);
+  v[11] = add_256(v[11], v[12]);
+  v[8] = add_256(v[8], v[13]);
+  v[9] = add_256(v[9], v[14]);
+  v[5] = xor_256(v[5], v[10]);
+  v[6] = xor_256(v[6], v[11]);
+  v[7] = xor_256(v[7], v[8]);
+  v[4] = xor_256(v[4], v[9]);
+  v[5] = rot12_256(v[5]);
+  v[6] = rot12_256(v[6]);
+  v[7] = rot12_256(v[7]);
+  v[4] = rot12_256(v[4]);
+  v[0] = add_256(v[0], m[(size_t)MSG_SCHEDULE[r][9]]);
+  v[1] = add_256(v[1], m[(size_t)MSG_SCHEDULE[r][11]]);
+  v[2] = add_256(v[2], m[(size_t)MSG_SCHEDULE[r][13]]);
+  v[3] = add_256(v[3], m[(size_t)MSG_SCHEDULE[r][15]]);
+  v[0] = add_256(v[0], v[5]);
+  v[1] = add_256(v[1], v[6]);
+  v[2] = add_256(v[2], v[7]);
+  v[3] = add_256(v[3], v[4]);
+  v[15] = xor_256(v[15], v[0]);
+  v[12] = xor_256(v[12], v[1]);
+  v[13] = xor_256(v[13], v[2]);
+  v[14] = xor_256(v[14], v[3]);
+  v[15] = rot8_256(v[15]);
+  v[12] = rot8_256(v[12]);
+  v[13] = rot8_256(v[13]);
+  v[14] = rot8_256(v[14]);
+  v[10] = add_256(v[10], v[15]);
+  v[11] = add_256(v[11], v[12]);
+  v[8] = add_256(v[8], v[13]);
+  v[9] = add_256(v[9], v[14]);
+  v[5] = xor_256(v[5], v[10]);
+  v[6] = xor_256(v[6], v[11]);
+  v[7] = xor_256(v[7], v[8]);
+  v[4] = xor_256(v[4], v[9]);
+  v[5] = rot7_256(v[5]);
+  v[6] = rot7_256(v[6]);
+  v[7] = rot7_256(v[7]);
+  v[4] = rot7_256(v[4]);
+}
+
+INLINE void transpose_vecs_256(__m256i vecs[8]) {
+  // Interleave 32-bit lanes. The low unpack is lanes 00/11/44/55, and the high
+  // is 22/33/66/77.
+  __m256i ab_0145 = _mm256_unpacklo_epi32(vecs[0], vecs[1]);
+  __m256i ab_2367 = _mm256_unpackhi_epi32(vecs[0], vecs[1]);
+  __m256i cd_0145 = _mm256_unpacklo_epi32(vecs[2], vecs[3]);
+  __m256i cd_2367 = _mm256_unpackhi_epi32(vecs[2], vecs[3]);
+  __m256i ef_0145 = _mm256_unpacklo_epi32(vecs[4], vecs[5]);
+  __m256i ef_2367 = _mm256_unpackhi_epi32(vecs[4], vecs[5]);
+  __m256i gh_0145 = _mm256_unpacklo_epi32(vecs[6], vecs[7]);
+  __m256i gh_2367 = _mm256_unpackhi_epi32(vecs[6], vecs[7]);
+
+  // Interleave 64-bit lates. The low unpack is lanes 00/22 and the high is
+  // 11/33.
+  __m256i abcd_04 = _mm256_unpacklo_epi64(ab_0145, cd_0145);
+  __m256i abcd_15 = _mm256_unpackhi_epi64(ab_0145, cd_0145);
+  __m256i abcd_26 = _mm256_unpacklo_epi64(ab_2367, cd_2367);
+  __m256i abcd_37 = _mm256_unpackhi_epi64(ab_2367, cd_2367);
+  __m256i efgh_04 = _mm256_unpacklo_epi64(ef_0145, gh_0145);
+  __m256i efgh_15 = _mm256_unpackhi_epi64(ef_0145, gh_0145);
+  __m256i efgh_26 = _mm256_unpacklo_epi64(ef_2367, gh_2367);
+  __m256i efgh_37 = _mm256_unpackhi_epi64(ef_2367, gh_2367);
+
+  // Interleave 128-bit lanes.
+  vecs[0] = _mm256_permute2x128_si256(abcd_04, efgh_04, 0x20);
+  vecs[1] = _mm256_permute2x128_si256(abcd_15, efgh_15, 0x20);
+  vecs[2] = _mm256_permute2x128_si256(abcd_26, efgh_26, 0x20);
+  vecs[3] = _mm256_permute2x128_si256(abcd_37, efgh_37, 0x20);
+  vecs[4] = _mm256_permute2x128_si256(abcd_04, efgh_04, 0x31);
+  vecs[5] = _mm256_permute2x128_si256(abcd_15, efgh_15, 0x31);
+  vecs[6] = _mm256_permute2x128_si256(abcd_26, efgh_26, 0x31);
+  vecs[7] = _mm256_permute2x128_si256(abcd_37, efgh_37, 0x31);
+}
+
+INLINE void transpose_msg_vecs8(const u8 *const *inputs,
+                                size_t block_offset, __m256i out[16]) {
+  out[0] = loadu_256(&inputs[0][block_offset + 0 * sizeof(__m256i)]);
+  out[1] = loadu_256(&inputs[1][block_offset + 0 * sizeof(__m256i)]);
+  out[2] = loadu_256(&inputs[2][block_offset + 0 * sizeof(__m256i)]);
+  out[3] = loadu_256(&inputs[3][block_offset + 0 * sizeof(__m256i)]);
+  out[4] = loadu_256(&inputs[4][block_offset + 0 * sizeof(__m256i)]);
+  out[5] = loadu_256(&inputs[5][block_offset + 0 * sizeof(__m256i)]);
+  out[6] = loadu_256(&inputs[6][block_offset + 0 * sizeof(__m256i)]);
+  out[7] = loadu_256(&inputs[7][block_offset + 0 * sizeof(__m256i)]);
+  out[8] = loadu_256(&inputs[0][block_offset + 1 * sizeof(__m256i)]);
+  out[9] = loadu_256(&inputs[1][block_offset + 1 * sizeof(__m256i)]);
+  out[10] = loadu_256(&inputs[2][block_offset + 1 * sizeof(__m256i)]);
+  out[11] = loadu_256(&inputs[3][block_offset + 1 * sizeof(__m256i)]);
+  out[12] = loadu_256(&inputs[4][block_offset + 1 * sizeof(__m256i)]);
+  out[13] = loadu_256(&inputs[5][block_offset + 1 * sizeof(__m256i)]);
+  out[14] = loadu_256(&inputs[6][block_offset + 1 * sizeof(__m256i)]);
+  out[15] = loadu_256(&inputs[7][block_offset + 1 * sizeof(__m256i)]);
+  for (size_t i = 0; i < 8; ++i) {
+    _mm_prefetch((const void *)&inputs[i][block_offset + 256], _MM_HINT_T0);
+  }
+  transpose_vecs_256(&out[0]);
+  transpose_vecs_256(&out[8]);
+}
+
+INLINE void load_counters8(u64 counter, char increment_counter,
+                           __m256i *out_lo, __m256i *out_hi) {
+  u64 mask = (increment_counter ? ~0 : 0);
+  __m512i mask_vec = _mm512_set1_epi64(mask);
+  __m512i deltas = _mm512_setr_epi64(0, 1, 2, 3, 4, 5, 6, 7);
+  deltas = _mm512_and_si512(mask_vec, deltas);
+  __m512i counters =
+      _mm512_add_epi64(_mm512_set1_epi64((int64_t)counter), deltas);
+  *out_lo = _mm512_cvtepi64_epi32(counters);
+  *out_hi = _mm512_cvtepi64_epi32(_mm512_srli_epi64(counters, 32));
+}
+
+static
+void blake3_hash8_avx512(const u8 *const *inputs, size_t blocks,
+                         const u32 key[8], u64 counter,
+                         char increment_counter, u8 flags,
+                         u8 flags_start, u8 flags_end, u8 *out) {
+  __m256i h_vecs[8] = {
+      set1_256(key[0]), set1_256(key[1]), set1_256(key[2]), set1_256(key[3]),
+      set1_256(key[4]), set1_256(key[5]), set1_256(key[6]), set1_256(key[7]),
+  };
+  __m256i counter_low_vec, counter_high_vec;
+  load_counters8(counter, increment_counter, &counter_low_vec,
+                 &counter_high_vec);
+  u8 block_flags = flags | flags_start;
+
+  for (size_t block = 0; block < blocks; block++) {
+    if (block + 1 == blocks) {
+      block_flags |= flags_end;
+    }
+    __m256i block_len_vec = set1_256(BLAKE3_BLOCK_LEN);
+    __m256i block_flags_vec = set1_256(block_flags);
+    __m256i msg_vecs[16];
+    transpose_msg_vecs8(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs);
+
+    __m256i v[16] = {
+        h_vecs[0],       h_vecs[1],        h_vecs[2],       h_vecs[3],
+        h_vecs[4],       h_vecs[5],        h_vecs[6],       h_vecs[7],
+        set1_256(IV[0]), set1_256(IV[1]),  set1_256(IV[2]), set1_256(IV[3]),
+        counter_low_vec, counter_high_vec, block_len_vec,   block_flags_vec,
+    };
+    round_fn8(v, msg_vecs, 0);
+    round_fn8(v, msg_vecs, 1);
+    round_fn8(v, msg_vecs, 2);
+    round_fn8(v, msg_vecs, 3);
+    round_fn8(v, msg_vecs, 4);
+    round_fn8(v, msg_vecs, 5);
+    round_fn8(v, msg_vecs, 6);
+    h_vecs[0] = xor_256(v[0], v[8]);
+    h_vecs[1] = xor_256(v[1], v[9]);
+    h_vecs[2] = xor_256(v[2], v[10]);
+    h_vecs[3] = xor_256(v[3], v[11]);
+    h_vecs[4] = xor_256(v[4], v[12]);
+    h_vecs[5] = xor_256(v[5], v[13]);
+    h_vecs[6] = xor_256(v[6], v[14]);
+    h_vecs[7] = xor_256(v[7], v[15]);
+
+    block_flags = flags;
+  }
+
+  transpose_vecs_256(h_vecs);
+  storeu_256(h_vecs[0], &out[0 * sizeof(__m256i)]);
+  storeu_256(h_vecs[1], &out[1 * sizeof(__m256i)]);
+  storeu_256(h_vecs[2], &out[2 * sizeof(__m256i)]);
+  storeu_256(h_vecs[3], &out[3 * sizeof(__m256i)]);
+  storeu_256(h_vecs[4], &out[4 * sizeof(__m256i)]);
+  storeu_256(h_vecs[5], &out[5 * sizeof(__m256i)]);
+  storeu_256(h_vecs[6], &out[6 * sizeof(__m256i)]);
+  storeu_256(h_vecs[7], &out[7 * sizeof(__m256i)]);
+}
+
+/*
+ * ----------------------------------------------------------------------------
+ * hash16_avx512
+ * ----------------------------------------------------------------------------
+ */
+
+INLINE void round_fn16(__m512i v[16], __m512i m[16], size_t r) {
+  v[0] = add_512(v[0], m[(size_t)MSG_SCHEDULE[r][0]]);
+  v[1] = add_512(v[1], m[(size_t)MSG_SCHEDULE[r][2]]);
+  v[2] = add_512(v[2], m[(size_t)MSG_SCHEDULE[r][4]]);
+  v[3] = add_512(v[3], m[(size_t)MSG_SCHEDULE[r][6]]);
+  v[0] = add_512(v[0], v[4]);
+  v[1] = add_512(v[1], v[5]);
+  v[2] = add_512(v[2], v[6]);
+  v[3] = add_512(v[3], v[7]);
+  v[12] = xor_512(v[12], v[0]);
+  v[13] = xor_512(v[13], v[1]);
+  v[14] = xor_512(v[14], v[2]);
+  v[15] = xor_512(v[15], v[3]);
+  v[12] = rot16_512(v[12]);
+  v[13] = rot16_512(v[13]);
+  v[14] = rot16_512(v[14]);
+  v[15] = rot16_512(v[15]);
+  v[8] = add_512(v[8], v[12]);
+  v[9] = add_512(v[9], v[13]);
+  v[10] = add_512(v[10], v[14]);
+  v[11] = add_512(v[11], v[15]);
+  v[4] = xor_512(v[4], v[8]);
+  v[5] = xor_512(v[5], v[9]);
+  v[6] = xor_512(v[6], v[10]);
+  v[7] = xor_512(v[7], v[11]);
+  v[4] = rot12_512(v[4]);
+  v[5] = rot12_512(v[5]);
+  v[6] = rot12_512(v[6]);
+  v[7] = rot12_512(v[7]);
+  v[0] = add_512(v[0], m[(size_t)MSG_SCHEDULE[r][1]]);
+  v[1] = add_512(v[1], m[(size_t)MSG_SCHEDULE[r][3]]);
+  v[2] = add_512(v[2], m[(size_t)MSG_SCHEDULE[r][5]]);
+  v[3] = add_512(v[3], m[(size_t)MSG_SCHEDULE[r][7]]);
+  v[0] = add_512(v[0], v[4]);
+  v[1] = add_512(v[1], v[5]);
+  v[2] = add_512(v[2], v[6]);
+  v[3] = add_512(v[3], v[7]);
+  v[12] = xor_512(v[12], v[0]);
+  v[13] = xor_512(v[13], v[1]);
+  v[14] = xor_512(v[14], v[2]);
+  v[15] = xor_512(v[15], v[3]);
+  v[12] = rot8_512(v[12]);
+  v[13] = rot8_512(v[13]);
+  v[14] = rot8_512(v[14]);
+  v[15] = rot8_512(v[15]);
+  v[8] = add_512(v[8], v[12]);
+  v[9] = add_512(v[9], v[13]);
+  v[10] = add_512(v[10], v[14]);
+  v[11] = add_512(v[11], v[15]);
+  v[4] = xor_512(v[4], v[8]);
+  v[5] = xor_512(v[5], v[9]);
+  v[6] = xor_512(v[6], v[10]);
+  v[7] = xor_512(v[7], v[11]);
+  v[4] = rot7_512(v[4]);
+  v[5] = rot7_512(v[5]);
+  v[6] = rot7_512(v[6]);
+  v[7] = rot7_512(v[7]);
+
+  v[0] = add_512(v[0], m[(size_t)MSG_SCHEDULE[r][8]]);
+  v[1] = add_512(v[1], m[(size_t)MSG_SCHEDULE[r][10]]);
+  v[2] = add_512(v[2], m[(size_t)MSG_SCHEDULE[r][12]]);
+  v[3] = add_512(v[3], m[(size_t)MSG_SCHEDULE[r][14]]);
+  v[0] = add_512(v[0], v[5]);
+  v[1] = add_512(v[1], v[6]);
+  v[2] = add_512(v[2], v[7]);
+  v[3] = add_512(v[3], v[4]);
+  v[15] = xor_512(v[15], v[0]);
+  v[12] = xor_512(v[12], v[1]);
+  v[13] = xor_512(v[13], v[2]);
+  v[14] = xor_512(v[14], v[3]);
+  v[15] = rot16_512(v[15]);
+  v[12] = rot16_512(v[12]);
+  v[13] = rot16_512(v[13]);
+  v[14] = rot16_512(v[14]);
+  v[10] = add_512(v[10], v[15]);
+  v[11] = add_512(v[11], v[12]);
+  v[8] = add_512(v[8], v[13]);
+  v[9] = add_512(v[9], v[14]);
+  v[5] = xor_512(v[5], v[10]);
+  v[6] = xor_512(v[6], v[11]);
+  v[7] = xor_512(v[7], v[8]);
+  v[4] = xor_512(v[4], v[9]);
+  v[5] = rot12_512(v[5]);
+  v[6] = rot12_512(v[6]);
+  v[7] = rot12_512(v[7]);
+  v[4] = rot12_512(v[4]);
+  v[0] = add_512(v[0], m[(size_t)MSG_SCHEDULE[r][9]]);
+  v[1] = add_512(v[1], m[(size_t)MSG_SCHEDULE[r][11]]);
+  v[2] = add_512(v[2], m[(size_t)MSG_SCHEDULE[r][13]]);
+  v[3] = add_512(v[3], m[(size_t)MSG_SCHEDULE[r][15]]);
+  v[0] = add_512(v[0], v[5]);
+  v[1] = add_512(v[1], v[6]);
+  v[2] = add_512(v[2], v[7]);
+  v[3] = add_512(v[3], v[4]);
+  v[15] = xor_512(v[15], v[0]);
+  v[12] = xor_512(v[12], v[1]);
+  v[13] = xor_512(v[13], v[2]);
+  v[14] = xor_512(v[14], v[3]);
+  v[15] = rot8_512(v[15]);
+  v[12] = rot8_512(v[12]);
+  v[13] = rot8_512(v[13]);
+  v[14] = rot8_512(v[14]);
+  v[10] = add_512(v[10], v[15]);
+  v[11] = add_512(v[11], v[12]);
+  v[8] = add_512(v[8], v[13]);
+  v[9] = add_512(v[9], v[14]);
+  v[5] = xor_512(v[5], v[10]);
+  v[6] = xor_512(v[6], v[11]);
+  v[7] = xor_512(v[7], v[8]);
+  v[4] = xor_512(v[4], v[9]);
+  v[5] = rot7_512(v[5]);
+  v[6] = rot7_512(v[6]);
+  v[7] = rot7_512(v[7]);
+  v[4] = rot7_512(v[4]);
+}
+
+// 0b10001000, or lanes a0/a2/b0/b2 in little-endian order
+#define LO_IMM8 0x88
+
+INLINE __m512i unpack_lo_128(__m512i a, __m512i b) {
+  return _mm512_shuffle_i32x4(a, b, LO_IMM8);
+}
+
+// 0b11011101, or lanes a1/a3/b1/b3 in little-endian order
+#define HI_IMM8 0xdd
+
+INLINE __m512i unpack_hi_128(__m512i a, __m512i b) {
+  return _mm512_shuffle_i32x4(a, b, HI_IMM8);
+}
+
+INLINE void transpose_vecs_512(__m512i vecs[16]) {
+  // Interleave 32-bit lanes. The _0 unpack is lanes
+  // 0/0/1/1/4/4/5/5/8/8/9/9/12/12/13/13, and the _2 unpack is lanes
+  // 2/2/3/3/6/6/7/7/10/10/11/11/14/14/15/15.
+  __m512i ab_0 = _mm512_unpacklo_epi32(vecs[0], vecs[1]);
+  __m512i ab_2 = _mm512_unpackhi_epi32(vecs[0], vecs[1]);
+  __m512i cd_0 = _mm512_unpacklo_epi32(vecs[2], vecs[3]);
+  __m512i cd_2 = _mm512_unpackhi_epi32(vecs[2], vecs[3]);
+  __m512i ef_0 = _mm512_unpacklo_epi32(vecs[4], vecs[5]);
+  __m512i ef_2 = _mm512_unpackhi_epi32(vecs[4], vecs[5]);
+  __m512i gh_0 = _mm512_unpacklo_epi32(vecs[6], vecs[7]);
+  __m512i gh_2 = _mm512_unpackhi_epi32(vecs[6], vecs[7]);
+  __m512i ij_0 = _mm512_unpacklo_epi32(vecs[8], vecs[9]);
+  __m512i ij_2 = _mm512_unpackhi_epi32(vecs[8], vecs[9]);
+  __m512i kl_0 = _mm512_unpacklo_epi32(vecs[10], vecs[11]);
+  __m512i kl_2 = _mm512_unpackhi_epi32(vecs[10], vecs[11]);
+  __m512i mn_0 = _mm512_unpacklo_epi32(vecs[12], vecs[13]);
+  __m512i mn_2 = _mm512_unpackhi_epi32(vecs[12], vecs[13]);
+  __m512i op_0 = _mm512_unpacklo_epi32(vecs[14], vecs[15]);
+  __m512i op_2 = _mm512_unpackhi_epi32(vecs[14], vecs[15]);
+
+  // Interleave 64-bit lates. The _0 unpack is lanes
+  // 0/0/0/0/4/4/4/4/8/8/8/8/12/12/12/12, the _1 unpack is lanes
+  // 1/1/1/1/5/5/5/5/9/9/9/9/13/13/13/13, the _2 unpack is lanes
+  // 2/2/2/2/6/6/6/6/10/10/10/10/14/14/14/14, and the _3 unpack is lanes
+  // 3/3/3/3/7/7/7/7/11/11/11/11/15/15/15/15.
+  __m512i abcd_0 = _mm512_unpacklo_epi64(ab_0, cd_0);
+  __m512i abcd_1 = _mm512_unpackhi_epi64(ab_0, cd_0);
+  __m512i abcd_2 = _mm512_unpacklo_epi64(ab_2, cd_2);
+  __m512i abcd_3 = _mm512_unpackhi_epi64(ab_2, cd_2);
+  __m512i efgh_0 = _mm512_unpacklo_epi64(ef_0, gh_0);
+  __m512i efgh_1 = _mm512_unpackhi_epi64(ef_0, gh_0);
+  __m512i efgh_2 = _mm512_unpacklo_epi64(ef_2, gh_2);
+  __m512i efgh_3 = _mm512_unpackhi_epi64(ef_2, gh_2);
+  __m512i ijkl_0 = _mm512_unpacklo_epi64(ij_0, kl_0);
+  __m512i ijkl_1 = _mm512_unpackhi_epi64(ij_0, kl_0);
+  __m512i ijkl_2 = _mm512_unpacklo_epi64(ij_2, kl_2);
+  __m512i ijkl_3 = _mm512_unpackhi_epi64(ij_2, kl_2);
+  __m512i mnop_0 = _mm512_unpacklo_epi64(mn_0, op_0);
+  __m512i mnop_1 = _mm512_unpackhi_epi64(mn_0, op_0);
+  __m512i mnop_2 = _mm512_unpacklo_epi64(mn_2, op_2);
+  __m512i mnop_3 = _mm512_unpackhi_epi64(mn_2, op_2);
+
+  // Interleave 128-bit lanes. The _0 unpack is
+  // 0/0/0/0/8/8/8/8/0/0/0/0/8/8/8/8, the _1 unpack is
+  // 1/1/1/1/9/9/9/9/1/1/1/1/9/9/9/9, and so on.
+  __m512i abcdefgh_0 = unpack_lo_128(abcd_0, efgh_0);
+  __m512i abcdefgh_1 = unpack_lo_128(abcd_1, efgh_1);
+  __m512i abcdefgh_2 = unpack_lo_128(abcd_2, efgh_2);
+  __m512i abcdefgh_3 = unpack_lo_128(abcd_3, efgh_3);
+  __m512i abcdefgh_4 = unpack_hi_128(abcd_0, efgh_0);
+  __m512i abcdefgh_5 = unpack_hi_128(abcd_1, efgh_1);
+  __m512i abcdefgh_6 = unpack_hi_128(abcd_2, efgh_2);
+  __m512i abcdefgh_7 = unpack_hi_128(abcd_3, efgh_3);
+  __m512i ijklmnop_0 = unpack_lo_128(ijkl_0, mnop_0);
+  __m512i ijklmnop_1 = unpack_lo_128(ijkl_1, mnop_1);
+  __m512i ijklmnop_2 = unpack_lo_128(ijkl_2, mnop_2);
+  __m512i ijklmnop_3 = unpack_lo_128(ijkl_3, mnop_3);
+  __m512i ijklmnop_4 = unpack_hi_128(ijkl_0, mnop_0);
+  __m512i ijklmnop_5 = unpack_hi_128(ijkl_1, mnop_1);
+  __m512i ijklmnop_6 = unpack_hi_128(ijkl_2, mnop_2);
+  __m512i ijklmnop_7 = unpack_hi_128(ijkl_3, mnop_3);
+
+  // Interleave 128-bit lanes again for the final outputs.
+  vecs[0] = unpack_lo_128(abcdefgh_0, ijklmnop_0);
+  vecs[1] = unpack_lo_128(abcdefgh_1, ijklmnop_1);
+  vecs[2] = unpack_lo_128(abcdefgh_2, ijklmnop_2);
+  vecs[3] = unpack_lo_128(abcdefgh_3, ijklmnop_3);
+  vecs[4] = unpack_lo_128(abcdefgh_4, ijklmnop_4);
+  vecs[5] = unpack_lo_128(abcdefgh_5, ijklmnop_5);
+  vecs[6] = unpack_lo_128(abcdefgh_6, ijklmnop_6);
+  vecs[7] = unpack_lo_128(abcdefgh_7, ijklmnop_7);
+  vecs[8] = unpack_hi_128(abcdefgh_0, ijklmnop_0);
+  vecs[9] = unpack_hi_128(abcdefgh_1, ijklmnop_1);
+  vecs[10] = unpack_hi_128(abcdefgh_2, ijklmnop_2);
+  vecs[11] = unpack_hi_128(abcdefgh_3, ijklmnop_3);
+  vecs[12] = unpack_hi_128(abcdefgh_4, ijklmnop_4);
+  vecs[13] = unpack_hi_128(abcdefgh_5, ijklmnop_5);
+  vecs[14] = unpack_hi_128(abcdefgh_6, ijklmnop_6);
+  vecs[15] = unpack_hi_128(abcdefgh_7, ijklmnop_7);
+}
+
+INLINE void transpose_msg_vecs16(const u8 *const *inputs,
+                                 size_t block_offset, __m512i out[16]) {
+  out[0] = loadu_512(&inputs[0][block_offset]);
+  out[1] = loadu_512(&inputs[1][block_offset]);
+  out[2] = loadu_512(&inputs[2][block_offset]);
+  out[3] = loadu_512(&inputs[3][block_offset]);
+  out[4] = loadu_512(&inputs[4][block_offset]);
+  out[5] = loadu_512(&inputs[5][block_offset]);
+  out[6] = loadu_512(&inputs[6][block_offset]);
+  out[7] = loadu_512(&inputs[7][block_offset]);
+  out[8] = loadu_512(&inputs[8][block_offset]);
+  out[9] = loadu_512(&inputs[9][block_offset]);
+  out[10] = loadu_512(&inputs[10][block_offset]);
+  out[11] = loadu_512(&inputs[11][block_offset]);
+  out[12] = loadu_512(&inputs[12][block_offset]);
+  out[13] = loadu_512(&inputs[13][block_offset]);
+  out[14] = loadu_512(&inputs[14][block_offset]);
+  out[15] = loadu_512(&inputs[15][block_offset]);
+  for (size_t i = 0; i < 16; ++i) {
+    _mm_prefetch((const void *)&inputs[i][block_offset + 256], _MM_HINT_T0);
+  }
+  transpose_vecs_512(out);
+}
+
+INLINE void load_counters16(u64 counter, char increment_counter,
+                            __m512i *out_lo, __m512i *out_hi) {
+  const __m512i mask = _mm512_set1_epi32(-(int32_t)increment_counter);
+  const __m512i deltas = _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  const __m512i masked_deltas = _mm512_and_si512(deltas, mask);
+  const __m512i low_words = _mm512_add_epi32(
+    _mm512_set1_epi32((int32_t)counter),
+    masked_deltas);
+  // The carry bit is 1 if the high bit of the word was 1 before addition and is
+  // 0 after.
+  // NOTE: It would be a bit more natural to use _mm512_cmp_epu32_mask to
+  // compute the carry bits here, and originally we did, but that intrinsic is
+  // broken under GCC 5.4. See https://github.com/BLAKE3-team/BLAKE3/issues/271.
+  const __m512i carries = _mm512_srli_epi32(
+    _mm512_andnot_si512(
+        low_words, // 0 after (gets inverted by andnot)
+        _mm512_set1_epi32((int32_t)counter)), // and 1 before
+    31);
+  const __m512i high_words = _mm512_add_epi32(
+    _mm512_set1_epi32((int32_t)(counter >> 32)),
+    carries);
+  *out_lo = low_words;
+  *out_hi = high_words;
+}
+
+static
+void blake3_hash16_avx512(const u8 *const *inputs, size_t blocks,
+                          const u32 key[8], u64 counter,
+                          char increment_counter, u8 flags,
+                          u8 flags_start, u8 flags_end,
+                          u8 *out) {
+  __m512i h_vecs[8] = {
+      set1_512(key[0]), set1_512(key[1]), set1_512(key[2]), set1_512(key[3]),
+      set1_512(key[4]), set1_512(key[5]), set1_512(key[6]), set1_512(key[7]),
+  };
+  __m512i counter_low_vec, counter_high_vec;
+  load_counters16(counter, increment_counter, &counter_low_vec,
+                  &counter_high_vec);
+  u8 block_flags = flags | flags_start;
+
+  for (size_t block = 0; block < blocks; block++) {
+    if (block + 1 == blocks) {
+      block_flags |= flags_end;
+    }
+    __m512i block_len_vec = set1_512(BLAKE3_BLOCK_LEN);
+    __m512i block_flags_vec = set1_512(block_flags);
+    __m512i msg_vecs[16];
+    transpose_msg_vecs16(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs);
+
+    __m512i v[16] = {
+        h_vecs[0],       h_vecs[1],        h_vecs[2],       h_vecs[3],
+        h_vecs[4],       h_vecs[5],        h_vecs[6],       h_vecs[7],
+        set1_512(IV[0]), set1_512(IV[1]),  set1_512(IV[2]), set1_512(IV[3]),
+        counter_low_vec, counter_high_vec, block_len_vec,   block_flags_vec,
+    };
+    round_fn16(v, msg_vecs, 0);
+    round_fn16(v, msg_vecs, 1);
+    round_fn16(v, msg_vecs, 2);
+    round_fn16(v, msg_vecs, 3);
+    round_fn16(v, msg_vecs, 4);
+    round_fn16(v, msg_vecs, 5);
+    round_fn16(v, msg_vecs, 6);
+    h_vecs[0] = xor_512(v[0], v[8]);
+    h_vecs[1] = xor_512(v[1], v[9]);
+    h_vecs[2] = xor_512(v[2], v[10]);
+    h_vecs[3] = xor_512(v[3], v[11]);
+    h_vecs[4] = xor_512(v[4], v[12]);
+    h_vecs[5] = xor_512(v[5], v[13]);
+    h_vecs[6] = xor_512(v[6], v[14]);
+    h_vecs[7] = xor_512(v[7], v[15]);
+
+    block_flags = flags;
+  }
+
+  // transpose_vecs_512 operates on a 16x16 matrix of words, but we only have 8
+  // state vectors. Pad the matrix with zeros. After transposition, store the
+  // lower half of each vector.
+  __m512i padded[16] = {
+      h_vecs[0],   h_vecs[1],   h_vecs[2],   h_vecs[3],
+      h_vecs[4],   h_vecs[5],   h_vecs[6],   h_vecs[7],
+      set1_512(0), set1_512(0), set1_512(0), set1_512(0),
+      set1_512(0), set1_512(0), set1_512(0), set1_512(0),
+  };
+  transpose_vecs_512(padded);
+  _mm256_mask_storeu_epi32(&out[0 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[0]));
+  _mm256_mask_storeu_epi32(&out[1 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[1]));
+  _mm256_mask_storeu_epi32(&out[2 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[2]));
+  _mm256_mask_storeu_epi32(&out[3 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[3]));
+  _mm256_mask_storeu_epi32(&out[4 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[4]));
+  _mm256_mask_storeu_epi32(&out[5 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[5]));
+  _mm256_mask_storeu_epi32(&out[6 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[6]));
+  _mm256_mask_storeu_epi32(&out[7 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[7]));
+  _mm256_mask_storeu_epi32(&out[8 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[8]));
+  _mm256_mask_storeu_epi32(&out[9 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[9]));
+  _mm256_mask_storeu_epi32(&out[10 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[10]));
+  _mm256_mask_storeu_epi32(&out[11 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[11]));
+  _mm256_mask_storeu_epi32(&out[12 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[12]));
+  _mm256_mask_storeu_epi32(&out[13 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[13]));
+  _mm256_mask_storeu_epi32(&out[14 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[14]));
+  _mm256_mask_storeu_epi32(&out[15 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[15]));
+}
+
+/*
+ * ----------------------------------------------------------------------------
+ * hash_many_avx512
+ * ----------------------------------------------------------------------------
+ */
+
+INLINE void hash_one_avx512(const u8 *input, size_t blocks,
+                            const u32 key[8], u64 counter,
+                            u8 flags, u8 flags_start,
+                            u8 flags_end, u8 out[BLAKE3_OUT_LEN]) {
+  u32 cv[8];
+  memcpy(cv, key, BLAKE3_KEY_LEN);
+  u8 block_flags = flags | flags_start;
+  while (blocks > 0) {
+    if (blocks == 1) {
+      block_flags |= flags_end;
+    }
+    blake3_compress_in_place_avx512(cv, input, BLAKE3_BLOCK_LEN, counter,
+                                    block_flags);
+    input = &input[BLAKE3_BLOCK_LEN];
+    blocks -= 1;
+    block_flags = flags;
+  }
+  memcpy(out, cv, BLAKE3_OUT_LEN);
+}
+
+void blake3_hash_many_avx512(const u8 *const *inputs, size_t num_inputs,
+                             size_t blocks, const u32 key[8],
+                             u64 counter, char increment_counter,
+                             u8 flags, u8 flags_start,
+                             u8 flags_end, u8 *out) {
+  while (num_inputs >= 16) {
+    blake3_hash16_avx512(inputs, blocks, key, counter, increment_counter, flags,
+                         flags_start, flags_end, out);
+    if (increment_counter) {
+      counter += 16;
+    }
+    inputs += 16;
+    num_inputs -= 16;
+    out = &out[16 * BLAKE3_OUT_LEN];
+  }
+  while (num_inputs >= 8) {
+    blake3_hash8_avx512(inputs, blocks, key, counter, increment_counter, flags,
+                        flags_start, flags_end, out);
+    if (increment_counter) {
+      counter += 8;
+    }
+    inputs += 8;
+    num_inputs -= 8;
+    out = &out[8 * BLAKE3_OUT_LEN];
+  }
+  while (num_inputs >= 4) {
+    blake3_hash4_avx512(inputs, blocks, key, counter, increment_counter, flags,
+                        flags_start, flags_end, out);
+    if (increment_counter) {
+      counter += 4;
+    }
+    inputs += 4;
+    num_inputs -= 4;
+    out = &out[4 * BLAKE3_OUT_LEN];
+  }
+  while (num_inputs > 0) {
+    hash_one_avx512(inputs[0], blocks, key, counter, flags, flags_start,
+                    flags_end, out);
+    if (increment_counter) {
+      counter += 1;
+    }
+    inputs += 1;
+    num_inputs -= 1;
+    out = &out[BLAKE3_OUT_LEN];
+  }
+}
diff --git a/src/blake3/blake3_avx512_x86-64_unix.S b/src/blake3/blake3_avx512_x86-64_unix.S
new file mode 100644
index 0000000..a06aede
--- /dev/null
+++ b/src/blake3/blake3_avx512_x86-64_unix.S
@@ -0,0 +1,2585 @@
+#if defined(__ELF__) && defined(__linux__)
+.section .note.GNU-stack,"",%progbits
+#endif
+
+#if defined(__ELF__) && defined(__CET__) && defined(__has_include)
+#if __has_include(<cet.h>)
+#include <cet.h>
+#endif
+#endif
+
+#if !defined(_CET_ENDBR)
+#define _CET_ENDBR
+#endif
+
+.intel_syntax noprefix
+.global _blake3_hash_many_avx512
+.global blake3_hash_many_avx512
+.global blake3_compress_in_place_avx512
+.global _blake3_compress_in_place_avx512
+.global blake3_compress_xof_avx512
+.global _blake3_compress_xof_avx512
+
+#ifdef __APPLE__
+.text
+#else
+.section .text
+#endif
+.p2align  6
+_blake3_hash_many_avx512:
+blake3_hash_many_avx512:
+        _CET_ENDBR
+        push    r15
+        push    r14
+        push    r13
+        push    r12
+        push    rbx
+        push    rbp
+        mov     rbp, rsp
+        sub     rsp, 144
+        and     rsp, 0xFFFFFFFFFFFFFFC0
+        neg     r9
+        kmovw   k1, r9d
+        vmovd   xmm0, r8d
+        vpbroadcastd ymm0, xmm0
+        shr     r8, 32
+        vmovd   xmm1, r8d
+        vpbroadcastd ymm1, xmm1
+        vmovdqa ymm4, ymm1
+        vmovdqa ymm5, ymm1
+        vpaddd  ymm2, ymm0, ymmword ptr [ADD0+rip]
+        vpaddd  ymm3, ymm0, ymmword ptr [ADD0+32+rip]
+        vpcmpltud k2, ymm2, ymm0
+        vpcmpltud k3, ymm3, ymm0
+        vpaddd  ymm4 {k2}, ymm4, dword ptr [ADD1+rip] {1to8}
+        vpaddd  ymm5 {k3}, ymm5, dword ptr [ADD1+rip] {1to8}
+        knotw   k2, k1
+        vmovdqa32 ymm2 {k2}, ymm0
+        vmovdqa32 ymm3 {k2}, ymm0
+        vmovdqa32 ymm4 {k2}, ymm1
+        vmovdqa32 ymm5 {k2}, ymm1
+        vmovdqa ymmword ptr [rsp], ymm2
+        vmovdqa ymmword ptr [rsp+0x1*0x20], ymm3
+        vmovdqa ymmword ptr [rsp+0x2*0x20], ymm4
+        vmovdqa ymmword ptr [rsp+0x3*0x20], ymm5
+        shl     rdx, 6
+        mov     qword ptr [rsp+0x80], rdx
+        cmp     rsi, 16
+        jc      3f
+2:
+        vpbroadcastd zmm0, dword ptr [rcx]
+        vpbroadcastd zmm1, dword ptr [rcx+0x1*0x4]
+        vpbroadcastd zmm2, dword ptr [rcx+0x2*0x4]
+        vpbroadcastd zmm3, dword ptr [rcx+0x3*0x4]
+        vpbroadcastd zmm4, dword ptr [rcx+0x4*0x4]
+        vpbroadcastd zmm5, dword ptr [rcx+0x5*0x4]
+        vpbroadcastd zmm6, dword ptr [rcx+0x6*0x4]
+        vpbroadcastd zmm7, dword ptr [rcx+0x7*0x4]
+        movzx   eax, byte ptr [rbp+0x38]
+        movzx   ebx, byte ptr [rbp+0x40]
+        or      eax, ebx
+        xor     edx, edx
+.p2align 5
+9:
+        movzx   ebx, byte ptr [rbp+0x48]
+        or      ebx, eax
+        add     rdx, 64
+        cmp     rdx, qword ptr [rsp+0x80]
+        cmove   eax, ebx
+        mov     dword ptr [rsp+0x88], eax
+        mov     r8, qword ptr [rdi]
+        mov     r9, qword ptr [rdi+0x8]
+        mov     r10, qword ptr [rdi+0x10]
+        mov     r11, qword ptr [rdi+0x18]
+        mov     r12, qword ptr [rdi+0x40]
+        mov     r13, qword ptr [rdi+0x48]
+        mov     r14, qword ptr [rdi+0x50]
+        mov     r15, qword ptr [rdi+0x58]
+        vmovdqu32 ymm16, ymmword ptr [rdx+r8-0x2*0x20]
+        vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01
+        vmovdqu32 ymm17, ymmword ptr [rdx+r9-0x2*0x20]
+        vinserti64x4 zmm17, zmm17, ymmword ptr [rdx+r13-0x2*0x20], 0x01
+        vpunpcklqdq zmm8, zmm16, zmm17
+        vpunpckhqdq zmm9, zmm16, zmm17
+        vmovdqu32 ymm18, ymmword ptr [rdx+r10-0x2*0x20]
+        vinserti64x4 zmm18, zmm18, ymmword ptr [rdx+r14-0x2*0x20], 0x01
+        vmovdqu32 ymm19, ymmword ptr [rdx+r11-0x2*0x20]
+        vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01
+        vpunpcklqdq zmm10, zmm18, zmm19
+        vpunpckhqdq zmm11, zmm18, zmm19
+        mov     r8, qword ptr [rdi+0x20]
+        mov     r9, qword ptr [rdi+0x28]
+        mov     r10, qword ptr [rdi+0x30]
+        mov     r11, qword ptr [rdi+0x38]
+        mov     r12, qword ptr [rdi+0x60]
+        mov     r13, qword ptr [rdi+0x68]
+        mov     r14, qword ptr [rdi+0x70]
+        mov     r15, qword ptr [rdi+0x78]
+        vmovdqu32 ymm16, ymmword ptr [rdx+r8-0x2*0x20]
+        vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01
+        vmovdqu32 ymm17, ymmword ptr [rdx+r9-0x2*0x20]
+        vinserti64x4 zmm17, zmm17, ymmword ptr [rdx+r13-0x2*0x20], 0x01
+        vpunpcklqdq zmm12, zmm16, zmm17
+        vpunpckhqdq zmm13, zmm16, zmm17
+        vmovdqu32 ymm18, ymmword ptr [rdx+r10-0x2*0x20]
+        vinserti64x4 zmm18, zmm18, ymmword ptr [rdx+r14-0x2*0x20], 0x01
+        vmovdqu32 ymm19, ymmword ptr [rdx+r11-0x2*0x20]
+        vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01
+        vpunpcklqdq zmm14, zmm18, zmm19
+        vpunpckhqdq zmm15, zmm18, zmm19
+        vmovdqa32 zmm27, zmmword ptr [INDEX0+rip]
+        vmovdqa32 zmm31, zmmword ptr [INDEX1+rip]
+        vshufps zmm16, zmm8, zmm10, 136
+        vshufps zmm17, zmm12, zmm14, 136
+        vmovdqa32 zmm20, zmm16
+        vpermt2d zmm16, zmm27, zmm17
+        vpermt2d zmm20, zmm31, zmm17
+        vshufps zmm17, zmm8, zmm10, 221
+        vshufps zmm30, zmm12, zmm14, 221
+        vmovdqa32 zmm21, zmm17
+        vpermt2d zmm17, zmm27, zmm30
+        vpermt2d zmm21, zmm31, zmm30
+        vshufps zmm18, zmm9, zmm11, 136
+        vshufps zmm8, zmm13, zmm15, 136
+        vmovdqa32 zmm22, zmm18
+        vpermt2d zmm18, zmm27, zmm8
+        vpermt2d zmm22, zmm31, zmm8
+        vshufps zmm19, zmm9, zmm11, 221
+        vshufps zmm8, zmm13, zmm15, 221
+        vmovdqa32 zmm23, zmm19
+        vpermt2d zmm19, zmm27, zmm8
+        vpermt2d zmm23, zmm31, zmm8
+        mov     r8, qword ptr [rdi]
+        mov     r9, qword ptr [rdi+0x8]
+        mov     r10, qword ptr [rdi+0x10]
+        mov     r11, qword ptr [rdi+0x18]
+        mov     r12, qword ptr [rdi+0x40]
+        mov     r13, qword ptr [rdi+0x48]
+        mov     r14, qword ptr [rdi+0x50]
+        mov     r15, qword ptr [rdi+0x58]
+        vmovdqu32 ymm24, ymmword ptr [r8+rdx-0x1*0x20]
+        vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01
+        vmovdqu32 ymm25, ymmword ptr [r9+rdx-0x1*0x20]
+        vinserti64x4 zmm25, zmm25, ymmword ptr [r13+rdx-0x1*0x20], 0x01
+        vpunpcklqdq zmm8, zmm24, zmm25
+        vpunpckhqdq zmm9, zmm24, zmm25
+        vmovdqu32 ymm24, ymmword ptr [r10+rdx-0x1*0x20]
+        vinserti64x4 zmm24, zmm24, ymmword ptr [r14+rdx-0x1*0x20], 0x01
+        vmovdqu32 ymm25, ymmword ptr [r11+rdx-0x1*0x20]
+        vinserti64x4 zmm25, zmm25, ymmword ptr [r15+rdx-0x1*0x20], 0x01
+        vpunpcklqdq zmm10, zmm24, zmm25
+        vpunpckhqdq zmm11, zmm24, zmm25
+        prefetcht0 [r8+rdx+0x80]
+        prefetcht0 [r12+rdx+0x80]
+        prefetcht0 [r9+rdx+0x80]
+        prefetcht0 [r13+rdx+0x80]
+        prefetcht0 [r10+rdx+0x80]
+        prefetcht0 [r14+rdx+0x80]
+        prefetcht0 [r11+rdx+0x80]
+        prefetcht0 [r15+rdx+0x80]
+        mov     r8, qword ptr [rdi+0x20]
+        mov     r9, qword ptr [rdi+0x28]
+        mov     r10, qword ptr [rdi+0x30]
+        mov     r11, qword ptr [rdi+0x38]
+        mov     r12, qword ptr [rdi+0x60]
+        mov     r13, qword ptr [rdi+0x68]
+        mov     r14, qword ptr [rdi+0x70]
+        mov     r15, qword ptr [rdi+0x78]
+        vmovdqu32 ymm24, ymmword ptr [r8+rdx-0x1*0x20]
+        vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01
+        vmovdqu32 ymm25, ymmword ptr [r9+rdx-0x1*0x20]
+        vinserti64x4 zmm25, zmm25, ymmword ptr [r13+rdx-0x1*0x20], 0x01
+        vpunpcklqdq zmm12, zmm24, zmm25
+        vpunpckhqdq zmm13, zmm24, zmm25
+        vmovdqu32 ymm24, ymmword ptr [r10+rdx-0x1*0x20]
+        vinserti64x4 zmm24, zmm24, ymmword ptr [r14+rdx-0x1*0x20], 0x01
+        vmovdqu32 ymm25, ymmword ptr [r11+rdx-0x1*0x20]
+        vinserti64x4 zmm25, zmm25, ymmword ptr [r15+rdx-0x1*0x20], 0x01
+        vpunpcklqdq zmm14, zmm24, zmm25
+        vpunpckhqdq zmm15, zmm24, zmm25
+        prefetcht0 [r8+rdx+0x80]
+        prefetcht0 [r12+rdx+0x80]
+        prefetcht0 [r9+rdx+0x80]
+        prefetcht0 [r13+rdx+0x80]
+        prefetcht0 [r10+rdx+0x80]
+        prefetcht0 [r14+rdx+0x80]
+        prefetcht0 [r11+rdx+0x80]
+        prefetcht0 [r15+rdx+0x80]
+        vshufps zmm24, zmm8, zmm10, 136
+        vshufps zmm30, zmm12, zmm14, 136
+        vmovdqa32 zmm28, zmm24
+        vpermt2d zmm24, zmm27, zmm30
+        vpermt2d zmm28, zmm31, zmm30
+        vshufps zmm25, zmm8, zmm10, 221
+        vshufps zmm30, zmm12, zmm14, 221
+        vmovdqa32 zmm29, zmm25
+        vpermt2d zmm25, zmm27, zmm30
+        vpermt2d zmm29, zmm31, zmm30
+        vshufps zmm26, zmm9, zmm11, 136
+        vshufps zmm8, zmm13, zmm15, 136
+        vmovdqa32 zmm30, zmm26
+        vpermt2d zmm26, zmm27, zmm8
+        vpermt2d zmm30, zmm31, zmm8
+        vshufps zmm8, zmm9, zmm11, 221
+        vshufps zmm10, zmm13, zmm15, 221
+        vpermi2d zmm27, zmm8, zmm10
+        vpermi2d zmm31, zmm8, zmm10
+        vpbroadcastd zmm8, dword ptr [BLAKE3_IV_0+rip]
+        vpbroadcastd zmm9, dword ptr [BLAKE3_IV_1+rip]
+        vpbroadcastd zmm10, dword ptr [BLAKE3_IV_2+rip]
+        vpbroadcastd zmm11, dword ptr [BLAKE3_IV_3+rip]
+        vmovdqa32 zmm12, zmmword ptr [rsp]
+        vmovdqa32 zmm13, zmmword ptr [rsp+0x1*0x40]
+        vpbroadcastd zmm14, dword ptr [BLAKE3_BLOCK_LEN+rip]
+        vpbroadcastd zmm15, dword ptr [rsp+0x22*0x4]
+        vpaddd  zmm0, zmm0, zmm16
+        vpaddd  zmm1, zmm1, zmm18
+        vpaddd  zmm2, zmm2, zmm20
+        vpaddd  zmm3, zmm3, zmm22
+        vpaddd  zmm0, zmm0, zmm4
+        vpaddd  zmm1, zmm1, zmm5
+        vpaddd  zmm2, zmm2, zmm6
+        vpaddd  zmm3, zmm3, zmm7
+        vpxord  zmm12, zmm12, zmm0
+        vpxord  zmm13, zmm13, zmm1
+        vpxord  zmm14, zmm14, zmm2
+        vpxord  zmm15, zmm15, zmm3
+        vprord  zmm12, zmm12, 16
+        vprord  zmm13, zmm13, 16
+        vprord  zmm14, zmm14, 16
+        vprord  zmm15, zmm15, 16
+        vpaddd  zmm8, zmm8, zmm12
+        vpaddd  zmm9, zmm9, zmm13
+        vpaddd  zmm10, zmm10, zmm14
+        vpaddd  zmm11, zmm11, zmm15
+        vpxord  zmm4, zmm4, zmm8
+        vpxord  zmm5, zmm5, zmm9
+        vpxord  zmm6, zmm6, zmm10
+        vpxord  zmm7, zmm7, zmm11
+        vprord  zmm4, zmm4, 12
+        vprord  zmm5, zmm5, 12
+        vprord  zmm6, zmm6, 12
+        vprord  zmm7, zmm7, 12
+        vpaddd  zmm0, zmm0, zmm17
+        vpaddd  zmm1, zmm1, zmm19
+        vpaddd  zmm2, zmm2, zmm21
+        vpaddd  zmm3, zmm3, zmm23
+        vpaddd  zmm0, zmm0, zmm4
+        vpaddd  zmm1, zmm1, zmm5
+        vpaddd  zmm2, zmm2, zmm6
+        vpaddd  zmm3, zmm3, zmm7
+        vpxord  zmm12, zmm12, zmm0
+        vpxord  zmm13, zmm13, zmm1
+        vpxord  zmm14, zmm14, zmm2
+        vpxord  zmm15, zmm15, zmm3
+        vprord  zmm12, zmm12, 8
+        vprord  zmm13, zmm13, 8
+        vprord  zmm14, zmm14, 8
+        vprord  zmm15, zmm15, 8
+        vpaddd  zmm8, zmm8, zmm12
+        vpaddd  zmm9, zmm9, zmm13
+        vpaddd  zmm10, zmm10, zmm14
+        vpaddd  zmm11, zmm11, zmm15
+        vpxord  zmm4, zmm4, zmm8
+        vpxord  zmm5, zmm5, zmm9
+        vpxord  zmm6, zmm6, zmm10
+        vpxord  zmm7, zmm7, zmm11
+        vprord  zmm4, zmm4, 7
+        vprord  zmm5, zmm5, 7
+        vprord  zmm6, zmm6, 7
+        vprord  zmm7, zmm7, 7
+        vpaddd  zmm0, zmm0, zmm24
+        vpaddd  zmm1, zmm1, zmm26
+        vpaddd  zmm2, zmm2, zmm28
+        vpaddd  zmm3, zmm3, zmm30
+        vpaddd  zmm0, zmm0, zmm5
+        vpaddd  zmm1, zmm1, zmm6
+        vpaddd  zmm2, zmm2, zmm7
+        vpaddd  zmm3, zmm3, zmm4
+        vpxord  zmm15, zmm15, zmm0
+        vpxord  zmm12, zmm12, zmm1
+        vpxord  zmm13, zmm13, zmm2
+        vpxord  zmm14, zmm14, zmm3
+        vprord  zmm15, zmm15, 16
+        vprord  zmm12, zmm12, 16
+        vprord  zmm13, zmm13, 16
+        vprord  zmm14, zmm14, 16
+        vpaddd  zmm10, zmm10, zmm15
+        vpaddd  zmm11, zmm11, zmm12
+        vpaddd  zmm8, zmm8, zmm13
+        vpaddd  zmm9, zmm9, zmm14
+        vpxord  zmm5, zmm5, zmm10
+        vpxord  zmm6, zmm6, zmm11
+        vpxord  zmm7, zmm7, zmm8
+        vpxord  zmm4, zmm4, zmm9
+        vprord  zmm5, zmm5, 12
+        vprord  zmm6, zmm6, 12
+        vprord  zmm7, zmm7, 12
+        vprord  zmm4, zmm4, 12
+        vpaddd  zmm0, zmm0, zmm25
+        vpaddd  zmm1, zmm1, zmm27
+        vpaddd  zmm2, zmm2, zmm29
+        vpaddd  zmm3, zmm3, zmm31
+        vpaddd  zmm0, zmm0, zmm5
+        vpaddd  zmm1, zmm1, zmm6
+        vpaddd  zmm2, zmm2, zmm7
+        vpaddd  zmm3, zmm3, zmm4
+        vpxord  zmm15, zmm15, zmm0
+        vpxord  zmm12, zmm12, zmm1
+        vpxord  zmm13, zmm13, zmm2
+        vpxord  zmm14, zmm14, zmm3
+        vprord  zmm15, zmm15, 8
+        vprord  zmm12, zmm12, 8
+        vprord  zmm13, zmm13, 8
+        vprord  zmm14, zmm14, 8
+        vpaddd  zmm10, zmm10, zmm15
+        vpaddd  zmm11, zmm11, zmm12
+        vpaddd  zmm8, zmm8, zmm13
+        vpaddd  zmm9, zmm9, zmm14
+        vpxord  zmm5, zmm5, zmm10
+        vpxord  zmm6, zmm6, zmm11
+        vpxord  zmm7, zmm7, zmm8
+        vpxord  zmm4, zmm4, zmm9
+        vprord  zmm5, zmm5, 7
+        vprord  zmm6, zmm6, 7
+        vprord  zmm7, zmm7, 7
+        vprord  zmm4, zmm4, 7
+        vpaddd  zmm0, zmm0, zmm18
+        vpaddd  zmm1, zmm1, zmm19
+        vpaddd  zmm2, zmm2, zmm23
+        vpaddd  zmm3, zmm3, zmm20
+        vpaddd  zmm0, zmm0, zmm4
+        vpaddd  zmm1, zmm1, zmm5
+        vpaddd  zmm2, zmm2, zmm6
+        vpaddd  zmm3, zmm3, zmm7
+        vpxord  zmm12, zmm12, zmm0
+        vpxord  zmm13, zmm13, zmm1
+        vpxord  zmm14, zmm14, zmm2
+        vpxord  zmm15, zmm15, zmm3
+        vprord  zmm12, zmm12, 16
+        vprord  zmm13, zmm13, 16
+        vprord  zmm14, zmm14, 16
+        vprord  zmm15, zmm15, 16
+        vpaddd  zmm8, zmm8, zmm12
+        vpaddd  zmm9, zmm9, zmm13
+        vpaddd  zmm10, zmm10, zmm14
+        vpaddd  zmm11, zmm11, zmm15
+        vpxord  zmm4, zmm4, zmm8
+        vpxord  zmm5, zmm5, zmm9
+        vpxord  zmm6, zmm6, zmm10
+        vpxord  zmm7, zmm7, zmm11
+        vprord  zmm4, zmm4, 12
+        vprord  zmm5, zmm5, 12
+        vprord  zmm6, zmm6, 12
+        vprord  zmm7, zmm7, 12
+        vpaddd  zmm0, zmm0, zmm22
+        vpaddd  zmm1, zmm1, zmm26
+        vpaddd  zmm2, zmm2, zmm16
+        vpaddd  zmm3, zmm3, zmm29
+        vpaddd  zmm0, zmm0, zmm4
+        vpaddd  zmm1, zmm1, zmm5
+        vpaddd  zmm2, zmm2, zmm6
+        vpaddd  zmm3, zmm3, zmm7
+        vpxord  zmm12, zmm12, zmm0
+        vpxord  zmm13, zmm13, zmm1
+        vpxord  zmm14, zmm14, zmm2
+        vpxord  zmm15, zmm15, zmm3
+        vprord  zmm12, zmm12, 8
+        vprord  zmm13, zmm13, 8
+        vprord  zmm14, zmm14, 8
+        vprord  zmm15, zmm15, 8
+        vpaddd  zmm8, zmm8, zmm12
+        vpaddd  zmm9, zmm9, zmm13
+        vpaddd  zmm10, zmm10, zmm14
+        vpaddd  zmm11, zmm11, zmm15
+        vpxord  zmm4, zmm4, zmm8
+        vpxord  zmm5, zmm5, zmm9
+        vpxord  zmm6, zmm6, zmm10
+        vpxord  zmm7, zmm7, zmm11
+        vprord  zmm4, zmm4, 7
+        vprord  zmm5, zmm5, 7
+        vprord  zmm6, zmm6, 7
+        vprord  zmm7, zmm7, 7
+        vpaddd  zmm0, zmm0, zmm17
+        vpaddd  zmm1, zmm1, zmm28
+        vpaddd  zmm2, zmm2, zmm25
+        vpaddd  zmm3, zmm3, zmm31
+        vpaddd  zmm0, zmm0, zmm5
+        vpaddd  zmm1, zmm1, zmm6
+        vpaddd  zmm2, zmm2, zmm7
+        vpaddd  zmm3, zmm3, zmm4
+        vpxord  zmm15, zmm15, zmm0
+        vpxord  zmm12, zmm12, zmm1
+        vpxord  zmm13, zmm13, zmm2
+        vpxord  zmm14, zmm14, zmm3
+        vprord  zmm15, zmm15, 16
+        vprord  zmm12, zmm12, 16
+        vprord  zmm13, zmm13, 16
+        vprord  zmm14, zmm14, 16
+        vpaddd  zmm10, zmm10, zmm15
+        vpaddd  zmm11, zmm11, zmm12
+        vpaddd  zmm8, zmm8, zmm13
+        vpaddd  zmm9, zmm9, zmm14
+        vpxord  zmm5, zmm5, zmm10
+        vpxord  zmm6, zmm6, zmm11
+        vpxord  zmm7, zmm7, zmm8
+        vpxord  zmm4, zmm4, zmm9
+        vprord  zmm5, zmm5, 12
+        vprord  zmm6, zmm6, 12
+        vprord  zmm7, zmm7, 12
+        vprord  zmm4, zmm4, 12
+        vpaddd  zmm0, zmm0, zmm27
+        vpaddd  zmm1, zmm1, zmm21
+        vpaddd  zmm2, zmm2, zmm30
+        vpaddd  zmm3, zmm3, zmm24
+        vpaddd  zmm0, zmm0, zmm5
+        vpaddd  zmm1, zmm1, zmm6
+        vpaddd  zmm2, zmm2, zmm7
+        vpaddd  zmm3, zmm3, zmm4
+        vpxord  zmm15, zmm15, zmm0
+        vpxord  zmm12, zmm12, zmm1
+        vpxord  zmm13, zmm13, zmm2
+        vpxord  zmm14, zmm14, zmm3
+        vprord  zmm15, zmm15, 8
+        vprord  zmm12, zmm12, 8
+        vprord  zmm13, zmm13, 8
+        vprord  zmm14, zmm14, 8
+        vpaddd  zmm10, zmm10, zmm15
+        vpaddd  zmm11, zmm11, zmm12
+        vpaddd  zmm8, zmm8, zmm13
+        vpaddd  zmm9, zmm9, zmm14
+        vpxord  zmm5, zmm5, zmm10
+        vpxord  zmm6, zmm6, zmm11
+        vpxord  zmm7, zmm7, zmm8
+        vpxord  zmm4, zmm4, zmm9
+        vprord  zmm5, zmm5, 7
+        vprord  zmm6, zmm6, 7
+        vprord  zmm7, zmm7, 7
+        vprord  zmm4, zmm4, 7
+        vpaddd  zmm0, zmm0, zmm19
+        vpaddd  zmm1, zmm1, zmm26
+        vpaddd  zmm2, zmm2, zmm29
+        vpaddd  zmm3, zmm3, zmm23
+        vpaddd  zmm0, zmm0, zmm4
+        vpaddd  zmm1, zmm1, zmm5
+        vpaddd  zmm2, zmm2, zmm6
+        vpaddd  zmm3, zmm3, zmm7
+        vpxord  zmm12, zmm12, zmm0
+        vpxord  zmm13, zmm13, zmm1
+        vpxord  zmm14, zmm14, zmm2
+        vpxord  zmm15, zmm15, zmm3
+        vprord  zmm12, zmm12, 16
+        vprord  zmm13, zmm13, 16
+        vprord  zmm14, zmm14, 16
+        vprord  zmm15, zmm15, 16
+        vpaddd  zmm8, zmm8, zmm12
+        vpaddd  zmm9, zmm9, zmm13
+        vpaddd  zmm10, zmm10, zmm14
+        vpaddd  zmm11, zmm11, zmm15
+        vpxord  zmm4, zmm4, zmm8
+        vpxord  zmm5, zmm5, zmm9
+        vpxord  zmm6, zmm6, zmm10
+        vpxord  zmm7, zmm7, zmm11
+        vprord  zmm4, zmm4, 12
+        vprord  zmm5, zmm5, 12
+        vprord  zmm6, zmm6, 12
+        vprord  zmm7, zmm7, 12
+        vpaddd  zmm0, zmm0, zmm20
+        vpaddd  zmm1, zmm1, zmm28
+        vpaddd  zmm2, zmm2, zmm18
+        vpaddd  zmm3, zmm3, zmm30
+        vpaddd  zmm0, zmm0, zmm4
+        vpaddd  zmm1, zmm1, zmm5
+        vpaddd  zmm2, zmm2, zmm6
+        vpaddd  zmm3, zmm3, zmm7
+        vpxord  zmm12, zmm12, zmm0
+        vpxord  zmm13, zmm13, zmm1
+        vpxord  zmm14, zmm14, zmm2
+        vpxord  zmm15, zmm15, zmm3
+        vprord  zmm12, zmm12, 8
+        vprord  zmm13, zmm13, 8
+        vprord  zmm14, zmm14, 8
+        vprord  zmm15, zmm15, 8
+        vpaddd  zmm8, zmm8, zmm12
+        vpaddd  zmm9, zmm9, zmm13
+        vpaddd  zmm10, zmm10, zmm14
+        vpaddd  zmm11, zmm11, zmm15
+        vpxord  zmm4, zmm4, zmm8
+        vpxord  zmm5, zmm5, zmm9
+        vpxord  zmm6, zmm6, zmm10
+        vpxord  zmm7, zmm7, zmm11
+        vprord  zmm4, zmm4, 7
+        vprord  zmm5, zmm5, 7
+        vprord  zmm6, zmm6, 7
+        vprord  zmm7, zmm7, 7
+        vpaddd  zmm0, zmm0, zmm22
+        vpaddd  zmm1, zmm1, zmm25
+        vpaddd  zmm2, zmm2, zmm27
+        vpaddd  zmm3, zmm3, zmm24
+        vpaddd  zmm0, zmm0, zmm5
+        vpaddd  zmm1, zmm1, zmm6
+        vpaddd  zmm2, zmm2, zmm7
+        vpaddd  zmm3, zmm3, zmm4
+        vpxord  zmm15, zmm15, zmm0
+        vpxord  zmm12, zmm12, zmm1
+        vpxord  zmm13, zmm13, zmm2
+        vpxord  zmm14, zmm14, zmm3
+        vprord  zmm15, zmm15, 16
+        vprord  zmm12, zmm12, 16
+        vprord  zmm13, zmm13, 16
+        vprord  zmm14, zmm14, 16
+        vpaddd  zmm10, zmm10, zmm15
+        vpaddd  zmm11, zmm11, zmm12
+        vpaddd  zmm8, zmm8, zmm13
+        vpaddd  zmm9, zmm9, zmm14
+        vpxord  zmm5, zmm5, zmm10
+        vpxord  zmm6, zmm6, zmm11
+        vpxord  zmm7, zmm7, zmm8
+        vpxord  zmm4, zmm4, zmm9
+        vprord  zmm5, zmm5, 12
+        vprord  zmm6, zmm6, 12
+        vprord  zmm7, zmm7, 12
+        vprord  zmm4, zmm4, 12
+        vpaddd  zmm0, zmm0, zmm21
+        vpaddd  zmm1, zmm1, zmm16
+        vpaddd  zmm2, zmm2, zmm31
+        vpaddd  zmm3, zmm3, zmm17
+        vpaddd  zmm0, zmm0, zmm5
+        vpaddd  zmm1, zmm1, zmm6
+        vpaddd  zmm2, zmm2, zmm7
+        vpaddd  zmm3, zmm3, zmm4
+        vpxord  zmm15, zmm15, zmm0
+        vpxord  zmm12, zmm12, zmm1
+        vpxord  zmm13, zmm13, zmm2
+        vpxord  zmm14, zmm14, zmm3
+        vprord  zmm15, zmm15, 8
+        vprord  zmm12, zmm12, 8
+        vprord  zmm13, zmm13, 8
+        vprord  zmm14, zmm14, 8
+        vpaddd  zmm10, zmm10, zmm15
+        vpaddd  zmm11, zmm11, zmm12
+        vpaddd  zmm8, zmm8, zmm13
+        vpaddd  zmm9, zmm9, zmm14
+        vpxord  zmm5, zmm5, zmm10
+        vpxord  zmm6, zmm6, zmm11
+        vpxord  zmm7, zmm7, zmm8
+        vpxord  zmm4, zmm4, zmm9
+        vprord  zmm5, zmm5, 7
+        vprord  zmm6, zmm6, 7
+        vprord  zmm7, zmm7, 7
+        vprord  zmm4, zmm4, 7
+        vpaddd  zmm0, zmm0, zmm26
+        vpaddd  zmm1, zmm1, zmm28
+        vpaddd  zmm2, zmm2, zmm30
+        vpaddd  zmm3, zmm3, zmm29
+        vpaddd  zmm0, zmm0, zmm4
+        vpaddd  zmm1, zmm1, zmm5
+        vpaddd  zmm2, zmm2, zmm6
+        vpaddd  zmm3, zmm3, zmm7
+        vpxord  zmm12, zmm12, zmm0
+        vpxord  zmm13, zmm13, zmm1
+        vpxord  zmm14, zmm14, zmm2
+        vpxord  zmm15, zmm15, zmm3
+        vprord  zmm12, zmm12, 16
+        vprord  zmm13, zmm13, 16
+        vprord  zmm14, zmm14, 16
+        vprord  zmm15, zmm15, 16
+        vpaddd  zmm8, zmm8, zmm12
+        vpaddd  zmm9, zmm9, zmm13
+        vpaddd  zmm10, zmm10, zmm14
+        vpaddd  zmm11, zmm11, zmm15
+        vpxord  zmm4, zmm4, zmm8
+        vpxord  zmm5, zmm5, zmm9
+        vpxord  zmm6, zmm6, zmm10
+        vpxord  zmm7, zmm7, zmm11
+        vprord  zmm4, zmm4, 12
+        vprord  zmm5, zmm5, 12
+        vprord  zmm6, zmm6, 12
+        vprord  zmm7, zmm7, 12
+        vpaddd  zmm0, zmm0, zmm23
+        vpaddd  zmm1, zmm1, zmm25
+        vpaddd  zmm2, zmm2, zmm19
+        vpaddd  zmm3, zmm3, zmm31
+        vpaddd  zmm0, zmm0, zmm4
+        vpaddd  zmm1, zmm1, zmm5
+        vpaddd  zmm2, zmm2, zmm6
+        vpaddd  zmm3, zmm3, zmm7
+        vpxord  zmm12, zmm12, zmm0
+        vpxord  zmm13, zmm13, zmm1
+        vpxord  zmm14, zmm14, zmm2
+        vpxord  zmm15, zmm15, zmm3
+        vprord  zmm12, zmm12, 8
+        vprord  zmm13, zmm13, 8
+        vprord  zmm14, zmm14, 8
+        vprord  zmm15, zmm15, 8
+        vpaddd  zmm8, zmm8, zmm12
+        vpaddd  zmm9, zmm9, zmm13
+        vpaddd  zmm10, zmm10, zmm14
+        vpaddd  zmm11, zmm11, zmm15
+        vpxord  zmm4, zmm4, zmm8
+        vpxord  zmm5, zmm5, zmm9
+        vpxord  zmm6, zmm6, zmm10
+        vpxord  zmm7, zmm7, zmm11
+        vprord  zmm4, zmm4, 7
+        vprord  zmm5, zmm5, 7
+        vprord  zmm6, zmm6, 7
+        vprord  zmm7, zmm7, 7
+        vpaddd  zmm0, zmm0, zmm20
+        vpaddd  zmm1, zmm1, zmm27
+        vpaddd  zmm2, zmm2, zmm21
+        vpaddd  zmm3, zmm3, zmm17
+        vpaddd  zmm0, zmm0, zmm5
+        vpaddd  zmm1, zmm1, zmm6
+        vpaddd  zmm2, zmm2, zmm7
+        vpaddd  zmm3, zmm3, zmm4
+        vpxord  zmm15, zmm15, zmm0
+        vpxord  zmm12, zmm12, zmm1
+        vpxord  zmm13, zmm13, zmm2
+        vpxord  zmm14, zmm14, zmm3
+        vprord  zmm15, zmm15, 16
+        vprord  zmm12, zmm12, 16
+        vprord  zmm13, zmm13, 16
+        vprord  zmm14, zmm14, 16
+        vpaddd  zmm10, zmm10, zmm15
+        vpaddd  zmm11, zmm11, zmm12
+        vpaddd  zmm8, zmm8, zmm13
+        vpaddd  zmm9, zmm9, zmm14
+        vpxord  zmm5, zmm5, zmm10
+        vpxord  zmm6, zmm6, zmm11
+        vpxord  zmm7, zmm7, zmm8
+        vpxord  zmm4, zmm4, zmm9
+        vprord  zmm5, zmm5, 12
+        vprord  zmm6, zmm6, 12
+        vprord  zmm7, zmm7, 12
+        vprord  zmm4, zmm4, 12
+        vpaddd  zmm0, zmm0, zmm16
+        vpaddd  zmm1, zmm1, zmm18
+        vpaddd  zmm2, zmm2, zmm24
+        vpaddd  zmm3, zmm3, zmm22
+        vpaddd  zmm0, zmm0, zmm5
+        vpaddd  zmm1, zmm1, zmm6
+        vpaddd  zmm2, zmm2, zmm7
+        vpaddd  zmm3, zmm3, zmm4
+        vpxord  zmm15, zmm15, zmm0
+        vpxord  zmm12, zmm12, zmm1
+        vpxord  zmm13, zmm13, zmm2
+        vpxord  zmm14, zmm14, zmm3
+        vprord  zmm15, zmm15, 8
+        vprord  zmm12, zmm12, 8
+        vprord  zmm13, zmm13, 8
+        vprord  zmm14, zmm14, 8
+        vpaddd  zmm10, zmm10, zmm15
+        vpaddd  zmm11, zmm11, zmm12
+        vpaddd  zmm8, zmm8, zmm13
+        vpaddd  zmm9, zmm9, zmm14
+        vpxord  zmm5, zmm5, zmm10
+        vpxord  zmm6, zmm6, zmm11
+        vpxord  zmm7, zmm7, zmm8
+        vpxord  zmm4, zmm4, zmm9
+        vprord  zmm5, zmm5, 7
+        vprord  zmm6, zmm6, 7
+        vprord  zmm7, zmm7, 7
+        vprord  zmm4, zmm4, 7
+        vpaddd  zmm0, zmm0, zmm28
+        vpaddd  zmm1, zmm1, zmm25
+        vpaddd  zmm2, zmm2, zmm31
+        vpaddd  zmm3, zmm3, zmm30
+        vpaddd  zmm0, zmm0, zmm4
+        vpaddd  zmm1, zmm1, zmm5
+        vpaddd  zmm2, zmm2, zmm6
+        vpaddd  zmm3, zmm3, zmm7
+        vpxord  zmm12, zmm12, zmm0
+        vpxord  zmm13, zmm13, zmm1
+        vpxord  zmm14, zmm14, zmm2
+        vpxord  zmm15, zmm15, zmm3
+        vprord  zmm12, zmm12, 16
+        vprord  zmm13, zmm13, 16
+        vprord  zmm14, zmm14, 16
+        vprord  zmm15, zmm15, 16
+        vpaddd  zmm8, zmm8, zmm12
+        vpaddd  zmm9, zmm9, zmm13
+        vpaddd  zmm10, zmm10, zmm14
+        vpaddd  zmm11, zmm11, zmm15
+        vpxord  zmm4, zmm4, zmm8
+        vpxord  zmm5, zmm5, zmm9
+        vpxord  zmm6, zmm6, zmm10
+        vpxord  zmm7, zmm7, zmm11
+        vprord  zmm4, zmm4, 12
+        vprord  zmm5, zmm5, 12
+        vprord  zmm6, zmm6, 12
+        vprord  zmm7, zmm7, 12
+        vpaddd  zmm0, zmm0, zmm29
+        vpaddd  zmm1, zmm1, zmm27
+        vpaddd  zmm2, zmm2, zmm26
+        vpaddd  zmm3, zmm3, zmm24
+        vpaddd  zmm0, zmm0, zmm4
+        vpaddd  zmm1, zmm1, zmm5
+        vpaddd  zmm2, zmm2, zmm6
+        vpaddd  zmm3, zmm3, zmm7
+        vpxord  zmm12, zmm12, zmm0
+        vpxord  zmm13, zmm13, zmm1
+        vpxord  zmm14, zmm14, zmm2
+        vpxord  zmm15, zmm15, zmm3
+        vprord  zmm12, zmm12, 8
+        vprord  zmm13, zmm13, 8
+        vprord  zmm14, zmm14, 8
+        vprord  zmm15, zmm15, 8
+        vpaddd  zmm8, zmm8, zmm12
+        vpaddd  zmm9, zmm9, zmm13
+        vpaddd  zmm10, zmm10, zmm14
+        vpaddd  zmm11, zmm11, zmm15
+        vpxord  zmm4, zmm4, zmm8
+        vpxord  zmm5, zmm5, zmm9
+        vpxord  zmm6, zmm6, zmm10
+        vpxord  zmm7, zmm7, zmm11
+        vprord  zmm4, zmm4, 7
+        vprord  zmm5, zmm5, 7
+        vprord  zmm6, zmm6, 7
+        vprord  zmm7, zmm7, 7
+        vpaddd  zmm0, zmm0, zmm23
+        vpaddd  zmm1, zmm1, zmm21
+        vpaddd  zmm2, zmm2, zmm16
+        vpaddd  zmm3, zmm3, zmm22
+        vpaddd  zmm0, zmm0, zmm5
+        vpaddd  zmm1, zmm1, zmm6
+        vpaddd  zmm2, zmm2, zmm7
+        vpaddd  zmm3, zmm3, zmm4
+        vpxord  zmm15, zmm15, zmm0
+        vpxord  zmm12, zmm12, zmm1
+        vpxord  zmm13, zmm13, zmm2
+        vpxord  zmm14, zmm14, zmm3
+        vprord  zmm15, zmm15, 16
+        vprord  zmm12, zmm12, 16
+        vprord  zmm13, zmm13, 16
+        vprord  zmm14, zmm14, 16
+        vpaddd  zmm10, zmm10, zmm15
+        vpaddd  zmm11, zmm11, zmm12
+        vpaddd  zmm8, zmm8, zmm13
+        vpaddd  zmm9, zmm9, zmm14
+        vpxord  zmm5, zmm5, zmm10
+        vpxord  zmm6, zmm6, zmm11
+        vpxord  zmm7, zmm7, zmm8
+        vpxord  zmm4, zmm4, zmm9
+        vprord  zmm5, zmm5, 12
+        vprord  zmm6, zmm6, 12
+        vprord  zmm7, zmm7, 12
+        vprord  zmm4, zmm4, 12
+        vpaddd  zmm0, zmm0, zmm18
+        vpaddd  zmm1, zmm1, zmm19
+        vpaddd  zmm2, zmm2, zmm17
+        vpaddd  zmm3, zmm3, zmm20
+        vpaddd  zmm0, zmm0, zmm5
+        vpaddd  zmm1, zmm1, zmm6
+        vpaddd  zmm2, zmm2, zmm7
+        vpaddd  zmm3, zmm3, zmm4
+        vpxord  zmm15, zmm15, zmm0
+        vpxord  zmm12, zmm12, zmm1
+        vpxord  zmm13, zmm13, zmm2
+        vpxord  zmm14, zmm14, zmm3
+        vprord  zmm15, zmm15, 8
+        vprord  zmm12, zmm12, 8
+        vprord  zmm13, zmm13, 8
+        vprord  zmm14, zmm14, 8
+        vpaddd  zmm10, zmm10, zmm15
+        vpaddd  zmm11, zmm11, zmm12
+        vpaddd  zmm8, zmm8, zmm13
+        vpaddd  zmm9, zmm9, zmm14
+        vpxord  zmm5, zmm5, zmm10
+        vpxord  zmm6, zmm6, zmm11
+        vpxord  zmm7, zmm7, zmm8
+        vpxord  zmm4, zmm4, zmm9
+        vprord  zmm5, zmm5, 7
+        vprord  zmm6, zmm6, 7
+        vprord  zmm7, zmm7, 7
+        vprord  zmm4, zmm4, 7
+        vpaddd  zmm0, zmm0, zmm25
+        vpaddd  zmm1, zmm1, zmm27
+        vpaddd  zmm2, zmm2, zmm24
+        vpaddd  zmm3, zmm3, zmm31
+        vpaddd  zmm0, zmm0, zmm4
+        vpaddd  zmm1, zmm1, zmm5
+        vpaddd  zmm2, zmm2, zmm6
+        vpaddd  zmm3, zmm3, zmm7
+        vpxord  zmm12, zmm12, zmm0
+        vpxord  zmm13, zmm13, zmm1
+        vpxord  zmm14, zmm14, zmm2
+        vpxord  zmm15, zmm15, zmm3
+        vprord  zmm12, zmm12, 16
+        vprord  zmm13, zmm13, 16
+        vprord  zmm14, zmm14, 16
+        vprord  zmm15, zmm15, 16
+        vpaddd  zmm8, zmm8, zmm12
+        vpaddd  zmm9, zmm9, zmm13
+        vpaddd  zmm10, zmm10, zmm14
+        vpaddd  zmm11, zmm11, zmm15
+        vpxord  zmm4, zmm4, zmm8
+        vpxord  zmm5, zmm5, zmm9
+        vpxord  zmm6, zmm6, zmm10
+        vpxord  zmm7, zmm7, zmm11
+        vprord  zmm4, zmm4, 12
+        vprord  zmm5, zmm5, 12
+        vprord  zmm6, zmm6, 12
+        vprord  zmm7, zmm7, 12
+        vpaddd  zmm0, zmm0, zmm30
+        vpaddd  zmm1, zmm1, zmm21
+        vpaddd  zmm2, zmm2, zmm28
+        vpaddd  zmm3, zmm3, zmm17
+        vpaddd  zmm0, zmm0, zmm4
+        vpaddd  zmm1, zmm1, zmm5
+        vpaddd  zmm2, zmm2, zmm6
+        vpaddd  zmm3, zmm3, zmm7
+        vpxord  zmm12, zmm12, zmm0
+        vpxord  zmm13, zmm13, zmm1
+        vpxord  zmm14, zmm14, zmm2
+        vpxord  zmm15, zmm15, zmm3
+        vprord  zmm12, zmm12, 8
+        vprord  zmm13, zmm13, 8
+        vprord  zmm14, zmm14, 8
+        vprord  zmm15, zmm15, 8
+        vpaddd  zmm8, zmm8, zmm12
+        vpaddd  zmm9, zmm9, zmm13
+        vpaddd  zmm10, zmm10, zmm14
+        vpaddd  zmm11, zmm11, zmm15
+        vpxord  zmm4, zmm4, zmm8
+        vpxord  zmm5, zmm5, zmm9
+        vpxord  zmm6, zmm6, zmm10
+        vpxord  zmm7, zmm7, zmm11
+        vprord  zmm4, zmm4, 7
+        vprord  zmm5, zmm5, 7
+        vprord  zmm6, zmm6, 7
+        vprord  zmm7, zmm7, 7
+        vpaddd  zmm0, zmm0, zmm29
+        vpaddd  zmm1, zmm1, zmm16
+        vpaddd  zmm2, zmm2, zmm18
+        vpaddd  zmm3, zmm3, zmm20
+        vpaddd  zmm0, zmm0, zmm5
+        vpaddd  zmm1, zmm1, zmm6
+        vpaddd  zmm2, zmm2, zmm7
+        vpaddd  zmm3, zmm3, zmm4
+        vpxord  zmm15, zmm15, zmm0
+        vpxord  zmm12, zmm12, zmm1
+        vpxord  zmm13, zmm13, zmm2
+        vpxord  zmm14, zmm14, zmm3
+        vprord  zmm15, zmm15, 16
+        vprord  zmm12, zmm12, 16
+        vprord  zmm13, zmm13, 16
+        vprord  zmm14, zmm14, 16
+        vpaddd  zmm10, zmm10, zmm15
+        vpaddd  zmm11, zmm11, zmm12
+        vpaddd  zmm8, zmm8, zmm13
+        vpaddd  zmm9, zmm9, zmm14
+        vpxord  zmm5, zmm5, zmm10
+        vpxord  zmm6, zmm6, zmm11
+        vpxord  zmm7, zmm7, zmm8
+        vpxord  zmm4, zmm4, zmm9
+        vprord  zmm5, zmm5, 12
+        vprord  zmm6, zmm6, 12
+        vprord  zmm7, zmm7, 12
+        vprord  zmm4, zmm4, 12
+        vpaddd  zmm0, zmm0, zmm19
+        vpaddd  zmm1, zmm1, zmm26
+        vpaddd  zmm2, zmm2, zmm22
+        vpaddd  zmm3, zmm3, zmm23
+        vpaddd  zmm0, zmm0, zmm5
+        vpaddd  zmm1, zmm1, zmm6
+        vpaddd  zmm2, zmm2, zmm7
+        vpaddd  zmm3, zmm3, zmm4
+        vpxord  zmm15, zmm15, zmm0
+        vpxord  zmm12, zmm12, zmm1
+        vpxord  zmm13, zmm13, zmm2
+        vpxord  zmm14, zmm14, zmm3
+        vprord  zmm15, zmm15, 8
+        vprord  zmm12, zmm12, 8
+        vprord  zmm13, zmm13, 8
+        vprord  zmm14, zmm14, 8
+        vpaddd  zmm10, zmm10, zmm15
+        vpaddd  zmm11, zmm11, zmm12
+        vpaddd  zmm8, zmm8, zmm13
+        vpaddd  zmm9, zmm9, zmm14
+        vpxord  zmm5, zmm5, zmm10
+        vpxord  zmm6, zmm6, zmm11
+        vpxord  zmm7, zmm7, zmm8
+        vpxord  zmm4, zmm4, zmm9
+        vprord  zmm5, zmm5, 7
+        vprord  zmm6, zmm6, 7
+        vprord  zmm7, zmm7, 7
+        vprord  zmm4, zmm4, 7
+        vpaddd  zmm0, zmm0, zmm27
+        vpaddd  zmm1, zmm1, zmm21
+        vpaddd  zmm2, zmm2, zmm17
+        vpaddd  zmm3, zmm3, zmm24
+        vpaddd  zmm0, zmm0, zmm4
+        vpaddd  zmm1, zmm1, zmm5
+        vpaddd  zmm2, zmm2, zmm6
+        vpaddd  zmm3, zmm3, zmm7
+        vpxord  zmm12, zmm12, zmm0
+        vpxord  zmm13, zmm13, zmm1
+        vpxord  zmm14, zmm14, zmm2
+        vpxord  zmm15, zmm15, zmm3
+        vprord  zmm12, zmm12, 16
+        vprord  zmm13, zmm13, 16
+        vprord  zmm14, zmm14, 16
+        vprord  zmm15, zmm15, 16
+        vpaddd  zmm8, zmm8, zmm12
+        vpaddd  zmm9, zmm9, zmm13
+        vpaddd  zmm10, zmm10, zmm14
+        vpaddd  zmm11, zmm11, zmm15
+        vpxord  zmm4, zmm4, zmm8
+        vpxord  zmm5, zmm5, zmm9
+        vpxord  zmm6, zmm6, zmm10
+        vpxord  zmm7, zmm7, zmm11
+        vprord  zmm4, zmm4, 12
+        vprord  zmm5, zmm5, 12
+        vprord  zmm6, zmm6, 12
+        vprord  zmm7, zmm7, 12
+        vpaddd  zmm0, zmm0, zmm31
+        vpaddd  zmm1, zmm1, zmm16
+        vpaddd  zmm2, zmm2, zmm25
+        vpaddd  zmm3, zmm3, zmm22
+        vpaddd  zmm0, zmm0, zmm4
+        vpaddd  zmm1, zmm1, zmm5
+        vpaddd  zmm2, zmm2, zmm6
+        vpaddd  zmm3, zmm3, zmm7
+        vpxord  zmm12, zmm12, zmm0
+        vpxord  zmm13, zmm13, zmm1
+        vpxord  zmm14, zmm14, zmm2
+        vpxord  zmm15, zmm15, zmm3
+        vprord  zmm12, zmm12, 8
+        vprord  zmm13, zmm13, 8
+        vprord  zmm14, zmm14, 8
+        vprord  zmm15, zmm15, 8
+        vpaddd  zmm8, zmm8, zmm12
+        vpaddd  zmm9, zmm9, zmm13
+        vpaddd  zmm10, zmm10, zmm14
+        vpaddd  zmm11, zmm11, zmm15
+        vpxord  zmm4, zmm4, zmm8
+        vpxord  zmm5, zmm5, zmm9
+        vpxord  zmm6, zmm6, zmm10
+        vpxord  zmm7, zmm7, zmm11
+        vprord  zmm4, zmm4, 7
+        vprord  zmm5, zmm5, 7
+        vprord  zmm6, zmm6, 7
+        vprord  zmm7, zmm7, 7
+        vpaddd  zmm0, zmm0, zmm30
+        vpaddd  zmm1, zmm1, zmm18
+        vpaddd  zmm2, zmm2, zmm19
+        vpaddd  zmm3, zmm3, zmm23
+        vpaddd  zmm0, zmm0, zmm5
+        vpaddd  zmm1, zmm1, zmm6
+        vpaddd  zmm2, zmm2, zmm7
+        vpaddd  zmm3, zmm3, zmm4
+        vpxord  zmm15, zmm15, zmm0
+        vpxord  zmm12, zmm12, zmm1
+        vpxord  zmm13, zmm13, zmm2
+        vpxord  zmm14, zmm14, zmm3
+        vprord  zmm15, zmm15, 16
+        vprord  zmm12, zmm12, 16
+        vprord  zmm13, zmm13, 16
+        vprord  zmm14, zmm14, 16
+        vpaddd  zmm10, zmm10, zmm15
+        vpaddd  zmm11, zmm11, zmm12
+        vpaddd  zmm8, zmm8, zmm13
+        vpaddd  zmm9, zmm9, zmm14
+        vpxord  zmm5, zmm5, zmm10
+        vpxord  zmm6, zmm6, zmm11
+        vpxord  zmm7, zmm7, zmm8
+        vpxord  zmm4, zmm4, zmm9
+        vprord  zmm5, zmm5, 12
+        vprord  zmm6, zmm6, 12
+        vprord  zmm7, zmm7, 12
+        vprord  zmm4, zmm4, 12
+        vpaddd  zmm0, zmm0, zmm26
+        vpaddd  zmm1, zmm1, zmm28
+        vpaddd  zmm2, zmm2, zmm20
+        vpaddd  zmm3, zmm3, zmm29
+        vpaddd  zmm0, zmm0, zmm5
+        vpaddd  zmm1, zmm1, zmm6
+        vpaddd  zmm2, zmm2, zmm7
+        vpaddd  zmm3, zmm3, zmm4
+        vpxord  zmm15, zmm15, zmm0
+        vpxord  zmm12, zmm12, zmm1
+        vpxord  zmm13, zmm13, zmm2
+        vpxord  zmm14, zmm14, zmm3
+        vprord  zmm15, zmm15, 8
+        vprord  zmm12, zmm12, 8
+        vprord  zmm13, zmm13, 8
+        vprord  zmm14, zmm14, 8
+        vpaddd  zmm10, zmm10, zmm15
+        vpaddd  zmm11, zmm11, zmm12
+        vpaddd  zmm8, zmm8, zmm13
+        vpaddd  zmm9, zmm9, zmm14
+        vpxord  zmm5, zmm5, zmm10
+        vpxord  zmm6, zmm6, zmm11
+        vpxord  zmm7, zmm7, zmm8
+        vpxord  zmm4, zmm4, zmm9
+        vprord  zmm5, zmm5, 7
+        vprord  zmm6, zmm6, 7
+        vprord  zmm7, zmm7, 7
+        vprord  zmm4, zmm4, 7
+        vpxord  zmm0, zmm0, zmm8
+        vpxord  zmm1, zmm1, zmm9
+        vpxord  zmm2, zmm2, zmm10
+        vpxord  zmm3, zmm3, zmm11
+        vpxord  zmm4, zmm4, zmm12
+        vpxord  zmm5, zmm5, zmm13
+        vpxord  zmm6, zmm6, zmm14
+        vpxord  zmm7, zmm7, zmm15
+        movzx   eax, byte ptr [rbp+0x38]
+        jne     9b
+        mov     rbx, qword ptr [rbp+0x50]
+        vpunpckldq zmm16, zmm0, zmm1
+        vpunpckhdq zmm17, zmm0, zmm1
+        vpunpckldq zmm18, zmm2, zmm3
+        vpunpckhdq zmm19, zmm2, zmm3
+        vpunpckldq zmm20, zmm4, zmm5
+        vpunpckhdq zmm21, zmm4, zmm5
+        vpunpckldq zmm22, zmm6, zmm7
+        vpunpckhdq zmm23, zmm6, zmm7
+        vpunpcklqdq zmm0, zmm16, zmm18
+        vpunpckhqdq zmm1, zmm16, zmm18
+        vpunpcklqdq zmm2, zmm17, zmm19
+        vpunpckhqdq zmm3, zmm17, zmm19
+        vpunpcklqdq zmm4, zmm20, zmm22
+        vpunpckhqdq zmm5, zmm20, zmm22
+        vpunpcklqdq zmm6, zmm21, zmm23
+        vpunpckhqdq zmm7, zmm21, zmm23
+        vshufi32x4 zmm16, zmm0, zmm4, 0x88
+        vshufi32x4 zmm17, zmm1, zmm5, 0x88
+        vshufi32x4 zmm18, zmm2, zmm6, 0x88
+        vshufi32x4 zmm19, zmm3, zmm7, 0x88
+        vshufi32x4 zmm20, zmm0, zmm4, 0xDD
+        vshufi32x4 zmm21, zmm1, zmm5, 0xDD
+        vshufi32x4 zmm22, zmm2, zmm6, 0xDD
+        vshufi32x4 zmm23, zmm3, zmm7, 0xDD
+        vshufi32x4 zmm0, zmm16, zmm17, 0x88
+        vshufi32x4 zmm1, zmm18, zmm19, 0x88
+        vshufi32x4 zmm2, zmm20, zmm21, 0x88
+        vshufi32x4 zmm3, zmm22, zmm23, 0x88
+        vshufi32x4 zmm4, zmm16, zmm17, 0xDD
+        vshufi32x4 zmm5, zmm18, zmm19, 0xDD
+        vshufi32x4 zmm6, zmm20, zmm21, 0xDD
+        vshufi32x4 zmm7, zmm22, zmm23, 0xDD
+        vmovdqu32 zmmword ptr [rbx], zmm0
+        vmovdqu32 zmmword ptr [rbx+0x1*0x40], zmm1
+        vmovdqu32 zmmword ptr [rbx+0x2*0x40], zmm2
+        vmovdqu32 zmmword ptr [rbx+0x3*0x40], zmm3
+        vmovdqu32 zmmword ptr [rbx+0x4*0x40], zmm4
+        vmovdqu32 zmmword ptr [rbx+0x5*0x40], zmm5
+        vmovdqu32 zmmword ptr [rbx+0x6*0x40], zmm6
+        vmovdqu32 zmmword ptr [rbx+0x7*0x40], zmm7
+        vmovdqa32 zmm0, zmmword ptr [rsp]
+        vmovdqa32 zmm1, zmmword ptr [rsp+0x1*0x40]
+        vmovdqa32 zmm2, zmm0
+        vpaddd  zmm2{k1}, zmm0, dword ptr [ADD16+rip] {1to16}
+        vpcmpltud k2, zmm2, zmm0
+        vpaddd  zmm1 {k2}, zmm1, dword ptr [ADD1+rip] {1to16}
+        vmovdqa32 zmmword ptr [rsp], zmm2
+        vmovdqa32 zmmword ptr [rsp+0x1*0x40], zmm1
+        add     rdi, 128
+        add     rbx, 512
+        mov     qword ptr [rbp+0x50], rbx
+        sub     rsi, 16
+        cmp     rsi, 16
+        jnc     2b
+        test    rsi, rsi
+        jnz     3f
+4:
+        vzeroupper
+        mov     rsp, rbp
+        pop     rbp
+        pop     rbx
+        pop     r12
+        pop     r13
+        pop     r14
+        pop     r15
+        ret
+.p2align 6
+3:
+        test    esi, 0x8
+        je      3f
+        vpbroadcastd ymm0, dword ptr [rcx]
+        vpbroadcastd ymm1, dword ptr [rcx+0x4]
+        vpbroadcastd ymm2, dword ptr [rcx+0x8]
+        vpbroadcastd ymm3, dword ptr [rcx+0xC]
+        vpbroadcastd ymm4, dword ptr [rcx+0x10]
+        vpbroadcastd ymm5, dword ptr [rcx+0x14]
+        vpbroadcastd ymm6, dword ptr [rcx+0x18]
+        vpbroadcastd ymm7, dword ptr [rcx+0x1C]
+        mov     r8, qword ptr [rdi]
+        mov     r9, qword ptr [rdi+0x8]
+        mov     r10, qword ptr [rdi+0x10]
+        mov     r11, qword ptr [rdi+0x18]
+        mov     r12, qword ptr [rdi+0x20]
+        mov     r13, qword ptr [rdi+0x28]
+        mov     r14, qword ptr [rdi+0x30]
+        mov     r15, qword ptr [rdi+0x38]
+        movzx   eax, byte ptr [rbp+0x38]
+        movzx   ebx, byte ptr [rbp+0x40]
+        or      eax, ebx
+        xor     edx, edx
+2:
+        movzx   ebx, byte ptr [rbp+0x48]
+        or      ebx, eax
+        add     rdx, 64
+        cmp     rdx, qword ptr [rsp+0x80]
+        cmove   eax, ebx
+        mov     dword ptr [rsp+0x88], eax
+        vmovups xmm8, xmmword ptr [r8+rdx-0x40]
+        vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x40], 0x01
+        vmovups xmm9, xmmword ptr [r9+rdx-0x40]
+        vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x40], 0x01
+        vunpcklpd ymm12, ymm8, ymm9
+        vunpckhpd ymm13, ymm8, ymm9
+        vmovups xmm10, xmmword ptr [r10+rdx-0x40]
+        vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x40], 0x01
+        vmovups xmm11, xmmword ptr [r11+rdx-0x40]
+        vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x40], 0x01
+        vunpcklpd ymm14, ymm10, ymm11
+        vunpckhpd ymm15, ymm10, ymm11
+        vshufps ymm16, ymm12, ymm14, 136
+        vshufps ymm17, ymm12, ymm14, 221
+        vshufps ymm18, ymm13, ymm15, 136
+        vshufps ymm19, ymm13, ymm15, 221
+        vmovups xmm8, xmmword ptr [r8+rdx-0x30]
+        vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x30], 0x01
+        vmovups xmm9, xmmword ptr [r9+rdx-0x30]
+        vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x30], 0x01
+        vunpcklpd ymm12, ymm8, ymm9
+        vunpckhpd ymm13, ymm8, ymm9
+        vmovups xmm10, xmmword ptr [r10+rdx-0x30]
+        vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x30], 0x01
+        vmovups xmm11, xmmword ptr [r11+rdx-0x30]
+        vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x30], 0x01
+        vunpcklpd ymm14, ymm10, ymm11
+        vunpckhpd ymm15, ymm10, ymm11
+        vshufps ymm20, ymm12, ymm14, 136
+        vshufps ymm21, ymm12, ymm14, 221
+        vshufps ymm22, ymm13, ymm15, 136
+        vshufps ymm23, ymm13, ymm15, 221
+        vmovups xmm8, xmmword ptr [r8+rdx-0x20]
+        vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x20], 0x01
+        vmovups xmm9, xmmword ptr [r9+rdx-0x20]
+        vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x20], 0x01
+        vunpcklpd ymm12, ymm8, ymm9
+        vunpckhpd ymm13, ymm8, ymm9
+        vmovups xmm10, xmmword ptr [r10+rdx-0x20]
+        vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x20], 0x01
+        vmovups xmm11, xmmword ptr [r11+rdx-0x20]
+        vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x20], 0x01
+        vunpcklpd ymm14, ymm10, ymm11
+        vunpckhpd ymm15, ymm10, ymm11
+        vshufps ymm24, ymm12, ymm14, 136
+        vshufps ymm25, ymm12, ymm14, 221
+        vshufps ymm26, ymm13, ymm15, 136
+        vshufps ymm27, ymm13, ymm15, 221
+        vmovups xmm8, xmmword ptr [r8+rdx-0x10]
+        vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x10], 0x01
+        vmovups xmm9, xmmword ptr [r9+rdx-0x10]
+        vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x10], 0x01
+        vunpcklpd ymm12, ymm8, ymm9
+        vunpckhpd ymm13, ymm8, ymm9
+        vmovups xmm10, xmmword ptr [r10+rdx-0x10]
+        vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x10], 0x01
+        vmovups xmm11, xmmword ptr [r11+rdx-0x10]
+        vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x10], 0x01
+        vunpcklpd ymm14, ymm10, ymm11
+        vunpckhpd ymm15, ymm10, ymm11
+        vshufps ymm28, ymm12, ymm14, 136
+        vshufps ymm29, ymm12, ymm14, 221
+        vshufps ymm30, ymm13, ymm15, 136
+        vshufps ymm31, ymm13, ymm15, 221
+        vpbroadcastd ymm8, dword ptr [BLAKE3_IV_0+rip]
+        vpbroadcastd ymm9, dword ptr [BLAKE3_IV_1+rip]
+        vpbroadcastd ymm10, dword ptr [BLAKE3_IV_2+rip]
+        vpbroadcastd ymm11, dword ptr [BLAKE3_IV_3+rip]
+        vmovdqa ymm12, ymmword ptr [rsp]
+        vmovdqa ymm13, ymmword ptr [rsp+0x40]
+        vpbroadcastd ymm14, dword ptr [BLAKE3_BLOCK_LEN+rip]
+        vpbroadcastd ymm15, dword ptr [rsp+0x88]
+        vpaddd  ymm0, ymm0, ymm16
+        vpaddd  ymm1, ymm1, ymm18
+        vpaddd  ymm2, ymm2, ymm20
+        vpaddd  ymm3, ymm3, ymm22
+        vpaddd  ymm0, ymm0, ymm4
+        vpaddd  ymm1, ymm1, ymm5
+        vpaddd  ymm2, ymm2, ymm6
+        vpaddd  ymm3, ymm3, ymm7
+        vpxord  ymm12, ymm12, ymm0
+        vpxord  ymm13, ymm13, ymm1
+        vpxord  ymm14, ymm14, ymm2
+        vpxord  ymm15, ymm15, ymm3
+        vprord  ymm12, ymm12, 16
+        vprord  ymm13, ymm13, 16
+        vprord  ymm14, ymm14, 16
+        vprord  ymm15, ymm15, 16
+        vpaddd  ymm8, ymm8, ymm12
+        vpaddd  ymm9, ymm9, ymm13
+        vpaddd  ymm10, ymm10, ymm14
+        vpaddd  ymm11, ymm11, ymm15
+        vpxord  ymm4, ymm4, ymm8
+        vpxord  ymm5, ymm5, ymm9
+        vpxord  ymm6, ymm6, ymm10
+        vpxord  ymm7, ymm7, ymm11
+        vprord  ymm4, ymm4, 12
+        vprord  ymm5, ymm5, 12
+        vprord  ymm6, ymm6, 12
+        vprord  ymm7, ymm7, 12
+        vpaddd  ymm0, ymm0, ymm17
+        vpaddd  ymm1, ymm1, ymm19
+        vpaddd  ymm2, ymm2, ymm21
+        vpaddd  ymm3, ymm3, ymm23
+        vpaddd  ymm0, ymm0, ymm4
+        vpaddd  ymm1, ymm1, ymm5
+        vpaddd  ymm2, ymm2, ymm6
+        vpaddd  ymm3, ymm3, ymm7
+        vpxord  ymm12, ymm12, ymm0
+        vpxord  ymm13, ymm13, ymm1
+        vpxord  ymm14, ymm14, ymm2
+        vpxord  ymm15, ymm15, ymm3
+        vprord  ymm12, ymm12, 8
+        vprord  ymm13, ymm13, 8
+        vprord  ymm14, ymm14, 8
+        vprord  ymm15, ymm15, 8
+        vpaddd  ymm8, ymm8, ymm12
+        vpaddd  ymm9, ymm9, ymm13
+        vpaddd  ymm10, ymm10, ymm14
+        vpaddd  ymm11, ymm11, ymm15
+        vpxord  ymm4, ymm4, ymm8
+        vpxord  ymm5, ymm5, ymm9
+        vpxord  ymm6, ymm6, ymm10
+        vpxord  ymm7, ymm7, ymm11
+        vprord  ymm4, ymm4, 7
+        vprord  ymm5, ymm5, 7
+        vprord  ymm6, ymm6, 7
+        vprord  ymm7, ymm7, 7
+        vpaddd  ymm0, ymm0, ymm24
+        vpaddd  ymm1, ymm1, ymm26
+        vpaddd  ymm2, ymm2, ymm28
+        vpaddd  ymm3, ymm3, ymm30
+        vpaddd  ymm0, ymm0, ymm5
+        vpaddd  ymm1, ymm1, ymm6
+        vpaddd  ymm2, ymm2, ymm7
+        vpaddd  ymm3, ymm3, ymm4
+        vpxord  ymm15, ymm15, ymm0
+        vpxord  ymm12, ymm12, ymm1
+        vpxord  ymm13, ymm13, ymm2
+        vpxord  ymm14, ymm14, ymm3
+        vprord  ymm15, ymm15, 16
+        vprord  ymm12, ymm12, 16
+        vprord  ymm13, ymm13, 16
+        vprord  ymm14, ymm14, 16
+        vpaddd  ymm10, ymm10, ymm15
+        vpaddd  ymm11, ymm11, ymm12
+        vpaddd  ymm8, ymm8, ymm13
+        vpaddd  ymm9, ymm9, ymm14
+        vpxord  ymm5, ymm5, ymm10
+        vpxord  ymm6, ymm6, ymm11
+        vpxord  ymm7, ymm7, ymm8
+        vpxord  ymm4, ymm4, ymm9
+        vprord  ymm5, ymm5, 12
+        vprord  ymm6, ymm6, 12
+        vprord  ymm7, ymm7, 12
+        vprord  ymm4, ymm4, 12
+        vpaddd  ymm0, ymm0, ymm25
+        vpaddd  ymm1, ymm1, ymm27
+        vpaddd  ymm2, ymm2, ymm29
+        vpaddd  ymm3, ymm3, ymm31
+        vpaddd  ymm0, ymm0, ymm5
+        vpaddd  ymm1, ymm1, ymm6
+        vpaddd  ymm2, ymm2, ymm7
+        vpaddd  ymm3, ymm3, ymm4
+        vpxord  ymm15, ymm15, ymm0
+        vpxord  ymm12, ymm12, ymm1
+        vpxord  ymm13, ymm13, ymm2
+        vpxord  ymm14, ymm14, ymm3
+        vprord  ymm15, ymm15, 8
+        vprord  ymm12, ymm12, 8
+        vprord  ymm13, ymm13, 8
+        vprord  ymm14, ymm14, 8
+        vpaddd  ymm10, ymm10, ymm15
+        vpaddd  ymm11, ymm11, ymm12
+        vpaddd  ymm8, ymm8, ymm13
+        vpaddd  ymm9, ymm9, ymm14
+        vpxord  ymm5, ymm5, ymm10
+        vpxord  ymm6, ymm6, ymm11
+        vpxord  ymm7, ymm7, ymm8
+        vpxord  ymm4, ymm4, ymm9
+        vprord  ymm5, ymm5, 7
+        vprord  ymm6, ymm6, 7
+        vprord  ymm7, ymm7, 7
+        vprord  ymm4, ymm4, 7
+        vpaddd  ymm0, ymm0, ymm18
+        vpaddd  ymm1, ymm1, ymm19
+        vpaddd  ymm2, ymm2, ymm23
+        vpaddd  ymm3, ymm3, ymm20
+        vpaddd  ymm0, ymm0, ymm4
+        vpaddd  ymm1, ymm1, ymm5
+        vpaddd  ymm2, ymm2, ymm6
+        vpaddd  ymm3, ymm3, ymm7
+        vpxord  ymm12, ymm12, ymm0
+        vpxord  ymm13, ymm13, ymm1
+        vpxord  ymm14, ymm14, ymm2
+        vpxord  ymm15, ymm15, ymm3
+        vprord  ymm12, ymm12, 16
+        vprord  ymm13, ymm13, 16
+        vprord  ymm14, ymm14, 16
+        vprord  ymm15, ymm15, 16
+        vpaddd  ymm8, ymm8, ymm12
+        vpaddd  ymm9, ymm9, ymm13
+        vpaddd  ymm10, ymm10, ymm14
+        vpaddd  ymm11, ymm11, ymm15
+        vpxord  ymm4, ymm4, ymm8
+        vpxord  ymm5, ymm5, ymm9
+        vpxord  ymm6, ymm6, ymm10
+        vpxord  ymm7, ymm7, ymm11
+        vprord  ymm4, ymm4, 12
+        vprord  ymm5, ymm5, 12
+        vprord  ymm6, ymm6, 12
+        vprord  ymm7, ymm7, 12
+        vpaddd  ymm0, ymm0, ymm22
+        vpaddd  ymm1, ymm1, ymm26
+        vpaddd  ymm2, ymm2, ymm16
+        vpaddd  ymm3, ymm3, ymm29
+        vpaddd  ymm0, ymm0, ymm4
+        vpaddd  ymm1, ymm1, ymm5
+        vpaddd  ymm2, ymm2, ymm6
+        vpaddd  ymm3, ymm3, ymm7
+        vpxord  ymm12, ymm12, ymm0
+        vpxord  ymm13, ymm13, ymm1
+        vpxord  ymm14, ymm14, ymm2
+        vpxord  ymm15, ymm15, ymm3
+        vprord  ymm12, ymm12, 8
+        vprord  ymm13, ymm13, 8
+        vprord  ymm14, ymm14, 8
+        vprord  ymm15, ymm15, 8
+        vpaddd  ymm8, ymm8, ymm12
+        vpaddd  ymm9, ymm9, ymm13
+        vpaddd  ymm10, ymm10, ymm14
+        vpaddd  ymm11, ymm11, ymm15
+        vpxord  ymm4, ymm4, ymm8
+        vpxord  ymm5, ymm5, ymm9
+        vpxord  ymm6, ymm6, ymm10
+        vpxord  ymm7, ymm7, ymm11
+        vprord  ymm4, ymm4, 7
+        vprord  ymm5, ymm5, 7
+        vprord  ymm6, ymm6, 7
+        vprord  ymm7, ymm7, 7
+        vpaddd  ymm0, ymm0, ymm17
+        vpaddd  ymm1, ymm1, ymm28
+        vpaddd  ymm2, ymm2, ymm25
+        vpaddd  ymm3, ymm3, ymm31
+        vpaddd  ymm0, ymm0, ymm5
+        vpaddd  ymm1, ymm1, ymm6
+        vpaddd  ymm2, ymm2, ymm7
+        vpaddd  ymm3, ymm3, ymm4
+        vpxord  ymm15, ymm15, ymm0
+        vpxord  ymm12, ymm12, ymm1
+        vpxord  ymm13, ymm13, ymm2
+        vpxord  ymm14, ymm14, ymm3
+        vprord  ymm15, ymm15, 16
+        vprord  ymm12, ymm12, 16
+        vprord  ymm13, ymm13, 16
+        vprord  ymm14, ymm14, 16
+        vpaddd  ymm10, ymm10, ymm15
+        vpaddd  ymm11, ymm11, ymm12
+        vpaddd  ymm8, ymm8, ymm13
+        vpaddd  ymm9, ymm9, ymm14
+        vpxord  ymm5, ymm5, ymm10
+        vpxord  ymm6, ymm6, ymm11
+        vpxord  ymm7, ymm7, ymm8
+        vpxord  ymm4, ymm4, ymm9
+        vprord  ymm5, ymm5, 12
+        vprord  ymm6, ymm6, 12
+        vprord  ymm7, ymm7, 12
+        vprord  ymm4, ymm4, 12
+        vpaddd  ymm0, ymm0, ymm27
+        vpaddd  ymm1, ymm1, ymm21
+        vpaddd  ymm2, ymm2, ymm30
+        vpaddd  ymm3, ymm3, ymm24
+        vpaddd  ymm0, ymm0, ymm5
+        vpaddd  ymm1, ymm1, ymm6
+        vpaddd  ymm2, ymm2, ymm7
+        vpaddd  ymm3, ymm3, ymm4
+        vpxord  ymm15, ymm15, ymm0
+        vpxord  ymm12, ymm12, ymm1
+        vpxord  ymm13, ymm13, ymm2
+        vpxord  ymm14, ymm14, ymm3
+        vprord  ymm15, ymm15, 8
+        vprord  ymm12, ymm12, 8
+        vprord  ymm13, ymm13, 8
+        vprord  ymm14, ymm14, 8
+        vpaddd  ymm10, ymm10, ymm15
+        vpaddd  ymm11, ymm11, ymm12
+        vpaddd  ymm8, ymm8, ymm13
+        vpaddd  ymm9, ymm9, ymm14
+        vpxord  ymm5, ymm5, ymm10
+        vpxord  ymm6, ymm6, ymm11
+        vpxord  ymm7, ymm7, ymm8
+        vpxord  ymm4, ymm4, ymm9
+        vprord  ymm5, ymm5, 7
+        vprord  ymm6, ymm6, 7
+        vprord  ymm7, ymm7, 7
+        vprord  ymm4, ymm4, 7
+        vpaddd  ymm0, ymm0, ymm19
+        vpaddd  ymm1, ymm1, ymm26
+        vpaddd  ymm2, ymm2, ymm29
+        vpaddd  ymm3, ymm3, ymm23
+        vpaddd  ymm0, ymm0, ymm4
+        vpaddd  ymm1, ymm1, ymm5
+        vpaddd  ymm2, ymm2, ymm6
+        vpaddd  ymm3, ymm3, ymm7
+        vpxord  ymm12, ymm12, ymm0
+        vpxord  ymm13, ymm13, ymm1
+        vpxord  ymm14, ymm14, ymm2
+        vpxord  ymm15, ymm15, ymm3
+        vprord  ymm12, ymm12, 16
+        vprord  ymm13, ymm13, 16
+        vprord  ymm14, ymm14, 16
+        vprord  ymm15, ymm15, 16
+        vpaddd  ymm8, ymm8, ymm12
+        vpaddd  ymm9, ymm9, ymm13
+        vpaddd  ymm10, ymm10, ymm14
+        vpaddd  ymm11, ymm11, ymm15
+        vpxord  ymm4, ymm4, ymm8
+        vpxord  ymm5, ymm5, ymm9
+        vpxord  ymm6, ymm6, ymm10
+        vpxord  ymm7, ymm7, ymm11
+        vprord  ymm4, ymm4, 12
+        vprord  ymm5, ymm5, 12
+        vprord  ymm6, ymm6, 12
+        vprord  ymm7, ymm7, 12
+        vpaddd  ymm0, ymm0, ymm20
+        vpaddd  ymm1, ymm1, ymm28
+        vpaddd  ymm2, ymm2, ymm18
+        vpaddd  ymm3, ymm3, ymm30
+        vpaddd  ymm0, ymm0, ymm4
+        vpaddd  ymm1, ymm1, ymm5
+        vpaddd  ymm2, ymm2, ymm6
+        vpaddd  ymm3, ymm3, ymm7
+        vpxord  ymm12, ymm12, ymm0
+        vpxord  ymm13, ymm13, ymm1
+        vpxord  ymm14, ymm14, ymm2
+        vpxord  ymm15, ymm15, ymm3
+        vprord  ymm12, ymm12, 8
+        vprord  ymm13, ymm13, 8
+        vprord  ymm14, ymm14, 8
+        vprord  ymm15, ymm15, 8
+        vpaddd  ymm8, ymm8, ymm12
+        vpaddd  ymm9, ymm9, ymm13
+        vpaddd  ymm10, ymm10, ymm14
+        vpaddd  ymm11, ymm11, ymm15
+        vpxord  ymm4, ymm4, ymm8
+        vpxord  ymm5, ymm5, ymm9
+        vpxord  ymm6, ymm6, ymm10
+        vpxord  ymm7, ymm7, ymm11
+        vprord  ymm4, ymm4, 7
+        vprord  ymm5, ymm5, 7
+        vprord  ymm6, ymm6, 7
+        vprord  ymm7, ymm7, 7
+        vpaddd  ymm0, ymm0, ymm22
+        vpaddd  ymm1, ymm1, ymm25
+        vpaddd  ymm2, ymm2, ymm27
+        vpaddd  ymm3, ymm3, ymm24
+        vpaddd  ymm0, ymm0, ymm5
+        vpaddd  ymm1, ymm1, ymm6
+        vpaddd  ymm2, ymm2, ymm7
+        vpaddd  ymm3, ymm3, ymm4
+        vpxord  ymm15, ymm15, ymm0
+        vpxord  ymm12, ymm12, ymm1
+        vpxord  ymm13, ymm13, ymm2
+        vpxord  ymm14, ymm14, ymm3
+        vprord  ymm15, ymm15, 16
+        vprord  ymm12, ymm12, 16
+        vprord  ymm13, ymm13, 16
+        vprord  ymm14, ymm14, 16
+        vpaddd  ymm10, ymm10, ymm15
+        vpaddd  ymm11, ymm11, ymm12
+        vpaddd  ymm8, ymm8, ymm13
+        vpaddd  ymm9, ymm9, ymm14
+        vpxord  ymm5, ymm5, ymm10
+        vpxord  ymm6, ymm6, ymm11
+        vpxord  ymm7, ymm7, ymm8
+        vpxord  ymm4, ymm4, ymm9
+        vprord  ymm5, ymm5, 12
+        vprord  ymm6, ymm6, 12
+        vprord  ymm7, ymm7, 12
+        vprord  ymm4, ymm4, 12
+        vpaddd  ymm0, ymm0, ymm21
+        vpaddd  ymm1, ymm1, ymm16
+        vpaddd  ymm2, ymm2, ymm31
+        vpaddd  ymm3, ymm3, ymm17
+        vpaddd  ymm0, ymm0, ymm5
+        vpaddd  ymm1, ymm1, ymm6
+        vpaddd  ymm2, ymm2, ymm7
+        vpaddd  ymm3, ymm3, ymm4
+        vpxord  ymm15, ymm15, ymm0
+        vpxord  ymm12, ymm12, ymm1
+        vpxord  ymm13, ymm13, ymm2
+        vpxord  ymm14, ymm14, ymm3
+        vprord  ymm15, ymm15, 8
+        vprord  ymm12, ymm12, 8
+        vprord  ymm13, ymm13, 8
+        vprord  ymm14, ymm14, 8
+        vpaddd  ymm10, ymm10, ymm15
+        vpaddd  ymm11, ymm11, ymm12
+        vpaddd  ymm8, ymm8, ymm13
+        vpaddd  ymm9, ymm9, ymm14
+        vpxord  ymm5, ymm5, ymm10
+        vpxord  ymm6, ymm6, ymm11
+        vpxord  ymm7, ymm7, ymm8
+        vpxord  ymm4, ymm4, ymm9
+        vprord  ymm5, ymm5, 7
+        vprord  ymm6, ymm6, 7
+        vprord  ymm7, ymm7, 7
+        vprord  ymm4, ymm4, 7
+        vpaddd  ymm0, ymm0, ymm26
+        vpaddd  ymm1, ymm1, ymm28
+        vpaddd  ymm2, ymm2, ymm30
+        vpaddd  ymm3, ymm3, ymm29
+        vpaddd  ymm0, ymm0, ymm4
+        vpaddd  ymm1, ymm1, ymm5
+        vpaddd  ymm2, ymm2, ymm6
+        vpaddd  ymm3, ymm3, ymm7
+        vpxord  ymm12, ymm12, ymm0
+        vpxord  ymm13, ymm13, ymm1
+        vpxord  ymm14, ymm14, ymm2
+        vpxord  ymm15, ymm15, ymm3
+        vprord  ymm12, ymm12, 16
+        vprord  ymm13, ymm13, 16
+        vprord  ymm14, ymm14, 16
+        vprord  ymm15, ymm15, 16
+        vpaddd  ymm8, ymm8, ymm12
+        vpaddd  ymm9, ymm9, ymm13
+        vpaddd  ymm10, ymm10, ymm14
+        vpaddd  ymm11, ymm11, ymm15
+        vpxord  ymm4, ymm4, ymm8
+        vpxord  ymm5, ymm5, ymm9
+        vpxord  ymm6, ymm6, ymm10
+        vpxord  ymm7, ymm7, ymm11
+        vprord  ymm4, ymm4, 12
+        vprord  ymm5, ymm5, 12
+        vprord  ymm6, ymm6, 12
+        vprord  ymm7, ymm7, 12
+        vpaddd  ymm0, ymm0, ymm23
+        vpaddd  ymm1, ymm1, ymm25
+        vpaddd  ymm2, ymm2, ymm19
+        vpaddd  ymm3, ymm3, ymm31
+        vpaddd  ymm0, ymm0, ymm4
+        vpaddd  ymm1, ymm1, ymm5
+        vpaddd  ymm2, ymm2, ymm6
+        vpaddd  ymm3, ymm3, ymm7
+        vpxord  ymm12, ymm12, ymm0
+        vpxord  ymm13, ymm13, ymm1
+        vpxord  ymm14, ymm14, ymm2
+        vpxord  ymm15, ymm15, ymm3
+        vprord  ymm12, ymm12, 8
+        vprord  ymm13, ymm13, 8
+        vprord  ymm14, ymm14, 8
+        vprord  ymm15, ymm15, 8
+        vpaddd  ymm8, ymm8, ymm12
+        vpaddd  ymm9, ymm9, ymm13
+        vpaddd  ymm10, ymm10, ymm14
+        vpaddd  ymm11, ymm11, ymm15
+        vpxord  ymm4, ymm4, ymm8
+        vpxord  ymm5, ymm5, ymm9
+        vpxord  ymm6, ymm6, ymm10
+        vpxord  ymm7, ymm7, ymm11
+        vprord  ymm4, ymm4, 7
+        vprord  ymm5, ymm5, 7
+        vprord  ymm6, ymm6, 7
+        vprord  ymm7, ymm7, 7
+        vpaddd  ymm0, ymm0, ymm20
+        vpaddd  ymm1, ymm1, ymm27
+        vpaddd  ymm2, ymm2, ymm21
+        vpaddd  ymm3, ymm3, ymm17
+        vpaddd  ymm0, ymm0, ymm5
+        vpaddd  ymm1, ymm1, ymm6
+        vpaddd  ymm2, ymm2, ymm7
+        vpaddd  ymm3, ymm3, ymm4
+        vpxord  ymm15, ymm15, ymm0
+        vpxord  ymm12, ymm12, ymm1
+        vpxord  ymm13, ymm13, ymm2
+        vpxord  ymm14, ymm14, ymm3
+        vprord  ymm15, ymm15, 16
+        vprord  ymm12, ymm12, 16
+        vprord  ymm13, ymm13, 16
+        vprord  ymm14, ymm14, 16
+        vpaddd  ymm10, ymm10, ymm15
+        vpaddd  ymm11, ymm11, ymm12
+        vpaddd  ymm8, ymm8, ymm13
+        vpaddd  ymm9, ymm9, ymm14
+        vpxord  ymm5, ymm5, ymm10
+        vpxord  ymm6, ymm6, ymm11
+        vpxord  ymm7, ymm7, ymm8
+        vpxord  ymm4, ymm4, ymm9
+        vprord  ymm5, ymm5, 12
+        vprord  ymm6, ymm6, 12
+        vprord  ymm7, ymm7, 12
+        vprord  ymm4, ymm4, 12
+        vpaddd  ymm0, ymm0, ymm16
+        vpaddd  ymm1, ymm1, ymm18
+        vpaddd  ymm2, ymm2, ymm24
+        vpaddd  ymm3, ymm3, ymm22
+        vpaddd  ymm0, ymm0, ymm5
+        vpaddd  ymm1, ymm1, ymm6
+        vpaddd  ymm2, ymm2, ymm7
+        vpaddd  ymm3, ymm3, ymm4
+        vpxord  ymm15, ymm15, ymm0
+        vpxord  ymm12, ymm12, ymm1
+        vpxord  ymm13, ymm13, ymm2
+        vpxord  ymm14, ymm14, ymm3
+        vprord  ymm15, ymm15, 8
+        vprord  ymm12, ymm12, 8
+        vprord  ymm13, ymm13, 8
+        vprord  ymm14, ymm14, 8
+        vpaddd  ymm10, ymm10, ymm15
+        vpaddd  ymm11, ymm11, ymm12
+        vpaddd  ymm8, ymm8, ymm13
+        vpaddd  ymm9, ymm9, ymm14
+        vpxord  ymm5, ymm5, ymm10
+        vpxord  ymm6, ymm6, ymm11
+        vpxord  ymm7, ymm7, ymm8
+        vpxord  ymm4, ymm4, ymm9
+        vprord  ymm5, ymm5, 7
+        vprord  ymm6, ymm6, 7
+        vprord  ymm7, ymm7, 7
+        vprord  ymm4, ymm4, 7
+        vpaddd  ymm0, ymm0, ymm28
+        vpaddd  ymm1, ymm1, ymm25
+        vpaddd  ymm2, ymm2, ymm31
+        vpaddd  ymm3, ymm3, ymm30
+        vpaddd  ymm0, ymm0, ymm4
+        vpaddd  ymm1, ymm1, ymm5
+        vpaddd  ymm2, ymm2, ymm6
+        vpaddd  ymm3, ymm3, ymm7
+        vpxord  ymm12, ymm12, ymm0
+        vpxord  ymm13, ymm13, ymm1
+        vpxord  ymm14, ymm14, ymm2
+        vpxord  ymm15, ymm15, ymm3
+        vprord  ymm12, ymm12, 16
+        vprord  ymm13, ymm13, 16
+        vprord  ymm14, ymm14, 16
+        vprord  ymm15, ymm15, 16
+        vpaddd  ymm8, ymm8, ymm12
+        vpaddd  ymm9, ymm9, ymm13
+        vpaddd  ymm10, ymm10, ymm14
+        vpaddd  ymm11, ymm11, ymm15
+        vpxord  ymm4, ymm4, ymm8
+        vpxord  ymm5, ymm5, ymm9
+        vpxord  ymm6, ymm6, ymm10
+        vpxord  ymm7, ymm7, ymm11
+        vprord  ymm4, ymm4, 12
+        vprord  ymm5, ymm5, 12
+        vprord  ymm6, ymm6, 12
+        vprord  ymm7, ymm7, 12
+        vpaddd  ymm0, ymm0, ymm29
+        vpaddd  ymm1, ymm1, ymm27
+        vpaddd  ymm2, ymm2, ymm26
+        vpaddd  ymm3, ymm3, ymm24
+        vpaddd  ymm0, ymm0, ymm4
+        vpaddd  ymm1, ymm1, ymm5
+        vpaddd  ymm2, ymm2, ymm6
+        vpaddd  ymm3, ymm3, ymm7
+        vpxord  ymm12, ymm12, ymm0
+        vpxord  ymm13, ymm13, ymm1
+        vpxord  ymm14, ymm14, ymm2
+        vpxord  ymm15, ymm15, ymm3
+        vprord  ymm12, ymm12, 8
+        vprord  ymm13, ymm13, 8
+        vprord  ymm14, ymm14, 8
+        vprord  ymm15, ymm15, 8
+        vpaddd  ymm8, ymm8, ymm12
+        vpaddd  ymm9, ymm9, ymm13
+        vpaddd  ymm10, ymm10, ymm14
+        vpaddd  ymm11, ymm11, ymm15
+        vpxord  ymm4, ymm4, ymm8
+        vpxord  ymm5, ymm5, ymm9
+        vpxord  ymm6, ymm6, ymm10
+        vpxord  ymm7, ymm7, ymm11
+        vprord  ymm4, ymm4, 7
+        vprord  ymm5, ymm5, 7
+        vprord  ymm6, ymm6, 7
+        vprord  ymm7, ymm7, 7
+        vpaddd  ymm0, ymm0, ymm23
+        vpaddd  ymm1, ymm1, ymm21
+        vpaddd  ymm2, ymm2, ymm16
+        vpaddd  ymm3, ymm3, ymm22
+        vpaddd  ymm0, ymm0, ymm5
+        vpaddd  ymm1, ymm1, ymm6
+        vpaddd  ymm2, ymm2, ymm7
+        vpaddd  ymm3, ymm3, ymm4
+        vpxord  ymm15, ymm15, ymm0
+        vpxord  ymm12, ymm12, ymm1
+        vpxord  ymm13, ymm13, ymm2
+        vpxord  ymm14, ymm14, ymm3
+        vprord  ymm15, ymm15, 16
+        vprord  ymm12, ymm12, 16
+        vprord  ymm13, ymm13, 16
+        vprord  ymm14, ymm14, 16
+        vpaddd  ymm10, ymm10, ymm15
+        vpaddd  ymm11, ymm11, ymm12
+        vpaddd  ymm8, ymm8, ymm13
+        vpaddd  ymm9, ymm9, ymm14
+        vpxord  ymm5, ymm5, ymm10
+        vpxord  ymm6, ymm6, ymm11
+        vpxord  ymm7, ymm7, ymm8
+        vpxord  ymm4, ymm4, ymm9
+        vprord  ymm5, ymm5, 12
+        vprord  ymm6, ymm6, 12
+        vprord  ymm7, ymm7, 12
+        vprord  ymm4, ymm4, 12
+        vpaddd  ymm0, ymm0, ymm18
+        vpaddd  ymm1, ymm1, ymm19
+        vpaddd  ymm2, ymm2, ymm17
+        vpaddd  ymm3, ymm3, ymm20
+        vpaddd  ymm0, ymm0, ymm5
+        vpaddd  ymm1, ymm1, ymm6
+        vpaddd  ymm2, ymm2, ymm7
+        vpaddd  ymm3, ymm3, ymm4
+        vpxord  ymm15, ymm15, ymm0
+        vpxord  ymm12, ymm12, ymm1
+        vpxord  ymm13, ymm13, ymm2
+        vpxord  ymm14, ymm14, ymm3
+        vprord  ymm15, ymm15, 8
+        vprord  ymm12, ymm12, 8
+        vprord  ymm13, ymm13, 8
+        vprord  ymm14, ymm14, 8
+        vpaddd  ymm10, ymm10, ymm15
+        vpaddd  ymm11, ymm11, ymm12
+        vpaddd  ymm8, ymm8, ymm13
+        vpaddd  ymm9, ymm9, ymm14
+        vpxord  ymm5, ymm5, ymm10
+        vpxord  ymm6, ymm6, ymm11
+        vpxord  ymm7, ymm7, ymm8
+        vpxord  ymm4, ymm4, ymm9
+        vprord  ymm5, ymm5, 7
+        vprord  ymm6, ymm6, 7
+        vprord  ymm7, ymm7, 7
+        vprord  ymm4, ymm4, 7
+        vpaddd  ymm0, ymm0, ymm25
+        vpaddd  ymm1, ymm1, ymm27
+        vpaddd  ymm2, ymm2, ymm24
+        vpaddd  ymm3, ymm3, ymm31
+        vpaddd  ymm0, ymm0, ymm4
+        vpaddd  ymm1, ymm1, ymm5
+        vpaddd  ymm2, ymm2, ymm6
+        vpaddd  ymm3, ymm3, ymm7
+        vpxord  ymm12, ymm12, ymm0
+        vpxord  ymm13, ymm13, ymm1
+        vpxord  ymm14, ymm14, ymm2
+        vpxord  ymm15, ymm15, ymm3
+        vprord  ymm12, ymm12, 16
+        vprord  ymm13, ymm13, 16
+        vprord  ymm14, ymm14, 16
+        vprord  ymm15, ymm15, 16
+        vpaddd  ymm8, ymm8, ymm12
+        vpaddd  ymm9, ymm9, ymm13
+        vpaddd  ymm10, ymm10, ymm14
+        vpaddd  ymm11, ymm11, ymm15
+        vpxord  ymm4, ymm4, ymm8
+        vpxord  ymm5, ymm5, ymm9
+        vpxord  ymm6, ymm6, ymm10
+        vpxord  ymm7, ymm7, ymm11
+        vprord  ymm4, ymm4, 12
+        vprord  ymm5, ymm5, 12
+        vprord  ymm6, ymm6, 12
+        vprord  ymm7, ymm7, 12
+        vpaddd  ymm0, ymm0, ymm30
+        vpaddd  ymm1, ymm1, ymm21
+        vpaddd  ymm2, ymm2, ymm28
+        vpaddd  ymm3, ymm3, ymm17
+        vpaddd  ymm0, ymm0, ymm4
+        vpaddd  ymm1, ymm1, ymm5
+        vpaddd  ymm2, ymm2, ymm6
+        vpaddd  ymm3, ymm3, ymm7
+        vpxord  ymm12, ymm12, ymm0
+        vpxord  ymm13, ymm13, ymm1
+        vpxord  ymm14, ymm14, ymm2
+        vpxord  ymm15, ymm15, ymm3
+        vprord  ymm12, ymm12, 8
+        vprord  ymm13, ymm13, 8
+        vprord  ymm14, ymm14, 8
+        vprord  ymm15, ymm15, 8
+        vpaddd  ymm8, ymm8, ymm12
+        vpaddd  ymm9, ymm9, ymm13
+        vpaddd  ymm10, ymm10, ymm14
+        vpaddd  ymm11, ymm11, ymm15
+        vpxord  ymm4, ymm4, ymm8
+        vpxord  ymm5, ymm5, ymm9
+        vpxord  ymm6, ymm6, ymm10
+        vpxord  ymm7, ymm7, ymm11
+        vprord  ymm4, ymm4, 7
+        vprord  ymm5, ymm5, 7
+        vprord  ymm6, ymm6, 7
+        vprord  ymm7, ymm7, 7
+        vpaddd  ymm0, ymm0, ymm29
+        vpaddd  ymm1, ymm1, ymm16
+        vpaddd  ymm2, ymm2, ymm18
+        vpaddd  ymm3, ymm3, ymm20
+        vpaddd  ymm0, ymm0, ymm5
+        vpaddd  ymm1, ymm1, ymm6
+        vpaddd  ymm2, ymm2, ymm7
+        vpaddd  ymm3, ymm3, ymm4
+        vpxord  ymm15, ymm15, ymm0
+        vpxord  ymm12, ymm12, ymm1
+        vpxord  ymm13, ymm13, ymm2
+        vpxord  ymm14, ymm14, ymm3
+        vprord  ymm15, ymm15, 16
+        vprord  ymm12, ymm12, 16
+        vprord  ymm13, ymm13, 16
+        vprord  ymm14, ymm14, 16
+        vpaddd  ymm10, ymm10, ymm15
+        vpaddd  ymm11, ymm11, ymm12
+        vpaddd  ymm8, ymm8, ymm13
+        vpaddd  ymm9, ymm9, ymm14
+        vpxord  ymm5, ymm5, ymm10
+        vpxord  ymm6, ymm6, ymm11
+        vpxord  ymm7, ymm7, ymm8
+        vpxord  ymm4, ymm4, ymm9
+        vprord  ymm5, ymm5, 12
+        vprord  ymm6, ymm6, 12
+        vprord  ymm7, ymm7, 12
+        vprord  ymm4, ymm4, 12
+        vpaddd  ymm0, ymm0, ymm19
+        vpaddd  ymm1, ymm1, ymm26
+        vpaddd  ymm2, ymm2, ymm22
+        vpaddd  ymm3, ymm3, ymm23
+        vpaddd  ymm0, ymm0, ymm5
+        vpaddd  ymm1, ymm1, ymm6
+        vpaddd  ymm2, ymm2, ymm7
+        vpaddd  ymm3, ymm3, ymm4
+        vpxord  ymm15, ymm15, ymm0
+        vpxord  ymm12, ymm12, ymm1
+        vpxord  ymm13, ymm13, ymm2
+        vpxord  ymm14, ymm14, ymm3
+        vprord  ymm15, ymm15, 8
+        vprord  ymm12, ymm12, 8
+        vprord  ymm13, ymm13, 8
+        vprord  ymm14, ymm14, 8
+        vpaddd  ymm10, ymm10, ymm15
+        vpaddd  ymm11, ymm11, ymm12
+        vpaddd  ymm8, ymm8, ymm13
+        vpaddd  ymm9, ymm9, ymm14
+        vpxord  ymm5, ymm5, ymm10
+        vpxord  ymm6, ymm6, ymm11
+        vpxord  ymm7, ymm7, ymm8
+        vpxord  ymm4, ymm4, ymm9
+        vprord  ymm5, ymm5, 7
+        vprord  ymm6, ymm6, 7
+        vprord  ymm7, ymm7, 7
+        vprord  ymm4, ymm4, 7
+        vpaddd  ymm0, ymm0, ymm27
+        vpaddd  ymm1, ymm1, ymm21
+        vpaddd  ymm2, ymm2, ymm17
+        vpaddd  ymm3, ymm3, ymm24
+        vpaddd  ymm0, ymm0, ymm4
+        vpaddd  ymm1, ymm1, ymm5
+        vpaddd  ymm2, ymm2, ymm6
+        vpaddd  ymm3, ymm3, ymm7
+        vpxord  ymm12, ymm12, ymm0
+        vpxord  ymm13, ymm13, ymm1
+        vpxord  ymm14, ymm14, ymm2
+        vpxord  ymm15, ymm15, ymm3
+        vprord  ymm12, ymm12, 16
+        vprord  ymm13, ymm13, 16
+        vprord  ymm14, ymm14, 16
+        vprord  ymm15, ymm15, 16
+        vpaddd  ymm8, ymm8, ymm12
+        vpaddd  ymm9, ymm9, ymm13
+        vpaddd  ymm10, ymm10, ymm14
+        vpaddd  ymm11, ymm11, ymm15
+        vpxord  ymm4, ymm4, ymm8
+        vpxord  ymm5, ymm5, ymm9
+        vpxord  ymm6, ymm6, ymm10
+        vpxord  ymm7, ymm7, ymm11
+        vprord  ymm4, ymm4, 12
+        vprord  ymm5, ymm5, 12
+        vprord  ymm6, ymm6, 12
+        vprord  ymm7, ymm7, 12
+        vpaddd  ymm0, ymm0, ymm31
+        vpaddd  ymm1, ymm1, ymm16
+        vpaddd  ymm2, ymm2, ymm25
+        vpaddd  ymm3, ymm3, ymm22
+        vpaddd  ymm0, ymm0, ymm4
+        vpaddd  ymm1, ymm1, ymm5
+        vpaddd  ymm2, ymm2, ymm6
+        vpaddd  ymm3, ymm3, ymm7
+        vpxord  ymm12, ymm12, ymm0
+        vpxord  ymm13, ymm13, ymm1
+        vpxord  ymm14, ymm14, ymm2
+        vpxord  ymm15, ymm15, ymm3
+        vprord  ymm12, ymm12, 8
+        vprord  ymm13, ymm13, 8
+        vprord  ymm14, ymm14, 8
+        vprord  ymm15, ymm15, 8
+        vpaddd  ymm8, ymm8, ymm12
+        vpaddd  ymm9, ymm9, ymm13
+        vpaddd  ymm10, ymm10, ymm14
+        vpaddd  ymm11, ymm11, ymm15
+        vpxord  ymm4, ymm4, ymm8
+        vpxord  ymm5, ymm5, ymm9
+        vpxord  ymm6, ymm6, ymm10
+        vpxord  ymm7, ymm7, ymm11
+        vprord  ymm4, ymm4, 7
+        vprord  ymm5, ymm5, 7
+        vprord  ymm6, ymm6, 7
+        vprord  ymm7, ymm7, 7
+        vpaddd  ymm0, ymm0, ymm30
+        vpaddd  ymm1, ymm1, ymm18
+        vpaddd  ymm2, ymm2, ymm19
+        vpaddd  ymm3, ymm3, ymm23
+        vpaddd  ymm0, ymm0, ymm5
+        vpaddd  ymm1, ymm1, ymm6
+        vpaddd  ymm2, ymm2, ymm7
+        vpaddd  ymm3, ymm3, ymm4
+        vpxord  ymm15, ymm15, ymm0
+        vpxord  ymm12, ymm12, ymm1
+        vpxord  ymm13, ymm13, ymm2
+        vpxord  ymm14, ymm14, ymm3
+        vprord  ymm15, ymm15, 16
+        vprord  ymm12, ymm12, 16
+        vprord  ymm13, ymm13, 16
+        vprord  ymm14, ymm14, 16
+        vpaddd  ymm10, ymm10, ymm15
+        vpaddd  ymm11, ymm11, ymm12
+        vpaddd  ymm8, ymm8, ymm13
+        vpaddd  ymm9, ymm9, ymm14
+        vpxord  ymm5, ymm5, ymm10
+        vpxord  ymm6, ymm6, ymm11
+        vpxord  ymm7, ymm7, ymm8
+        vpxord  ymm4, ymm4, ymm9
+        vprord  ymm5, ymm5, 12
+        vprord  ymm6, ymm6, 12
+        vprord  ymm7, ymm7, 12
+        vprord  ymm4, ymm4, 12
+        vpaddd  ymm0, ymm0, ymm26
+        vpaddd  ymm1, ymm1, ymm28
+        vpaddd  ymm2, ymm2, ymm20
+        vpaddd  ymm3, ymm3, ymm29
+        vpaddd  ymm0, ymm0, ymm5
+        vpaddd  ymm1, ymm1, ymm6
+        vpaddd  ymm2, ymm2, ymm7
+        vpaddd  ymm3, ymm3, ymm4
+        vpxord  ymm15, ymm15, ymm0
+        vpxord  ymm12, ymm12, ymm1
+        vpxord  ymm13, ymm13, ymm2
+        vpxord  ymm14, ymm14, ymm3
+        vprord  ymm15, ymm15, 8
+        vprord  ymm12, ymm12, 8
+        vprord  ymm13, ymm13, 8
+        vprord  ymm14, ymm14, 8
+        vpaddd  ymm10, ymm10, ymm15
+        vpaddd  ymm11, ymm11, ymm12
+        vpaddd  ymm8, ymm8, ymm13
+        vpaddd  ymm9, ymm9, ymm14
+        vpxord  ymm5, ymm5, ymm10
+        vpxord  ymm6, ymm6, ymm11
+        vpxord  ymm7, ymm7, ymm8
+        vpxord  ymm4, ymm4, ymm9
+        vprord  ymm5, ymm5, 7
+        vprord  ymm6, ymm6, 7
+        vprord  ymm7, ymm7, 7
+        vprord  ymm4, ymm4, 7
+        vpxor   ymm0, ymm0, ymm8
+        vpxor   ymm1, ymm1, ymm9
+        vpxor   ymm2, ymm2, ymm10
+        vpxor   ymm3, ymm3, ymm11
+        vpxor   ymm4, ymm4, ymm12
+        vpxor   ymm5, ymm5, ymm13
+        vpxor   ymm6, ymm6, ymm14
+        vpxor   ymm7, ymm7, ymm15
+        movzx   eax, byte ptr [rbp+0x38]
+        jne     2b
+        mov     rbx, qword ptr [rbp+0x50]
+        vunpcklps ymm8, ymm0, ymm1
+        vunpcklps ymm9, ymm2, ymm3
+        vunpckhps ymm10, ymm0, ymm1
+        vunpcklps ymm11, ymm4, ymm5
+        vunpcklps ymm0, ymm6, ymm7
+        vshufps ymm12, ymm8, ymm9, 78
+        vblendps ymm1, ymm8, ymm12, 0xCC
+        vshufps ymm8, ymm11, ymm0, 78
+        vunpckhps ymm13, ymm2, ymm3
+        vblendps ymm2, ymm11, ymm8, 0xCC
+        vblendps ymm3, ymm12, ymm9, 0xCC
+        vperm2f128 ymm12, ymm1, ymm2, 0x20
+        vmovups ymmword ptr [rbx], ymm12
+        vunpckhps ymm14, ymm4, ymm5
+        vblendps ymm4, ymm8, ymm0, 0xCC
+        vunpckhps ymm15, ymm6, ymm7
+        vperm2f128 ymm7, ymm3, ymm4, 0x20
+        vmovups ymmword ptr [rbx+0x20], ymm7
+        vshufps ymm5, ymm10, ymm13, 78
+        vblendps ymm6, ymm5, ymm13, 0xCC
+        vshufps ymm13, ymm14, ymm15, 78
+        vblendps ymm10, ymm10, ymm5, 0xCC
+        vblendps ymm14, ymm14, ymm13, 0xCC
+        vperm2f128 ymm8, ymm10, ymm14, 0x20
+        vmovups ymmword ptr [rbx+0x40], ymm8
+        vblendps ymm15, ymm13, ymm15, 0xCC
+        vperm2f128 ymm13, ymm6, ymm15, 0x20
+        vmovups ymmword ptr [rbx+0x60], ymm13
+        vperm2f128 ymm9, ymm1, ymm2, 0x31
+        vperm2f128 ymm11, ymm3, ymm4, 0x31
+        vmovups ymmword ptr [rbx+0x80], ymm9
+        vperm2f128 ymm14, ymm10, ymm14, 0x31
+        vperm2f128 ymm15, ymm6, ymm15, 0x31
+        vmovups ymmword ptr [rbx+0xA0], ymm11
+        vmovups ymmword ptr [rbx+0xC0], ymm14
+        vmovups ymmword ptr [rbx+0xE0], ymm15
+        vmovdqa ymm0, ymmword ptr [rsp]
+        vmovdqa ymm2, ymmword ptr [rsp+0x2*0x20]
+        vmovdqa32 ymm0 {k1}, ymmword ptr [rsp+0x1*0x20]
+        vmovdqa32 ymm2 {k1}, ymmword ptr [rsp+0x3*0x20]
+        vmovdqa ymmword ptr [rsp], ymm0
+        vmovdqa ymmword ptr [rsp+0x2*0x20], ymm2
+        add     rbx, 256
+        mov     qword ptr [rbp+0x50], rbx
+        add     rdi, 64
+        sub     rsi, 8
+3:
+        mov     rbx, qword ptr [rbp+0x50]
+        mov     r15, qword ptr [rsp+0x80]
+        movzx   r13, byte ptr [rbp+0x38]
+        movzx   r12, byte ptr [rbp+0x48]
+        test    esi, 0x4
+        je      3f
+        vbroadcasti32x4 zmm0, xmmword ptr [rcx]
+        vbroadcasti32x4 zmm1, xmmword ptr [rcx+0x1*0x10]
+        vmovdqa xmm12, xmmword ptr [rsp]
+        vmovdqa xmm13, xmmword ptr [rsp+0x4*0x10]
+        vpunpckldq xmm14, xmm12, xmm13
+        vpunpckhdq xmm15, xmm12, xmm13
+        vpermq  ymm14, ymm14, 0xDC
+        vpermq  ymm15, ymm15, 0xDC
+        vpbroadcastd zmm12, dword ptr [BLAKE3_BLOCK_LEN+rip]
+        vinserti64x4 zmm13, zmm14, ymm15, 0x01
+        mov     eax, 17476
+        kmovw   k2, eax
+        vpblendmd zmm13 {k2}, zmm13, zmm12
+        vbroadcasti32x4 zmm15, xmmword ptr [BLAKE3_IV+rip]
+        mov     r8, qword ptr [rdi]
+        mov     r9, qword ptr [rdi+0x8]
+        mov     r10, qword ptr [rdi+0x10]
+        mov     r11, qword ptr [rdi+0x18]
+        mov     eax, 43690
+        kmovw   k3, eax
+        mov     eax, 34952
+        kmovw   k4, eax
+        movzx   eax, byte ptr [rbp+0x40]
+        or      eax, r13d
+        xor     edx, edx
+.p2align 5
+2:
+        mov     r14d, eax
+        or      eax, r12d
+        add     rdx, 64
+        cmp     rdx, r15
+        cmovne  eax, r14d
+        mov     dword ptr [rsp+0x88], eax
+        vmovdqa32 zmm2, zmm15
+        vpbroadcastd zmm8, dword ptr [rsp+0x22*0x4]
+        vpblendmd zmm3 {k4}, zmm13, zmm8
+        vmovups zmm8, zmmword ptr [r8+rdx-0x1*0x40]
+        vinserti32x4 zmm8, zmm8, xmmword ptr [r9+rdx-0x4*0x10], 0x01
+        vinserti32x4 zmm8, zmm8, xmmword ptr [r10+rdx-0x4*0x10], 0x02
+        vinserti32x4 zmm8, zmm8, xmmword ptr [r11+rdx-0x4*0x10], 0x03
+        vmovups zmm9, zmmword ptr [r8+rdx-0x30]
+        vinserti32x4 zmm9, zmm9, xmmword ptr [r9+rdx-0x3*0x10], 0x01
+        vinserti32x4 zmm9, zmm9, xmmword ptr [r10+rdx-0x3*0x10], 0x02
+        vinserti32x4 zmm9, zmm9, xmmword ptr [r11+rdx-0x3*0x10], 0x03
+        vshufps zmm4, zmm8, zmm9, 136
+        vshufps zmm5, zmm8, zmm9, 221
+        vmovups zmm8, zmmword ptr [r8+rdx-0x20]
+        vinserti32x4 zmm8, zmm8, xmmword ptr [r9+rdx-0x2*0x10], 0x01
+        vinserti32x4 zmm8, zmm8, xmmword ptr [r10+rdx-0x2*0x10], 0x02
+        vinserti32x4 zmm8, zmm8, xmmword ptr [r11+rdx-0x2*0x10], 0x03
+        vmovups zmm9, zmmword ptr [r8+rdx-0x10]
+        vinserti32x4 zmm9, zmm9, xmmword ptr [r9+rdx-0x1*0x10], 0x01
+        vinserti32x4 zmm9, zmm9, xmmword ptr [r10+rdx-0x1*0x10], 0x02
+        vinserti32x4 zmm9, zmm9, xmmword ptr [r11+rdx-0x1*0x10], 0x03
+        vshufps zmm6, zmm8, zmm9, 136
+        vshufps zmm7, zmm8, zmm9, 221
+        vpshufd zmm6, zmm6, 0x93
+        vpshufd zmm7, zmm7, 0x93
+        mov     al, 7
+9:
+        vpaddd  zmm0, zmm0, zmm4
+        vpaddd  zmm0, zmm0, zmm1
+        vpxord  zmm3, zmm3, zmm0
+        vprord  zmm3, zmm3, 16
+        vpaddd  zmm2, zmm2, zmm3
+        vpxord  zmm1, zmm1, zmm2
+        vprord  zmm1, zmm1, 12
+        vpaddd  zmm0, zmm0, zmm5
+        vpaddd  zmm0, zmm0, zmm1
+        vpxord  zmm3, zmm3, zmm0
+        vprord  zmm3, zmm3, 8
+        vpaddd  zmm2, zmm2, zmm3
+        vpxord  zmm1, zmm1, zmm2
+        vprord  zmm1, zmm1, 7
+        vpshufd zmm0, zmm0, 0x93
+        vpshufd zmm3, zmm3, 0x4E
+        vpshufd zmm2, zmm2, 0x39
+        vpaddd  zmm0, zmm0, zmm6
+        vpaddd  zmm0, zmm0, zmm1
+        vpxord  zmm3, zmm3, zmm0
+        vprord  zmm3, zmm3, 16
+        vpaddd  zmm2, zmm2, zmm3
+        vpxord  zmm1, zmm1, zmm2
+        vprord  zmm1, zmm1, 12
+        vpaddd  zmm0, zmm0, zmm7
+        vpaddd  zmm0, zmm0, zmm1
+        vpxord  zmm3, zmm3, zmm0
+        vprord  zmm3, zmm3, 8
+        vpaddd  zmm2, zmm2, zmm3
+        vpxord  zmm1, zmm1, zmm2
+        vprord  zmm1, zmm1, 7
+        vpshufd zmm0, zmm0, 0x39
+        vpshufd zmm3, zmm3, 0x4E
+        vpshufd zmm2, zmm2, 0x93
+        dec     al
+        jz      9f
+        vshufps zmm8, zmm4, zmm5, 214
+        vpshufd zmm9, zmm4, 0x0F
+        vpshufd zmm4, zmm8, 0x39
+        vshufps zmm8, zmm6, zmm7, 250
+        vpblendmd zmm9 {k3}, zmm9, zmm8
+        vpunpcklqdq zmm8, zmm7, zmm5
+        vpblendmd zmm8 {k4}, zmm8, zmm6
+        vpshufd zmm8, zmm8, 0x78
+        vpunpckhdq zmm5, zmm5, zmm7
+        vpunpckldq zmm6, zmm6, zmm5
+        vpshufd zmm7, zmm6, 0x1E
+        vmovdqa32 zmm5, zmm9
+        vmovdqa32 zmm6, zmm8
+        jmp     9b
+9:
+        vpxord  zmm0, zmm0, zmm2
+        vpxord  zmm1, zmm1, zmm3
+        mov     eax, r13d
+        cmp     rdx, r15
+        jne     2b
+        vmovdqu xmmword ptr [rbx], xmm0
+        vmovdqu xmmword ptr [rbx+0x10], xmm1
+        vextracti128 xmmword ptr [rbx+0x20], ymm0, 0x01
+        vextracti128 xmmword ptr [rbx+0x30], ymm1, 0x01
+        vextracti32x4 xmmword ptr [rbx+0x4*0x10], zmm0, 0x02
+        vextracti32x4 xmmword ptr [rbx+0x5*0x10], zmm1, 0x02
+        vextracti32x4 xmmword ptr [rbx+0x6*0x10], zmm0, 0x03
+        vextracti32x4 xmmword ptr [rbx+0x7*0x10], zmm1, 0x03
+        vmovdqa xmm0, xmmword ptr [rsp]
+        vmovdqa xmm2, xmmword ptr [rsp+0x40]
+        vmovdqa32 xmm0 {k1}, xmmword ptr [rsp+0x1*0x10]
+        vmovdqa32 xmm2 {k1}, xmmword ptr [rsp+0x5*0x10]
+        vmovdqa xmmword ptr [rsp], xmm0
+        vmovdqa xmmword ptr [rsp+0x40], xmm2
+        add     rbx, 128
+        add     rdi, 32
+        sub     rsi, 4
+3:
+        test    esi, 0x2
+        je      3f
+        vbroadcasti128 ymm0, xmmword ptr [rcx]
+        vbroadcasti128 ymm1, xmmword ptr [rcx+0x10]
+        vmovd   xmm13, dword ptr [rsp]
+        vpinsrd xmm13, xmm13, dword ptr [rsp+0x40], 1
+        vpinsrd xmm13, xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
+        vmovd   xmm14, dword ptr [rsp+0x4]
+        vpinsrd xmm14, xmm14, dword ptr [rsp+0x44], 1
+        vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
+        vinserti128 ymm13, ymm13, xmm14, 0x01
+        mov     r8, qword ptr [rdi]
+        mov     r9, qword ptr [rdi+0x8]
+        movzx   eax, byte ptr [rbp+0x40]
+        or      eax, r13d
+        xor     edx, edx
+.p2align 5
+2:
+        mov     r14d, eax
+        or      eax, r12d
+        add     rdx, 64
+        cmp     rdx, r15
+        cmovne  eax, r14d
+        mov     dword ptr [rsp+0x88], eax
+        vbroadcasti128 ymm2, xmmword ptr [BLAKE3_IV+rip]
+        vpbroadcastd ymm8, dword ptr [rsp+0x88]
+        vpblendd ymm3, ymm13, ymm8, 0x88
+        vmovups ymm8, ymmword ptr [r8+rdx-0x40]
+        vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-0x40], 0x01
+        vmovups ymm9, ymmword ptr [r8+rdx-0x30]
+        vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-0x30], 0x01
+        vshufps ymm4, ymm8, ymm9, 136
+        vshufps ymm5, ymm8, ymm9, 221
+        vmovups ymm8, ymmword ptr [r8+rdx-0x20]
+        vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-0x20], 0x01
+        vmovups ymm9, ymmword ptr [r8+rdx-0x10]
+        vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-0x10], 0x01
+        vshufps ymm6, ymm8, ymm9, 136
+        vshufps ymm7, ymm8, ymm9, 221
+        vpshufd ymm6, ymm6, 0x93
+        vpshufd ymm7, ymm7, 0x93
+        mov     al, 7
+9:
+        vpaddd  ymm0, ymm0, ymm4
+        vpaddd  ymm0, ymm0, ymm1
+        vpxord  ymm3, ymm3, ymm0
+        vprord  ymm3, ymm3, 16
+        vpaddd  ymm2, ymm2, ymm3
+        vpxord  ymm1, ymm1, ymm2
+        vprord  ymm1, ymm1, 12
+        vpaddd  ymm0, ymm0, ymm5
+        vpaddd  ymm0, ymm0, ymm1
+        vpxord  ymm3, ymm3, ymm0
+        vprord  ymm3, ymm3, 8
+        vpaddd  ymm2, ymm2, ymm3
+        vpxord  ymm1, ymm1, ymm2
+        vprord  ymm1, ymm1, 7
+        vpshufd ymm0, ymm0, 0x93
+        vpshufd ymm3, ymm3, 0x4E
+        vpshufd ymm2, ymm2, 0x39
+        vpaddd  ymm0, ymm0, ymm6
+        vpaddd  ymm0, ymm0, ymm1
+        vpxord  ymm3, ymm3, ymm0
+        vprord  ymm3, ymm3, 16
+        vpaddd  ymm2, ymm2, ymm3
+        vpxord  ymm1, ymm1, ymm2
+        vprord  ymm1, ymm1, 12
+        vpaddd  ymm0, ymm0, ymm7
+        vpaddd  ymm0, ymm0, ymm1
+        vpxord  ymm3, ymm3, ymm0
+        vprord  ymm3, ymm3, 8
+        vpaddd  ymm2, ymm2, ymm3
+        vpxord  ymm1, ymm1, ymm2
+        vprord  ymm1, ymm1, 7
+        vpshufd ymm0, ymm0, 0x39
+        vpshufd ymm3, ymm3, 0x4E
+        vpshufd ymm2, ymm2, 0x93
+        dec     al
+        jz      9f
+        vshufps ymm8, ymm4, ymm5, 214
+        vpshufd ymm9, ymm4, 0x0F
+        vpshufd ymm4, ymm8, 0x39
+        vshufps ymm8, ymm6, ymm7, 250
+        vpblendd ymm9, ymm9, ymm8, 0xAA
+        vpunpcklqdq ymm8, ymm7, ymm5
+        vpblendd ymm8, ymm8, ymm6, 0x88
+        vpshufd ymm8, ymm8, 0x78
+        vpunpckhdq ymm5, ymm5, ymm7
+        vpunpckldq ymm6, ymm6, ymm5
+        vpshufd ymm7, ymm6, 0x1E
+        vmovdqa ymm5, ymm9
+        vmovdqa ymm6, ymm8
+        jmp     9b
+9:
+        vpxor   ymm0, ymm0, ymm2
+        vpxor   ymm1, ymm1, ymm3
+        mov     eax, r13d
+        cmp     rdx, r15
+        jne     2b
+        vmovdqu xmmword ptr [rbx], xmm0
+        vmovdqu xmmword ptr [rbx+0x10], xmm1
+        vextracti128 xmmword ptr [rbx+0x20], ymm0, 0x01
+        vextracti128 xmmword ptr [rbx+0x30], ymm1, 0x01
+        vmovdqa xmm0, xmmword ptr [rsp]
+        vmovdqa xmm2, xmmword ptr [rsp+0x4*0x10]
+        vmovdqu32 xmm0 {k1}, xmmword ptr [rsp+0x8]
+        vmovdqu32 xmm2 {k1}, xmmword ptr [rsp+0x48]
+        vmovdqa xmmword ptr [rsp], xmm0
+        vmovdqa xmmword ptr [rsp+0x4*0x10], xmm2
+        add     rbx, 64
+        add     rdi, 16
+        sub     rsi, 2
+3:
+        test    esi, 0x1
+        je      4b
+        vmovdqu xmm0, xmmword ptr [rcx]
+        vmovdqu xmm1, xmmword ptr [rcx+0x10]
+        vmovd   xmm14, dword ptr [rsp]
+        vpinsrd xmm14, xmm14, dword ptr [rsp+0x40], 1
+        vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
+        vmovdqa xmm15, xmmword ptr [BLAKE3_IV+rip]
+        mov     r8, qword ptr [rdi]
+        movzx   eax, byte ptr [rbp+0x40]
+        or      eax, r13d
+        xor     edx, edx
+.p2align 5
+2:
+        mov     r14d, eax
+        or      eax, r12d
+        add     rdx, 64
+        cmp     rdx, r15
+        cmovne  eax, r14d
+        vpinsrd xmm3, xmm14, eax, 3
+        vmovdqa xmm2, xmm15
+        vmovups xmm8, xmmword ptr [r8+rdx-0x40]
+        vmovups xmm9, xmmword ptr [r8+rdx-0x30]
+        vshufps xmm4, xmm8, xmm9, 136
+        vshufps xmm5, xmm8, xmm9, 221
+        vmovups xmm8, xmmword ptr [r8+rdx-0x20]
+        vmovups xmm9, xmmword ptr [r8+rdx-0x10]
+        vshufps xmm6, xmm8, xmm9, 136
+        vshufps xmm7, xmm8, xmm9, 221
+        vpshufd xmm6, xmm6, 0x93
+        vpshufd xmm7, xmm7, 0x93
+        mov     al, 7
+9:
+        vpaddd  xmm0, xmm0, xmm4
+        vpaddd  xmm0, xmm0, xmm1
+        vpxord  xmm3, xmm3, xmm0
+        vprord  xmm3, xmm3, 16
+        vpaddd  xmm2, xmm2, xmm3
+        vpxord  xmm1, xmm1, xmm2
+        vprord  xmm1, xmm1, 12
+        vpaddd  xmm0, xmm0, xmm5
+        vpaddd  xmm0, xmm0, xmm1
+        vpxord  xmm3, xmm3, xmm0
+        vprord  xmm3, xmm3, 8
+        vpaddd  xmm2, xmm2, xmm3
+        vpxord  xmm1, xmm1, xmm2
+        vprord  xmm1, xmm1, 7
+        vpshufd xmm0, xmm0, 0x93
+        vpshufd xmm3, xmm3, 0x4E
+        vpshufd xmm2, xmm2, 0x39
+        vpaddd  xmm0, xmm0, xmm6
+        vpaddd  xmm0, xmm0, xmm1
+        vpxord  xmm3, xmm3, xmm0
+        vprord  xmm3, xmm3, 16
+        vpaddd  xmm2, xmm2, xmm3
+        vpxord  xmm1, xmm1, xmm2
+        vprord  xmm1, xmm1, 12
+        vpaddd  xmm0, xmm0, xmm7
+        vpaddd  xmm0, xmm0, xmm1
+        vpxord  xmm3, xmm3, xmm0
+        vprord  xmm3, xmm3, 8
+        vpaddd  xmm2, xmm2, xmm3
+        vpxord  xmm1, xmm1, xmm2
+        vprord  xmm1, xmm1, 7
+        vpshufd xmm0, xmm0, 0x39
+        vpshufd xmm3, xmm3, 0x4E
+        vpshufd xmm2, xmm2, 0x93
+        dec     al
+        jz      9f
+        vshufps xmm8, xmm4, xmm5, 214
+        vpshufd xmm9, xmm4, 0x0F
+        vpshufd xmm4, xmm8, 0x39
+        vshufps xmm8, xmm6, xmm7, 250
+        vpblendd xmm9, xmm9, xmm8, 0xAA
+        vpunpcklqdq xmm8, xmm7, xmm5
+        vpblendd xmm8, xmm8, xmm6, 0x88
+        vpshufd xmm8, xmm8, 0x78
+        vpunpckhdq xmm5, xmm5, xmm7
+        vpunpckldq xmm6, xmm6, xmm5
+        vpshufd xmm7, xmm6, 0x1E
+        vmovdqa xmm5, xmm9
+        vmovdqa xmm6, xmm8
+        jmp     9b
+9:
+        vpxor   xmm0, xmm0, xmm2
+        vpxor   xmm1, xmm1, xmm3
+        mov     eax, r13d
+        cmp     rdx, r15
+        jne     2b
+        vmovdqu xmmword ptr [rbx], xmm0
+        vmovdqu xmmword ptr [rbx+0x10], xmm1
+        jmp     4b
+.p2align 6
+_blake3_compress_in_place_avx512:
+blake3_compress_in_place_avx512:
+        _CET_ENDBR
+        vmovdqu xmm0, xmmword ptr [rdi]
+        vmovdqu xmm1, xmmword ptr [rdi+0x10]
+        movzx   eax, r8b
+        movzx   edx, dl
+        shl     rax, 32
+        add     rdx, rax
+        vmovq   xmm3, rcx
+        vmovq   xmm4, rdx
+        vpunpcklqdq xmm3, xmm3, xmm4
+        vmovaps xmm2, xmmword ptr [BLAKE3_IV+rip]
+        vmovups xmm8, xmmword ptr [rsi]
+        vmovups xmm9, xmmword ptr [rsi+0x10]
+        vshufps xmm4, xmm8, xmm9, 136
+        vshufps xmm5, xmm8, xmm9, 221
+        vmovups xmm8, xmmword ptr [rsi+0x20]
+        vmovups xmm9, xmmword ptr [rsi+0x30]
+        vshufps xmm6, xmm8, xmm9, 136
+        vshufps xmm7, xmm8, xmm9, 221
+        vpshufd xmm6, xmm6, 0x93
+        vpshufd xmm7, xmm7, 0x93
+        mov     al, 7
+9:
+        vpaddd  xmm0, xmm0, xmm4
+        vpaddd  xmm0, xmm0, xmm1
+        vpxord  xmm3, xmm3, xmm0
+        vprord  xmm3, xmm3, 16
+        vpaddd  xmm2, xmm2, xmm3
+        vpxord  xmm1, xmm1, xmm2
+        vprord  xmm1, xmm1, 12
+        vpaddd  xmm0, xmm0, xmm5
+        vpaddd  xmm0, xmm0, xmm1
+        vpxord  xmm3, xmm3, xmm0
+        vprord  xmm3, xmm3, 8
+        vpaddd  xmm2, xmm2, xmm3
+        vpxord  xmm1, xmm1, xmm2
+        vprord  xmm1, xmm1, 7
+        vpshufd xmm0, xmm0, 0x93
+        vpshufd xmm3, xmm3, 0x4E
+        vpshufd xmm2, xmm2, 0x39
+        vpaddd  xmm0, xmm0, xmm6
+        vpaddd  xmm0, xmm0, xmm1
+        vpxord  xmm3, xmm3, xmm0
+        vprord  xmm3, xmm3, 16
+        vpaddd  xmm2, xmm2, xmm3
+        vpxord  xmm1, xmm1, xmm2
+        vprord  xmm1, xmm1, 12
+        vpaddd  xmm0, xmm0, xmm7
+        vpaddd  xmm0, xmm0, xmm1
+        vpxord  xmm3, xmm3, xmm0
+        vprord  xmm3, xmm3, 8
+        vpaddd  xmm2, xmm2, xmm3
+        vpxord  xmm1, xmm1, xmm2
+        vprord  xmm1, xmm1, 7
+        vpshufd xmm0, xmm0, 0x39
+        vpshufd xmm3, xmm3, 0x4E
+        vpshufd xmm2, xmm2, 0x93
+        dec     al
+        jz      9f
+        vshufps xmm8, xmm4, xmm5, 214
+        vpshufd xmm9, xmm4, 0x0F
+        vpshufd xmm4, xmm8, 0x39
+        vshufps xmm8, xmm6, xmm7, 250
+        vpblendd xmm9, xmm9, xmm8, 0xAA
+        vpunpcklqdq xmm8, xmm7, xmm5
+        vpblendd xmm8, xmm8, xmm6, 0x88
+        vpshufd xmm8, xmm8, 0x78
+        vpunpckhdq xmm5, xmm5, xmm7
+        vpunpckldq xmm6, xmm6, xmm5
+        vpshufd xmm7, xmm6, 0x1E
+        vmovdqa xmm5, xmm9
+        vmovdqa xmm6, xmm8
+        jmp     9b
+9:
+        vpxor   xmm0, xmm0, xmm2
+        vpxor   xmm1, xmm1, xmm3
+        vmovdqu xmmword ptr [rdi], xmm0
+        vmovdqu xmmword ptr [rdi+0x10], xmm1
+        ret
+
+.p2align 6
+_blake3_compress_xof_avx512:
+blake3_compress_xof_avx512:
+        _CET_ENDBR
+        vmovdqu xmm0, xmmword ptr [rdi]
+        vmovdqu xmm1, xmmword ptr [rdi+0x10]
+        movzx   eax, r8b
+        movzx   edx, dl
+        shl     rax, 32
+        add     rdx, rax
+        vmovq   xmm3, rcx
+        vmovq   xmm4, rdx
+        vpunpcklqdq xmm3, xmm3, xmm4
+        vmovaps xmm2, xmmword ptr [BLAKE3_IV+rip]
+        vmovups xmm8, xmmword ptr [rsi]
+        vmovups xmm9, xmmword ptr [rsi+0x10]
+        vshufps xmm4, xmm8, xmm9, 136
+        vshufps xmm5, xmm8, xmm9, 221
+        vmovups xmm8, xmmword ptr [rsi+0x20]
+        vmovups xmm9, xmmword ptr [rsi+0x30]
+        vshufps xmm6, xmm8, xmm9, 136
+        vshufps xmm7, xmm8, xmm9, 221
+        vpshufd xmm6, xmm6, 0x93
+        vpshufd xmm7, xmm7, 0x93
+        mov     al, 7
+9:
+        vpaddd  xmm0, xmm0, xmm4
+        vpaddd  xmm0, xmm0, xmm1
+        vpxord  xmm3, xmm3, xmm0
+        vprord  xmm3, xmm3, 16
+        vpaddd  xmm2, xmm2, xmm3
+        vpxord  xmm1, xmm1, xmm2
+        vprord  xmm1, xmm1, 12
+        vpaddd  xmm0, xmm0, xmm5
+        vpaddd  xmm0, xmm0, xmm1
+        vpxord  xmm3, xmm3, xmm0
+        vprord  xmm3, xmm3, 8
+        vpaddd  xmm2, xmm2, xmm3
+        vpxord  xmm1, xmm1, xmm2
+        vprord  xmm1, xmm1, 7
+        vpshufd xmm0, xmm0, 0x93
+        vpshufd xmm3, xmm3, 0x4E
+        vpshufd xmm2, xmm2, 0x39
+        vpaddd  xmm0, xmm0, xmm6
+        vpaddd  xmm0, xmm0, xmm1
+        vpxord  xmm3, xmm3, xmm0
+        vprord  xmm3, xmm3, 16
+        vpaddd  xmm2, xmm2, xmm3
+        vpxord  xmm1, xmm1, xmm2
+        vprord  xmm1, xmm1, 12
+        vpaddd  xmm0, xmm0, xmm7
+        vpaddd  xmm0, xmm0, xmm1
+        vpxord  xmm3, xmm3, xmm0
+        vprord  xmm3, xmm3, 8
+        vpaddd  xmm2, xmm2, xmm3
+        vpxord  xmm1, xmm1, xmm2
+        vprord  xmm1, xmm1, 7
+        vpshufd xmm0, xmm0, 0x39
+        vpshufd xmm3, xmm3, 0x4E
+        vpshufd xmm2, xmm2, 0x93
+        dec     al
+        jz      9f
+        vshufps xmm8, xmm4, xmm5, 214
+        vpshufd xmm9, xmm4, 0x0F
+        vpshufd xmm4, xmm8, 0x39
+        vshufps xmm8, xmm6, xmm7, 250
+        vpblendd xmm9, xmm9, xmm8, 0xAA
+        vpunpcklqdq xmm8, xmm7, xmm5
+        vpblendd xmm8, xmm8, xmm6, 0x88
+        vpshufd xmm8, xmm8, 0x78
+        vpunpckhdq xmm5, xmm5, xmm7
+        vpunpckldq xmm6, xmm6, xmm5
+        vpshufd xmm7, xmm6, 0x1E
+        vmovdqa xmm5, xmm9
+        vmovdqa xmm6, xmm8
+        jmp     9b
+9:
+        vpxor   xmm0, xmm0, xmm2
+        vpxor   xmm1, xmm1, xmm3
+        vpxor   xmm2, xmm2, [rdi]
+        vpxor   xmm3, xmm3, [rdi+0x10]
+        vmovdqu xmmword ptr [r9], xmm0
+        vmovdqu xmmword ptr [r9+0x10], xmm1
+        vmovdqu xmmword ptr [r9+0x20], xmm2
+        vmovdqu xmmword ptr [r9+0x30], xmm3
+        ret
+
+#ifdef __APPLE__
+.static_data
+#else
+.section .rodata
+#endif
+.p2align  6
+INDEX0:
+        .long    0,  1,  2,  3, 16, 17, 18, 19
+        .long    8,  9, 10, 11, 24, 25, 26, 27
+INDEX1:
+        .long    4,  5,  6,  7, 20, 21, 22, 23
+        .long   12, 13, 14, 15, 28, 29, 30, 31
+ADD0:
+        .long    0,  1,  2,  3,  4,  5,  6,  7
+        .long    8,  9, 10, 11, 12, 13, 14, 15
+ADD1:   .long    1
+
+ADD16:  .long   16
+BLAKE3_BLOCK_LEN:
+        .long   64
+.p2align 6
+BLAKE3_IV:
+BLAKE3_IV_0:
+        .long   0x6A09E667
+BLAKE3_IV_1:
+        .long   0xBB67AE85
+BLAKE3_IV_2:
+        .long   0x3C6EF372
+BLAKE3_IV_3:
+        .long   0xA54FF53A
diff --git a/src/blake3/blake3_dispatch.c b/src/blake3/blake3_dispatch.c
new file mode 100644
index 0000000..c571d1e
--- /dev/null
+++ b/src/blake3/blake3_dispatch.c
@@ -0,0 +1,269 @@
+#include "blake3.h"
+
+#if defined(IS_X86)
+#if defined(__GNUC__)
+#include <immintrin.h>
+#else
+#undef IS_X86 /* Unimplemented! */
+#endif
+#endif
+
+#define MAYBE_UNUSED(x) (void)((x))
+
+#if defined(IS_X86)
+static u64
+xgetbv(void)
+{
+    u32 eax = 0, edx = 0;
+    __asm__ __volatile__("xgetbv\n" : "=a"(eax), "=d"(edx) : "c"(0));
+    return ((u64) edx << 32) | eax;
+}
+
+static void
+cpuid(u32 out[4], u32 id)
+{
+#if  defined(__i386__) || defined(_M_IX86)
+    __asm__ __volatile__("movl %%ebx, %1\n"
+                         "cpuid\n"
+                         "xchgl %1, %%ebx\n"
+                         : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3])
+                         : "a"(id));
+#else
+    __asm__ __volatile__("cpuid\n"
+                         : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3])
+                         : "a"(id));
+#endif
+}
+
+static void
+cpuidex(u32 out[4], u32 id, u32 sid)
+{
+#if  defined(__i386__) || defined(_M_IX86)
+    __asm__ __volatile__("movl %%ebx, %1\n"
+                         "cpuid\n"
+                         "xchgl %1, %%ebx\n"
+                         : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3])
+                         : "a"(id), "c"(sid));
+#else
+    __asm__ __volatile__("cpuid\n"
+                         : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3])
+                         : "a"(id), "c"(sid));
+#endif
+}
+
+#endif /* IS_X86 */
+
+enum cpu_feature {
+  SSE2      = 1 << 0,
+  SSSE3     = 1 << 1,
+  SSE41     = 1 << 2,
+  AVX       = 1 << 3,
+  AVX2      = 1 << 4,
+  AVX512F   = 1 << 5,
+  AVX512VL  = 1 << 6,
+  /* ... */
+  UNDEFINED = 1 << 30
+};
+
+#if !defined(BLAKE3_TESTING)
+static /* Allow the variable to be controlled manually for testing */
+#endif
+enum cpu_feature g_cpu_features = UNDEFINED;
+
+#if !defined(BLAKE3_TESTING)
+static
+#endif
+enum cpu_feature
+get_cpu_features(void)
+{
+
+    if (g_cpu_features != UNDEFINED)
+        return g_cpu_features;
+
+#if defined(IS_X86)
+    u32 regs[4] = {0};
+    u32 *eax = &regs[0], *ebx = &regs[1], *ecx = &regs[2], *edx = &regs[3];
+    (void) edx;
+    enum cpu_feature features = 0;
+    cpuid(regs, 0);
+    const int max_id = *eax;
+    cpuid(regs, 1);
+#if defined(__amd64__) || defined(_M_X64)
+    features |= SSE2;
+#else
+    if (*edx & (1UL << 26))
+        features |= SSE2;
+#endif
+    if (*ecx & (1UL << 0))
+        features |= SSSE3;
+    if (*ecx & (1UL << 19))
+        features |= SSE41;
+
+    if (*ecx & (1UL << 27)) { /* OSXSAVE */
+        const u64 mask = xgetbv();
+        if ((mask & 6) == 6) { /* SSE and AVX states */
+            if (*ecx & (1UL << 28))
+                features |= AVX;
+            if (max_id >= 7) {
+                cpuidex(regs, 7, 0);
+                if (*ebx & (1UL << 5))
+                    features |= AVX2;
+                if ((mask & 224) == 224) { /* Opmask, ZMM_Hi256, Hi16_Zmm */
+                    if (*ebx & (1UL << 31))
+                        features |= AVX512VL;
+                    if (*ebx & (1UL << 16))
+                        features |= AVX512F;
+                }
+            }
+        }
+    }
+    g_cpu_features = features;
+    return features;
+#else
+    /* How to detect NEON? */
+    return 0;
+#endif
+}
+
+void blake3_compress_in_place(u32 cv[8],
+                              const u8 block[BLAKE3_BLOCK_LEN],
+                              u8 block_len, u64 counter,
+                              u8 flags)
+{
+#if defined(IS_X86)
+    const enum cpu_feature features = get_cpu_features();
+    MAYBE_UNUSED(features);
+#if !defined(BLAKE3_NO_AVX512)
+    if (features & AVX512VL) {
+        blake3_compress_in_place_avx512(cv, block, block_len, counter, flags);
+        return;
+    }
+#endif
+#if !defined(BLAKE3_NO_SSE41)
+    if (features & SSE41) {
+        blake3_compress_in_place_sse41(cv, block, block_len, counter, flags);
+        return;
+    }
+#endif
+#if !defined(BLAKE3_NO_SSE2)
+    if (features & SSE2) {
+        blake3_compress_in_place_sse2(cv, block, block_len, counter, flags);
+        return;
+    }
+#endif
+#endif
+    blake3_compress_in_place_portable(cv, block, block_len, counter, flags);
+}
+
+void blake3_compress_xof(const u32 cv[8],
+                         const u8 block[BLAKE3_BLOCK_LEN],
+                         u8 block_len, u64 counter, u8 flags,
+                         u8 out[64])
+{
+#if defined(IS_X86)
+    const enum cpu_feature features = get_cpu_features();
+    MAYBE_UNUSED(features);
+#if !defined(BLAKE3_NO_AVX512)
+    if (features & AVX512VL) {
+        blake3_compress_xof_avx512(cv, block, block_len, counter, flags, out);
+        return;
+    }
+#endif
+#if !defined(BLAKE3_NO_SSE41)
+    if (features & SSE41) {
+        blake3_compress_xof_sse41(cv, block, block_len, counter, flags, out);
+        return;
+    }
+#endif
+#if !defined(BLAKE3_NO_SSE2)
+    if (features & SSE2) {
+        blake3_compress_xof_sse2(cv, block, block_len, counter, flags, out);
+        return;
+    }
+#endif
+#endif
+    blake3_compress_xof_portable(cv, block, block_len, counter, flags, out);
+}
+
+void blake3_hash_many(const u8 *const *inputs, size_t num_inputs,
+                      size_t blocks, const u32 key[8], u64 counter,
+                      char increment_counter, u8 flags,
+                      u8 flags_start, u8 flags_end, u8 *out)
+{
+#if defined(IS_X86)
+    const enum cpu_feature features = get_cpu_features();
+    MAYBE_UNUSED(features);
+#if !defined(BLAKE3_NO_AVX512)
+    if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) {
+        blake3_hash_many_avx512(inputs, num_inputs, blocks, key, counter,
+                                increment_counter, flags, flags_start, flags_end,
+                                out);
+        return;
+    }
+#endif
+#if !defined(BLAKE3_NO_AVX2)
+    if (features & AVX2) {
+        blake3_hash_many_avx2(inputs, num_inputs, blocks, key, counter,
+                              increment_counter, flags, flags_start, flags_end,
+                              out);
+        return;
+    }
+#endif
+#if !defined(BLAKE3_NO_SSE41)
+    if (features & SSE41) {
+        blake3_hash_many_sse41(inputs, num_inputs, blocks, key, counter,
+                               increment_counter, flags, flags_start, flags_end,
+                               out);
+        return;
+    }
+#endif
+#if !defined(BLAKE3_NO_SSE2)
+    if (features & SSE2) {
+        blake3_hash_many_sse2(inputs, num_inputs, blocks, key, counter,
+                              increment_counter, flags, flags_start, flags_end,
+                              out);
+        return;
+    }
+#endif
+#endif
+
+#if BLAKE3_USE_NEON == 1
+    blake3_hash_many_neon(inputs, num_inputs, blocks, key, counter,
+                          increment_counter, flags, flags_start, flags_end, out);
+    return;
+#endif
+
+    blake3_hash_many_portable(inputs, num_inputs, blocks, key, counter,
+                              increment_counter, flags, flags_start, flags_end,
+                              out);
+}
+
+/* The dynamically detected SIMD degree of the current platform. */
+size_t
+blake3_simd_degree(void)
+{
+#if defined(IS_X86)
+    const enum cpu_feature features = get_cpu_features();
+    MAYBE_UNUSED(features);
+#if !defined(BLAKE3_NO_AVX512)
+    if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL))
+        return 16;
+#endif
+#if !defined(BLAKE3_NO_AVX2)
+    if (features & AVX2)
+        return 8;
+#endif
+#if !defined(BLAKE3_NO_SSE41)
+    if (features & SSE41)
+        return 4;
+#endif
+#if !defined(BLAKE3_NO_SSE2)
+    if (features & SSE2)
+        return 4;
+#endif
+#endif
+#if BLAKE3_USE_NEON == 1
+    return 4;
+#endif
+    return 1;
+}
diff --git a/src/blake3/blake3_impl.c b/src/blake3/blake3_impl.c
new file mode 100644
index 0000000..4af38de
--- /dev/null
+++ b/src/blake3/blake3_impl.c
@@ -0,0 +1,594 @@
+#include <assert.h>
+#include "blake3.h"
+
+INLINE void
+chunk_state_init(blake3_chunk_state *self, const u32 key[8], u8 flags)
+{
+    memcpy(self->cv, key, BLAKE3_KEY_LEN);
+    self->chunk_counter = 0;
+    memset(self->buf, 0, BLAKE3_BLOCK_LEN);
+    self->buf_len = 0;
+    self->blocks_compressed = 0;
+    self->flags = flags;
+}
+
+INLINE void
+chunk_state_reset(blake3_chunk_state *self, const u32 key[8], u64 chunk_counter)
+{
+    memcpy(self->cv, key, BLAKE3_KEY_LEN);
+    self->chunk_counter = chunk_counter;
+    self->blocks_compressed = 0;
+    memset(self->buf, 0, BLAKE3_BLOCK_LEN);
+    self->buf_len = 0;
+}
+
+INLINE size_t
+chunk_state_len(const blake3_chunk_state *self)
+{
+    return (BLAKE3_BLOCK_LEN * (size_t) self->blocks_compressed) + ((size_t) self->buf_len);
+}
+
+INLINE size_t
+chunk_state_fill_buf(blake3_chunk_state *self, const u8 *input, size_t input_len)
+{
+    size_t take = BLAKE3_BLOCK_LEN - ((size_t) self->buf_len);
+    if (take > input_len)
+        take = input_len;
+    u8 *dest = self->buf + ((size_t) self->buf_len);
+    memcpy(dest, input, take);
+    self->buf_len += (u8) take;
+    return take;
+}
+
+INLINE u8
+chunk_state_maybe_start_flag(const blake3_chunk_state *self)
+{
+    if (self->blocks_compressed == 0)
+        return CHUNK_START;
+    else
+        return 0;
+}
+
+typedef struct {
+  u32 input_cv[8];
+  u64 counter;
+  u8 block[BLAKE3_BLOCK_LEN];
+  u8 block_len;
+  u8 flags;
+} output_t;
+
+INLINE output_t
+make_output(const u32 input_cv[8], const u8 block[BLAKE3_BLOCK_LEN],
+            u8 block_len, u64 counter, u8 flags)
+{
+    output_t ret;
+    memcpy(ret.input_cv, input_cv, 32);
+    memcpy(ret.block, block, BLAKE3_BLOCK_LEN);
+    ret.block_len = block_len;
+    ret.counter = counter;
+    ret.flags = flags;
+    return ret;
+}
+
+/* Chaining values within a given chunk (specifically the compress_in_place
+ * interface) are represented as words. This avoids unnecessary bytes<->words
+ * conversion overhead in the portable implementation. However, the hash_many
+ * interface handles both user input and parent node blocks, so it accepts
+ * bytes. For that reason, chaining values in the CV stack are represented as
+ * bytes.
+ */
+INLINE void
+output_chaining_value(const output_t *self, u8 cv[32])
+{
+    u32 cv_words[8];
+    memcpy(cv_words, self->input_cv, 32);
+    blake3_compress_in_place(cv_words, self->block, self->block_len,
+                             self->counter, self->flags);
+    store_cv_words(cv, cv_words);
+}
+
+INLINE void
+output_root_bytes(const output_t *self, u8 *out, size_t out_len)
+{
+    u64 output_block_counter = 0;
+    u8 wide_buf[64];
+    while (out_len > 0) {
+        blake3_compress_xof(self->input_cv, self->block, self->block_len,
+                            output_block_counter, self->flags | ROOT, wide_buf);
+        size_t available_bytes = 64;
+        size_t memcpy_len;
+        if (out_len > available_bytes)
+            memcpy_len = available_bytes;
+        else
+            memcpy_len = out_len;
+        memcpy(out, wide_buf, memcpy_len);
+        out += memcpy_len;
+        out_len -= memcpy_len;
+        ++output_block_counter;
+    }
+}
+
+INLINE void
+chunk_state_update(blake3_chunk_state *self, const u8 *input, size_t input_len)
+{
+    if (self->buf_len > 0) {
+        size_t take = chunk_state_fill_buf(self, input, input_len);
+        input += take;
+        input_len -= take;
+        if (input_len > 0) {
+            blake3_compress_in_place(
+                self->cv, self->buf, BLAKE3_BLOCK_LEN, self->chunk_counter,
+                self->flags | chunk_state_maybe_start_flag(self));
+            ++self->blocks_compressed;
+            self->buf_len = 0;
+            memset(self->buf, 0, BLAKE3_BLOCK_LEN);
+        }
+    }
+
+    while (input_len > BLAKE3_BLOCK_LEN) {
+        blake3_compress_in_place(self->cv, input, BLAKE3_BLOCK_LEN,
+                                 self->chunk_counter,
+                                 self->flags | chunk_state_maybe_start_flag(self));
+        ++self->blocks_compressed;
+        input += BLAKE3_BLOCK_LEN;
+        input_len -= BLAKE3_BLOCK_LEN;
+    }
+
+    size_t take = chunk_state_fill_buf(self, input, input_len);
+    input += take;
+    input_len -= take;
+}
+
+INLINE output_t
+chunk_state_output(const blake3_chunk_state *self)
+{
+    u8 block_flags = self->flags | chunk_state_maybe_start_flag(self) | CHUNK_END;
+    return make_output(self->cv, self->buf, self->buf_len, self->chunk_counter, block_flags);
+}
+
+INLINE output_t
+parent_output(const u8 block[BLAKE3_BLOCK_LEN], const u32 key[8], u8 flags)
+{
+    return make_output(key, block, BLAKE3_BLOCK_LEN, 0, flags | PARENT);
+}
+
+/* Given some input larger than one chunk, return the number of bytes that
+ * should go in the left subtree. This is the largest power-of-2 number of
+ * chunks that leaves at least 1 byte for the right subtree.
+ */
+INLINE size_t
+left_len(size_t content_len)
+{
+    /* Subtract 1 to reserve at least one byte for the right side. content_len
+     * should always be greater than BLAKE3_CHUNK_LEN. */
+    size_t full_chunks = (content_len - 1) / BLAKE3_CHUNK_LEN;
+    return round_down_to_power_of_2(full_chunks) * BLAKE3_CHUNK_LEN;
+}
+
+/* Use SIMD parallelism to hash up to MAX_SIMD_DEGREE chunks at the same time
+ * on a single thread. Write out the chunk chaining values and return the
+ * number of chunks hashed. These chunks are never the root and never empty;
+ * those cases use a different codepath.
+ */
+INLINE size_t
+compress_chunks_parallel(const u8 *input, size_t input_len, const u32 key[8],
+                         u64 chunk_counter, u8 flags, u8 *out)
+{
+#if defined(BLAKE3_TESTING)
+    assert(0 < input_len);
+    assert(input_len <= MAX_SIMD_DEGREE * BLAKE3_CHUNK_LEN);
+#endif
+
+    const u8 *chunks_array[MAX_SIMD_DEGREE];
+    size_t input_position = 0;
+    size_t chunks_array_len = 0;
+    while (input_len - input_position >= BLAKE3_CHUNK_LEN) {
+        chunks_array[chunks_array_len] = &input[input_position];
+        input_position += BLAKE3_CHUNK_LEN;
+        ++chunks_array_len;
+    }
+
+    blake3_hash_many(chunks_array, chunks_array_len,
+                     BLAKE3_CHUNK_LEN / BLAKE3_BLOCK_LEN, key, chunk_counter,
+                     1, flags, CHUNK_START, CHUNK_END, out);
+
+    /* Hash the remaining partial chunk, if there is one. Note that the empty
+     * chunk (meaning the empty message) is a different codepath. */
+    if (input_len > input_position) {
+        u64 counter = chunk_counter + (u64) chunks_array_len;
+        blake3_chunk_state chunk_state;
+        chunk_state_init(&chunk_state, key, flags);
+        chunk_state.chunk_counter = counter;
+        chunk_state_update(&chunk_state, &input[input_position],
+                           input_len - input_position);
+        output_t output = chunk_state_output(&chunk_state);
+        output_chaining_value(&output, &out[chunks_array_len * BLAKE3_OUT_LEN]);
+        return chunks_array_len + 1;
+    } else {
+        return chunks_array_len;
+    }
+}
+
+/* Use SIMD parallelism to hash up to MAX_SIMD_DEGREE parents at the same time
+ * on a single thread. Write out the parent chaining values and return the
+ * number of parents hashed. (If there's an odd input chaining value left over,
+ * return it as an additional output.) These parents are never the root and
+ * never empty; those cases use a different codepath.
+ */
+INLINE size_t
+compress_parents_parallel(const u8 *child_chaining_values, size_t num_chaining_values,
+                          const u32 key[8], u8 flags, u8 *out)
+{
+#if defined(BLAKE3_TESTING)
+    assert(2 <= num_chaining_values);
+    assert(num_chaining_values <= 2 * MAX_SIMD_DEGREE_OR_2);
+#endif
+
+    const u8 *parents_array[MAX_SIMD_DEGREE_OR_2];
+    size_t parents_array_len = 0;
+    while (num_chaining_values - (2 * parents_array_len) >= 2) {
+        parents_array[parents_array_len] =
+            &child_chaining_values[2 * parents_array_len * BLAKE3_OUT_LEN];
+        ++parents_array_len;
+    }
+
+    blake3_hash_many(parents_array, parents_array_len, 1, key,
+                     0, /* Parents always use counter 0. */
+                     0, flags | PARENT,
+                     0, /* Parents have no start flags. */
+                     0, /* Parents have no end flags. */
+                     out);
+
+    /* If there's an odd child left over, it becomes an output. */
+    if (num_chaining_values > 2 * parents_array_len) {
+        memcpy(&out[parents_array_len * BLAKE3_OUT_LEN],
+               &child_chaining_values[2 * parents_array_len * BLAKE3_OUT_LEN],
+               BLAKE3_OUT_LEN);
+        return parents_array_len + 1;
+    } else {
+        return parents_array_len;
+    }
+}
+
+/* The wide helper function returns (writes out) an array of chaining values
+ * and returns the length of that array. The number of chaining values returned
+ * is the dynamically detected SIMD degree, at most MAX_SIMD_DEGREE. Or fewer,
+ * if the input is shorter than that many chunks. The reason for maintaining a
+ * wide array of chaining values going back up the tree, is to allow the
+ * implementation to hash as many parents in parallel as possible.
+ *
+ * As a special case when the SIMD degree is 1, this function will still return
+ * at least 2 outputs. This guarantees that this function doesn't perform the
+ * root compression. (If it did, it would use the wrong flags, and also we
+ * wouldn't be able to implement exendable output.) Note that this function is
+ * not used when the whole input is only 1 chunk long; that's a different
+ * codepath.
+ *
+ * Why not just have the caller split the input on the first update(), instead
+ * of implementing this special rule? Because we don't want to limit SIMD or
+ * multi-threading parallelism for that update().
+ */
+static size_t
+blake3_compress_subtree_wide(const u8 *input, size_t input_len,
+                             const u32 key[8], u64 chunk_counter, u8 flags, u8 *out)
+{
+    /* Note that the single chunk case does *not* bump the SIMD degree up to 2
+     * when it is 1. If this implementation adds multi-threading in the future,
+     * this gives us the option of multi-threading even the 2-chunk case, which
+     * can help performance on smaller platforms.
+     */
+    if (input_len <= blake3_simd_degree() * BLAKE3_CHUNK_LEN)
+        return compress_chunks_parallel(input, input_len, key, chunk_counter, flags, out);
+
+    /* With more than simd_degree chunks, we need to recurse. Start by dividing
+     * the input into left and right subtrees. (Note that this is only optimal
+     * as long as the SIMD degree is a power of 2. If we ever get a SIMD degree
+     * of 3 or something, we'll need a more complicated strategy.)
+     */
+    size_t left_input_len = left_len(input_len);
+    size_t right_input_len = input_len - left_input_len;
+    const u8 *right_input = &input[left_input_len];
+    u64 right_chunk_counter = chunk_counter + (u64) (left_input_len / BLAKE3_CHUNK_LEN);
+
+    /* Make space for the child outputs. Here we use MAX_SIMD_DEGREE_OR_2 to
+     * account for the special case of returning 2 outputs when the SIMD degree
+     * is 1.
+     */
+    u8 cv_array[2 * MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN];
+    size_t degree = blake3_simd_degree();
+    if (left_input_len > BLAKE3_CHUNK_LEN && degree == 1)  {
+        /* The special case: We always use a degree of at least two, to make
+         * sure there are two outputs. Except, as noted above, at the chunk
+         * level, where we allow degree=1. (Note that the 1-chunk-input case is
+         * a different codepath.)
+         */
+        degree = 2;
+    }
+    u8 *right_cvs = &cv_array[degree * BLAKE3_OUT_LEN];
+
+    /* Recurse! If this implementation adds multi-threading support in the
+     * future, this is where it will go. */
+    size_t left_n = blake3_compress_subtree_wide(input, left_input_len, key,
+                                                 chunk_counter, flags, cv_array);
+    size_t right_n = blake3_compress_subtree_wide(
+        right_input, right_input_len, key, right_chunk_counter, flags, right_cvs);
+
+    /* The special case again. If simd_degree=1, then we'll have left_n=1 and
+     * right_n=1. Rather than compressing them into a single output, return
+     * them directly, to make sure we always have at least two outputs.
+     */
+    if (left_n == 1) {
+        memcpy(out, cv_array, 2 * BLAKE3_OUT_LEN);
+        return 2;
+    }
+
+    /* Otherwise, do one layer of parent node compression. */
+    size_t num_chaining_values = left_n + right_n;
+    return compress_parents_parallel(cv_array, num_chaining_values, key, flags, out);
+}
+
+/* Hash a subtree with compress_subtree_wide(), and then condense the resulting
+ * list of chaining values down to a single parent node. Don't compress that
+ * last parent node, however. Instead, return its message bytes (the
+ * concatenated chaining values of its children). This is necessary when the
+ * first call to update() supplies a complete subtree, because the topmost
+ * parent node of that subtree could end up being the root. It's also necessary
+ * for extended output in the general case.
+ *
+ * As with compress_subtree_wide(), this function is not used on inputs of 1
+ * chunk or less. That's a different codepath.
+ */
+INLINE void
+compress_subtree_to_parent_node(const u8 *input, size_t input_len, const u32 key[8],
+                                u64 chunk_counter, u8 flags, u8 out[2 * BLAKE3_OUT_LEN])
+{
+#if defined(BLAKE3_TESTING)
+    assert(input_len > BLAKE3_CHUNK_LEN);
+#endif
+
+    u8 cv_array[MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN];
+    size_t num_cvs = blake3_compress_subtree_wide(input, input_len, key,
+                                                  chunk_counter, flags, cv_array);
+    assert(num_cvs <= MAX_SIMD_DEGREE_OR_2);
+
+    /* If MAX_SIMD_DEGREE is greater than 2 and there's enough input,
+     * compress_subtree_wide() returns more than 2 chaining values. Condense
+     * them into 2 by forming parent nodes repeatedly.
+     */
+    u8 out_array[MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN / 2];
+    /* The second half of this loop condition is always true, and we just
+     * asserted it above. But GCC can't tell that it's always true, and if NDEBUG
+     * is set on platforms where MAX_SIMD_DEGREE_OR_2 == 2, GCC emits spurious
+     * warnings here. GCC 8.5 is particularly sensitive, so if you're changing
+     * this code, test it against that version.
+     */
+    while (num_cvs > 2 && num_cvs <= MAX_SIMD_DEGREE_OR_2) {
+        num_cvs = compress_parents_parallel(cv_array, num_cvs, key, flags, out_array);
+        memcpy(cv_array, out_array, num_cvs * BLAKE3_OUT_LEN);
+    }
+    memcpy(out, cv_array, 2 * BLAKE3_OUT_LEN);
+}
+
+void
+blake3_init(blake3_ctx *ctx)
+{
+    memcpy(ctx->key, IV, BLAKE3_KEY_LEN);
+    chunk_state_init(&ctx->chunk, IV, 0);
+    ctx->cv_stack_len = 0;
+}
+
+/* As described in hasher_push_cv() below, we do "lazy merging", delaying
+ * merges until right before the next CV is about to be added. This is
+ * different from the reference implementation. Another difference is that we
+ * aren't always merging 1 chunk at a time. Instead, each CV might represent
+ * any power-of-two number of chunks, as long as the smaller-above-larger stack
+ * order is maintained. Instead of the "count the trailing 0-bits" algorithm
+ * described in the spec, we use a "count the total number of 1-bits" variant
+ * that doesn't require us to retain the subtree size of the CV on top of the
+ * stack. The principle is the same: each CV that should remain in the stack is
+ * represented by a 1-bit in the total number of chunks (or bytes) so far.
+ */
+INLINE void
+hasher_merge_cv_stack(blake3_ctx *ctx, u64 total_len)
+{
+    size_t post_merge_stack_len = (size_t) popcnt(total_len);
+    while (ctx->cv_stack_len > post_merge_stack_len) {
+        u8 *parent_node = &ctx->cv_stack[(ctx->cv_stack_len - 2) * BLAKE3_OUT_LEN];
+        output_t output = parent_output(parent_node, ctx->key, ctx->chunk.flags);
+        output_chaining_value(&output, parent_node);
+        --ctx->cv_stack_len;
+    }
+}
+
+/* In reference_impl.rs, we merge the new CV with existing CVs from the stack
+ * before pushing it. We can do that because we know more input is coming, so
+ * we know none of the merges are root.
+ *
+ * This setting is different. We want to feed as much input as possible to
+ * compress_subtree_wide(), without setting aside anything for the chunk_state.
+ * If the user gives us 64 KiB, we want to parallelize over all 64 KiB at once
+ * as a single subtree, if at all possible.
+ *
+ * This leads to two problems:
+ * 1) This 64 KiB input might be the only call that ever gets made to update.
+ *    In this case, the root node of the 64 KiB subtree would be the root node
+ *    of the whole tree, and it would need to be ROOT finalized. We can't
+ *    compress it until we know.
+ * 2) This 64 KiB input might complete a larger tree, whose root node is
+ *    similarly going to be the the root of the whole tree. For example, maybe
+ *    we have 196 KiB (that is, 128 + 64) hashed so far. We can't compress the
+ *    node at the root of the 256 KiB subtree until we know how to finalize it.
+ *
+ * The second problem is solved with "lazy merging". That is, when we're about
+ * to add a CV to the stack, we don't merge it with anything first, as the
+ * reference impl does. Instead we do merges using the *previous* CV that was
+ * added, which is sitting on top of the stack, and we put the new CV
+ * (unmerged) on top of the stack afterwards. This guarantees that we never
+ * merge the root node until finalize().
+ *
+ * Solving the first problem requires an additional tool,
+ * compress_subtree_to_parent_node(). That function always returns the top
+ * *two* chaining values of the subtree it's compressing. We then do lazy
+ * merging with each of them separately, so that the second CV will always
+ * remain unmerged. (That also helps us support extendable output when we're
+ * hashing an input all-at-once.)
+ */
+INLINE void
+hasher_push_cv(blake3_ctx *ctx, u8 new_cv[BLAKE3_OUT_LEN], u64 chunk_counter)
+{
+    hasher_merge_cv_stack(ctx, chunk_counter);
+    memcpy(&ctx->cv_stack[ctx->cv_stack_len * BLAKE3_OUT_LEN], new_cv, BLAKE3_OUT_LEN);
+    ++ctx->cv_stack_len;
+}
+
+void
+blake3_update(blake3_ctx *ctx, const void *input, size_t input_len)
+{
+    /* Explicitly checking for zero avoids causing UB by passing a null pointer
+     * to memcpy. This comes up in practice with things like:
+     *   std::vector<u8> v;
+     *   blake3_update(&ctx, v.data(), v.size());
+     */
+    if (input_len == 0)
+        return;
+
+    const u8 *input_bytes = (const u8 *) input;
+
+    /* If we have some partial chunk bytes in the internal chunk_state, we need
+     * to finish that chunk first. */
+    if (chunk_state_len(&ctx->chunk) > 0) {
+        size_t take = BLAKE3_CHUNK_LEN - chunk_state_len(&ctx->chunk);
+        if (take > input_len)
+            take = input_len;
+        chunk_state_update(&ctx->chunk, input_bytes, take);
+        input_bytes += take;
+        input_len -= take;
+        /* If we've filled the current chunk and there's more coming, finalize
+         * this chunk and proceed. In this case we know it's not the root. */
+        if (input_len > 0) {
+            output_t output = chunk_state_output(&ctx->chunk);
+            u8 chunk_cv[32];
+            output_chaining_value(&output, chunk_cv);
+            hasher_push_cv(ctx, chunk_cv, ctx->chunk.chunk_counter);
+            chunk_state_reset(&ctx->chunk, ctx->key, ctx->chunk.chunk_counter + 1);
+        } else {
+            return;
+        }
+    }
+
+    /* Now the chunk_state is clear, and we have more input. If there's more than
+     * a single chunk (so, definitely not the root chunk), hash the largest whole
+     * subtree we can, with the full benefits of SIMD (and maybe in the future,
+     * multi-threading) parallelism. Two restrictions:
+     * - The subtree has to be a power-of-2 number of chunks. Only subtrees along
+     *   the right edge can be incomplete, and we don't know where the right edge
+     *   is going to be until we get to finalize().
+     * - The subtree must evenly divide the total number of chunks up until this
+     *   point (if total is not 0). If the current incomplete subtree is only
+     *   waiting for 1 more chunk, we can't hash a subtree of 4 chunks. We have
+     *   to complete the current subtree first.
+     * Because we might need to break up the input to form powers of 2, or to
+     * evenly divide what we already have, this part runs in a loop.
+     */
+    while (input_len > BLAKE3_CHUNK_LEN) {
+        size_t subtree_len = round_down_to_power_of_2(input_len);
+        u64 count_so_far = ctx->chunk.chunk_counter * BLAKE3_CHUNK_LEN;
+        /* Shrink the subtree_len until it evenly divides the count so far. We know
+         * that subtree_len itself is a power of 2, so we can use a bitmasking
+         * trick instead of an actual remainder operation. (Note that if the caller
+         * consistently passes power-of-2 inputs of the same size, as is hopefully
+         * typical, this loop condition will always fail, and subtree_len will
+         * always be the full length of the input.)
+         *
+         * An aside: We don't have to shrink subtree_len quite this much. For
+         * example, if count_so_far is 1, we could pass 2 chunks to
+         * compress_subtree_to_parent_node. Since we'll get 2 CVs back, we'll still
+         * get the right answer in the end, and we might get to use 2-way SIMD
+         * parallelism. The problem with this optimization, is that it gets us
+         * stuck always hashing 2 chunks. The total number of chunks will remain
+         * odd, and we'll never graduate to higher degrees of parallelism. See
+         * https://github.com/BLAKE3-team/BLAKE3/issues/69.
+         */
+        while ((((u64)(subtree_len - 1)) & count_so_far) != 0)
+            subtree_len /= 2;
+        /* The shrunken subtree_len might now be 1 chunk long. If so, hash that
+         * one chunk by itself. Otherwise, compress the subtree into a pair of
+         * CVs.
+         */
+        u64 subtree_chunks = subtree_len / BLAKE3_CHUNK_LEN;
+        if (subtree_len <= BLAKE3_CHUNK_LEN) {
+            blake3_chunk_state chunk_state;
+            chunk_state_init(&chunk_state, ctx->key, ctx->chunk.flags);
+            chunk_state.chunk_counter = ctx->chunk.chunk_counter;
+            chunk_state_update(&chunk_state, input_bytes, subtree_len);
+            output_t output = chunk_state_output(&chunk_state);
+            u8 cv[BLAKE3_OUT_LEN];
+            output_chaining_value(&output, cv);
+            hasher_push_cv(ctx, cv, chunk_state.chunk_counter);
+        } else {
+            /* This is the high-performance happy path, though getting here
+             * depends on the caller giving us a long enough input. */
+            u8 cv_pair[2 * BLAKE3_OUT_LEN];
+            compress_subtree_to_parent_node(input_bytes, subtree_len, ctx->key,
+                                            ctx->chunk.chunk_counter,
+                                            ctx->chunk.flags, cv_pair);
+            hasher_push_cv(ctx, cv_pair, ctx->chunk.chunk_counter);
+            hasher_push_cv(ctx, &cv_pair[BLAKE3_OUT_LEN],
+                           ctx->chunk.chunk_counter + (subtree_chunks / 2));
+        }
+        ctx->chunk.chunk_counter += subtree_chunks;
+        input_bytes += subtree_len;
+        input_len -= subtree_len;
+    }
+
+    /* If there's any remaining input less than a full chunk, add it to the chunk
+     * state. In that case, also do a final merge loop to make sure the subtree
+     * stack doesn't contain any unmerged pairs. The remaining input means we
+     * know these merges are non-root. This merge loop isn't strictly necessary
+     * here, because hasher_push_chunk_cv already does its own merge loop, but it
+     * simplifies blake3_hasher_finalize below.
+     */
+    if (input_len > 0) {
+        chunk_state_update(&ctx->chunk, input_bytes, input_len);
+        hasher_merge_cv_stack(ctx, ctx->chunk.chunk_counter);
+    }
+}
+
+void
+blake3_final(blake3_ctx *ctx, unsigned char * restrict out)
+{
+    /* If the subtree stack is empty, then the current chunk is the root. */
+    if (ctx->cv_stack_len == 0) {
+        output_t output = chunk_state_output(&ctx->chunk);
+        output_root_bytes(&output, out, 32);
+        return;
+    }
+
+    /* If there are any bytes in the chunk state, finalize that chunk and do a
+     * roll-up merge between that chunk hash and every subtree in the stack. In
+     * this case, the extra merge loop at the end of blake3_hasher_update
+     * guarantees that none of the subtrees in the stack need to be merged with
+     * each other first. Otherwise, if there are no bytes in the chunk state,
+     * then the top of the stack is a chunk hash, and we start the merge from
+     * that.
+     */
+    output_t output;
+    size_t cvs_remaining;
+    if (chunk_state_len(&ctx->chunk) > 0) {
+        cvs_remaining = ctx->cv_stack_len;
+        output = chunk_state_output(&ctx->chunk);
+    } else {
+        /* There are always at least 2 CVs in the stack in this case. */
+        cvs_remaining = ctx->cv_stack_len - 2;
+        output = parent_output(&ctx->cv_stack[cvs_remaining * 32], ctx->key,
+                               ctx->chunk.flags);
+    }
+    while (cvs_remaining > 0) {
+        --cvs_remaining;
+        u8 parent_block[BLAKE3_BLOCK_LEN];
+        memcpy(parent_block, &ctx->cv_stack[cvs_remaining * 32], 32);
+        output_chaining_value(&output, &parent_block[32]);
+        output = parent_output(parent_block, ctx->key, ctx->chunk.flags);
+    }
+    output_root_bytes(&output, out, 32);
+}
diff --git a/src/blake3/blake3_portable.c b/src/blake3/blake3_portable.c
new file mode 100644
index 0000000..480c582
--- /dev/null
+++ b/src/blake3/blake3_portable.c
@@ -0,0 +1,169 @@
+#include "blake3.h"
+
+INLINE u32
+rotr32(u32 w, u32 c)
+{
+    return (w >> c) | (w << (32 - c));
+}
+
+INLINE void
+g(u32 *state, size_t a, size_t b, size_t c, size_t d, u32 x, u32 y)
+{
+    state[a] = state[a] + state[b] + x;
+    state[d] = rotr32(state[d] ^ state[a], 16);
+    state[c] = state[c] + state[d];
+    state[b] = rotr32(state[b] ^ state[c], 12);
+    state[a] = state[a] + state[b] + y;
+    state[d] = rotr32(state[d] ^ state[a], 8);
+    state[c] = state[c] + state[d];
+    state[b] = rotr32(state[b] ^ state[c], 7);
+}
+
+INLINE void
+round_fn(u32 state[16], const u32 *msg, size_t round)
+{
+    /* Select the message schedule based on the round. */
+    const u8 *schedule = MSG_SCHEDULE[round];
+
+    /* Mix the columns. */
+    g(state,  0,  4,  8, 12, msg[schedule[ 0]], msg[schedule[ 1]]);
+    g(state,  1,  5,  9, 13, msg[schedule[ 2]], msg[schedule[ 3]]);
+    g(state,  2,  6, 10, 14, msg[schedule[ 4]], msg[schedule[ 5]]);
+    g(state,  3,  7, 11, 15, msg[schedule[ 6]], msg[schedule[ 7]]);
+
+    /* Mix the rows. */
+    g(state,  0,  5, 10, 15, msg[schedule[ 8]], msg[schedule[ 9]]);
+    g(state,  1,  6, 11, 12, msg[schedule[10]], msg[schedule[11]]);
+    g(state,  2,  7,  8, 13, msg[schedule[12]], msg[schedule[13]]);
+    g(state,  3,  4,  9, 14, msg[schedule[14]], msg[schedule[15]]);
+}
+
+INLINE void
+compress_pre(u32 state[16], const u32 cv[8], const u8 block[BLAKE3_BLOCK_LEN],
+             u8 block_len, u64 counter, u8 flags)
+{
+    u32 block_words[16];
+    block_words[ 0] = load32(block + 4 *  0);
+    block_words[ 1] = load32(block + 4 *  1);
+    block_words[ 2] = load32(block + 4 *  2);
+    block_words[ 3] = load32(block + 4 *  3);
+    block_words[ 4] = load32(block + 4 *  4);
+    block_words[ 5] = load32(block + 4 *  5);
+    block_words[ 6] = load32(block + 4 *  6);
+    block_words[ 7] = load32(block + 4 *  7);
+    block_words[ 8] = load32(block + 4 *  8);
+    block_words[ 9] = load32(block + 4 *  9);
+    block_words[10] = load32(block + 4 * 10);
+    block_words[11] = load32(block + 4 * 11);
+    block_words[12] = load32(block + 4 * 12);
+    block_words[13] = load32(block + 4 * 13);
+    block_words[14] = load32(block + 4 * 14);
+    block_words[15] = load32(block + 4 * 15);
+
+    state[ 0] = cv[0];
+    state[ 1] = cv[1];
+    state[ 2] = cv[2];
+    state[ 3] = cv[3];
+    state[ 4] = cv[4];
+    state[ 5] = cv[5];
+    state[ 6] = cv[6];
+    state[ 7] = cv[7];
+    state[ 8] = IV[0];
+    state[ 9] = IV[1];
+    state[10] = IV[2];
+    state[11] = IV[3];
+    state[12] = counter_low(counter);
+    state[13] = counter_high(counter);
+    state[14] = (u32) block_len;
+    state[15] = (u32) flags;
+
+    round_fn(state, &block_words[0], 0);
+    round_fn(state, &block_words[0], 1);
+    round_fn(state, &block_words[0], 2);
+    round_fn(state, &block_words[0], 3);
+    round_fn(state, &block_words[0], 4);
+    round_fn(state, &block_words[0], 5);
+    round_fn(state, &block_words[0], 6);
+}
+
+void
+blake3_compress_in_place_portable(u32 cv[8],
+                                  const u8 block[BLAKE3_BLOCK_LEN],
+                                  u8 block_len, u64 counter, u8 flags)
+{
+    u32 state[16];
+    compress_pre(state, cv, block, block_len, counter, flags);
+    cv[0] = state[0] ^ state[ 8];
+    cv[1] = state[1] ^ state[ 9];
+    cv[2] = state[2] ^ state[10];
+    cv[3] = state[3] ^ state[11];
+    cv[4] = state[4] ^ state[12];
+    cv[5] = state[5] ^ state[13];
+    cv[6] = state[6] ^ state[14];
+    cv[7] = state[7] ^ state[15];
+}
+
+void
+blake3_compress_xof_portable(const u32 cv[8],
+                             const u8 block[BLAKE3_BLOCK_LEN],
+                             u8 block_len, u64 counter,
+                             u8 flags, u8 out[64])
+{
+    u32 state[16];
+    compress_pre(state, cv, block, block_len, counter, flags);
+
+    store32(&out[ 0 * 4], state[ 0] ^ state[ 8]);
+    store32(&out[ 1 * 4], state[ 1] ^ state[ 9]);
+    store32(&out[ 2 * 4], state[ 2] ^ state[10]);
+    store32(&out[ 3 * 4], state[ 3] ^ state[11]);
+    store32(&out[ 4 * 4], state[ 4] ^ state[12]);
+    store32(&out[ 5 * 4], state[ 5] ^ state[13]);
+    store32(&out[ 6 * 4], state[ 6] ^ state[14]);
+    store32(&out[ 7 * 4], state[ 7] ^ state[15]);
+    store32(&out[ 8 * 4], state[ 8] ^ cv[0]);
+    store32(&out[ 9 * 4], state[ 9] ^ cv[1]);
+    store32(&out[10 * 4], state[10] ^ cv[2]);
+    store32(&out[11 * 4], state[11] ^ cv[3]);
+    store32(&out[12 * 4], state[12] ^ cv[4]);
+    store32(&out[13 * 4], state[13] ^ cv[5]);
+    store32(&out[14 * 4], state[14] ^ cv[6]);
+    store32(&out[15 * 4], state[15] ^ cv[7]);
+}
+
+INLINE void
+hash_one_portable(const u8 *input, size_t blocks,
+                  const u32 key[8], u64 counter,
+                  u8 flags, u8 flags_start,
+                  u8 flags_end, u8 out[BLAKE3_OUT_LEN])
+{
+    u32 cv[8];
+    memcpy(cv, key, BLAKE3_KEY_LEN);
+    u8 block_flags = flags | flags_start;
+    while (blocks > 0) {
+        if (blocks == 1)
+            block_flags |= flags_end;
+        blake3_compress_in_place_portable(cv, input, BLAKE3_BLOCK_LEN, counter,
+                                          block_flags);
+        input = &input[BLAKE3_BLOCK_LEN];
+        blocks -= 1;
+        block_flags = flags;
+    }
+    store_cv_words(out, cv);
+}
+
+void blake3_hash_many_portable(const u8 *const *inputs, size_t num_inputs,
+                               size_t blocks, const u32 key[8],
+                               u64 counter, char increment_counter,
+                               u8 flags, u8 flags_start,
+                               u8 flags_end, u8 *out)
+{
+    while (num_inputs > 0) {
+        hash_one_portable(inputs[0], blocks, key, counter, flags, flags_start,
+                          flags_end, out);
+        if (increment_counter)
+            ++counter;
+        ++inputs;
+        --num_inputs;
+        out = &out[BLAKE3_OUT_LEN];
+    }
+}
diff --git a/src/blake3/blake3_sse2.c b/src/blake3/blake3_sse2.c
new file mode 100644
index 0000000..75cd5a3
--- /dev/null
+++ b/src/blake3/blake3_sse2.c
@@ -0,0 +1,566 @@
+#include "blake3.h"
+
+#include <immintrin.h>
+
+#define DEGREE 4
+
+#define _mm_shuffle_ps2(a, b, c)                                               \
+  (_mm_castps_si128(                                                           \
+      _mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), (c))))
+
+INLINE __m128i loadu(const u8 src[16]) {
+  return _mm_loadu_si128((const __m128i *)src);
+}
+
+INLINE void storeu(__m128i src, u8 dest[16]) {
+  _mm_storeu_si128((__m128i *)dest, src);
+}
+
+INLINE __m128i addv(__m128i a, __m128i b) { return _mm_add_epi32(a, b); }
+
+// Note that clang-format doesn't like the name "xor" for some reason.
+INLINE __m128i xorv(__m128i a, __m128i b) { return _mm_xor_si128(a, b); }
+
+INLINE __m128i set1(u32 x) { return _mm_set1_epi32((int32_t)x); }
+
+INLINE __m128i set4(u32 a, u32 b, u32 c, u32 d) {
+  return _mm_setr_epi32((int32_t)a, (int32_t)b, (int32_t)c, (int32_t)d);
+}
+
+INLINE __m128i rot16(__m128i x) {
+  return _mm_shufflehi_epi16(_mm_shufflelo_epi16(x, 0xB1), 0xB1);
+}
+
+INLINE __m128i rot12(__m128i x) {
+  return xorv(_mm_srli_epi32(x, 12), _mm_slli_epi32(x, 32 - 12));
+}
+
+INLINE __m128i rot8(__m128i x) {
+  return xorv(_mm_srli_epi32(x, 8), _mm_slli_epi32(x, 32 - 8));
+}
+
+INLINE __m128i rot7(__m128i x) {
+  return xorv(_mm_srli_epi32(x, 7), _mm_slli_epi32(x, 32 - 7));
+}
+
+INLINE void g1(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3,
+               __m128i m) {
+  *row0 = addv(addv(*row0, m), *row1);
+  *row3 = xorv(*row3, *row0);
+  *row3 = rot16(*row3);
+  *row2 = addv(*row2, *row3);
+  *row1 = xorv(*row1, *row2);
+  *row1 = rot12(*row1);
+}
+
+INLINE void g2(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3,
+               __m128i m) {
+  *row0 = addv(addv(*row0, m), *row1);
+  *row3 = xorv(*row3, *row0);
+  *row3 = rot8(*row3);
+  *row2 = addv(*row2, *row3);
+  *row1 = xorv(*row1, *row2);
+  *row1 = rot7(*row1);
+}
+
+// Note the optimization here of leaving row1 as the unrotated row, rather than
+// row0. All the message loads below are adjusted to compensate for this. See
+// discussion at https://github.com/sneves/blake2-avx2/pull/4
+INLINE void diagonalize(__m128i *row0, __m128i *row2, __m128i *row3) {
+  *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(2, 1, 0, 3));
+  *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2));
+  *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(0, 3, 2, 1));
+}
+
+INLINE void undiagonalize(__m128i *row0, __m128i *row2, __m128i *row3) {
+  *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(0, 3, 2, 1));
+  *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2));
+  *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(2, 1, 0, 3));
+}
+
+INLINE __m128i blend_epi16(__m128i a, __m128i b, const int16_t imm8) {
+  const __m128i bits = _mm_set_epi16(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01);
+  __m128i mask = _mm_set1_epi16(imm8);
+  mask = _mm_and_si128(mask, bits);
+  mask = _mm_cmpeq_epi16(mask, bits);
+  return _mm_or_si128(_mm_and_si128(mask, b), _mm_andnot_si128(mask, a));
+}
+
+INLINE void compress_pre(__m128i rows[4], const u32 cv[8],
+                         const u8 block[BLAKE3_BLOCK_LEN],
+                         u8 block_len, u64 counter, u8 flags) {
+  rows[0] = loadu((u8 *)&cv[0]);
+  rows[1] = loadu((u8 *)&cv[4]);
+  rows[2] = set4(IV[0], IV[1], IV[2], IV[3]);
+  rows[3] = set4(counter_low(counter), counter_high(counter),
+                 (u32)block_len, (u32)flags);
+
+  __m128i m0 = loadu(&block[sizeof(__m128i) * 0]);
+  __m128i m1 = loadu(&block[sizeof(__m128i) * 1]);
+  __m128i m2 = loadu(&block[sizeof(__m128i) * 2]);
+  __m128i m3 = loadu(&block[sizeof(__m128i) * 3]);
+
+  __m128i t0, t1, t2, t3, tt;
+
+  // Round 1. The first round permutes the message words from the original
+  // input order, into the groups that get mixed in parallel.
+  t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(2, 0, 2, 0)); //  6  4  2  0
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
+  t1 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 3, 1)); //  7  5  3  1
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
+  diagonalize(&rows[0], &rows[2], &rows[3]);
+  t2 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(2, 0, 2, 0)); // 14 12 10  8
+  t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2, 1, 0, 3));   // 12 10  8 14
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
+  t3 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 1, 3, 1)); // 15 13 11  9
+  t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE(2, 1, 0, 3));   // 13 11  9 15
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
+  undiagonalize(&rows[0], &rows[2], &rows[3]);
+  m0 = t0;
+  m1 = t1;
+  m2 = t2;
+  m3 = t3;
+
+  // Round 2. This round and all following rounds apply a fixed permutation
+  // to the message words from the round before.
+  t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
+  t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
+  t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
+  tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
+  t1 = blend_epi16(tt, t1, 0xCC);
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
+  diagonalize(&rows[0], &rows[2], &rows[3]);
+  t2 = _mm_unpacklo_epi64(m3, m1);
+  tt = blend_epi16(t2, m2, 0xC0);
+  t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
+  t3 = _mm_unpackhi_epi32(m1, m3);
+  tt = _mm_unpacklo_epi32(m2, t3);
+  t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
+  undiagonalize(&rows[0], &rows[2], &rows[3]);
+  m0 = t0;
+  m1 = t1;
+  m2 = t2;
+  m3 = t3;
+
+  // Round 3
+  t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
+  t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
+  t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
+  tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
+  t1 = blend_epi16(tt, t1, 0xCC);
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
+  diagonalize(&rows[0], &rows[2], &rows[3]);
+  t2 = _mm_unpacklo_epi64(m3, m1);
+  tt = blend_epi16(t2, m2, 0xC0);
+  t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
+  t3 = _mm_unpackhi_epi32(m1, m3);
+  tt = _mm_unpacklo_epi32(m2, t3);
+  t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
+  undiagonalize(&rows[0], &rows[2], &rows[3]);
+  m0 = t0;
+  m1 = t1;
+  m2 = t2;
+  m3 = t3;
+
+  // Round 4
+  t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
+  t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
+  t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
+  tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
+  t1 = blend_epi16(tt, t1, 0xCC);
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
+  diagonalize(&rows[0], &rows[2], &rows[3]);
+  t2 = _mm_unpacklo_epi64(m3, m1);
+  tt = blend_epi16(t2, m2, 0xC0);
+  t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
+  t3 = _mm_unpackhi_epi32(m1, m3);
+  tt = _mm_unpacklo_epi32(m2, t3);
+  t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
+  undiagonalize(&rows[0], &rows[2], &rows[3]);
+  m0 = t0;
+  m1 = t1;
+  m2 = t2;
+  m3 = t3;
+
+  // Round 5
+  t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
+  t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
+  t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
+  tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
+  t1 = blend_epi16(tt, t1, 0xCC);
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
+  diagonalize(&rows[0], &rows[2], &rows[3]);
+  t2 = _mm_unpacklo_epi64(m3, m1);
+  tt = blend_epi16(t2, m2, 0xC0);
+  t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
+  t3 = _mm_unpackhi_epi32(m1, m3);
+  tt = _mm_unpacklo_epi32(m2, t3);
+  t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
+  undiagonalize(&rows[0], &rows[2], &rows[3]);
+  m0 = t0;
+  m1 = t1;
+  m2 = t2;
+  m3 = t3;
+
+  // Round 6
+  t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
+  t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
+  t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
+  tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
+  t1 = blend_epi16(tt, t1, 0xCC);
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
+  diagonalize(&rows[0], &rows[2], &rows[3]);
+  t2 = _mm_unpacklo_epi64(m3, m1);
+  tt = blend_epi16(t2, m2, 0xC0);
+  t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
+  t3 = _mm_unpackhi_epi32(m1, m3);
+  tt = _mm_unpacklo_epi32(m2, t3);
+  t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
+  undiagonalize(&rows[0], &rows[2], &rows[3]);
+  m0 = t0;
+  m1 = t1;
+  m2 = t2;
+  m3 = t3;
+
+  // Round 7
+  t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
+  t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
+  t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
+  tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
+  t1 = blend_epi16(tt, t1, 0xCC);
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
+  diagonalize(&rows[0], &rows[2], &rows[3]);
+  t2 = _mm_unpacklo_epi64(m3, m1);
+  tt = blend_epi16(t2, m2, 0xC0);
+  t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
+  t3 = _mm_unpackhi_epi32(m1, m3);
+  tt = _mm_unpacklo_epi32(m2, t3);
+  t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
+  undiagonalize(&rows[0], &rows[2], &rows[3]);
+}
+
+void blake3_compress_in_place_sse2(u32 cv[8],
+                                   const u8 block[BLAKE3_BLOCK_LEN],
+                                   u8 block_len, u64 counter,
+                                   u8 flags) {
+  __m128i rows[4];
+  compress_pre(rows, cv, block, block_len, counter, flags);
+  storeu(xorv(rows[0], rows[2]), (u8 *)&cv[0]);
+  storeu(xorv(rows[1], rows[3]), (u8 *)&cv[4]);
+}
+
+void blake3_compress_xof_sse2(const u32 cv[8],
+                              const u8 block[BLAKE3_BLOCK_LEN],
+                              u8 block_len, u64 counter,
+                              u8 flags, u8 out[64]) {
+  __m128i rows[4];
+  compress_pre(rows, cv, block, block_len, counter, flags);
+  storeu(xorv(rows[0], rows[2]), &out[0]);
+  storeu(xorv(rows[1], rows[3]), &out[16]);
+  storeu(xorv(rows[2], loadu((u8 *)&cv[0])), &out[32]);
+  storeu(xorv(rows[3], loadu((u8 *)&cv[4])), &out[48]);
+}
+
+INLINE void round_fn(__m128i v[16], __m128i m[16], size_t r) {
+  v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][0]]);
+  v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][2]]);
+  v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][4]]);
+  v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][6]]);
+  v[0] = addv(v[0], v[4]);
+  v[1] = addv(v[1], v[5]);
+  v[2] = addv(v[2], v[6]);
+  v[3] = addv(v[3], v[7]);
+  v[12] = xorv(v[12], v[0]);
+  v[13] = xorv(v[13], v[1]);
+  v[14] = xorv(v[14], v[2]);
+  v[15] = xorv(v[15], v[3]);
+  v[12] = rot16(v[12]);
+  v[13] = rot16(v[13]);
+  v[14] = rot16(v[14]);
+  v[15] = rot16(v[15]);
+  v[8] = addv(v[8], v[12]);
+  v[9] = addv(v[9], v[13]);
+  v[10] = addv(v[10], v[14]);
+  v[11] = addv(v[11], v[15]);
+  v[4] = xorv(v[4], v[8]);
+  v[5] = xorv(v[5], v[9]);
+  v[6] = xorv(v[6], v[10]);
+  v[7] = xorv(v[7], v[11]);
+  v[4] = rot12(v[4]);
+  v[5] = rot12(v[5]);
+  v[6] = rot12(v[6]);
+  v[7] = rot12(v[7]);
+  v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][1]]);
+  v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][3]]);
+  v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][5]]);
+  v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][7]]);
+  v[0] = addv(v[0], v[4]);
+  v[1] = addv(v[1], v[5]);
+  v[2] = addv(v[2], v[6]);
+  v[3] = addv(v[3], v[7]);
+  v[12] = xorv(v[12], v[0]);
+  v[13] = xorv(v[13], v[1]);
+  v[14] = xorv(v[14], v[2]);
+  v[15] = xorv(v[15], v[3]);
+  v[12] = rot8(v[12]);
+  v[13] = rot8(v[13]);
+  v[14] = rot8(v[14]);
+  v[15] = rot8(v[15]);
+  v[8] = addv(v[8], v[12]);
+  v[9] = addv(v[9], v[13]);
+  v[10] = addv(v[10], v[14]);
+  v[11] = addv(v[11], v[15]);
+  v[4] = xorv(v[4], v[8]);
+  v[5] = xorv(v[5], v[9]);
+  v[6] = xorv(v[6], v[10]);
+  v[7] = xorv(v[7], v[11]);
+  v[4] = rot7(v[4]);
+  v[5] = rot7(v[5]);
+  v[6] = rot7(v[6]);
+  v[7] = rot7(v[7]);
+
+  v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][8]]);
+  v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][10]]);
+  v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][12]]);
+  v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][14]]);
+  v[0] = addv(v[0], v[5]);
+  v[1] = addv(v[1], v[6]);
+  v[2] = addv(v[2], v[7]);
+  v[3] = addv(v[3], v[4]);
+  v[15] = xorv(v[15], v[0]);
+  v[12] = xorv(v[12], v[1]);
+  v[13] = xorv(v[13], v[2]);
+  v[14] = xorv(v[14], v[3]);
+  v[15] = rot16(v[15]);
+  v[12] = rot16(v[12]);
+  v[13] = rot16(v[13]);
+  v[14] = rot16(v[14]);
+  v[10] = addv(v[10], v[15]);
+  v[11] = addv(v[11], v[12]);
+  v[8] = addv(v[8], v[13]);
+  v[9] = addv(v[9], v[14]);
+  v[5] = xorv(v[5], v[10]);
+  v[6] = xorv(v[6], v[11]);
+  v[7] = xorv(v[7], v[8]);
+  v[4] = xorv(v[4], v[9]);
+  v[5] = rot12(v[5]);
+  v[6] = rot12(v[6]);
+  v[7] = rot12(v[7]);
+  v[4] = rot12(v[4]);
+  v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][9]]);
+  v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][11]]);
+  v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][13]]);
+  v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][15]]);
+  v[0] = addv(v[0], v[5]);
+  v[1] = addv(v[1], v[6]);
+  v[2] = addv(v[2], v[7]);
+  v[3] = addv(v[3], v[4]);
+  v[15] = xorv(v[15], v[0]);
+  v[12] = xorv(v[12], v[1]);
+  v[13] = xorv(v[13], v[2]);
+  v[14] = xorv(v[14], v[3]);
+  v[15] = rot8(v[15]);
+  v[12] = rot8(v[12]);
+  v[13] = rot8(v[13]);
+  v[14] = rot8(v[14]);
+  v[10] = addv(v[10], v[15]);
+  v[11] = addv(v[11], v[12]);
+  v[8] = addv(v[8], v[13]);
+  v[9] = addv(v[9], v[14]);
+  v[5] = xorv(v[5], v[10]);
+  v[6] = xorv(v[6], v[11]);
+  v[7] = xorv(v[7], v[8]);
+  v[4] = xorv(v[4], v[9]);
+  v[5] = rot7(v[5]);
+  v[6] = rot7(v[6]);
+  v[7] = rot7(v[7]);
+  v[4] = rot7(v[4]);
+}
+
+INLINE void transpose_vecs(__m128i vecs[DEGREE]) {
+  // Interleave 32-bit lates. The low unpack is lanes 00/11 and the high is
+  // 22/33. Note that this doesn't split the vector into two lanes, as the
+  // AVX2 counterparts do.
+  __m128i ab_01 = _mm_unpacklo_epi32(vecs[0], vecs[1]);
+  __m128i ab_23 = _mm_unpackhi_epi32(vecs[0], vecs[1]);
+  __m128i cd_01 = _mm_unpacklo_epi32(vecs[2], vecs[3]);
+  __m128i cd_23 = _mm_unpackhi_epi32(vecs[2], vecs[3]);
+
+  // Interleave 64-bit lanes.
+  __m128i abcd_0 = _mm_unpacklo_epi64(ab_01, cd_01);
+  __m128i abcd_1 = _mm_unpackhi_epi64(ab_01, cd_01);
+  __m128i abcd_2 = _mm_unpacklo_epi64(ab_23, cd_23);
+  __m128i abcd_3 = _mm_unpackhi_epi64(ab_23, cd_23);
+
+  vecs[0] = abcd_0;
+  vecs[1] = abcd_1;
+  vecs[2] = abcd_2;
+  vecs[3] = abcd_3;
+}
+
+INLINE void transpose_msg_vecs(const u8 *const *inputs,
+                               size_t block_offset, __m128i out[16]) {
+  out[0] = loadu(&inputs[0][block_offset + 0 * sizeof(__m128i)]);
+  out[1] = loadu(&inputs[1][block_offset + 0 * sizeof(__m128i)]);
+  out[2] = loadu(&inputs[2][block_offset + 0 * sizeof(__m128i)]);
+  out[3] = loadu(&inputs[3][block_offset + 0 * sizeof(__m128i)]);
+  out[4] = loadu(&inputs[0][block_offset + 1 * sizeof(__m128i)]);
+  out[5] = loadu(&inputs[1][block_offset + 1 * sizeof(__m128i)]);
+  out[6] = loadu(&inputs[2][block_offset + 1 * sizeof(__m128i)]);
+  out[7] = loadu(&inputs[3][block_offset + 1 * sizeof(__m128i)]);
+  out[8] = loadu(&inputs[0][block_offset + 2 * sizeof(__m128i)]);
+  out[9] = loadu(&inputs[1][block_offset + 2 * sizeof(__m128i)]);
+  out[10] = loadu(&inputs[2][block_offset + 2 * sizeof(__m128i)]);
+  out[11] = loadu(&inputs[3][block_offset + 2 * sizeof(__m128i)]);
+  out[12] = loadu(&inputs[0][block_offset + 3 * sizeof(__m128i)]);
+  out[13] = loadu(&inputs[1][block_offset + 3 * sizeof(__m128i)]);
+  out[14] = loadu(&inputs[2][block_offset + 3 * sizeof(__m128i)]);
+  out[15] = loadu(&inputs[3][block_offset + 3 * sizeof(__m128i)]);
+  for (size_t i = 0; i < 4; ++i) {
+    _mm_prefetch((const void *)&inputs[i][block_offset + 256], _MM_HINT_T0);
+  }
+  transpose_vecs(&out[0]);
+  transpose_vecs(&out[4]);
+  transpose_vecs(&out[8]);
+  transpose_vecs(&out[12]);
+}
+
+INLINE void load_counters(u64 counter, char increment_counter,
+                          __m128i *out_lo, __m128i *out_hi) {
+  const __m128i mask = _mm_set1_epi32(-(int32_t)increment_counter);
+  const __m128i add0 = _mm_set_epi32(3, 2, 1, 0);
+  const __m128i add1 = _mm_and_si128(mask, add0);
+  __m128i l = _mm_add_epi32(_mm_set1_epi32((int32_t)counter), add1);
+  __m128i carry = _mm_cmpgt_epi32(_mm_xor_si128(add1, _mm_set1_epi32(0x80000000)), 
+                                  _mm_xor_si128(   l, _mm_set1_epi32(0x80000000)));
+  __m128i h = _mm_sub_epi32(_mm_set1_epi32((int32_t)(counter >> 32)), carry);
+  *out_lo = l;
+  *out_hi = h;
+}
+
+static
+void blake3_hash4_sse2(const u8 *const *inputs, size_t blocks,
+                       const u32 key[8], u64 counter,
+                       char increment_counter, u8 flags,
+                       u8 flags_start, u8 flags_end, u8 *out) {
+  __m128i h_vecs[8] = {
+      set1(key[0]), set1(key[1]), set1(key[2]), set1(key[3]),
+      set1(key[4]), set1(key[5]), set1(key[6]), set1(key[7]),
+  };
+  __m128i counter_low_vec, counter_high_vec;
+  load_counters(counter, increment_counter, &counter_low_vec,
+                &counter_high_vec);
+  u8 block_flags = flags | flags_start;
+
+  for (size_t block = 0; block < blocks; block++) {
+    if (block + 1 == blocks) {
+      block_flags |= flags_end;
+    }
+    __m128i block_len_vec = set1(BLAKE3_BLOCK_LEN);
+    __m128i block_flags_vec = set1(block_flags);
+    __m128i msg_vecs[16];
+    transpose_msg_vecs(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs);
+
+    __m128i v[16] = {
+        h_vecs[0],       h_vecs[1],        h_vecs[2],     h_vecs[3],
+        h_vecs[4],       h_vecs[5],        h_vecs[6],     h_vecs[7],
+        set1(IV[0]),     set1(IV[1]),      set1(IV[2]),   set1(IV[3]),
+        counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec,
+    };
+    round_fn(v, msg_vecs, 0);
+    round_fn(v, msg_vecs, 1);
+    round_fn(v, msg_vecs, 2);
+    round_fn(v, msg_vecs, 3);
+    round_fn(v, msg_vecs, 4);
+    round_fn(v, msg_vecs, 5);
+    round_fn(v, msg_vecs, 6);
+    h_vecs[0] = xorv(v[0], v[8]);
+    h_vecs[1] = xorv(v[1], v[9]);
+    h_vecs[2] = xorv(v[2], v[10]);
+    h_vecs[3] = xorv(v[3], v[11]);
+    h_vecs[4] = xorv(v[4], v[12]);
+    h_vecs[5] = xorv(v[5], v[13]);
+    h_vecs[6] = xorv(v[6], v[14]);
+    h_vecs[7] = xorv(v[7], v[15]);
+
+    block_flags = flags;
+  }
+
+  transpose_vecs(&h_vecs[0]);
+  transpose_vecs(&h_vecs[4]);
+  // The first four vecs now contain the first half of each output, and the
+  // second four vecs contain the second half of each output.
+  storeu(h_vecs[0], &out[0 * sizeof(__m128i)]);
+  storeu(h_vecs[4], &out[1 * sizeof(__m128i)]);
+  storeu(h_vecs[1], &out[2 * sizeof(__m128i)]);
+  storeu(h_vecs[5], &out[3 * sizeof(__m128i)]);
+  storeu(h_vecs[2], &out[4 * sizeof(__m128i)]);
+  storeu(h_vecs[6], &out[5 * sizeof(__m128i)]);
+  storeu(h_vecs[3], &out[6 * sizeof(__m128i)]);
+  storeu(h_vecs[7], &out[7 * sizeof(__m128i)]);
+}
+
+INLINE void hash_one_sse2(const u8 *input, size_t blocks,
+                          const u32 key[8], u64 counter,
+                          u8 flags, u8 flags_start,
+                          u8 flags_end, u8 out[BLAKE3_OUT_LEN]) {
+  u32 cv[8];
+  memcpy(cv, key, BLAKE3_KEY_LEN);
+  u8 block_flags = flags | flags_start;
+  while (blocks > 0) {
+    if (blocks == 1) {
+      block_flags |= flags_end;
+    }
+    blake3_compress_in_place_sse2(cv, input, BLAKE3_BLOCK_LEN, counter,
+                                  block_flags);
+    input = &input[BLAKE3_BLOCK_LEN];
+    blocks -= 1;
+    block_flags = flags;
+  }
+  memcpy(out, cv, BLAKE3_OUT_LEN);
+}
+
+void blake3_hash_many_sse2(const u8 *const *inputs, size_t num_inputs,
+                           size_t blocks, const u32 key[8],
+                           u64 counter, char increment_counter,
+                           u8 flags, u8 flags_start,
+                           u8 flags_end, u8 *out) {
+  while (num_inputs >= DEGREE) {
+    blake3_hash4_sse2(inputs, blocks, key, counter, increment_counter, flags,
+                      flags_start, flags_end, out);
+    if (increment_counter) {
+      counter += DEGREE;
+    }
+    inputs += DEGREE;
+    num_inputs -= DEGREE;
+    out = &out[DEGREE * BLAKE3_OUT_LEN];
+  }
+  while (num_inputs > 0) {
+    hash_one_sse2(inputs[0], blocks, key, counter, flags, flags_start,
+                  flags_end, out);
+    if (increment_counter) {
+      counter += 1;
+    }
+    inputs += 1;
+    num_inputs -= 1;
+    out = &out[BLAKE3_OUT_LEN];
+  }
+}
diff --git a/src/blake3/blake3_sse2_x86-64_unix.S b/src/blake3/blake3_sse2_x86-64_unix.S
new file mode 100644
index 0000000..99f033f
--- /dev/null
+++ b/src/blake3/blake3_sse2_x86-64_unix.S
@@ -0,0 +1,2291 @@
+#if defined(__ELF__) && defined(__linux__)
+.section .note.GNU-stack,"",%progbits
+#endif
+
+#if defined(__ELF__) && defined(__CET__) && defined(__has_include)
+#if __has_include(<cet.h>)
+#include <cet.h>
+#endif
+#endif
+
+#if !defined(_CET_ENDBR)
+#define _CET_ENDBR
+#endif
+
+.intel_syntax noprefix
+.global blake3_hash_many_sse2
+.global _blake3_hash_many_sse2
+.global blake3_compress_in_place_sse2
+.global _blake3_compress_in_place_sse2
+.global blake3_compress_xof_sse2
+.global _blake3_compress_xof_sse2
+#ifdef __APPLE__
+.text
+#else
+.section .text
+#endif
+        .p2align  6
+_blake3_hash_many_sse2:
+blake3_hash_many_sse2:
+        _CET_ENDBR
+        push    r15
+        push    r14
+        push    r13
+        push    r12
+        push    rbx
+        push    rbp
+        mov     rbp, rsp
+        sub     rsp, 360
+        and     rsp, 0xFFFFFFFFFFFFFFC0
+        neg     r9d
+        movd    xmm0, r9d
+        pshufd  xmm0, xmm0, 0x00
+        movdqa  xmmword ptr [rsp+0x130], xmm0
+        movdqa  xmm1, xmm0
+        pand    xmm1, xmmword ptr [ADD0+rip]
+        pand    xmm0, xmmword ptr [ADD1+rip]
+        movdqa  xmmword ptr [rsp+0x150], xmm0
+        movd    xmm0, r8d
+        pshufd  xmm0, xmm0, 0x00
+        paddd   xmm0, xmm1
+        movdqa  xmmword ptr [rsp+0x110], xmm0
+        pxor    xmm0, xmmword ptr [CMP_MSB_MASK+rip]
+        pxor    xmm1, xmmword ptr [CMP_MSB_MASK+rip]
+        pcmpgtd xmm1, xmm0
+        shr     r8, 32
+        movd    xmm2, r8d
+        pshufd  xmm2, xmm2, 0x00
+        psubd   xmm2, xmm1
+        movdqa  xmmword ptr [rsp+0x120], xmm2
+        mov     rbx, qword ptr [rbp+0x50]
+        mov     r15, rdx
+        shl     r15, 6
+        movzx   r13d, byte ptr [rbp+0x38]
+        movzx   r12d, byte ptr [rbp+0x48]
+        cmp     rsi, 4
+        jc      3f
+2:
+        movdqu  xmm3, xmmword ptr [rcx]
+        pshufd  xmm0, xmm3, 0x00
+        pshufd  xmm1, xmm3, 0x55
+        pshufd  xmm2, xmm3, 0xAA
+        pshufd  xmm3, xmm3, 0xFF
+        movdqu  xmm7, xmmword ptr [rcx+0x10]
+        pshufd  xmm4, xmm7, 0x00
+        pshufd  xmm5, xmm7, 0x55
+        pshufd  xmm6, xmm7, 0xAA
+        pshufd  xmm7, xmm7, 0xFF
+        mov     r8, qword ptr [rdi]
+        mov     r9, qword ptr [rdi+0x8]
+        mov     r10, qword ptr [rdi+0x10]
+        mov     r11, qword ptr [rdi+0x18]
+        movzx   eax, byte ptr [rbp+0x40]
+        or      eax, r13d
+        xor     edx, edx
+9:
+        mov     r14d, eax
+        or      eax, r12d
+        add     rdx, 64
+        cmp     rdx, r15
+        cmovne  eax, r14d
+        movdqu  xmm8, xmmword ptr [r8+rdx-0x40]
+        movdqu  xmm9, xmmword ptr [r9+rdx-0x40]
+        movdqu  xmm10, xmmword ptr [r10+rdx-0x40]
+        movdqu  xmm11, xmmword ptr [r11+rdx-0x40]
+        movdqa  xmm12, xmm8
+        punpckldq xmm8, xmm9
+        punpckhdq xmm12, xmm9
+        movdqa  xmm14, xmm10
+        punpckldq xmm10, xmm11
+        punpckhdq xmm14, xmm11
+        movdqa  xmm9, xmm8
+        punpcklqdq xmm8, xmm10
+        punpckhqdq xmm9, xmm10
+        movdqa  xmm13, xmm12
+        punpcklqdq xmm12, xmm14
+        punpckhqdq xmm13, xmm14
+        movdqa  xmmword ptr [rsp], xmm8
+        movdqa  xmmword ptr [rsp+0x10], xmm9
+        movdqa  xmmword ptr [rsp+0x20], xmm12
+        movdqa  xmmword ptr [rsp+0x30], xmm13
+        movdqu  xmm8, xmmword ptr [r8+rdx-0x30]
+        movdqu  xmm9, xmmword ptr [r9+rdx-0x30]
+        movdqu  xmm10, xmmword ptr [r10+rdx-0x30]
+        movdqu  xmm11, xmmword ptr [r11+rdx-0x30]
+        movdqa  xmm12, xmm8
+        punpckldq xmm8, xmm9
+        punpckhdq xmm12, xmm9
+        movdqa  xmm14, xmm10
+        punpckldq xmm10, xmm11
+        punpckhdq xmm14, xmm11
+        movdqa  xmm9, xmm8
+        punpcklqdq xmm8, xmm10
+        punpckhqdq xmm9, xmm10
+        movdqa  xmm13, xmm12
+        punpcklqdq xmm12, xmm14
+        punpckhqdq xmm13, xmm14
+        movdqa  xmmword ptr [rsp+0x40], xmm8
+        movdqa  xmmword ptr [rsp+0x50], xmm9
+        movdqa  xmmword ptr [rsp+0x60], xmm12
+        movdqa  xmmword ptr [rsp+0x70], xmm13
+        movdqu  xmm8, xmmword ptr [r8+rdx-0x20]
+        movdqu  xmm9, xmmword ptr [r9+rdx-0x20]
+        movdqu  xmm10, xmmword ptr [r10+rdx-0x20]
+        movdqu  xmm11, xmmword ptr [r11+rdx-0x20]
+        movdqa  xmm12, xmm8
+        punpckldq xmm8, xmm9
+        punpckhdq xmm12, xmm9
+        movdqa  xmm14, xmm10
+        punpckldq xmm10, xmm11
+        punpckhdq xmm14, xmm11
+        movdqa  xmm9, xmm8
+        punpcklqdq xmm8, xmm10
+        punpckhqdq xmm9, xmm10
+        movdqa  xmm13, xmm12
+        punpcklqdq xmm12, xmm14
+        punpckhqdq xmm13, xmm14
+        movdqa  xmmword ptr [rsp+0x80], xmm8
+        movdqa  xmmword ptr [rsp+0x90], xmm9
+        movdqa  xmmword ptr [rsp+0xA0], xmm12
+        movdqa  xmmword ptr [rsp+0xB0], xmm13
+        movdqu  xmm8, xmmword ptr [r8+rdx-0x10]
+        movdqu  xmm9, xmmword ptr [r9+rdx-0x10]
+        movdqu  xmm10, xmmword ptr [r10+rdx-0x10]
+        movdqu  xmm11, xmmword ptr [r11+rdx-0x10]
+        movdqa  xmm12, xmm8
+        punpckldq xmm8, xmm9
+        punpckhdq xmm12, xmm9
+        movdqa  xmm14, xmm10
+        punpckldq xmm10, xmm11
+        punpckhdq xmm14, xmm11
+        movdqa  xmm9, xmm8
+        punpcklqdq xmm8, xmm10
+        punpckhqdq xmm9, xmm10
+        movdqa  xmm13, xmm12
+        punpcklqdq xmm12, xmm14
+        punpckhqdq xmm13, xmm14
+        movdqa  xmmword ptr [rsp+0xC0], xmm8
+        movdqa  xmmword ptr [rsp+0xD0], xmm9
+        movdqa  xmmword ptr [rsp+0xE0], xmm12
+        movdqa  xmmword ptr [rsp+0xF0], xmm13
+        movdqa  xmm9, xmmword ptr [BLAKE3_IV_1+rip]
+        movdqa  xmm10, xmmword ptr [BLAKE3_IV_2+rip]
+        movdqa  xmm11, xmmword ptr [BLAKE3_IV_3+rip]
+        movdqa  xmm12, xmmword ptr [rsp+0x110]
+        movdqa  xmm13, xmmword ptr [rsp+0x120]
+        movdqa  xmm14, xmmword ptr [BLAKE3_BLOCK_LEN+rip]
+        movd    xmm15, eax
+        pshufd  xmm15, xmm15, 0x00
+        prefetcht0 [r8+rdx+0x80]
+        prefetcht0 [r9+rdx+0x80]
+        prefetcht0 [r10+rdx+0x80]
+        prefetcht0 [r11+rdx+0x80]
+        paddd   xmm0, xmmword ptr [rsp]
+        paddd   xmm1, xmmword ptr [rsp+0x20]
+        paddd   xmm2, xmmword ptr [rsp+0x40]
+        paddd   xmm3, xmmword ptr [rsp+0x60]
+        paddd   xmm0, xmm4
+        paddd   xmm1, xmm5
+        paddd   xmm2, xmm6
+        paddd   xmm3, xmm7
+        pxor    xmm12, xmm0
+        pxor    xmm13, xmm1
+        pxor    xmm14, xmm2
+        pxor    xmm15, xmm3
+        pshuflw xmm12, xmm12, 0xB1
+        pshufhw xmm12, xmm12, 0xB1
+        pshuflw xmm13, xmm13, 0xB1
+        pshufhw xmm13, xmm13, 0xB1
+        pshuflw xmm14, xmm14, 0xB1
+        pshufhw xmm14, xmm14, 0xB1
+        pshuflw xmm15, xmm15, 0xB1
+        pshufhw xmm15, xmm15, 0xB1
+        movdqa  xmm8, xmmword ptr [BLAKE3_IV_0+rip]
+        paddd   xmm8, xmm12
+        paddd   xmm9, xmm13
+        paddd   xmm10, xmm14
+        paddd   xmm11, xmm15
+        pxor    xmm4, xmm8
+        pxor    xmm5, xmm9
+        pxor    xmm6, xmm10
+        pxor    xmm7, xmm11
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 12
+        pslld   xmm4, 20
+        por     xmm4, xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 12
+        pslld   xmm5, 20
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 12
+        pslld   xmm6, 20
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 12
+        pslld   xmm7, 20
+        por     xmm7, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0x10]
+        paddd   xmm1, xmmword ptr [rsp+0x30]
+        paddd   xmm2, xmmword ptr [rsp+0x50]
+        paddd   xmm3, xmmword ptr [rsp+0x70]
+        paddd   xmm0, xmm4
+        paddd   xmm1, xmm5
+        paddd   xmm2, xmm6
+        paddd   xmm3, xmm7
+        pxor    xmm12, xmm0
+        pxor    xmm13, xmm1
+        pxor    xmm14, xmm2
+        pxor    xmm15, xmm3
+        movdqa  xmm8, xmm12
+        psrld   xmm12, 8
+        pslld   xmm8, 24
+        pxor    xmm12, xmm8
+        movdqa  xmm8, xmm13
+        psrld   xmm13, 8
+        pslld   xmm8, 24
+        pxor    xmm13, xmm8
+        movdqa  xmm8, xmm14
+        psrld   xmm14, 8
+        pslld   xmm8, 24
+        pxor    xmm14, xmm8
+        movdqa  xmm8, xmm15
+        psrld   xmm15, 8
+        pslld   xmm8, 24
+        pxor    xmm15, xmm8
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm12
+        paddd   xmm9, xmm13
+        paddd   xmm10, xmm14
+        paddd   xmm11, xmm15
+        pxor    xmm4, xmm8
+        pxor    xmm5, xmm9
+        pxor    xmm6, xmm10
+        pxor    xmm7, xmm11
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 7
+        pslld   xmm4, 25
+        por     xmm4, xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 7
+        pslld   xmm5, 25
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 7
+        pslld   xmm6, 25
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 7
+        pslld   xmm7, 25
+        por     xmm7, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0x80]
+        paddd   xmm1, xmmword ptr [rsp+0xA0]
+        paddd   xmm2, xmmword ptr [rsp+0xC0]
+        paddd   xmm3, xmmword ptr [rsp+0xE0]
+        paddd   xmm0, xmm5
+        paddd   xmm1, xmm6
+        paddd   xmm2, xmm7
+        paddd   xmm3, xmm4
+        pxor    xmm15, xmm0
+        pxor    xmm12, xmm1
+        pxor    xmm13, xmm2
+        pxor    xmm14, xmm3
+        pshuflw xmm15, xmm15, 0xB1
+        pshufhw xmm15, xmm15, 0xB1
+        pshuflw xmm12, xmm12, 0xB1
+        pshufhw xmm12, xmm12, 0xB1
+        pshuflw xmm13, xmm13, 0xB1
+        pshufhw xmm13, xmm13, 0xB1
+        pshuflw xmm14, xmm14, 0xB1
+        pshufhw xmm14, xmm14, 0xB1
+        paddd   xmm10, xmm15
+        paddd   xmm11, xmm12
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm13
+        paddd   xmm9, xmm14
+        pxor    xmm5, xmm10
+        pxor    xmm6, xmm11
+        pxor    xmm7, xmm8
+        pxor    xmm4, xmm9
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 12
+        pslld   xmm5, 20
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 12
+        pslld   xmm6, 20
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 12
+        pslld   xmm7, 20
+        por     xmm7, xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 12
+        pslld   xmm4, 20
+        por     xmm4, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0x90]
+        paddd   xmm1, xmmword ptr [rsp+0xB0]
+        paddd   xmm2, xmmword ptr [rsp+0xD0]
+        paddd   xmm3, xmmword ptr [rsp+0xF0]
+        paddd   xmm0, xmm5
+        paddd   xmm1, xmm6
+        paddd   xmm2, xmm7
+        paddd   xmm3, xmm4
+        pxor    xmm15, xmm0
+        pxor    xmm12, xmm1
+        pxor    xmm13, xmm2
+        pxor    xmm14, xmm3
+        movdqa  xmm8, xmm15
+        psrld   xmm15, 8
+        pslld   xmm8, 24
+        pxor    xmm15, xmm8
+        movdqa  xmm8, xmm12
+        psrld   xmm12, 8
+        pslld   xmm8, 24
+        pxor    xmm12, xmm8
+        movdqa  xmm8, xmm13
+        psrld   xmm13, 8
+        pslld   xmm8, 24
+        pxor    xmm13, xmm8
+        movdqa  xmm8, xmm14
+        psrld   xmm14, 8
+        pslld   xmm8, 24
+        pxor    xmm14, xmm8
+        paddd   xmm10, xmm15
+        paddd   xmm11, xmm12
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm13
+        paddd   xmm9, xmm14
+        pxor    xmm5, xmm10
+        pxor    xmm6, xmm11
+        pxor    xmm7, xmm8
+        pxor    xmm4, xmm9
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 7
+        pslld   xmm5, 25
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 7
+        pslld   xmm6, 25
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 7
+        pslld   xmm7, 25
+        por     xmm7, xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 7
+        pslld   xmm4, 25
+        por     xmm4, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0x20]
+        paddd   xmm1, xmmword ptr [rsp+0x30]
+        paddd   xmm2, xmmword ptr [rsp+0x70]
+        paddd   xmm3, xmmword ptr [rsp+0x40]
+        paddd   xmm0, xmm4
+        paddd   xmm1, xmm5
+        paddd   xmm2, xmm6
+        paddd   xmm3, xmm7
+        pxor    xmm12, xmm0
+        pxor    xmm13, xmm1
+        pxor    xmm14, xmm2
+        pxor    xmm15, xmm3
+        pshuflw xmm12, xmm12, 0xB1
+        pshufhw xmm12, xmm12, 0xB1
+        pshuflw xmm13, xmm13, 0xB1
+        pshufhw xmm13, xmm13, 0xB1
+        pshuflw xmm14, xmm14, 0xB1
+        pshufhw xmm14, xmm14, 0xB1
+        pshuflw xmm15, xmm15, 0xB1
+        pshufhw xmm15, xmm15, 0xB1
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm12
+        paddd   xmm9, xmm13
+        paddd   xmm10, xmm14
+        paddd   xmm11, xmm15
+        pxor    xmm4, xmm8
+        pxor    xmm5, xmm9
+        pxor    xmm6, xmm10
+        pxor    xmm7, xmm11
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 12
+        pslld   xmm4, 20
+        por     xmm4, xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 12
+        pslld   xmm5, 20
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 12
+        pslld   xmm6, 20
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 12
+        pslld   xmm7, 20
+        por     xmm7, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0x60]
+        paddd   xmm1, xmmword ptr [rsp+0xA0]
+        paddd   xmm2, xmmword ptr [rsp]
+        paddd   xmm3, xmmword ptr [rsp+0xD0]
+        paddd   xmm0, xmm4
+        paddd   xmm1, xmm5
+        paddd   xmm2, xmm6
+        paddd   xmm3, xmm7
+        pxor    xmm12, xmm0
+        pxor    xmm13, xmm1
+        pxor    xmm14, xmm2
+        pxor    xmm15, xmm3
+        movdqa  xmm8, xmm12
+        psrld   xmm12, 8
+        pslld   xmm8, 24
+        pxor    xmm12, xmm8
+        movdqa  xmm8, xmm13
+        psrld   xmm13, 8
+        pslld   xmm8, 24
+        pxor    xmm13, xmm8
+        movdqa  xmm8, xmm14
+        psrld   xmm14, 8
+        pslld   xmm8, 24
+        pxor    xmm14, xmm8
+        movdqa  xmm8, xmm15
+        psrld   xmm15, 8
+        pslld   xmm8, 24
+        pxor    xmm15, xmm8
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm12
+        paddd   xmm9, xmm13
+        paddd   xmm10, xmm14
+        paddd   xmm11, xmm15
+        pxor    xmm4, xmm8
+        pxor    xmm5, xmm9
+        pxor    xmm6, xmm10
+        pxor    xmm7, xmm11
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 7
+        pslld   xmm4, 25
+        por     xmm4, xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 7
+        pslld   xmm5, 25
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 7
+        pslld   xmm6, 25
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 7
+        pslld   xmm7, 25
+        por     xmm7, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0x10]
+        paddd   xmm1, xmmword ptr [rsp+0xC0]
+        paddd   xmm2, xmmword ptr [rsp+0x90]
+        paddd   xmm3, xmmword ptr [rsp+0xF0]
+        paddd   xmm0, xmm5
+        paddd   xmm1, xmm6
+        paddd   xmm2, xmm7
+        paddd   xmm3, xmm4
+        pxor    xmm15, xmm0
+        pxor    xmm12, xmm1
+        pxor    xmm13, xmm2
+        pxor    xmm14, xmm3
+        pshuflw xmm15, xmm15, 0xB1
+        pshufhw xmm15, xmm15, 0xB1
+        pshuflw xmm12, xmm12, 0xB1
+        pshufhw xmm12, xmm12, 0xB1
+        pshuflw xmm13, xmm13, 0xB1
+        pshufhw xmm13, xmm13, 0xB1
+        pshuflw xmm14, xmm14, 0xB1
+        pshufhw xmm14, xmm14, 0xB1
+        paddd   xmm10, xmm15
+        paddd   xmm11, xmm12
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm13
+        paddd   xmm9, xmm14
+        pxor    xmm5, xmm10
+        pxor    xmm6, xmm11
+        pxor    xmm7, xmm8
+        pxor    xmm4, xmm9
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 12
+        pslld   xmm5, 20
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 12
+        pslld   xmm6, 20
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 12
+        pslld   xmm7, 20
+        por     xmm7, xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 12
+        pslld   xmm4, 20
+        por     xmm4, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0xB0]
+        paddd   xmm1, xmmword ptr [rsp+0x50]
+        paddd   xmm2, xmmword ptr [rsp+0xE0]
+        paddd   xmm3, xmmword ptr [rsp+0x80]
+        paddd   xmm0, xmm5
+        paddd   xmm1, xmm6
+        paddd   xmm2, xmm7
+        paddd   xmm3, xmm4
+        pxor    xmm15, xmm0
+        pxor    xmm12, xmm1
+        pxor    xmm13, xmm2
+        pxor    xmm14, xmm3
+        movdqa  xmm8, xmm15
+        psrld   xmm15, 8
+        pslld   xmm8, 24
+        pxor    xmm15, xmm8
+        movdqa  xmm8, xmm12
+        psrld   xmm12, 8
+        pslld   xmm8, 24
+        pxor    xmm12, xmm8
+        movdqa  xmm8, xmm13
+        psrld   xmm13, 8
+        pslld   xmm8, 24
+        pxor    xmm13, xmm8
+        movdqa  xmm8, xmm14
+        psrld   xmm14, 8
+        pslld   xmm8, 24
+        pxor    xmm14, xmm8
+        paddd   xmm10, xmm15
+        paddd   xmm11, xmm12
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm13
+        paddd   xmm9, xmm14
+        pxor    xmm5, xmm10
+        pxor    xmm6, xmm11
+        pxor    xmm7, xmm8
+        pxor    xmm4, xmm9
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 7
+        pslld   xmm5, 25
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 7
+        pslld   xmm6, 25
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 7
+        pslld   xmm7, 25
+        por     xmm7, xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 7
+        pslld   xmm4, 25
+        por     xmm4, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0x30]
+        paddd   xmm1, xmmword ptr [rsp+0xA0]
+        paddd   xmm2, xmmword ptr [rsp+0xD0]
+        paddd   xmm3, xmmword ptr [rsp+0x70]
+        paddd   xmm0, xmm4
+        paddd   xmm1, xmm5
+        paddd   xmm2, xmm6
+        paddd   xmm3, xmm7
+        pxor    xmm12, xmm0
+        pxor    xmm13, xmm1
+        pxor    xmm14, xmm2
+        pxor    xmm15, xmm3
+        pshuflw xmm12, xmm12, 0xB1
+        pshufhw xmm12, xmm12, 0xB1
+        pshuflw xmm13, xmm13, 0xB1
+        pshufhw xmm13, xmm13, 0xB1
+        pshuflw xmm14, xmm14, 0xB1
+        pshufhw xmm14, xmm14, 0xB1
+        pshuflw xmm15, xmm15, 0xB1
+        pshufhw xmm15, xmm15, 0xB1
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm12
+        paddd   xmm9, xmm13
+        paddd   xmm10, xmm14
+        paddd   xmm11, xmm15
+        pxor    xmm4, xmm8
+        pxor    xmm5, xmm9
+        pxor    xmm6, xmm10
+        pxor    xmm7, xmm11
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 12
+        pslld   xmm4, 20
+        por     xmm4, xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 12
+        pslld   xmm5, 20
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 12
+        pslld   xmm6, 20
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 12
+        pslld   xmm7, 20
+        por     xmm7, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0x40]
+        paddd   xmm1, xmmword ptr [rsp+0xC0]
+        paddd   xmm2, xmmword ptr [rsp+0x20]
+        paddd   xmm3, xmmword ptr [rsp+0xE0]
+        paddd   xmm0, xmm4
+        paddd   xmm1, xmm5
+        paddd   xmm2, xmm6
+        paddd   xmm3, xmm7
+        pxor    xmm12, xmm0
+        pxor    xmm13, xmm1
+        pxor    xmm14, xmm2
+        pxor    xmm15, xmm3
+        movdqa  xmm8, xmm12
+        psrld   xmm12, 8
+        pslld   xmm8, 24
+        pxor    xmm12, xmm8
+        movdqa  xmm8, xmm13
+        psrld   xmm13, 8
+        pslld   xmm8, 24
+        pxor    xmm13, xmm8
+        movdqa  xmm8, xmm14
+        psrld   xmm14, 8
+        pslld   xmm8, 24
+        pxor    xmm14, xmm8
+        movdqa  xmm8, xmm15
+        psrld   xmm15, 8
+        pslld   xmm8, 24
+        pxor    xmm15, xmm8
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm12
+        paddd   xmm9, xmm13
+        paddd   xmm10, xmm14
+        paddd   xmm11, xmm15
+        pxor    xmm4, xmm8
+        pxor    xmm5, xmm9
+        pxor    xmm6, xmm10
+        pxor    xmm7, xmm11
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 7
+        pslld   xmm4, 25
+        por     xmm4, xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 7
+        pslld   xmm5, 25
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 7
+        pslld   xmm6, 25
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 7
+        pslld   xmm7, 25
+        por     xmm7, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0x60]
+        paddd   xmm1, xmmword ptr [rsp+0x90]
+        paddd   xmm2, xmmword ptr [rsp+0xB0]
+        paddd   xmm3, xmmword ptr [rsp+0x80]
+        paddd   xmm0, xmm5
+        paddd   xmm1, xmm6
+        paddd   xmm2, xmm7
+        paddd   xmm3, xmm4
+        pxor    xmm15, xmm0
+        pxor    xmm12, xmm1
+        pxor    xmm13, xmm2
+        pxor    xmm14, xmm3
+        pshuflw xmm15, xmm15, 0xB1
+        pshufhw xmm15, xmm15, 0xB1
+        pshuflw xmm12, xmm12, 0xB1
+        pshufhw xmm12, xmm12, 0xB1
+        pshuflw xmm13, xmm13, 0xB1
+        pshufhw xmm13, xmm13, 0xB1
+        pshuflw xmm14, xmm14, 0xB1
+        pshufhw xmm14, xmm14, 0xB1
+        paddd   xmm10, xmm15
+        paddd   xmm11, xmm12
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm13
+        paddd   xmm9, xmm14
+        pxor    xmm5, xmm10
+        pxor    xmm6, xmm11
+        pxor    xmm7, xmm8
+        pxor    xmm4, xmm9
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 12
+        pslld   xmm5, 20
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 12
+        pslld   xmm6, 20
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 12
+        pslld   xmm7, 20
+        por     xmm7, xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 12
+        pslld   xmm4, 20
+        por     xmm4, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0x50]
+        paddd   xmm1, xmmword ptr [rsp]
+        paddd   xmm2, xmmword ptr [rsp+0xF0]
+        paddd   xmm3, xmmword ptr [rsp+0x10]
+        paddd   xmm0, xmm5
+        paddd   xmm1, xmm6
+        paddd   xmm2, xmm7
+        paddd   xmm3, xmm4
+        pxor    xmm15, xmm0
+        pxor    xmm12, xmm1
+        pxor    xmm13, xmm2
+        pxor    xmm14, xmm3
+        movdqa  xmm8, xmm15
+        psrld   xmm15, 8
+        pslld   xmm8, 24
+        pxor    xmm15, xmm8
+        movdqa  xmm8, xmm12
+        psrld   xmm12, 8
+        pslld   xmm8, 24
+        pxor    xmm12, xmm8
+        movdqa  xmm8, xmm13
+        psrld   xmm13, 8
+        pslld   xmm8, 24
+        pxor    xmm13, xmm8
+        movdqa  xmm8, xmm14
+        psrld   xmm14, 8
+        pslld   xmm8, 24
+        pxor    xmm14, xmm8
+        paddd   xmm10, xmm15
+        paddd   xmm11, xmm12
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm13
+        paddd   xmm9, xmm14
+        pxor    xmm5, xmm10
+        pxor    xmm6, xmm11
+        pxor    xmm7, xmm8
+        pxor    xmm4, xmm9
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 7
+        pslld   xmm5, 25
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 7
+        pslld   xmm6, 25
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 7
+        pslld   xmm7, 25
+        por     xmm7, xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 7
+        pslld   xmm4, 25
+        por     xmm4, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0xA0]
+        paddd   xmm1, xmmword ptr [rsp+0xC0]
+        paddd   xmm2, xmmword ptr [rsp+0xE0]
+        paddd   xmm3, xmmword ptr [rsp+0xD0]
+        paddd   xmm0, xmm4
+        paddd   xmm1, xmm5
+        paddd   xmm2, xmm6
+        paddd   xmm3, xmm7
+        pxor    xmm12, xmm0
+        pxor    xmm13, xmm1
+        pxor    xmm14, xmm2
+        pxor    xmm15, xmm3
+        pshuflw xmm12, xmm12, 0xB1
+        pshufhw xmm12, xmm12, 0xB1
+        pshuflw xmm13, xmm13, 0xB1
+        pshufhw xmm13, xmm13, 0xB1
+        pshuflw xmm14, xmm14, 0xB1
+        pshufhw xmm14, xmm14, 0xB1
+        pshuflw xmm15, xmm15, 0xB1
+        pshufhw xmm15, xmm15, 0xB1
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm12
+        paddd   xmm9, xmm13
+        paddd   xmm10, xmm14
+        paddd   xmm11, xmm15
+        pxor    xmm4, xmm8
+        pxor    xmm5, xmm9
+        pxor    xmm6, xmm10
+        pxor    xmm7, xmm11
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 12
+        pslld   xmm4, 20
+        por     xmm4, xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 12
+        pslld   xmm5, 20
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 12
+        pslld   xmm6, 20
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 12
+        pslld   xmm7, 20
+        por     xmm7, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0x70]
+        paddd   xmm1, xmmword ptr [rsp+0x90]
+        paddd   xmm2, xmmword ptr [rsp+0x30]
+        paddd   xmm3, xmmword ptr [rsp+0xF0]
+        paddd   xmm0, xmm4
+        paddd   xmm1, xmm5
+        paddd   xmm2, xmm6
+        paddd   xmm3, xmm7
+        pxor    xmm12, xmm0
+        pxor    xmm13, xmm1
+        pxor    xmm14, xmm2
+        pxor    xmm15, xmm3
+        movdqa  xmm8, xmm12
+        psrld   xmm12, 8
+        pslld   xmm8, 24
+        pxor    xmm12, xmm8
+        movdqa  xmm8, xmm13
+        psrld   xmm13, 8
+        pslld   xmm8, 24
+        pxor    xmm13, xmm8
+        movdqa  xmm8, xmm14
+        psrld   xmm14, 8
+        pslld   xmm8, 24
+        pxor    xmm14, xmm8
+        movdqa  xmm8, xmm15
+        psrld   xmm15, 8
+        pslld   xmm8, 24
+        pxor    xmm15, xmm8
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm12
+        paddd   xmm9, xmm13
+        paddd   xmm10, xmm14
+        paddd   xmm11, xmm15
+        pxor    xmm4, xmm8
+        pxor    xmm5, xmm9
+        pxor    xmm6, xmm10
+        pxor    xmm7, xmm11
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 7
+        pslld   xmm4, 25
+        por     xmm4, xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 7
+        pslld   xmm5, 25
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 7
+        pslld   xmm6, 25
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 7
+        pslld   xmm7, 25
+        por     xmm7, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0x40]
+        paddd   xmm1, xmmword ptr [rsp+0xB0]
+        paddd   xmm2, xmmword ptr [rsp+0x50]
+        paddd   xmm3, xmmword ptr [rsp+0x10]
+        paddd   xmm0, xmm5
+        paddd   xmm1, xmm6
+        paddd   xmm2, xmm7
+        paddd   xmm3, xmm4
+        pxor    xmm15, xmm0
+        pxor    xmm12, xmm1
+        pxor    xmm13, xmm2
+        pxor    xmm14, xmm3
+        pshuflw xmm15, xmm15, 0xB1
+        pshufhw xmm15, xmm15, 0xB1
+        pshuflw xmm12, xmm12, 0xB1
+        pshufhw xmm12, xmm12, 0xB1
+        pshuflw xmm13, xmm13, 0xB1
+        pshufhw xmm13, xmm13, 0xB1
+        pshuflw xmm14, xmm14, 0xB1
+        pshufhw xmm14, xmm14, 0xB1
+        paddd   xmm10, xmm15
+        paddd   xmm11, xmm12
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm13
+        paddd   xmm9, xmm14
+        pxor    xmm5, xmm10
+        pxor    xmm6, xmm11
+        pxor    xmm7, xmm8
+        pxor    xmm4, xmm9
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 12
+        pslld   xmm5, 20
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 12
+        pslld   xmm6, 20
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 12
+        pslld   xmm7, 20
+        por     xmm7, xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 12
+        pslld   xmm4, 20
+        por     xmm4, xmm8
+        paddd   xmm0, xmmword ptr [rsp]
+        paddd   xmm1, xmmword ptr [rsp+0x20]
+        paddd   xmm2, xmmword ptr [rsp+0x80]
+        paddd   xmm3, xmmword ptr [rsp+0x60]
+        paddd   xmm0, xmm5
+        paddd   xmm1, xmm6
+        paddd   xmm2, xmm7
+        paddd   xmm3, xmm4
+        pxor    xmm15, xmm0
+        pxor    xmm12, xmm1
+        pxor    xmm13, xmm2
+        pxor    xmm14, xmm3
+        movdqa  xmm8, xmm15
+        psrld   xmm15, 8
+        pslld   xmm8, 24
+        pxor    xmm15, xmm8
+        movdqa  xmm8, xmm12
+        psrld   xmm12, 8
+        pslld   xmm8, 24
+        pxor    xmm12, xmm8
+        movdqa  xmm8, xmm13
+        psrld   xmm13, 8
+        pslld   xmm8, 24
+        pxor    xmm13, xmm8
+        movdqa  xmm8, xmm14
+        psrld   xmm14, 8
+        pslld   xmm8, 24
+        pxor    xmm14, xmm8
+        paddd   xmm10, xmm15
+        paddd   xmm11, xmm12
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm13
+        paddd   xmm9, xmm14
+        pxor    xmm5, xmm10
+        pxor    xmm6, xmm11
+        pxor    xmm7, xmm8
+        pxor    xmm4, xmm9
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 7
+        pslld   xmm5, 25
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 7
+        pslld   xmm6, 25
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 7
+        pslld   xmm7, 25
+        por     xmm7, xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 7
+        pslld   xmm4, 25
+        por     xmm4, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0xC0]
+        paddd   xmm1, xmmword ptr [rsp+0x90]
+        paddd   xmm2, xmmword ptr [rsp+0xF0]
+        paddd   xmm3, xmmword ptr [rsp+0xE0]
+        paddd   xmm0, xmm4
+        paddd   xmm1, xmm5
+        paddd   xmm2, xmm6
+        paddd   xmm3, xmm7
+        pxor    xmm12, xmm0
+        pxor    xmm13, xmm1
+        pxor    xmm14, xmm2
+        pxor    xmm15, xmm3
+        pshuflw xmm12, xmm12, 0xB1
+        pshufhw xmm12, xmm12, 0xB1
+        pshuflw xmm13, xmm13, 0xB1
+        pshufhw xmm13, xmm13, 0xB1
+        pshuflw xmm14, xmm14, 0xB1
+        pshufhw xmm14, xmm14, 0xB1
+        pshuflw xmm15, xmm15, 0xB1
+        pshufhw xmm15, xmm15, 0xB1
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm12
+        paddd   xmm9, xmm13
+        paddd   xmm10, xmm14
+        paddd   xmm11, xmm15
+        pxor    xmm4, xmm8
+        pxor    xmm5, xmm9
+        pxor    xmm6, xmm10
+        pxor    xmm7, xmm11
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 12
+        pslld   xmm4, 20
+        por     xmm4, xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 12
+        pslld   xmm5, 20
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 12
+        pslld   xmm6, 20
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 12
+        pslld   xmm7, 20
+        por     xmm7, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0xD0]
+        paddd   xmm1, xmmword ptr [rsp+0xB0]
+        paddd   xmm2, xmmword ptr [rsp+0xA0]
+        paddd   xmm3, xmmword ptr [rsp+0x80]
+        paddd   xmm0, xmm4
+        paddd   xmm1, xmm5
+        paddd   xmm2, xmm6
+        paddd   xmm3, xmm7
+        pxor    xmm12, xmm0
+        pxor    xmm13, xmm1
+        pxor    xmm14, xmm2
+        pxor    xmm15, xmm3
+        movdqa  xmm8, xmm12
+        psrld   xmm12, 8
+        pslld   xmm8, 24
+        pxor    xmm12, xmm8
+        movdqa  xmm8, xmm13
+        psrld   xmm13, 8
+        pslld   xmm8, 24
+        pxor    xmm13, xmm8
+        movdqa  xmm8, xmm14
+        psrld   xmm14, 8
+        pslld   xmm8, 24
+        pxor    xmm14, xmm8
+        movdqa  xmm8, xmm15
+        psrld   xmm15, 8
+        pslld   xmm8, 24
+        pxor    xmm15, xmm8
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm12
+        paddd   xmm9, xmm13
+        paddd   xmm10, xmm14
+        paddd   xmm11, xmm15
+        pxor    xmm4, xmm8
+        pxor    xmm5, xmm9
+        pxor    xmm6, xmm10
+        pxor    xmm7, xmm11
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 7
+        pslld   xmm4, 25
+        por     xmm4, xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 7
+        pslld   xmm5, 25
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 7
+        pslld   xmm6, 25
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 7
+        pslld   xmm7, 25
+        por     xmm7, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0x70]
+        paddd   xmm1, xmmword ptr [rsp+0x50]
+        paddd   xmm2, xmmword ptr [rsp]
+        paddd   xmm3, xmmword ptr [rsp+0x60]
+        paddd   xmm0, xmm5
+        paddd   xmm1, xmm6
+        paddd   xmm2, xmm7
+        paddd   xmm3, xmm4
+        pxor    xmm15, xmm0
+        pxor    xmm12, xmm1
+        pxor    xmm13, xmm2
+        pxor    xmm14, xmm3
+        pshuflw xmm15, xmm15, 0xB1
+        pshufhw xmm15, xmm15, 0xB1
+        pshuflw xmm12, xmm12, 0xB1
+        pshufhw xmm12, xmm12, 0xB1
+        pshuflw xmm13, xmm13, 0xB1
+        pshufhw xmm13, xmm13, 0xB1
+        pshuflw xmm14, xmm14, 0xB1
+        pshufhw xmm14, xmm14, 0xB1
+        paddd   xmm10, xmm15
+        paddd   xmm11, xmm12
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm13
+        paddd   xmm9, xmm14
+        pxor    xmm5, xmm10
+        pxor    xmm6, xmm11
+        pxor    xmm7, xmm8
+        pxor    xmm4, xmm9
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 12
+        pslld   xmm5, 20
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 12
+        pslld   xmm6, 20
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 12
+        pslld   xmm7, 20
+        por     xmm7, xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 12
+        pslld   xmm4, 20
+        por     xmm4, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0x20]
+        paddd   xmm1, xmmword ptr [rsp+0x30]
+        paddd   xmm2, xmmword ptr [rsp+0x10]
+        paddd   xmm3, xmmword ptr [rsp+0x40]
+        paddd   xmm0, xmm5
+        paddd   xmm1, xmm6
+        paddd   xmm2, xmm7
+        paddd   xmm3, xmm4
+        pxor    xmm15, xmm0
+        pxor    xmm12, xmm1
+        pxor    xmm13, xmm2
+        pxor    xmm14, xmm3
+        movdqa  xmm8, xmm15
+        psrld   xmm15, 8
+        pslld   xmm8, 24
+        pxor    xmm15, xmm8
+        movdqa  xmm8, xmm12
+        psrld   xmm12, 8
+        pslld   xmm8, 24
+        pxor    xmm12, xmm8
+        movdqa  xmm8, xmm13
+        psrld   xmm13, 8
+        pslld   xmm8, 24
+        pxor    xmm13, xmm8
+        movdqa  xmm8, xmm14
+        psrld   xmm14, 8
+        pslld   xmm8, 24
+        pxor    xmm14, xmm8
+        paddd   xmm10, xmm15
+        paddd   xmm11, xmm12
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm13
+        paddd   xmm9, xmm14
+        pxor    xmm5, xmm10
+        pxor    xmm6, xmm11
+        pxor    xmm7, xmm8
+        pxor    xmm4, xmm9
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 7
+        pslld   xmm5, 25
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 7
+        pslld   xmm6, 25
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 7
+        pslld   xmm7, 25
+        por     xmm7, xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 7
+        pslld   xmm4, 25
+        por     xmm4, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0x90]
+        paddd   xmm1, xmmword ptr [rsp+0xB0]
+        paddd   xmm2, xmmword ptr [rsp+0x80]
+        paddd   xmm3, xmmword ptr [rsp+0xF0]
+        paddd   xmm0, xmm4
+        paddd   xmm1, xmm5
+        paddd   xmm2, xmm6
+        paddd   xmm3, xmm7
+        pxor    xmm12, xmm0
+        pxor    xmm13, xmm1
+        pxor    xmm14, xmm2
+        pxor    xmm15, xmm3
+        pshuflw xmm12, xmm12, 0xB1
+        pshufhw xmm12, xmm12, 0xB1
+        pshuflw xmm13, xmm13, 0xB1
+        pshufhw xmm13, xmm13, 0xB1
+        pshuflw xmm14, xmm14, 0xB1
+        pshufhw xmm14, xmm14, 0xB1
+        pshuflw xmm15, xmm15, 0xB1
+        pshufhw xmm15, xmm15, 0xB1
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm12
+        paddd   xmm9, xmm13
+        paddd   xmm10, xmm14
+        paddd   xmm11, xmm15
+        pxor    xmm4, xmm8
+        pxor    xmm5, xmm9
+        pxor    xmm6, xmm10
+        pxor    xmm7, xmm11
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 12
+        pslld   xmm4, 20
+        por     xmm4, xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 12
+        pslld   xmm5, 20
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 12
+        pslld   xmm6, 20
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 12
+        pslld   xmm7, 20
+        por     xmm7, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0xE0]
+        paddd   xmm1, xmmword ptr [rsp+0x50]
+        paddd   xmm2, xmmword ptr [rsp+0xC0]
+        paddd   xmm3, xmmword ptr [rsp+0x10]
+        paddd   xmm0, xmm4
+        paddd   xmm1, xmm5
+        paddd   xmm2, xmm6
+        paddd   xmm3, xmm7
+        pxor    xmm12, xmm0
+        pxor    xmm13, xmm1
+        pxor    xmm14, xmm2
+        pxor    xmm15, xmm3
+        movdqa  xmm8, xmm12
+        psrld   xmm12, 8
+        pslld   xmm8, 24
+        pxor    xmm12, xmm8
+        movdqa  xmm8, xmm13
+        psrld   xmm13, 8
+        pslld   xmm8, 24
+        pxor    xmm13, xmm8
+        movdqa  xmm8, xmm14
+        psrld   xmm14, 8
+        pslld   xmm8, 24
+        pxor    xmm14, xmm8
+        movdqa  xmm8, xmm15
+        psrld   xmm15, 8
+        pslld   xmm8, 24
+        pxor    xmm15, xmm8
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm12
+        paddd   xmm9, xmm13
+        paddd   xmm10, xmm14
+        paddd   xmm11, xmm15
+        pxor    xmm4, xmm8
+        pxor    xmm5, xmm9
+        pxor    xmm6, xmm10
+        pxor    xmm7, xmm11
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 7
+        pslld   xmm4, 25
+        por     xmm4, xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 7
+        pslld   xmm5, 25
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 7
+        pslld   xmm6, 25
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 7
+        pslld   xmm7, 25
+        por     xmm7, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0xD0]
+        paddd   xmm1, xmmword ptr [rsp]
+        paddd   xmm2, xmmword ptr [rsp+0x20]
+        paddd   xmm3, xmmword ptr [rsp+0x40]
+        paddd   xmm0, xmm5
+        paddd   xmm1, xmm6
+        paddd   xmm2, xmm7
+        paddd   xmm3, xmm4
+        pxor    xmm15, xmm0
+        pxor    xmm12, xmm1
+        pxor    xmm13, xmm2
+        pxor    xmm14, xmm3
+        pshuflw xmm15, xmm15, 0xB1
+        pshufhw xmm15, xmm15, 0xB1
+        pshuflw xmm12, xmm12, 0xB1
+        pshufhw xmm12, xmm12, 0xB1
+        pshuflw xmm13, xmm13, 0xB1
+        pshufhw xmm13, xmm13, 0xB1
+        pshuflw xmm14, xmm14, 0xB1
+        pshufhw xmm14, xmm14, 0xB1
+        paddd   xmm10, xmm15
+        paddd   xmm11, xmm12
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm13
+        paddd   xmm9, xmm14
+        pxor    xmm5, xmm10
+        pxor    xmm6, xmm11
+        pxor    xmm7, xmm8
+        pxor    xmm4, xmm9
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 12
+        pslld   xmm5, 20
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 12
+        pslld   xmm6, 20
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 12
+        pslld   xmm7, 20
+        por     xmm7, xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 12
+        pslld   xmm4, 20
+        por     xmm4, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0x30]
+        paddd   xmm1, xmmword ptr [rsp+0xA0]
+        paddd   xmm2, xmmword ptr [rsp+0x60]
+        paddd   xmm3, xmmword ptr [rsp+0x70]
+        paddd   xmm0, xmm5
+        paddd   xmm1, xmm6
+        paddd   xmm2, xmm7
+        paddd   xmm3, xmm4
+        pxor    xmm15, xmm0
+        pxor    xmm12, xmm1
+        pxor    xmm13, xmm2
+        pxor    xmm14, xmm3
+        movdqa  xmm8, xmm15
+        psrld   xmm15, 8
+        pslld   xmm8, 24
+        pxor    xmm15, xmm8
+        movdqa  xmm8, xmm12
+        psrld   xmm12, 8
+        pslld   xmm8, 24
+        pxor    xmm12, xmm8
+        movdqa  xmm8, xmm13
+        psrld   xmm13, 8
+        pslld   xmm8, 24
+        pxor    xmm13, xmm8
+        movdqa  xmm8, xmm14
+        psrld   xmm14, 8
+        pslld   xmm8, 24
+        pxor    xmm14, xmm8
+        paddd   xmm10, xmm15
+        paddd   xmm11, xmm12
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm13
+        paddd   xmm9, xmm14
+        pxor    xmm5, xmm10
+        pxor    xmm6, xmm11
+        pxor    xmm7, xmm8
+        pxor    xmm4, xmm9
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 7
+        pslld   xmm5, 25
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 7
+        pslld   xmm6, 25
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 7
+        pslld   xmm7, 25
+        por     xmm7, xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 7
+        pslld   xmm4, 25
+        por     xmm4, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0xB0]
+        paddd   xmm1, xmmword ptr [rsp+0x50]
+        paddd   xmm2, xmmword ptr [rsp+0x10]
+        paddd   xmm3, xmmword ptr [rsp+0x80]
+        paddd   xmm0, xmm4
+        paddd   xmm1, xmm5
+        paddd   xmm2, xmm6
+        paddd   xmm3, xmm7
+        pxor    xmm12, xmm0
+        pxor    xmm13, xmm1
+        pxor    xmm14, xmm2
+        pxor    xmm15, xmm3
+        pshuflw xmm12, xmm12, 0xB1
+        pshufhw xmm12, xmm12, 0xB1
+        pshuflw xmm13, xmm13, 0xB1
+        pshufhw xmm13, xmm13, 0xB1
+        pshuflw xmm14, xmm14, 0xB1
+        pshufhw xmm14, xmm14, 0xB1
+        pshuflw xmm15, xmm15, 0xB1
+        pshufhw xmm15, xmm15, 0xB1
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm12
+        paddd   xmm9, xmm13
+        paddd   xmm10, xmm14
+        paddd   xmm11, xmm15
+        pxor    xmm4, xmm8
+        pxor    xmm5, xmm9
+        pxor    xmm6, xmm10
+        pxor    xmm7, xmm11
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 12
+        pslld   xmm4, 20
+        por     xmm4, xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 12
+        pslld   xmm5, 20
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 12
+        pslld   xmm6, 20
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 12
+        pslld   xmm7, 20
+        por     xmm7, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0xF0]
+        paddd   xmm1, xmmword ptr [rsp]
+        paddd   xmm2, xmmword ptr [rsp+0x90]
+        paddd   xmm3, xmmword ptr [rsp+0x60]
+        paddd   xmm0, xmm4
+        paddd   xmm1, xmm5
+        paddd   xmm2, xmm6
+        paddd   xmm3, xmm7
+        pxor    xmm12, xmm0
+        pxor    xmm13, xmm1
+        pxor    xmm14, xmm2
+        pxor    xmm15, xmm3
+        movdqa  xmm8, xmm12
+        psrld   xmm12, 8
+        pslld   xmm8, 24
+        pxor    xmm12, xmm8
+        movdqa  xmm8, xmm13
+        psrld   xmm13, 8
+        pslld   xmm8, 24
+        pxor    xmm13, xmm8
+        movdqa  xmm8, xmm14
+        psrld   xmm14, 8
+        pslld   xmm8, 24
+        pxor    xmm14, xmm8
+        movdqa  xmm8, xmm15
+        psrld   xmm15, 8
+        pslld   xmm8, 24
+        pxor    xmm15, xmm8
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm12
+        paddd   xmm9, xmm13
+        paddd   xmm10, xmm14
+        paddd   xmm11, xmm15
+        pxor    xmm4, xmm8
+        pxor    xmm5, xmm9
+        pxor    xmm6, xmm10
+        pxor    xmm7, xmm11
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 7
+        pslld   xmm4, 25
+        por     xmm4, xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 7
+        pslld   xmm5, 25
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 7
+        pslld   xmm6, 25
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 7
+        pslld   xmm7, 25
+        por     xmm7, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0xE0]
+        paddd   xmm1, xmmword ptr [rsp+0x20]
+        paddd   xmm2, xmmword ptr [rsp+0x30]
+        paddd   xmm3, xmmword ptr [rsp+0x70]
+        paddd   xmm0, xmm5
+        paddd   xmm1, xmm6
+        paddd   xmm2, xmm7
+        paddd   xmm3, xmm4
+        pxor    xmm15, xmm0
+        pxor    xmm12, xmm1
+        pxor    xmm13, xmm2
+        pxor    xmm14, xmm3
+        pshuflw xmm15, xmm15, 0xB1
+        pshufhw xmm15, xmm15, 0xB1
+        pshuflw xmm12, xmm12, 0xB1
+        pshufhw xmm12, xmm12, 0xB1
+        pshuflw xmm13, xmm13, 0xB1
+        pshufhw xmm13, xmm13, 0xB1
+        pshuflw xmm14, xmm14, 0xB1
+        pshufhw xmm14, xmm14, 0xB1
+        paddd   xmm10, xmm15
+        paddd   xmm11, xmm12
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm13
+        paddd   xmm9, xmm14
+        pxor    xmm5, xmm10
+        pxor    xmm6, xmm11
+        pxor    xmm7, xmm8
+        pxor    xmm4, xmm9
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 12
+        pslld   xmm5, 20
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 12
+        pslld   xmm6, 20
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 12
+        pslld   xmm7, 20
+        por     xmm7, xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 12
+        pslld   xmm4, 20
+        por     xmm4, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0xA0]
+        paddd   xmm1, xmmword ptr [rsp+0xC0]
+        paddd   xmm2, xmmword ptr [rsp+0x40]
+        paddd   xmm3, xmmword ptr [rsp+0xD0]
+        paddd   xmm0, xmm5
+        paddd   xmm1, xmm6
+        paddd   xmm2, xmm7
+        paddd   xmm3, xmm4
+        pxor    xmm15, xmm0
+        pxor    xmm12, xmm1
+        pxor    xmm13, xmm2
+        pxor    xmm14, xmm3
+        movdqa  xmm8, xmm15
+        psrld   xmm15, 8
+        pslld   xmm8, 24
+        pxor    xmm15, xmm8
+        movdqa  xmm8, xmm12
+        psrld   xmm12, 8
+        pslld   xmm8, 24
+        pxor    xmm12, xmm8
+        movdqa  xmm8, xmm13
+        psrld   xmm13, 8
+        pslld   xmm8, 24
+        pxor    xmm13, xmm8
+        movdqa  xmm8, xmm14
+        psrld   xmm14, 8
+        pslld   xmm8, 24
+        pxor    xmm14, xmm8
+        paddd   xmm10, xmm15
+        paddd   xmm11, xmm12
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm13
+        paddd   xmm9, xmm14
+        pxor    xmm5, xmm10
+        pxor    xmm6, xmm11
+        pxor    xmm7, xmm8
+        pxor    xmm4, xmm9
+        pxor    xmm0, xmm8
+        pxor    xmm1, xmm9
+        pxor    xmm2, xmm10
+        pxor    xmm3, xmm11
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 7
+        pslld   xmm5, 25
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 7
+        pslld   xmm6, 25
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 7
+        pslld   xmm7, 25
+        por     xmm7, xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 7
+        pslld   xmm4, 25
+        por     xmm4, xmm8
+        pxor    xmm4, xmm12
+        pxor    xmm5, xmm13
+        pxor    xmm6, xmm14
+        pxor    xmm7, xmm15
+        mov     eax, r13d
+        jne     9b
+        movdqa  xmm9, xmm0
+        punpckldq xmm0, xmm1
+        punpckhdq xmm9, xmm1
+        movdqa  xmm11, xmm2
+        punpckldq xmm2, xmm3
+        punpckhdq xmm11, xmm3
+        movdqa  xmm1, xmm0
+        punpcklqdq xmm0, xmm2
+        punpckhqdq xmm1, xmm2
+        movdqa  xmm3, xmm9
+        punpcklqdq xmm9, xmm11
+        punpckhqdq xmm3, xmm11
+        movdqu  xmmword ptr [rbx], xmm0
+        movdqu  xmmword ptr [rbx+0x20], xmm1
+        movdqu  xmmword ptr [rbx+0x40], xmm9
+        movdqu  xmmword ptr [rbx+0x60], xmm3
+        movdqa  xmm9, xmm4
+        punpckldq xmm4, xmm5
+        punpckhdq xmm9, xmm5
+        movdqa  xmm11, xmm6
+        punpckldq xmm6, xmm7
+        punpckhdq xmm11, xmm7
+        movdqa  xmm5, xmm4
+        punpcklqdq xmm4, xmm6
+        punpckhqdq xmm5, xmm6
+        movdqa  xmm7, xmm9
+        punpcklqdq xmm9, xmm11
+        punpckhqdq xmm7, xmm11
+        movdqu  xmmword ptr [rbx+0x10], xmm4
+        movdqu  xmmword ptr [rbx+0x30], xmm5
+        movdqu  xmmword ptr [rbx+0x50], xmm9
+        movdqu  xmmword ptr [rbx+0x70], xmm7
+        movdqa  xmm1, xmmword ptr [rsp+0x110]
+        movdqa  xmm0, xmm1
+        paddd   xmm1, xmmword ptr [rsp+0x150]
+        movdqa  xmmword ptr [rsp+0x110], xmm1
+        pxor    xmm0, xmmword ptr [CMP_MSB_MASK+rip]
+        pxor    xmm1, xmmword ptr [CMP_MSB_MASK+rip]
+        pcmpgtd xmm0, xmm1
+        movdqa  xmm1, xmmword ptr [rsp+0x120]
+        psubd   xmm1, xmm0
+        movdqa  xmmword ptr [rsp+0x120], xmm1
+        add     rbx, 128
+        add     rdi, 32
+        sub     rsi, 4
+        cmp     rsi, 4
+        jnc     2b
+        test    rsi, rsi
+        jnz     3f
+4:
+        mov     rsp, rbp
+        pop     rbp
+        pop     rbx
+        pop     r12
+        pop     r13
+        pop     r14
+        pop     r15
+        ret
+.p2align 5
+3:
+        test    esi, 0x2
+        je      3f
+        movups  xmm0, xmmword ptr [rcx]
+        movups  xmm1, xmmword ptr [rcx+0x10]
+        movaps  xmm8, xmm0
+        movaps  xmm9, xmm1
+        movd    xmm13, dword ptr [rsp+0x110]
+        movd    xmm14, dword ptr [rsp+0x120]
+        punpckldq xmm13, xmm14
+        movaps  xmmword ptr [rsp], xmm13
+        movd    xmm14, dword ptr [rsp+0x114]
+        movd    xmm13, dword ptr [rsp+0x124]
+        punpckldq xmm14, xmm13
+        movaps  xmmword ptr [rsp+0x10], xmm14
+        mov     r8, qword ptr [rdi]
+        mov     r9, qword ptr [rdi+0x8]
+        movzx   eax, byte ptr [rbp+0x40]
+        or      eax, r13d
+        xor     edx, edx
+2:
+        mov     r14d, eax
+        or      eax, r12d
+        add     rdx, 64
+        cmp     rdx, r15
+        cmovne  eax, r14d
+        movaps  xmm2, xmmword ptr [BLAKE3_IV+rip]
+        movaps  xmm10, xmm2
+        movups  xmm4, xmmword ptr [r8+rdx-0x40]
+        movups  xmm5, xmmword ptr [r8+rdx-0x30]
+        movaps  xmm3, xmm4
+        shufps  xmm4, xmm5, 136
+        shufps  xmm3, xmm5, 221
+        movaps  xmm5, xmm3
+        movups  xmm6, xmmword ptr [r8+rdx-0x20]
+        movups  xmm7, xmmword ptr [r8+rdx-0x10]
+        movaps  xmm3, xmm6
+        shufps  xmm6, xmm7, 136
+        pshufd  xmm6, xmm6, 0x93
+        shufps  xmm3, xmm7, 221
+        pshufd  xmm7, xmm3, 0x93
+        movups  xmm12, xmmword ptr [r9+rdx-0x40]
+        movups  xmm13, xmmword ptr [r9+rdx-0x30]
+        movaps  xmm11, xmm12
+        shufps  xmm12, xmm13, 136
+        shufps  xmm11, xmm13, 221
+        movaps  xmm13, xmm11
+        movups  xmm14, xmmword ptr [r9+rdx-0x20]
+        movups  xmm15, xmmword ptr [r9+rdx-0x10]
+        movaps  xmm11, xmm14
+        shufps  xmm14, xmm15, 136
+        pshufd  xmm14, xmm14, 0x93
+        shufps  xmm11, xmm15, 221
+        pshufd  xmm15, xmm11, 0x93
+        shl     rax, 0x20
+        or      rax, 0x40
+        movq    xmm3, rax
+        movdqa  xmmword ptr [rsp+0x20], xmm3
+        movaps  xmm3, xmmword ptr [rsp]
+        movaps  xmm11, xmmword ptr [rsp+0x10]
+        punpcklqdq xmm3, xmmword ptr [rsp+0x20]
+        punpcklqdq xmm11, xmmword ptr [rsp+0x20]
+        mov     al, 7
+9:
+        paddd   xmm0, xmm4
+        paddd   xmm8, xmm12
+        movaps  xmmword ptr [rsp+0x20], xmm4
+        movaps  xmmword ptr [rsp+0x30], xmm12
+        paddd   xmm0, xmm1
+        paddd   xmm8, xmm9
+        pxor    xmm3, xmm0
+        pxor    xmm11, xmm8
+        pshuflw xmm3, xmm3, 0xB1
+        pshufhw xmm3, xmm3, 0xB1
+        pshuflw xmm11, xmm11, 0xB1
+        pshufhw xmm11, xmm11, 0xB1
+        paddd   xmm2, xmm3
+        paddd   xmm10, xmm11
+        pxor    xmm1, xmm2
+        pxor    xmm9, xmm10
+        movdqa  xmm4, xmm1
+        pslld   xmm1, 20
+        psrld   xmm4, 12
+        por     xmm1, xmm4
+        movdqa  xmm4, xmm9
+        pslld   xmm9, 20
+        psrld   xmm4, 12
+        por     xmm9, xmm4
+        paddd   xmm0, xmm5
+        paddd   xmm8, xmm13
+        movaps  xmmword ptr [rsp+0x40], xmm5
+        movaps  xmmword ptr [rsp+0x50], xmm13
+        paddd   xmm0, xmm1
+        paddd   xmm8, xmm9
+        pxor    xmm3, xmm0
+        pxor    xmm11, xmm8
+        movdqa  xmm13, xmm3
+        psrld   xmm3, 8
+        pslld   xmm13, 24
+        pxor    xmm3, xmm13
+        movdqa  xmm13, xmm11
+        psrld   xmm11, 8
+        pslld   xmm13, 24
+        pxor    xmm11, xmm13
+        paddd   xmm2, xmm3
+        paddd   xmm10, xmm11
+        pxor    xmm1, xmm2
+        pxor    xmm9, xmm10
+        movdqa  xmm4, xmm1
+        pslld   xmm1, 25
+        psrld   xmm4, 7
+        por     xmm1, xmm4
+        movdqa  xmm4, xmm9
+        pslld   xmm9, 25
+        psrld   xmm4, 7
+        por     xmm9, xmm4
+        pshufd  xmm0, xmm0, 0x93
+        pshufd  xmm8, xmm8, 0x93
+        pshufd  xmm3, xmm3, 0x4E
+        pshufd  xmm11, xmm11, 0x4E
+        pshufd  xmm2, xmm2, 0x39
+        pshufd  xmm10, xmm10, 0x39
+        paddd   xmm0, xmm6
+        paddd   xmm8, xmm14
+        paddd   xmm0, xmm1
+        paddd   xmm8, xmm9
+        pxor    xmm3, xmm0
+        pxor    xmm11, xmm8
+        pshuflw xmm3, xmm3, 0xB1
+        pshufhw xmm3, xmm3, 0xB1
+        pshuflw xmm11, xmm11, 0xB1
+        pshufhw xmm11, xmm11, 0xB1
+        paddd   xmm2, xmm3
+        paddd   xmm10, xmm11
+        pxor    xmm1, xmm2
+        pxor    xmm9, xmm10
+        movdqa  xmm4, xmm1
+        pslld   xmm1, 20
+        psrld   xmm4, 12
+        por     xmm1, xmm4
+        movdqa  xmm4, xmm9
+        pslld   xmm9, 20
+        psrld   xmm4, 12
+        por     xmm9, xmm4
+        paddd   xmm0, xmm7
+        paddd   xmm8, xmm15
+        paddd   xmm0, xmm1
+        paddd   xmm8, xmm9
+        pxor    xmm3, xmm0
+        pxor    xmm11, xmm8
+        movdqa  xmm13, xmm3
+        psrld   xmm3, 8
+        pslld   xmm13, 24
+        pxor    xmm3, xmm13
+        movdqa  xmm13, xmm11
+        psrld   xmm11, 8
+        pslld   xmm13, 24
+        pxor    xmm11, xmm13
+        paddd   xmm2, xmm3
+        paddd   xmm10, xmm11
+        pxor    xmm1, xmm2
+        pxor    xmm9, xmm10
+        movdqa  xmm4, xmm1
+        pslld   xmm1, 25
+        psrld   xmm4, 7
+        por     xmm1, xmm4
+        movdqa  xmm4, xmm9
+        pslld   xmm9, 25
+        psrld   xmm4, 7
+        por     xmm9, xmm4
+        pshufd  xmm0, xmm0, 0x39
+        pshufd  xmm8, xmm8, 0x39
+        pshufd  xmm3, xmm3, 0x4E
+        pshufd  xmm11, xmm11, 0x4E
+        pshufd  xmm2, xmm2, 0x93
+        pshufd  xmm10, xmm10, 0x93
+        dec     al
+        je      9f
+        movdqa  xmm12, xmmword ptr [rsp+0x20]
+        movdqa  xmm5, xmmword ptr [rsp+0x40]
+        pshufd  xmm13, xmm12, 0x0F
+        shufps  xmm12, xmm5, 214
+        pshufd  xmm4, xmm12, 0x39
+        movdqa  xmm12, xmm6
+        shufps  xmm12, xmm7, 250
+        pand    xmm13, xmmword ptr [PBLENDW_0x33_MASK+rip]
+        pand    xmm12, xmmword ptr [PBLENDW_0xCC_MASK+rip]
+        por     xmm13, xmm12
+        movdqa  xmmword ptr [rsp+0x20], xmm13
+        movdqa  xmm12, xmm7
+        punpcklqdq xmm12, xmm5
+        movdqa  xmm13, xmm6
+        pand    xmm12, xmmword ptr [PBLENDW_0x3F_MASK+rip]
+        pand    xmm13, xmmword ptr [PBLENDW_0xC0_MASK+rip]
+        por     xmm12, xmm13
+        pshufd  xmm12, xmm12, 0x78
+        punpckhdq xmm5, xmm7
+        punpckldq xmm6, xmm5
+        pshufd  xmm7, xmm6, 0x1E
+        movdqa  xmmword ptr [rsp+0x40], xmm12
+        movdqa  xmm5, xmmword ptr [rsp+0x30]
+        movdqa  xmm13, xmmword ptr [rsp+0x50]
+        pshufd  xmm6, xmm5, 0x0F
+        shufps  xmm5, xmm13, 214
+        pshufd  xmm12, xmm5, 0x39
+        movdqa  xmm5, xmm14
+        shufps  xmm5, xmm15, 250
+        pand    xmm6, xmmword ptr [PBLENDW_0x33_MASK+rip]
+        pand    xmm5, xmmword ptr [PBLENDW_0xCC_MASK+rip]
+        por     xmm6, xmm5
+        movdqa  xmm5, xmm15
+        punpcklqdq xmm5, xmm13
+        movdqa  xmmword ptr [rsp+0x30], xmm2
+        movdqa  xmm2, xmm14
+        pand    xmm5, xmmword ptr [PBLENDW_0x3F_MASK+rip]
+        pand    xmm2, xmmword ptr [PBLENDW_0xC0_MASK+rip]
+        por     xmm5, xmm2
+        movdqa  xmm2, xmmword ptr [rsp+0x30]
+        pshufd  xmm5, xmm5, 0x78
+        punpckhdq xmm13, xmm15
+        punpckldq xmm14, xmm13
+        pshufd  xmm15, xmm14, 0x1E
+        movdqa  xmm13, xmm6
+        movdqa  xmm14, xmm5
+        movdqa  xmm5, xmmword ptr [rsp+0x20]
+        movdqa  xmm6, xmmword ptr [rsp+0x40]
+        jmp     9b
+9:
+        pxor    xmm0, xmm2
+        pxor    xmm1, xmm3
+        pxor    xmm8, xmm10
+        pxor    xmm9, xmm11
+        mov     eax, r13d
+        cmp     rdx, r15
+        jne     2b
+        movups  xmmword ptr [rbx], xmm0
+        movups  xmmword ptr [rbx+0x10], xmm1
+        movups  xmmword ptr [rbx+0x20], xmm8
+        movups  xmmword ptr [rbx+0x30], xmm9
+        mov     eax, dword ptr [rsp+0x130]
+        neg     eax
+        mov    r10d, dword ptr [rsp+0x110+8*rax]
+        mov    r11d, dword ptr [rsp+0x120+8*rax]
+        mov dword ptr [rsp+0x110], r10d
+        mov dword ptr [rsp+0x120], r11d
+        add     rdi, 16
+        add     rbx, 64
+        sub     rsi, 2
+3:
+        test    esi, 0x1
+        je      4b
+        movups  xmm0, xmmword ptr [rcx]
+        movups  xmm1, xmmword ptr [rcx+0x10]
+        movd    xmm13, dword ptr [rsp+0x110]
+        movd    xmm14, dword ptr [rsp+0x120]
+        punpckldq xmm13, xmm14
+        mov     r8, qword ptr [rdi]
+        movzx   eax, byte ptr [rbp+0x40]
+        or      eax, r13d
+        xor     edx, edx
+2:
+        mov     r14d, eax
+        or      eax, r12d
+        add     rdx, 64
+        cmp     rdx, r15
+        cmovne  eax, r14d
+        movaps  xmm2, xmmword ptr [BLAKE3_IV+rip]
+        shl     rax, 32
+        or      rax, 64
+        movq    xmm12, rax
+        movdqa  xmm3, xmm13
+        punpcklqdq xmm3, xmm12
+        movups  xmm4, xmmword ptr [r8+rdx-0x40]
+        movups  xmm5, xmmword ptr [r8+rdx-0x30]
+        movaps  xmm8, xmm4
+        shufps  xmm4, xmm5, 136
+        shufps  xmm8, xmm5, 221
+        movaps  xmm5, xmm8
+        movups  xmm6, xmmword ptr [r8+rdx-0x20]
+        movups  xmm7, xmmword ptr [r8+rdx-0x10]
+        movaps  xmm8, xmm6
+        shufps  xmm6, xmm7, 136
+        pshufd  xmm6, xmm6, 0x93
+        shufps  xmm8, xmm7, 221
+        pshufd  xmm7, xmm8, 0x93
+        mov     al, 7
+9:
+        paddd   xmm0, xmm4
+        paddd   xmm0, xmm1
+        pxor    xmm3, xmm0
+        pshuflw xmm3, xmm3, 0xB1
+        pshufhw xmm3, xmm3, 0xB1
+        paddd   xmm2, xmm3
+        pxor    xmm1, xmm2
+        movdqa  xmm11, xmm1
+        pslld   xmm1, 20
+        psrld   xmm11, 12
+        por     xmm1, xmm11
+        paddd   xmm0, xmm5
+        paddd   xmm0, xmm1
+        pxor    xmm3, xmm0
+        movdqa  xmm14, xmm3
+        psrld   xmm3, 8
+        pslld   xmm14, 24
+        pxor    xmm3, xmm14
+        paddd   xmm2, xmm3
+        pxor    xmm1, xmm2
+        movdqa  xmm11, xmm1
+        pslld   xmm1, 25
+        psrld   xmm11, 7
+        por     xmm1, xmm11
+        pshufd  xmm0, xmm0, 0x93
+        pshufd  xmm3, xmm3, 0x4E
+        pshufd  xmm2, xmm2, 0x39
+        paddd   xmm0, xmm6
+        paddd   xmm0, xmm1
+        pxor    xmm3, xmm0
+        pshuflw xmm3, xmm3, 0xB1
+        pshufhw xmm3, xmm3, 0xB1
+        paddd   xmm2, xmm3
+        pxor    xmm1, xmm2
+        movdqa  xmm11, xmm1
+        pslld   xmm1, 20
+        psrld   xmm11, 12
+        por     xmm1, xmm11
+        paddd   xmm0, xmm7
+        paddd   xmm0, xmm1
+        pxor    xmm3, xmm0
+        movdqa  xmm14, xmm3
+        psrld   xmm3, 8
+        pslld   xmm14, 24
+        pxor    xmm3, xmm14
+        paddd   xmm2, xmm3
+        pxor    xmm1, xmm2
+        movdqa  xmm11, xmm1
+        pslld   xmm1, 25
+        psrld   xmm11, 7
+        por     xmm1, xmm11
+        pshufd  xmm0, xmm0, 0x39
+        pshufd  xmm3, xmm3, 0x4E
+        pshufd  xmm2, xmm2, 0x93
+        dec     al
+        jz      9f
+        movdqa  xmm8, xmm4
+        shufps  xmm8, xmm5, 214
+        pshufd  xmm9, xmm4, 0x0F
+        pshufd  xmm4, xmm8, 0x39
+        movdqa  xmm8, xmm6
+        shufps  xmm8, xmm7, 250
+        pand    xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip]
+        pand    xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip]
+        por     xmm9, xmm8
+        movdqa  xmm8, xmm7
+        punpcklqdq xmm8, xmm5
+        movdqa  xmm10, xmm6
+        pand    xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip]
+        pand    xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]
+        por     xmm8, xmm10
+        pshufd  xmm8, xmm8, 0x78
+        punpckhdq xmm5, xmm7
+        punpckldq xmm6, xmm5
+        pshufd  xmm7, xmm6, 0x1E
+        movdqa  xmm5, xmm9
+        movdqa  xmm6, xmm8
+        jmp     9b
+9:
+        pxor    xmm0, xmm2
+        pxor    xmm1, xmm3
+        mov     eax, r13d
+        cmp     rdx, r15
+        jne     2b
+        movups  xmmword ptr [rbx], xmm0
+        movups  xmmword ptr [rbx+0x10], xmm1
+        jmp     4b
+
+.p2align 6
+blake3_compress_in_place_sse2:
+_blake3_compress_in_place_sse2:
+        _CET_ENDBR
+        movups  xmm0, xmmword ptr [rdi]
+        movups  xmm1, xmmword ptr [rdi+0x10]
+        movaps  xmm2, xmmword ptr [BLAKE3_IV+rip]
+        shl     r8, 32
+        add     rdx, r8
+        movq    xmm3, rcx
+        movq    xmm4, rdx
+        punpcklqdq xmm3, xmm4
+        movups  xmm4, xmmword ptr [rsi]
+        movups  xmm5, xmmword ptr [rsi+0x10]
+        movaps  xmm8, xmm4
+        shufps  xmm4, xmm5, 136
+        shufps  xmm8, xmm5, 221
+        movaps  xmm5, xmm8
+        movups  xmm6, xmmword ptr [rsi+0x20]
+        movups  xmm7, xmmword ptr [rsi+0x30]
+        movaps  xmm8, xmm6
+        shufps  xmm6, xmm7, 136
+        pshufd  xmm6, xmm6, 0x93
+        shufps  xmm8, xmm7, 221
+        pshufd  xmm7, xmm8, 0x93
+        mov     al, 7
+9:
+        paddd   xmm0, xmm4
+        paddd   xmm0, xmm1
+        pxor    xmm3, xmm0
+        pshuflw xmm3, xmm3, 0xB1
+        pshufhw xmm3, xmm3, 0xB1
+        paddd   xmm2, xmm3
+        pxor    xmm1, xmm2
+        movdqa  xmm11, xmm1
+        pslld   xmm1, 20
+        psrld   xmm11, 12
+        por     xmm1, xmm11
+        paddd   xmm0, xmm5
+        paddd   xmm0, xmm1
+        pxor    xmm3, xmm0
+        movdqa  xmm14, xmm3
+        psrld   xmm3, 8
+        pslld   xmm14, 24
+        pxor    xmm3, xmm14
+        paddd   xmm2, xmm3
+        pxor    xmm1, xmm2
+        movdqa  xmm11, xmm1
+        pslld   xmm1, 25
+        psrld   xmm11, 7
+        por     xmm1, xmm11
+        pshufd  xmm0, xmm0, 0x93
+        pshufd  xmm3, xmm3, 0x4E
+        pshufd  xmm2, xmm2, 0x39
+        paddd   xmm0, xmm6
+        paddd   xmm0, xmm1
+        pxor    xmm3, xmm0
+        pshuflw xmm3, xmm3, 0xB1
+        pshufhw xmm3, xmm3, 0xB1
+        paddd   xmm2, xmm3
+        pxor    xmm1, xmm2
+        movdqa  xmm11, xmm1
+        pslld   xmm1, 20
+        psrld   xmm11, 12
+        por     xmm1, xmm11
+        paddd   xmm0, xmm7
+        paddd   xmm0, xmm1
+        pxor    xmm3, xmm0
+        movdqa  xmm14, xmm3
+        psrld   xmm3, 8
+        pslld   xmm14, 24
+        pxor    xmm3, xmm14
+        paddd   xmm2, xmm3
+        pxor    xmm1, xmm2
+        movdqa  xmm11, xmm1
+        pslld   xmm1, 25
+        psrld   xmm11, 7
+        por     xmm1, xmm11
+        pshufd  xmm0, xmm0, 0x39
+        pshufd  xmm3, xmm3, 0x4E
+        pshufd  xmm2, xmm2, 0x93
+        dec     al
+        jz      9f
+        movdqa  xmm8, xmm4
+        shufps  xmm8, xmm5, 214
+        pshufd  xmm9, xmm4, 0x0F
+        pshufd  xmm4, xmm8, 0x39
+        movdqa  xmm8, xmm6
+        shufps  xmm8, xmm7, 250
+        pand    xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip]
+        pand    xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip]
+        por     xmm9, xmm8
+        movdqa  xmm8, xmm7
+        punpcklqdq xmm8, xmm5
+        movdqa  xmm10, xmm6
+        pand    xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip]
+        pand    xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]
+        por     xmm8, xmm10
+        pshufd  xmm8, xmm8, 0x78
+        punpckhdq xmm5, xmm7
+        punpckldq xmm6, xmm5
+        pshufd  xmm7, xmm6, 0x1E
+        movdqa  xmm5, xmm9
+        movdqa  xmm6, xmm8
+        jmp     9b
+9:
+        pxor    xmm0, xmm2
+        pxor    xmm1, xmm3
+        movups  xmmword ptr [rdi], xmm0
+        movups  xmmword ptr [rdi+0x10], xmm1
+        ret
+
+.p2align 6
+blake3_compress_xof_sse2:
+_blake3_compress_xof_sse2:
+        _CET_ENDBR
+        movups  xmm0, xmmword ptr [rdi]
+        movups  xmm1, xmmword ptr [rdi+0x10]
+        movaps  xmm2, xmmword ptr [BLAKE3_IV+rip]
+        movzx   eax, r8b
+        movzx   edx, dl
+        shl     rax, 32
+        add     rdx, rax
+        movq    xmm3, rcx
+        movq    xmm4, rdx
+        punpcklqdq xmm3, xmm4
+        movups  xmm4, xmmword ptr [rsi]
+        movups  xmm5, xmmword ptr [rsi+0x10]
+        movaps  xmm8, xmm4
+        shufps  xmm4, xmm5, 136
+        shufps  xmm8, xmm5, 221
+        movaps  xmm5, xmm8
+        movups  xmm6, xmmword ptr [rsi+0x20]
+        movups  xmm7, xmmword ptr [rsi+0x30]
+        movaps  xmm8, xmm6
+        shufps  xmm6, xmm7, 136
+        pshufd  xmm6, xmm6, 0x93
+        shufps  xmm8, xmm7, 221
+        pshufd  xmm7, xmm8, 0x93
+        mov     al, 7
+9:
+        paddd   xmm0, xmm4
+        paddd   xmm0, xmm1
+        pxor    xmm3, xmm0
+        pshuflw xmm3, xmm3, 0xB1
+        pshufhw xmm3, xmm3, 0xB1
+        paddd   xmm2, xmm3
+        pxor    xmm1, xmm2
+        movdqa  xmm11, xmm1
+        pslld   xmm1, 20
+        psrld   xmm11, 12
+        por     xmm1, xmm11
+        paddd   xmm0, xmm5
+        paddd   xmm0, xmm1
+        pxor    xmm3, xmm0
+        movdqa  xmm14, xmm3
+        psrld   xmm3, 8
+        pslld   xmm14, 24
+        pxor    xmm3, xmm14
+        paddd   xmm2, xmm3
+        pxor    xmm1, xmm2
+        movdqa  xmm11, xmm1
+        pslld   xmm1, 25
+        psrld   xmm11, 7
+        por     xmm1, xmm11
+        pshufd  xmm0, xmm0, 0x93
+        pshufd  xmm3, xmm3, 0x4E
+        pshufd  xmm2, xmm2, 0x39
+        paddd   xmm0, xmm6
+        paddd   xmm0, xmm1
+        pxor    xmm3, xmm0
+        pshuflw xmm3, xmm3, 0xB1
+        pshufhw xmm3, xmm3, 0xB1
+        paddd   xmm2, xmm3
+        pxor    xmm1, xmm2
+        movdqa  xmm11, xmm1
+        pslld   xmm1, 20
+        psrld   xmm11, 12
+        por     xmm1, xmm11
+        paddd   xmm0, xmm7
+        paddd   xmm0, xmm1
+        pxor    xmm3, xmm0
+        movdqa  xmm14, xmm3
+        psrld   xmm3, 8
+        pslld   xmm14, 24
+        pxor    xmm3, xmm14
+        paddd   xmm2, xmm3
+        pxor    xmm1, xmm2
+        movdqa  xmm11, xmm1
+        pslld   xmm1, 25
+        psrld   xmm11, 7
+        por     xmm1, xmm11
+        pshufd  xmm0, xmm0, 0x39
+        pshufd  xmm3, xmm3, 0x4E
+        pshufd  xmm2, xmm2, 0x93
+        dec     al
+        jz      9f
+        movdqa  xmm8, xmm4
+        shufps  xmm8, xmm5, 214
+        pshufd  xmm9, xmm4, 0x0F
+        pshufd  xmm4, xmm8, 0x39
+        movdqa  xmm8, xmm6
+        shufps  xmm8, xmm7, 250
+        pand    xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip]
+        pand    xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip]
+        por     xmm9, xmm8
+        movdqa  xmm8, xmm7
+        punpcklqdq xmm8, xmm5
+        movdqa  xmm10, xmm6
+        pand    xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip]
+        pand    xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]
+        por     xmm8, xmm10
+        pshufd  xmm8, xmm8, 0x78
+        punpckhdq xmm5, xmm7
+        punpckldq xmm6, xmm5
+        pshufd  xmm7, xmm6, 0x1E
+        movdqa  xmm5, xmm9
+        movdqa  xmm6, xmm8
+        jmp     9b
+9:
+        movdqu  xmm4, xmmword ptr [rdi]
+        movdqu  xmm5, xmmword ptr [rdi+0x10]
+        pxor    xmm0, xmm2
+        pxor    xmm1, xmm3
+        pxor    xmm2, xmm4
+        pxor    xmm3, xmm5
+        movups  xmmword ptr [r9], xmm0
+        movups  xmmword ptr [r9+0x10], xmm1
+        movups  xmmword ptr [r9+0x20], xmm2
+        movups  xmmword ptr [r9+0x30], xmm3
+        ret
+
+
+#ifdef __APPLE__
+.static_data
+#else
+.section .rodata
+#endif
+.p2align  6
+BLAKE3_IV:
+        .long  0x6A09E667, 0xBB67AE85
+        .long  0x3C6EF372, 0xA54FF53A
+ADD0:	
+        .long  0, 1, 2, 3
+ADD1:
+	.long  4, 4, 4, 4
+BLAKE3_IV_0:
+	.long  0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667
+BLAKE3_IV_1:
+	.long  0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85
+BLAKE3_IV_2:
+	.long  0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372
+BLAKE3_IV_3:
+	.long  0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A
+BLAKE3_BLOCK_LEN:
+	.long  64, 64, 64, 64
+CMP_MSB_MASK:
+	.long  0x80000000, 0x80000000, 0x80000000, 0x80000000
+PBLENDW_0x33_MASK:
+	.long  0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000
+PBLENDW_0xCC_MASK:
+	.long  0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF
+PBLENDW_0x3F_MASK:
+	.long  0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000
+PBLENDW_0xC0_MASK:
+	.long  0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF
diff --git a/src/blake3/blake3_sse41.c b/src/blake3/blake3_sse41.c
new file mode 100644
index 0000000..0be9528
--- /dev/null
+++ b/src/blake3/blake3_sse41.c
@@ -0,0 +1,560 @@
+#include "blake3.h"
+
+#include <immintrin.h>
+
+#define DEGREE 4
+
+#define _mm_shuffle_ps2(a, b, c)                                               \
+  (_mm_castps_si128(                                                           \
+      _mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), (c))))
+
+INLINE __m128i loadu(const u8 src[16]) {
+  return _mm_loadu_si128((const __m128i *)src);
+}
+
+INLINE void storeu(__m128i src, u8 dest[16]) {
+  _mm_storeu_si128((__m128i *)dest, src);
+}
+
+INLINE __m128i addv(__m128i a, __m128i b) { return _mm_add_epi32(a, b); }
+
+// Note that clang-format doesn't like the name "xor" for some reason.
+INLINE __m128i xorv(__m128i a, __m128i b) { return _mm_xor_si128(a, b); }
+
+INLINE __m128i set1(u32 x) { return _mm_set1_epi32((int32_t)x); }
+
+INLINE __m128i set4(u32 a, u32 b, u32 c, u32 d) {
+  return _mm_setr_epi32((int32_t)a, (int32_t)b, (int32_t)c, (int32_t)d);
+}
+
+INLINE __m128i rot16(__m128i x) {
+  return _mm_shuffle_epi8(
+      x, _mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2));
+}
+
+INLINE __m128i rot12(__m128i x) {
+  return xorv(_mm_srli_epi32(x, 12), _mm_slli_epi32(x, 32 - 12));
+}
+
+INLINE __m128i rot8(__m128i x) {
+  return _mm_shuffle_epi8(
+      x, _mm_set_epi8(12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1));
+}
+
+INLINE __m128i rot7(__m128i x) {
+  return xorv(_mm_srli_epi32(x, 7), _mm_slli_epi32(x, 32 - 7));
+}
+
+INLINE void g1(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3,
+               __m128i m) {
+  *row0 = addv(addv(*row0, m), *row1);
+  *row3 = xorv(*row3, *row0);
+  *row3 = rot16(*row3);
+  *row2 = addv(*row2, *row3);
+  *row1 = xorv(*row1, *row2);
+  *row1 = rot12(*row1);
+}
+
+INLINE void g2(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3,
+               __m128i m) {
+  *row0 = addv(addv(*row0, m), *row1);
+  *row3 = xorv(*row3, *row0);
+  *row3 = rot8(*row3);
+  *row2 = addv(*row2, *row3);
+  *row1 = xorv(*row1, *row2);
+  *row1 = rot7(*row1);
+}
+
+// Note the optimization here of leaving row1 as the unrotated row, rather than
+// row0. All the message loads below are adjusted to compensate for this. See
+// discussion at https://github.com/sneves/blake2-avx2/pull/4
+INLINE void diagonalize(__m128i *row0, __m128i *row2, __m128i *row3) {
+  *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(2, 1, 0, 3));
+  *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2));
+  *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(0, 3, 2, 1));
+}
+
+INLINE void undiagonalize(__m128i *row0, __m128i *row2, __m128i *row3) {
+  *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(0, 3, 2, 1));
+  *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2));
+  *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(2, 1, 0, 3));
+}
+
+INLINE void compress_pre(__m128i rows[4], const u32 cv[8],
+                         const u8 block[BLAKE3_BLOCK_LEN],
+                         u8 block_len, u64 counter, u8 flags) {
+  rows[0] = loadu((u8 *)&cv[0]);
+  rows[1] = loadu((u8 *)&cv[4]);
+  rows[2] = set4(IV[0], IV[1], IV[2], IV[3]);
+  rows[3] = set4(counter_low(counter), counter_high(counter),
+                 (u32)block_len, (u32)flags);
+
+  __m128i m0 = loadu(&block[sizeof(__m128i) * 0]);
+  __m128i m1 = loadu(&block[sizeof(__m128i) * 1]);
+  __m128i m2 = loadu(&block[sizeof(__m128i) * 2]);
+  __m128i m3 = loadu(&block[sizeof(__m128i) * 3]);
+
+  __m128i t0, t1, t2, t3, tt;
+
+  // Round 1. The first round permutes the message words from the original
+  // input order, into the groups that get mixed in parallel.
+  t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(2, 0, 2, 0)); //  6  4  2  0
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
+  t1 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 3, 1)); //  7  5  3  1
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
+  diagonalize(&rows[0], &rows[2], &rows[3]);
+  t2 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(2, 0, 2, 0)); // 14 12 10  8
+  t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2, 1, 0, 3));   // 12 10  8 14
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
+  t3 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 1, 3, 1)); // 15 13 11  9
+  t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE(2, 1, 0, 3));   // 13 11  9 15
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
+  undiagonalize(&rows[0], &rows[2], &rows[3]);
+  m0 = t0;
+  m1 = t1;
+  m2 = t2;
+  m3 = t3;
+
+  // Round 2. This round and all following rounds apply a fixed permutation
+  // to the message words from the round before.
+  t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
+  t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
+  t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
+  tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
+  t1 = _mm_blend_epi16(tt, t1, 0xCC);
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
+  diagonalize(&rows[0], &rows[2], &rows[3]);
+  t2 = _mm_unpacklo_epi64(m3, m1);
+  tt = _mm_blend_epi16(t2, m2, 0xC0);
+  t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
+  t3 = _mm_unpackhi_epi32(m1, m3);
+  tt = _mm_unpacklo_epi32(m2, t3);
+  t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
+  undiagonalize(&rows[0], &rows[2], &rows[3]);
+  m0 = t0;
+  m1 = t1;
+  m2 = t2;
+  m3 = t3;
+
+  // Round 3
+  t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
+  t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
+  t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
+  tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
+  t1 = _mm_blend_epi16(tt, t1, 0xCC);
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
+  diagonalize(&rows[0], &rows[2], &rows[3]);
+  t2 = _mm_unpacklo_epi64(m3, m1);
+  tt = _mm_blend_epi16(t2, m2, 0xC0);
+  t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
+  t3 = _mm_unpackhi_epi32(m1, m3);
+  tt = _mm_unpacklo_epi32(m2, t3);
+  t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
+  undiagonalize(&rows[0], &rows[2], &rows[3]);
+  m0 = t0;
+  m1 = t1;
+  m2 = t2;
+  m3 = t3;
+
+  // Round 4
+  t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
+  t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
+  t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
+  tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
+  t1 = _mm_blend_epi16(tt, t1, 0xCC);
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
+  diagonalize(&rows[0], &rows[2], &rows[3]);
+  t2 = _mm_unpacklo_epi64(m3, m1);
+  tt = _mm_blend_epi16(t2, m2, 0xC0);
+  t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
+  t3 = _mm_unpackhi_epi32(m1, m3);
+  tt = _mm_unpacklo_epi32(m2, t3);
+  t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
+  undiagonalize(&rows[0], &rows[2], &rows[3]);
+  m0 = t0;
+  m1 = t1;
+  m2 = t2;
+  m3 = t3;
+
+  // Round 5
+  t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
+  t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
+  t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
+  tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
+  t1 = _mm_blend_epi16(tt, t1, 0xCC);
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
+  diagonalize(&rows[0], &rows[2], &rows[3]);
+  t2 = _mm_unpacklo_epi64(m3, m1);
+  tt = _mm_blend_epi16(t2, m2, 0xC0);
+  t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
+  t3 = _mm_unpackhi_epi32(m1, m3);
+  tt = _mm_unpacklo_epi32(m2, t3);
+  t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
+  undiagonalize(&rows[0], &rows[2], &rows[3]);
+  m0 = t0;
+  m1 = t1;
+  m2 = t2;
+  m3 = t3;
+
+  // Round 6
+  t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
+  t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
+  t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
+  tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
+  t1 = _mm_blend_epi16(tt, t1, 0xCC);
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
+  diagonalize(&rows[0], &rows[2], &rows[3]);
+  t2 = _mm_unpacklo_epi64(m3, m1);
+  tt = _mm_blend_epi16(t2, m2, 0xC0);
+  t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
+  t3 = _mm_unpackhi_epi32(m1, m3);
+  tt = _mm_unpacklo_epi32(m2, t3);
+  t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
+  undiagonalize(&rows[0], &rows[2], &rows[3]);
+  m0 = t0;
+  m1 = t1;
+  m2 = t2;
+  m3 = t3;
+
+  // Round 7
+  t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
+  t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
+  t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
+  tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
+  t1 = _mm_blend_epi16(tt, t1, 0xCC);
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
+  diagonalize(&rows[0], &rows[2], &rows[3]);
+  t2 = _mm_unpacklo_epi64(m3, m1);
+  tt = _mm_blend_epi16(t2, m2, 0xC0);
+  t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
+  t3 = _mm_unpackhi_epi32(m1, m3);
+  tt = _mm_unpacklo_epi32(m2, t3);
+  t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
+  undiagonalize(&rows[0], &rows[2], &rows[3]);
+}
+
+void blake3_compress_in_place_sse41(u32 cv[8],
+                                    const u8 block[BLAKE3_BLOCK_LEN],
+                                    u8 block_len, u64 counter,
+                                    u8 flags) {
+  __m128i rows[4];
+  compress_pre(rows, cv, block, block_len, counter, flags);
+  storeu(xorv(rows[0], rows[2]), (u8 *)&cv[0]);
+  storeu(xorv(rows[1], rows[3]), (u8 *)&cv[4]);
+}
+
+void blake3_compress_xof_sse41(const u32 cv[8],
+                               const u8 block[BLAKE3_BLOCK_LEN],
+                               u8 block_len, u64 counter,
+                               u8 flags, u8 out[64]) {
+  __m128i rows[4];
+  compress_pre(rows, cv, block, block_len, counter, flags);
+  storeu(xorv(rows[0], rows[2]), &out[0]);
+  storeu(xorv(rows[1], rows[3]), &out[16]);
+  storeu(xorv(rows[2], loadu((u8 *)&cv[0])), &out[32]);
+  storeu(xorv(rows[3], loadu((u8 *)&cv[4])), &out[48]);
+}
+
+INLINE void round_fn(__m128i v[16], __m128i m[16], size_t r) {
+  v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][0]]);
+  v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][2]]);
+  v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][4]]);
+  v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][6]]);
+  v[0] = addv(v[0], v[4]);
+  v[1] = addv(v[1], v[5]);
+  v[2] = addv(v[2], v[6]);
+  v[3] = addv(v[3], v[7]);
+  v[12] = xorv(v[12], v[0]);
+  v[13] = xorv(v[13], v[1]);
+  v[14] = xorv(v[14], v[2]);
+  v[15] = xorv(v[15], v[3]);
+  v[12] = rot16(v[12]);
+  v[13] = rot16(v[13]);
+  v[14] = rot16(v[14]);
+  v[15] = rot16(v[15]);
+  v[8] = addv(v[8], v[12]);
+  v[9] = addv(v[9], v[13]);
+  v[10] = addv(v[10], v[14]);
+  v[11] = addv(v[11], v[15]);
+  v[4] = xorv(v[4], v[8]);
+  v[5] = xorv(v[5], v[9]);
+  v[6] = xorv(v[6], v[10]);
+  v[7] = xorv(v[7], v[11]);
+  v[4] = rot12(v[4]);
+  v[5] = rot12(v[5]);
+  v[6] = rot12(v[6]);
+  v[7] = rot12(v[7]);
+  v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][1]]);
+  v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][3]]);
+  v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][5]]);
+  v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][7]]);
+  v[0] = addv(v[0], v[4]);
+  v[1] = addv(v[1], v[5]);
+  v[2] = addv(v[2], v[6]);
+  v[3] = addv(v[3], v[7]);
+  v[12] = xorv(v[12], v[0]);
+  v[13] = xorv(v[13], v[1]);
+  v[14] = xorv(v[14], v[2]);
+  v[15] = xorv(v[15], v[3]);
+  v[12] = rot8(v[12]);
+  v[13] = rot8(v[13]);
+  v[14] = rot8(v[14]);
+  v[15] = rot8(v[15]);
+  v[8] = addv(v[8], v[12]);
+  v[9] = addv(v[9], v[13]);
+  v[10] = addv(v[10], v[14]);
+  v[11] = addv(v[11], v[15]);
+  v[4] = xorv(v[4], v[8]);
+  v[5] = xorv(v[5], v[9]);
+  v[6] = xorv(v[6], v[10]);
+  v[7] = xorv(v[7], v[11]);
+  v[4] = rot7(v[4]);
+  v[5] = rot7(v[5]);
+  v[6] = rot7(v[6]);
+  v[7] = rot7(v[7]);
+
+  v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][8]]);
+  v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][10]]);
+  v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][12]]);
+  v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][14]]);
+  v[0] = addv(v[0], v[5]);
+  v[1] = addv(v[1], v[6]);
+  v[2] = addv(v[2], v[7]);
+  v[3] = addv(v[3], v[4]);
+  v[15] = xorv(v[15], v[0]);
+  v[12] = xorv(v[12], v[1]);
+  v[13] = xorv(v[13], v[2]);
+  v[14] = xorv(v[14], v[3]);
+  v[15] = rot16(v[15]);
+  v[12] = rot16(v[12]);
+  v[13] = rot16(v[13]);
+  v[14] = rot16(v[14]);
+  v[10] = addv(v[10], v[15]);
+  v[11] = addv(v[11], v[12]);
+  v[8] = addv(v[8], v[13]);
+  v[9] = addv(v[9], v[14]);
+  v[5] = xorv(v[5], v[10]);
+  v[6] = xorv(v[6], v[11]);
+  v[7] = xorv(v[7], v[8]);
+  v[4] = xorv(v[4], v[9]);
+  v[5] = rot12(v[5]);
+  v[6] = rot12(v[6]);
+  v[7] = rot12(v[7]);
+  v[4] = rot12(v[4]);
+  v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][9]]);
+  v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][11]]);
+  v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][13]]);
+  v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][15]]);
+  v[0] = addv(v[0], v[5]);
+  v[1] = addv(v[1], v[6]);
+  v[2] = addv(v[2], v[7]);
+  v[3] = addv(v[3], v[4]);
+  v[15] = xorv(v[15], v[0]);
+  v[12] = xorv(v[12], v[1]);
+  v[13] = xorv(v[13], v[2]);
+  v[14] = xorv(v[14], v[3]);
+  v[15] = rot8(v[15]);
+  v[12] = rot8(v[12]);
+  v[13] = rot8(v[13]);
+  v[14] = rot8(v[14]);
+  v[10] = addv(v[10], v[15]);
+  v[11] = addv(v[11], v[12]);
+  v[8] = addv(v[8], v[13]);
+  v[9] = addv(v[9], v[14]);
+  v[5] = xorv(v[5], v[10]);
+  v[6] = xorv(v[6], v[11]);
+  v[7] = xorv(v[7], v[8]);
+  v[4] = xorv(v[4], v[9]);
+  v[5] = rot7(v[5]);
+  v[6] = rot7(v[6]);
+  v[7] = rot7(v[7]);
+  v[4] = rot7(v[4]);
+}
+
+INLINE void transpose_vecs(__m128i vecs[DEGREE]) {
+  // Interleave 32-bit lates. The low unpack is lanes 00/11 and the high is
+  // 22/33. Note that this doesn't split the vector into two lanes, as the
+  // AVX2 counterparts do.
+  __m128i ab_01 = _mm_unpacklo_epi32(vecs[0], vecs[1]);
+  __m128i ab_23 = _mm_unpackhi_epi32(vecs[0], vecs[1]);
+  __m128i cd_01 = _mm_unpacklo_epi32(vecs[2], vecs[3]);
+  __m128i cd_23 = _mm_unpackhi_epi32(vecs[2], vecs[3]);
+
+  // Interleave 64-bit lanes.
+  __m128i abcd_0 = _mm_unpacklo_epi64(ab_01, cd_01);
+  __m128i abcd_1 = _mm_unpackhi_epi64(ab_01, cd_01);
+  __m128i abcd_2 = _mm_unpacklo_epi64(ab_23, cd_23);
+  __m128i abcd_3 = _mm_unpackhi_epi64(ab_23, cd_23);
+
+  vecs[0] = abcd_0;
+  vecs[1] = abcd_1;
+  vecs[2] = abcd_2;
+  vecs[3] = abcd_3;
+}
+
+INLINE void transpose_msg_vecs(const u8 *const *inputs,
+                               size_t block_offset, __m128i out[16]) {
+  out[0] = loadu(&inputs[0][block_offset + 0 * sizeof(__m128i)]);
+  out[1] = loadu(&inputs[1][block_offset + 0 * sizeof(__m128i)]);
+  out[2] = loadu(&inputs[2][block_offset + 0 * sizeof(__m128i)]);
+  out[3] = loadu(&inputs[3][block_offset + 0 * sizeof(__m128i)]);
+  out[4] = loadu(&inputs[0][block_offset + 1 * sizeof(__m128i)]);
+  out[5] = loadu(&inputs[1][block_offset + 1 * sizeof(__m128i)]);
+  out[6] = loadu(&inputs[2][block_offset + 1 * sizeof(__m128i)]);
+  out[7] = loadu(&inputs[3][block_offset + 1 * sizeof(__m128i)]);
+  out[8] = loadu(&inputs[0][block_offset + 2 * sizeof(__m128i)]);
+  out[9] = loadu(&inputs[1][block_offset + 2 * sizeof(__m128i)]);
+  out[10] = loadu(&inputs[2][block_offset + 2 * sizeof(__m128i)]);
+  out[11] = loadu(&inputs[3][block_offset + 2 * sizeof(__m128i)]);
+  out[12] = loadu(&inputs[0][block_offset + 3 * sizeof(__m128i)]);
+  out[13] = loadu(&inputs[1][block_offset + 3 * sizeof(__m128i)]);
+  out[14] = loadu(&inputs[2][block_offset + 3 * sizeof(__m128i)]);
+  out[15] = loadu(&inputs[3][block_offset + 3 * sizeof(__m128i)]);
+  for (size_t i = 0; i < 4; ++i) {
+    _mm_prefetch((const void *)&inputs[i][block_offset + 256], _MM_HINT_T0);
+  }
+  transpose_vecs(&out[0]);
+  transpose_vecs(&out[4]);
+  transpose_vecs(&out[8]);
+  transpose_vecs(&out[12]);
+}
+
+INLINE void load_counters(u64 counter, char increment_counter,
+                          __m128i *out_lo, __m128i *out_hi) {
+  const __m128i mask = _mm_set1_epi32(-(int32_t)increment_counter);
+  const __m128i add0 = _mm_set_epi32(3, 2, 1, 0);
+  const __m128i add1 = _mm_and_si128(mask, add0);
+  __m128i l = _mm_add_epi32(_mm_set1_epi32((int32_t)counter), add1);
+  __m128i carry = _mm_cmpgt_epi32(_mm_xor_si128(add1, _mm_set1_epi32(0x80000000)), 
+                                  _mm_xor_si128(   l, _mm_set1_epi32(0x80000000)));
+  __m128i h = _mm_sub_epi32(_mm_set1_epi32((int32_t)(counter >> 32)), carry);
+  *out_lo = l;
+  *out_hi = h;
+}
+
+static
+void blake3_hash4_sse41(const u8 *const *inputs, size_t blocks,
+                        const u32 key[8], u64 counter,
+                        char increment_counter, u8 flags,
+                        u8 flags_start, u8 flags_end, u8 *out) {
+  __m128i h_vecs[8] = {
+      set1(key[0]), set1(key[1]), set1(key[2]), set1(key[3]),
+      set1(key[4]), set1(key[5]), set1(key[6]), set1(key[7]),
+  };
+  __m128i counter_low_vec, counter_high_vec;
+  load_counters(counter, increment_counter, &counter_low_vec,
+                &counter_high_vec);
+  u8 block_flags = flags | flags_start;
+
+  for (size_t block = 0; block < blocks; block++) {
+    if (block + 1 == blocks) {
+      block_flags |= flags_end;
+    }
+    __m128i block_len_vec = set1(BLAKE3_BLOCK_LEN);
+    __m128i block_flags_vec = set1(block_flags);
+    __m128i msg_vecs[16];
+    transpose_msg_vecs(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs);
+
+    __m128i v[16] = {
+        h_vecs[0],       h_vecs[1],        h_vecs[2],     h_vecs[3],
+        h_vecs[4],       h_vecs[5],        h_vecs[6],     h_vecs[7],
+        set1(IV[0]),     set1(IV[1]),      set1(IV[2]),   set1(IV[3]),
+        counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec,
+    };
+    round_fn(v, msg_vecs, 0);
+    round_fn(v, msg_vecs, 1);
+    round_fn(v, msg_vecs, 2);
+    round_fn(v, msg_vecs, 3);
+    round_fn(v, msg_vecs, 4);
+    round_fn(v, msg_vecs, 5);
+    round_fn(v, msg_vecs, 6);
+    h_vecs[0] = xorv(v[0], v[8]);
+    h_vecs[1] = xorv(v[1], v[9]);
+    h_vecs[2] = xorv(v[2], v[10]);
+    h_vecs[3] = xorv(v[3], v[11]);
+    h_vecs[4] = xorv(v[4], v[12]);
+    h_vecs[5] = xorv(v[5], v[13]);
+    h_vecs[6] = xorv(v[6], v[14]);
+    h_vecs[7] = xorv(v[7], v[15]);
+
+    block_flags = flags;
+  }
+
+  transpose_vecs(&h_vecs[0]);
+  transpose_vecs(&h_vecs[4]);
+  // The first four vecs now contain the first half of each output, and the
+  // second four vecs contain the second half of each output.
+  storeu(h_vecs[0], &out[0 * sizeof(__m128i)]);
+  storeu(h_vecs[4], &out[1 * sizeof(__m128i)]);
+  storeu(h_vecs[1], &out[2 * sizeof(__m128i)]);
+  storeu(h_vecs[5], &out[3 * sizeof(__m128i)]);
+  storeu(h_vecs[2], &out[4 * sizeof(__m128i)]);
+  storeu(h_vecs[6], &out[5 * sizeof(__m128i)]);
+  storeu(h_vecs[3], &out[6 * sizeof(__m128i)]);
+  storeu(h_vecs[7], &out[7 * sizeof(__m128i)]);
+}
+
+INLINE void hash_one_sse41(const u8 *input, size_t blocks,
+                           const u32 key[8], u64 counter,
+                           u8 flags, u8 flags_start,
+                           u8 flags_end, u8 out[BLAKE3_OUT_LEN]) {
+  u32 cv[8];
+  memcpy(cv, key, BLAKE3_KEY_LEN);
+  u8 block_flags = flags | flags_start;
+  while (blocks > 0) {
+    if (blocks == 1) {
+      block_flags |= flags_end;
+    }
+    blake3_compress_in_place_sse41(cv, input, BLAKE3_BLOCK_LEN, counter,
+                                   block_flags);
+    input = &input[BLAKE3_BLOCK_LEN];
+    blocks -= 1;
+    block_flags = flags;
+  }
+  memcpy(out, cv, BLAKE3_OUT_LEN);
+}
+
+void blake3_hash_many_sse41(const u8 *const *inputs, size_t num_inputs,
+                            size_t blocks, const u32 key[8],
+                            u64 counter, char increment_counter,
+                            u8 flags, u8 flags_start,
+                            u8 flags_end, u8 *out) {
+  while (num_inputs >= DEGREE) {
+    blake3_hash4_sse41(inputs, blocks, key, counter, increment_counter, flags,
+                       flags_start, flags_end, out);
+    if (increment_counter) {
+      counter += DEGREE;
+    }
+    inputs += DEGREE;
+    num_inputs -= DEGREE;
+    out = &out[DEGREE * BLAKE3_OUT_LEN];
+  }
+  while (num_inputs > 0) {
+    hash_one_sse41(inputs[0], blocks, key, counter, flags, flags_start,
+                   flags_end, out);
+    if (increment_counter) {
+      counter += 1;
+    }
+    inputs += 1;
+    num_inputs -= 1;
+    out = &out[BLAKE3_OUT_LEN];
+  }
+}
diff --git a/src/blake3/blake3_sse41_x86-64_unix.S b/src/blake3/blake3_sse41_x86-64_unix.S
new file mode 100644
index 0000000..a3ff642
--- /dev/null
+++ b/src/blake3/blake3_sse41_x86-64_unix.S
@@ -0,0 +1,2028 @@
+#if defined(__ELF__) && defined(__linux__)
+.section .note.GNU-stack,"",%progbits
+#endif
+
+#if defined(__ELF__) && defined(__CET__) && defined(__has_include)
+#if __has_include(<cet.h>)
+#include <cet.h>
+#endif
+#endif
+
+#if !defined(_CET_ENDBR)
+#define _CET_ENDBR
+#endif
+
+.intel_syntax noprefix
+.global blake3_hash_many_sse41
+.global _blake3_hash_many_sse41
+.global blake3_compress_in_place_sse41
+.global _blake3_compress_in_place_sse41
+.global blake3_compress_xof_sse41
+.global _blake3_compress_xof_sse41
+#ifdef __APPLE__
+.text
+#else
+.section .text
+#endif
+        .p2align  6
+_blake3_hash_many_sse41:
+blake3_hash_many_sse41:
+        _CET_ENDBR
+        push    r15
+        push    r14
+        push    r13
+        push    r12
+        push    rbx
+        push    rbp
+        mov     rbp, rsp
+        sub     rsp, 360
+        and     rsp, 0xFFFFFFFFFFFFFFC0
+        neg     r9d
+        movd    xmm0, r9d
+        pshufd  xmm0, xmm0, 0x00
+        movdqa  xmmword ptr [rsp+0x130], xmm0
+        movdqa  xmm1, xmm0
+        pand    xmm1, xmmword ptr [ADD0+rip]
+        pand    xmm0, xmmword ptr [ADD1+rip]
+        movdqa  xmmword ptr [rsp+0x150], xmm0
+        movd    xmm0, r8d
+        pshufd  xmm0, xmm0, 0x00
+        paddd   xmm0, xmm1
+        movdqa  xmmword ptr [rsp+0x110], xmm0
+        pxor    xmm0, xmmword ptr [CMP_MSB_MASK+rip]
+        pxor    xmm1, xmmword ptr [CMP_MSB_MASK+rip]
+        pcmpgtd xmm1, xmm0
+        shr     r8, 32
+        movd    xmm2, r8d
+        pshufd  xmm2, xmm2, 0x00
+        psubd   xmm2, xmm1
+        movdqa  xmmword ptr [rsp+0x120], xmm2
+        mov     rbx, qword ptr [rbp+0x50]
+        mov     r15, rdx
+        shl     r15, 6
+        movzx   r13d, byte ptr [rbp+0x38]
+        movzx   r12d, byte ptr [rbp+0x48]
+        cmp     rsi, 4
+        jc      3f
+2:
+        movdqu  xmm3, xmmword ptr [rcx]
+        pshufd  xmm0, xmm3, 0x00
+        pshufd  xmm1, xmm3, 0x55
+        pshufd  xmm2, xmm3, 0xAA
+        pshufd  xmm3, xmm3, 0xFF
+        movdqu  xmm7, xmmword ptr [rcx+0x10]
+        pshufd  xmm4, xmm7, 0x00
+        pshufd  xmm5, xmm7, 0x55
+        pshufd  xmm6, xmm7, 0xAA
+        pshufd  xmm7, xmm7, 0xFF
+        mov     r8, qword ptr [rdi]
+        mov     r9, qword ptr [rdi+0x8]
+        mov     r10, qword ptr [rdi+0x10]
+        mov     r11, qword ptr [rdi+0x18]
+        movzx   eax, byte ptr [rbp+0x40]
+        or      eax, r13d
+        xor     edx, edx
+9:
+        mov     r14d, eax
+        or      eax, r12d
+        add     rdx, 64
+        cmp     rdx, r15
+        cmovne  eax, r14d
+        movdqu  xmm8, xmmword ptr [r8+rdx-0x40]
+        movdqu  xmm9, xmmword ptr [r9+rdx-0x40]
+        movdqu  xmm10, xmmword ptr [r10+rdx-0x40]
+        movdqu  xmm11, xmmword ptr [r11+rdx-0x40]
+        movdqa  xmm12, xmm8
+        punpckldq xmm8, xmm9
+        punpckhdq xmm12, xmm9
+        movdqa  xmm14, xmm10
+        punpckldq xmm10, xmm11
+        punpckhdq xmm14, xmm11
+        movdqa  xmm9, xmm8
+        punpcklqdq xmm8, xmm10
+        punpckhqdq xmm9, xmm10
+        movdqa  xmm13, xmm12
+        punpcklqdq xmm12, xmm14
+        punpckhqdq xmm13, xmm14
+        movdqa  xmmword ptr [rsp], xmm8
+        movdqa  xmmword ptr [rsp+0x10], xmm9
+        movdqa  xmmword ptr [rsp+0x20], xmm12
+        movdqa  xmmword ptr [rsp+0x30], xmm13
+        movdqu  xmm8, xmmword ptr [r8+rdx-0x30]
+        movdqu  xmm9, xmmword ptr [r9+rdx-0x30]
+        movdqu  xmm10, xmmword ptr [r10+rdx-0x30]
+        movdqu  xmm11, xmmword ptr [r11+rdx-0x30]
+        movdqa  xmm12, xmm8
+        punpckldq xmm8, xmm9
+        punpckhdq xmm12, xmm9
+        movdqa  xmm14, xmm10
+        punpckldq xmm10, xmm11
+        punpckhdq xmm14, xmm11
+        movdqa  xmm9, xmm8
+        punpcklqdq xmm8, xmm10
+        punpckhqdq xmm9, xmm10
+        movdqa  xmm13, xmm12
+        punpcklqdq xmm12, xmm14
+        punpckhqdq xmm13, xmm14
+        movdqa  xmmword ptr [rsp+0x40], xmm8
+        movdqa  xmmword ptr [rsp+0x50], xmm9
+        movdqa  xmmword ptr [rsp+0x60], xmm12
+        movdqa  xmmword ptr [rsp+0x70], xmm13
+        movdqu  xmm8, xmmword ptr [r8+rdx-0x20]
+        movdqu  xmm9, xmmword ptr [r9+rdx-0x20]
+        movdqu  xmm10, xmmword ptr [r10+rdx-0x20]
+        movdqu  xmm11, xmmword ptr [r11+rdx-0x20]
+        movdqa  xmm12, xmm8
+        punpckldq xmm8, xmm9
+        punpckhdq xmm12, xmm9
+        movdqa  xmm14, xmm10
+        punpckldq xmm10, xmm11
+        punpckhdq xmm14, xmm11
+        movdqa  xmm9, xmm8
+        punpcklqdq xmm8, xmm10
+        punpckhqdq xmm9, xmm10
+        movdqa  xmm13, xmm12
+        punpcklqdq xmm12, xmm14
+        punpckhqdq xmm13, xmm14
+        movdqa  xmmword ptr [rsp+0x80], xmm8
+        movdqa  xmmword ptr [rsp+0x90], xmm9
+        movdqa  xmmword ptr [rsp+0xA0], xmm12
+        movdqa  xmmword ptr [rsp+0xB0], xmm13
+        movdqu  xmm8, xmmword ptr [r8+rdx-0x10]
+        movdqu  xmm9, xmmword ptr [r9+rdx-0x10]
+        movdqu  xmm10, xmmword ptr [r10+rdx-0x10]
+        movdqu  xmm11, xmmword ptr [r11+rdx-0x10]
+        movdqa  xmm12, xmm8
+        punpckldq xmm8, xmm9
+        punpckhdq xmm12, xmm9
+        movdqa  xmm14, xmm10
+        punpckldq xmm10, xmm11
+        punpckhdq xmm14, xmm11
+        movdqa  xmm9, xmm8
+        punpcklqdq xmm8, xmm10
+        punpckhqdq xmm9, xmm10
+        movdqa  xmm13, xmm12
+        punpcklqdq xmm12, xmm14
+        punpckhqdq xmm13, xmm14
+        movdqa  xmmword ptr [rsp+0xC0], xmm8
+        movdqa  xmmword ptr [rsp+0xD0], xmm9
+        movdqa  xmmword ptr [rsp+0xE0], xmm12
+        movdqa  xmmword ptr [rsp+0xF0], xmm13
+        movdqa  xmm9, xmmword ptr [BLAKE3_IV_1+rip]
+        movdqa  xmm10, xmmword ptr [BLAKE3_IV_2+rip]
+        movdqa  xmm11, xmmword ptr [BLAKE3_IV_3+rip]
+        movdqa  xmm12, xmmword ptr [rsp+0x110]
+        movdqa  xmm13, xmmword ptr [rsp+0x120]
+        movdqa  xmm14, xmmword ptr [BLAKE3_BLOCK_LEN+rip]
+        movd    xmm15, eax
+        pshufd  xmm15, xmm15, 0x00
+        prefetcht0 [r8+rdx+0x80]
+        prefetcht0 [r9+rdx+0x80]
+        prefetcht0 [r10+rdx+0x80]
+        prefetcht0 [r11+rdx+0x80]
+        paddd   xmm0, xmmword ptr [rsp]
+        paddd   xmm1, xmmword ptr [rsp+0x20]
+        paddd   xmm2, xmmword ptr [rsp+0x40]
+        paddd   xmm3, xmmword ptr [rsp+0x60]
+        paddd   xmm0, xmm4
+        paddd   xmm1, xmm5
+        paddd   xmm2, xmm6
+        paddd   xmm3, xmm7
+        pxor    xmm12, xmm0
+        pxor    xmm13, xmm1
+        pxor    xmm14, xmm2
+        pxor    xmm15, xmm3
+        movdqa  xmm8, xmmword ptr [ROT16+rip]
+        pshufb  xmm12, xmm8
+        pshufb  xmm13, xmm8
+        pshufb  xmm14, xmm8
+        pshufb  xmm15, xmm8
+        movdqa  xmm8, xmmword ptr [BLAKE3_IV_0+rip]
+        paddd   xmm8, xmm12
+        paddd   xmm9, xmm13
+        paddd   xmm10, xmm14
+        paddd   xmm11, xmm15
+        pxor    xmm4, xmm8
+        pxor    xmm5, xmm9
+        pxor    xmm6, xmm10
+        pxor    xmm7, xmm11
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 12
+        pslld   xmm4, 20
+        por     xmm4, xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 12
+        pslld   xmm5, 20
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 12
+        pslld   xmm6, 20
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 12
+        pslld   xmm7, 20
+        por     xmm7, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0x10]
+        paddd   xmm1, xmmword ptr [rsp+0x30]
+        paddd   xmm2, xmmword ptr [rsp+0x50]
+        paddd   xmm3, xmmword ptr [rsp+0x70]
+        paddd   xmm0, xmm4
+        paddd   xmm1, xmm5
+        paddd   xmm2, xmm6
+        paddd   xmm3, xmm7
+        pxor    xmm12, xmm0
+        pxor    xmm13, xmm1
+        pxor    xmm14, xmm2
+        pxor    xmm15, xmm3
+        movdqa  xmm8, xmmword ptr [ROT8+rip]
+        pshufb  xmm12, xmm8
+        pshufb  xmm13, xmm8
+        pshufb  xmm14, xmm8
+        pshufb  xmm15, xmm8
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm12
+        paddd   xmm9, xmm13
+        paddd   xmm10, xmm14
+        paddd   xmm11, xmm15
+        pxor    xmm4, xmm8
+        pxor    xmm5, xmm9
+        pxor    xmm6, xmm10
+        pxor    xmm7, xmm11
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 7
+        pslld   xmm4, 25
+        por     xmm4, xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 7
+        pslld   xmm5, 25
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 7
+        pslld   xmm6, 25
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 7
+        pslld   xmm7, 25
+        por     xmm7, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0x80]
+        paddd   xmm1, xmmword ptr [rsp+0xA0]
+        paddd   xmm2, xmmword ptr [rsp+0xC0]
+        paddd   xmm3, xmmword ptr [rsp+0xE0]
+        paddd   xmm0, xmm5
+        paddd   xmm1, xmm6
+        paddd   xmm2, xmm7
+        paddd   xmm3, xmm4
+        pxor    xmm15, xmm0
+        pxor    xmm12, xmm1
+        pxor    xmm13, xmm2
+        pxor    xmm14, xmm3
+        movdqa  xmm8, xmmword ptr [ROT16+rip]
+        pshufb  xmm15, xmm8
+        pshufb  xmm12, xmm8
+        pshufb  xmm13, xmm8
+        pshufb  xmm14, xmm8
+        paddd   xmm10, xmm15
+        paddd   xmm11, xmm12
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm13
+        paddd   xmm9, xmm14
+        pxor    xmm5, xmm10
+        pxor    xmm6, xmm11
+        pxor    xmm7, xmm8
+        pxor    xmm4, xmm9
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 12
+        pslld   xmm5, 20
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 12
+        pslld   xmm6, 20
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 12
+        pslld   xmm7, 20
+        por     xmm7, xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 12
+        pslld   xmm4, 20
+        por     xmm4, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0x90]
+        paddd   xmm1, xmmword ptr [rsp+0xB0]
+        paddd   xmm2, xmmword ptr [rsp+0xD0]
+        paddd   xmm3, xmmword ptr [rsp+0xF0]
+        paddd   xmm0, xmm5
+        paddd   xmm1, xmm6
+        paddd   xmm2, xmm7
+        paddd   xmm3, xmm4
+        pxor    xmm15, xmm0
+        pxor    xmm12, xmm1
+        pxor    xmm13, xmm2
+        pxor    xmm14, xmm3
+        movdqa  xmm8, xmmword ptr [ROT8+rip]
+        pshufb  xmm15, xmm8
+        pshufb  xmm12, xmm8
+        pshufb  xmm13, xmm8
+        pshufb  xmm14, xmm8
+        paddd   xmm10, xmm15
+        paddd   xmm11, xmm12
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm13
+        paddd   xmm9, xmm14
+        pxor    xmm5, xmm10
+        pxor    xmm6, xmm11
+        pxor    xmm7, xmm8
+        pxor    xmm4, xmm9
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 7
+        pslld   xmm5, 25
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 7
+        pslld   xmm6, 25
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 7
+        pslld   xmm7, 25
+        por     xmm7, xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 7
+        pslld   xmm4, 25
+        por     xmm4, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0x20]
+        paddd   xmm1, xmmword ptr [rsp+0x30]
+        paddd   xmm2, xmmword ptr [rsp+0x70]
+        paddd   xmm3, xmmword ptr [rsp+0x40]
+        paddd   xmm0, xmm4
+        paddd   xmm1, xmm5
+        paddd   xmm2, xmm6
+        paddd   xmm3, xmm7
+        pxor    xmm12, xmm0
+        pxor    xmm13, xmm1
+        pxor    xmm14, xmm2
+        pxor    xmm15, xmm3
+        movdqa  xmm8, xmmword ptr [ROT16+rip]
+        pshufb  xmm12, xmm8
+        pshufb  xmm13, xmm8
+        pshufb  xmm14, xmm8
+        pshufb  xmm15, xmm8
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm12
+        paddd   xmm9, xmm13
+        paddd   xmm10, xmm14
+        paddd   xmm11, xmm15
+        pxor    xmm4, xmm8
+        pxor    xmm5, xmm9
+        pxor    xmm6, xmm10
+        pxor    xmm7, xmm11
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 12
+        pslld   xmm4, 20
+        por     xmm4, xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 12
+        pslld   xmm5, 20
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 12
+        pslld   xmm6, 20
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 12
+        pslld   xmm7, 20
+        por     xmm7, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0x60]
+        paddd   xmm1, xmmword ptr [rsp+0xA0]
+        paddd   xmm2, xmmword ptr [rsp]
+        paddd   xmm3, xmmword ptr [rsp+0xD0]
+        paddd   xmm0, xmm4
+        paddd   xmm1, xmm5
+        paddd   xmm2, xmm6
+        paddd   xmm3, xmm7
+        pxor    xmm12, xmm0
+        pxor    xmm13, xmm1
+        pxor    xmm14, xmm2
+        pxor    xmm15, xmm3
+        movdqa  xmm8, xmmword ptr [ROT8+rip]
+        pshufb  xmm12, xmm8
+        pshufb  xmm13, xmm8
+        pshufb  xmm14, xmm8
+        pshufb  xmm15, xmm8
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm12
+        paddd   xmm9, xmm13
+        paddd   xmm10, xmm14
+        paddd   xmm11, xmm15
+        pxor    xmm4, xmm8
+        pxor    xmm5, xmm9
+        pxor    xmm6, xmm10
+        pxor    xmm7, xmm11
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 7
+        pslld   xmm4, 25
+        por     xmm4, xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 7
+        pslld   xmm5, 25
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 7
+        pslld   xmm6, 25
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 7
+        pslld   xmm7, 25
+        por     xmm7, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0x10]
+        paddd   xmm1, xmmword ptr [rsp+0xC0]
+        paddd   xmm2, xmmword ptr [rsp+0x90]
+        paddd   xmm3, xmmword ptr [rsp+0xF0]
+        paddd   xmm0, xmm5
+        paddd   xmm1, xmm6
+        paddd   xmm2, xmm7
+        paddd   xmm3, xmm4
+        pxor    xmm15, xmm0
+        pxor    xmm12, xmm1
+        pxor    xmm13, xmm2
+        pxor    xmm14, xmm3
+        movdqa  xmm8, xmmword ptr [ROT16+rip]
+        pshufb  xmm15, xmm8
+        pshufb  xmm12, xmm8
+        pshufb  xmm13, xmm8
+        pshufb  xmm14, xmm8
+        paddd   xmm10, xmm15
+        paddd   xmm11, xmm12
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm13
+        paddd   xmm9, xmm14
+        pxor    xmm5, xmm10
+        pxor    xmm6, xmm11
+        pxor    xmm7, xmm8
+        pxor    xmm4, xmm9
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 12
+        pslld   xmm5, 20
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 12
+        pslld   xmm6, 20
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 12
+        pslld   xmm7, 20
+        por     xmm7, xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 12
+        pslld   xmm4, 20
+        por     xmm4, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0xB0]
+        paddd   xmm1, xmmword ptr [rsp+0x50]
+        paddd   xmm2, xmmword ptr [rsp+0xE0]
+        paddd   xmm3, xmmword ptr [rsp+0x80]
+        paddd   xmm0, xmm5
+        paddd   xmm1, xmm6
+        paddd   xmm2, xmm7
+        paddd   xmm3, xmm4
+        pxor    xmm15, xmm0
+        pxor    xmm12, xmm1
+        pxor    xmm13, xmm2
+        pxor    xmm14, xmm3
+        movdqa  xmm8, xmmword ptr [ROT8+rip]
+        pshufb  xmm15, xmm8
+        pshufb  xmm12, xmm8
+        pshufb  xmm13, xmm8
+        pshufb  xmm14, xmm8
+        paddd   xmm10, xmm15
+        paddd   xmm11, xmm12
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm13
+        paddd   xmm9, xmm14
+        pxor    xmm5, xmm10
+        pxor    xmm6, xmm11
+        pxor    xmm7, xmm8
+        pxor    xmm4, xmm9
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 7
+        pslld   xmm5, 25
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 7
+        pslld   xmm6, 25
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 7
+        pslld   xmm7, 25
+        por     xmm7, xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 7
+        pslld   xmm4, 25
+        por     xmm4, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0x30]
+        paddd   xmm1, xmmword ptr [rsp+0xA0]
+        paddd   xmm2, xmmword ptr [rsp+0xD0]
+        paddd   xmm3, xmmword ptr [rsp+0x70]
+        paddd   xmm0, xmm4
+        paddd   xmm1, xmm5
+        paddd   xmm2, xmm6
+        paddd   xmm3, xmm7
+        pxor    xmm12, xmm0
+        pxor    xmm13, xmm1
+        pxor    xmm14, xmm2
+        pxor    xmm15, xmm3
+        movdqa  xmm8, xmmword ptr [ROT16+rip]
+        pshufb  xmm12, xmm8
+        pshufb  xmm13, xmm8
+        pshufb  xmm14, xmm8
+        pshufb  xmm15, xmm8
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm12
+        paddd   xmm9, xmm13
+        paddd   xmm10, xmm14
+        paddd   xmm11, xmm15
+        pxor    xmm4, xmm8
+        pxor    xmm5, xmm9
+        pxor    xmm6, xmm10
+        pxor    xmm7, xmm11
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 12
+        pslld   xmm4, 20
+        por     xmm4, xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 12
+        pslld   xmm5, 20
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 12
+        pslld   xmm6, 20
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 12
+        pslld   xmm7, 20
+        por     xmm7, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0x40]
+        paddd   xmm1, xmmword ptr [rsp+0xC0]
+        paddd   xmm2, xmmword ptr [rsp+0x20]
+        paddd   xmm3, xmmword ptr [rsp+0xE0]
+        paddd   xmm0, xmm4
+        paddd   xmm1, xmm5
+        paddd   xmm2, xmm6
+        paddd   xmm3, xmm7
+        pxor    xmm12, xmm0
+        pxor    xmm13, xmm1
+        pxor    xmm14, xmm2
+        pxor    xmm15, xmm3
+        movdqa  xmm8, xmmword ptr [ROT8+rip]
+        pshufb  xmm12, xmm8
+        pshufb  xmm13, xmm8
+        pshufb  xmm14, xmm8
+        pshufb  xmm15, xmm8
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm12
+        paddd   xmm9, xmm13
+        paddd   xmm10, xmm14
+        paddd   xmm11, xmm15
+        pxor    xmm4, xmm8
+        pxor    xmm5, xmm9
+        pxor    xmm6, xmm10
+        pxor    xmm7, xmm11
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 7
+        pslld   xmm4, 25
+        por     xmm4, xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 7
+        pslld   xmm5, 25
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 7
+        pslld   xmm6, 25
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 7
+        pslld   xmm7, 25
+        por     xmm7, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0x60]
+        paddd   xmm1, xmmword ptr [rsp+0x90]
+        paddd   xmm2, xmmword ptr [rsp+0xB0]
+        paddd   xmm3, xmmword ptr [rsp+0x80]
+        paddd   xmm0, xmm5
+        paddd   xmm1, xmm6
+        paddd   xmm2, xmm7
+        paddd   xmm3, xmm4
+        pxor    xmm15, xmm0
+        pxor    xmm12, xmm1
+        pxor    xmm13, xmm2
+        pxor    xmm14, xmm3
+        movdqa  xmm8, xmmword ptr [ROT16+rip]
+        pshufb  xmm15, xmm8
+        pshufb  xmm12, xmm8
+        pshufb  xmm13, xmm8
+        pshufb  xmm14, xmm8
+        paddd   xmm10, xmm15
+        paddd   xmm11, xmm12
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm13
+        paddd   xmm9, xmm14
+        pxor    xmm5, xmm10
+        pxor    xmm6, xmm11
+        pxor    xmm7, xmm8
+        pxor    xmm4, xmm9
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 12
+        pslld   xmm5, 20
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 12
+        pslld   xmm6, 20
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 12
+        pslld   xmm7, 20
+        por     xmm7, xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 12
+        pslld   xmm4, 20
+        por     xmm4, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0x50]
+        paddd   xmm1, xmmword ptr [rsp]
+        paddd   xmm2, xmmword ptr [rsp+0xF0]
+        paddd   xmm3, xmmword ptr [rsp+0x10]
+        paddd   xmm0, xmm5
+        paddd   xmm1, xmm6
+        paddd   xmm2, xmm7
+        paddd   xmm3, xmm4
+        pxor    xmm15, xmm0
+        pxor    xmm12, xmm1
+        pxor    xmm13, xmm2
+        pxor    xmm14, xmm3
+        movdqa  xmm8, xmmword ptr [ROT8+rip]
+        pshufb  xmm15, xmm8
+        pshufb  xmm12, xmm8
+        pshufb  xmm13, xmm8
+        pshufb  xmm14, xmm8
+        paddd   xmm10, xmm15
+        paddd   xmm11, xmm12
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm13
+        paddd   xmm9, xmm14
+        pxor    xmm5, xmm10
+        pxor    xmm6, xmm11
+        pxor    xmm7, xmm8
+        pxor    xmm4, xmm9
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 7
+        pslld   xmm5, 25
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 7
+        pslld   xmm6, 25
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 7
+        pslld   xmm7, 25
+        por     xmm7, xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 7
+        pslld   xmm4, 25
+        por     xmm4, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0xA0]
+        paddd   xmm1, xmmword ptr [rsp+0xC0]
+        paddd   xmm2, xmmword ptr [rsp+0xE0]
+        paddd   xmm3, xmmword ptr [rsp+0xD0]
+        paddd   xmm0, xmm4
+        paddd   xmm1, xmm5
+        paddd   xmm2, xmm6
+        paddd   xmm3, xmm7
+        pxor    xmm12, xmm0
+        pxor    xmm13, xmm1
+        pxor    xmm14, xmm2
+        pxor    xmm15, xmm3
+        movdqa  xmm8, xmmword ptr [ROT16+rip]
+        pshufb  xmm12, xmm8
+        pshufb  xmm13, xmm8
+        pshufb  xmm14, xmm8
+        pshufb  xmm15, xmm8
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm12
+        paddd   xmm9, xmm13
+        paddd   xmm10, xmm14
+        paddd   xmm11, xmm15
+        pxor    xmm4, xmm8
+        pxor    xmm5, xmm9
+        pxor    xmm6, xmm10
+        pxor    xmm7, xmm11
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 12
+        pslld   xmm4, 20
+        por     xmm4, xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 12
+        pslld   xmm5, 20
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 12
+        pslld   xmm6, 20
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 12
+        pslld   xmm7, 20
+        por     xmm7, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0x70]
+        paddd   xmm1, xmmword ptr [rsp+0x90]
+        paddd   xmm2, xmmword ptr [rsp+0x30]
+        paddd   xmm3, xmmword ptr [rsp+0xF0]
+        paddd   xmm0, xmm4
+        paddd   xmm1, xmm5
+        paddd   xmm2, xmm6
+        paddd   xmm3, xmm7
+        pxor    xmm12, xmm0
+        pxor    xmm13, xmm1
+        pxor    xmm14, xmm2
+        pxor    xmm15, xmm3
+        movdqa  xmm8, xmmword ptr [ROT8+rip]
+        pshufb  xmm12, xmm8
+        pshufb  xmm13, xmm8
+        pshufb  xmm14, xmm8
+        pshufb  xmm15, xmm8
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm12
+        paddd   xmm9, xmm13
+        paddd   xmm10, xmm14
+        paddd   xmm11, xmm15
+        pxor    xmm4, xmm8
+        pxor    xmm5, xmm9
+        pxor    xmm6, xmm10
+        pxor    xmm7, xmm11
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 7
+        pslld   xmm4, 25
+        por     xmm4, xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 7
+        pslld   xmm5, 25
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 7
+        pslld   xmm6, 25
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 7
+        pslld   xmm7, 25
+        por     xmm7, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0x40]
+        paddd   xmm1, xmmword ptr [rsp+0xB0]
+        paddd   xmm2, xmmword ptr [rsp+0x50]
+        paddd   xmm3, xmmword ptr [rsp+0x10]
+        paddd   xmm0, xmm5
+        paddd   xmm1, xmm6
+        paddd   xmm2, xmm7
+        paddd   xmm3, xmm4
+        pxor    xmm15, xmm0
+        pxor    xmm12, xmm1
+        pxor    xmm13, xmm2
+        pxor    xmm14, xmm3
+        movdqa  xmm8, xmmword ptr [ROT16+rip]
+        pshufb  xmm15, xmm8
+        pshufb  xmm12, xmm8
+        pshufb  xmm13, xmm8
+        pshufb  xmm14, xmm8
+        paddd   xmm10, xmm15
+        paddd   xmm11, xmm12
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm13
+        paddd   xmm9, xmm14
+        pxor    xmm5, xmm10
+        pxor    xmm6, xmm11
+        pxor    xmm7, xmm8
+        pxor    xmm4, xmm9
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 12
+        pslld   xmm5, 20
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 12
+        pslld   xmm6, 20
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 12
+        pslld   xmm7, 20
+        por     xmm7, xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 12
+        pslld   xmm4, 20
+        por     xmm4, xmm8
+        paddd   xmm0, xmmword ptr [rsp]
+        paddd   xmm1, xmmword ptr [rsp+0x20]
+        paddd   xmm2, xmmword ptr [rsp+0x80]
+        paddd   xmm3, xmmword ptr [rsp+0x60]
+        paddd   xmm0, xmm5
+        paddd   xmm1, xmm6
+        paddd   xmm2, xmm7
+        paddd   xmm3, xmm4
+        pxor    xmm15, xmm0
+        pxor    xmm12, xmm1
+        pxor    xmm13, xmm2
+        pxor    xmm14, xmm3
+        movdqa  xmm8, xmmword ptr [ROT8+rip]
+        pshufb  xmm15, xmm8
+        pshufb  xmm12, xmm8
+        pshufb  xmm13, xmm8
+        pshufb  xmm14, xmm8
+        paddd   xmm10, xmm15
+        paddd   xmm11, xmm12
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm13
+        paddd   xmm9, xmm14
+        pxor    xmm5, xmm10
+        pxor    xmm6, xmm11
+        pxor    xmm7, xmm8
+        pxor    xmm4, xmm9
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 7
+        pslld   xmm5, 25
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 7
+        pslld   xmm6, 25
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 7
+        pslld   xmm7, 25
+        por     xmm7, xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 7
+        pslld   xmm4, 25
+        por     xmm4, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0xC0]
+        paddd   xmm1, xmmword ptr [rsp+0x90]
+        paddd   xmm2, xmmword ptr [rsp+0xF0]
+        paddd   xmm3, xmmword ptr [rsp+0xE0]
+        paddd   xmm0, xmm4
+        paddd   xmm1, xmm5
+        paddd   xmm2, xmm6
+        paddd   xmm3, xmm7
+        pxor    xmm12, xmm0
+        pxor    xmm13, xmm1
+        pxor    xmm14, xmm2
+        pxor    xmm15, xmm3
+        movdqa  xmm8, xmmword ptr [ROT16+rip]
+        pshufb  xmm12, xmm8
+        pshufb  xmm13, xmm8
+        pshufb  xmm14, xmm8
+        pshufb  xmm15, xmm8
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm12
+        paddd   xmm9, xmm13
+        paddd   xmm10, xmm14
+        paddd   xmm11, xmm15
+        pxor    xmm4, xmm8
+        pxor    xmm5, xmm9
+        pxor    xmm6, xmm10
+        pxor    xmm7, xmm11
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 12
+        pslld   xmm4, 20
+        por     xmm4, xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 12
+        pslld   xmm5, 20
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 12
+        pslld   xmm6, 20
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 12
+        pslld   xmm7, 20
+        por     xmm7, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0xD0]
+        paddd   xmm1, xmmword ptr [rsp+0xB0]
+        paddd   xmm2, xmmword ptr [rsp+0xA0]
+        paddd   xmm3, xmmword ptr [rsp+0x80]
+        paddd   xmm0, xmm4
+        paddd   xmm1, xmm5
+        paddd   xmm2, xmm6
+        paddd   xmm3, xmm7
+        pxor    xmm12, xmm0
+        pxor    xmm13, xmm1
+        pxor    xmm14, xmm2
+        pxor    xmm15, xmm3
+        movdqa  xmm8, xmmword ptr [ROT8+rip]
+        pshufb  xmm12, xmm8
+        pshufb  xmm13, xmm8
+        pshufb  xmm14, xmm8
+        pshufb  xmm15, xmm8
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm12
+        paddd   xmm9, xmm13
+        paddd   xmm10, xmm14
+        paddd   xmm11, xmm15
+        pxor    xmm4, xmm8
+        pxor    xmm5, xmm9
+        pxor    xmm6, xmm10
+        pxor    xmm7, xmm11
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 7
+        pslld   xmm4, 25
+        por     xmm4, xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 7
+        pslld   xmm5, 25
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 7
+        pslld   xmm6, 25
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 7
+        pslld   xmm7, 25
+        por     xmm7, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0x70]
+        paddd   xmm1, xmmword ptr [rsp+0x50]
+        paddd   xmm2, xmmword ptr [rsp]
+        paddd   xmm3, xmmword ptr [rsp+0x60]
+        paddd   xmm0, xmm5
+        paddd   xmm1, xmm6
+        paddd   xmm2, xmm7
+        paddd   xmm3, xmm4
+        pxor    xmm15, xmm0
+        pxor    xmm12, xmm1
+        pxor    xmm13, xmm2
+        pxor    xmm14, xmm3
+        movdqa  xmm8, xmmword ptr [ROT16+rip]
+        pshufb  xmm15, xmm8
+        pshufb  xmm12, xmm8
+        pshufb  xmm13, xmm8
+        pshufb  xmm14, xmm8
+        paddd   xmm10, xmm15
+        paddd   xmm11, xmm12
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm13
+        paddd   xmm9, xmm14
+        pxor    xmm5, xmm10
+        pxor    xmm6, xmm11
+        pxor    xmm7, xmm8
+        pxor    xmm4, xmm9
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 12
+        pslld   xmm5, 20
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 12
+        pslld   xmm6, 20
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 12
+        pslld   xmm7, 20
+        por     xmm7, xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 12
+        pslld   xmm4, 20
+        por     xmm4, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0x20]
+        paddd   xmm1, xmmword ptr [rsp+0x30]
+        paddd   xmm2, xmmword ptr [rsp+0x10]
+        paddd   xmm3, xmmword ptr [rsp+0x40]
+        paddd   xmm0, xmm5
+        paddd   xmm1, xmm6
+        paddd   xmm2, xmm7
+        paddd   xmm3, xmm4
+        pxor    xmm15, xmm0
+        pxor    xmm12, xmm1
+        pxor    xmm13, xmm2
+        pxor    xmm14, xmm3
+        movdqa  xmm8, xmmword ptr [ROT8+rip]
+        pshufb  xmm15, xmm8
+        pshufb  xmm12, xmm8
+        pshufb  xmm13, xmm8
+        pshufb  xmm14, xmm8
+        paddd   xmm10, xmm15
+        paddd   xmm11, xmm12
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm13
+        paddd   xmm9, xmm14
+        pxor    xmm5, xmm10
+        pxor    xmm6, xmm11
+        pxor    xmm7, xmm8
+        pxor    xmm4, xmm9
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 7
+        pslld   xmm5, 25
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 7
+        pslld   xmm6, 25
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 7
+        pslld   xmm7, 25
+        por     xmm7, xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 7
+        pslld   xmm4, 25
+        por     xmm4, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0x90]
+        paddd   xmm1, xmmword ptr [rsp+0xB0]
+        paddd   xmm2, xmmword ptr [rsp+0x80]
+        paddd   xmm3, xmmword ptr [rsp+0xF0]
+        paddd   xmm0, xmm4
+        paddd   xmm1, xmm5
+        paddd   xmm2, xmm6
+        paddd   xmm3, xmm7
+        pxor    xmm12, xmm0
+        pxor    xmm13, xmm1
+        pxor    xmm14, xmm2
+        pxor    xmm15, xmm3
+        movdqa  xmm8, xmmword ptr [ROT16+rip]
+        pshufb  xmm12, xmm8
+        pshufb  xmm13, xmm8
+        pshufb  xmm14, xmm8
+        pshufb  xmm15, xmm8
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm12
+        paddd   xmm9, xmm13
+        paddd   xmm10, xmm14
+        paddd   xmm11, xmm15
+        pxor    xmm4, xmm8
+        pxor    xmm5, xmm9
+        pxor    xmm6, xmm10
+        pxor    xmm7, xmm11
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 12
+        pslld   xmm4, 20
+        por     xmm4, xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 12
+        pslld   xmm5, 20
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 12
+        pslld   xmm6, 20
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 12
+        pslld   xmm7, 20
+        por     xmm7, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0xE0]
+        paddd   xmm1, xmmword ptr [rsp+0x50]
+        paddd   xmm2, xmmword ptr [rsp+0xC0]
+        paddd   xmm3, xmmword ptr [rsp+0x10]
+        paddd   xmm0, xmm4
+        paddd   xmm1, xmm5
+        paddd   xmm2, xmm6
+        paddd   xmm3, xmm7
+        pxor    xmm12, xmm0
+        pxor    xmm13, xmm1
+        pxor    xmm14, xmm2
+        pxor    xmm15, xmm3
+        movdqa  xmm8, xmmword ptr [ROT8+rip]
+        pshufb  xmm12, xmm8
+        pshufb  xmm13, xmm8
+        pshufb  xmm14, xmm8
+        pshufb  xmm15, xmm8
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm12
+        paddd   xmm9, xmm13
+        paddd   xmm10, xmm14
+        paddd   xmm11, xmm15
+        pxor    xmm4, xmm8
+        pxor    xmm5, xmm9
+        pxor    xmm6, xmm10
+        pxor    xmm7, xmm11
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 7
+        pslld   xmm4, 25
+        por     xmm4, xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 7
+        pslld   xmm5, 25
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 7
+        pslld   xmm6, 25
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 7
+        pslld   xmm7, 25
+        por     xmm7, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0xD0]
+        paddd   xmm1, xmmword ptr [rsp]
+        paddd   xmm2, xmmword ptr [rsp+0x20]
+        paddd   xmm3, xmmword ptr [rsp+0x40]
+        paddd   xmm0, xmm5
+        paddd   xmm1, xmm6
+        paddd   xmm2, xmm7
+        paddd   xmm3, xmm4
+        pxor    xmm15, xmm0
+        pxor    xmm12, xmm1
+        pxor    xmm13, xmm2
+        pxor    xmm14, xmm3
+        movdqa  xmm8, xmmword ptr [ROT16+rip]
+        pshufb  xmm15, xmm8
+        pshufb  xmm12, xmm8
+        pshufb  xmm13, xmm8
+        pshufb  xmm14, xmm8
+        paddd   xmm10, xmm15
+        paddd   xmm11, xmm12
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm13
+        paddd   xmm9, xmm14
+        pxor    xmm5, xmm10
+        pxor    xmm6, xmm11
+        pxor    xmm7, xmm8
+        pxor    xmm4, xmm9
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 12
+        pslld   xmm5, 20
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 12
+        pslld   xmm6, 20
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 12
+        pslld   xmm7, 20
+        por     xmm7, xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 12
+        pslld   xmm4, 20
+        por     xmm4, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0x30]
+        paddd   xmm1, xmmword ptr [rsp+0xA0]
+        paddd   xmm2, xmmword ptr [rsp+0x60]
+        paddd   xmm3, xmmword ptr [rsp+0x70]
+        paddd   xmm0, xmm5
+        paddd   xmm1, xmm6
+        paddd   xmm2, xmm7
+        paddd   xmm3, xmm4
+        pxor    xmm15, xmm0
+        pxor    xmm12, xmm1
+        pxor    xmm13, xmm2
+        pxor    xmm14, xmm3
+        movdqa  xmm8, xmmword ptr [ROT8+rip]
+        pshufb  xmm15, xmm8
+        pshufb  xmm12, xmm8
+        pshufb  xmm13, xmm8
+        pshufb  xmm14, xmm8
+        paddd   xmm10, xmm15
+        paddd   xmm11, xmm12
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm13
+        paddd   xmm9, xmm14
+        pxor    xmm5, xmm10
+        pxor    xmm6, xmm11
+        pxor    xmm7, xmm8
+        pxor    xmm4, xmm9
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 7
+        pslld   xmm5, 25
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 7
+        pslld   xmm6, 25
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 7
+        pslld   xmm7, 25
+        por     xmm7, xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 7
+        pslld   xmm4, 25
+        por     xmm4, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0xB0]
+        paddd   xmm1, xmmword ptr [rsp+0x50]
+        paddd   xmm2, xmmword ptr [rsp+0x10]
+        paddd   xmm3, xmmword ptr [rsp+0x80]
+        paddd   xmm0, xmm4
+        paddd   xmm1, xmm5
+        paddd   xmm2, xmm6
+        paddd   xmm3, xmm7
+        pxor    xmm12, xmm0
+        pxor    xmm13, xmm1
+        pxor    xmm14, xmm2
+        pxor    xmm15, xmm3
+        movdqa  xmm8, xmmword ptr [ROT16+rip]
+        pshufb  xmm12, xmm8
+        pshufb  xmm13, xmm8
+        pshufb  xmm14, xmm8
+        pshufb  xmm15, xmm8
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm12
+        paddd   xmm9, xmm13
+        paddd   xmm10, xmm14
+        paddd   xmm11, xmm15
+        pxor    xmm4, xmm8
+        pxor    xmm5, xmm9
+        pxor    xmm6, xmm10
+        pxor    xmm7, xmm11
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 12
+        pslld   xmm4, 20
+        por     xmm4, xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 12
+        pslld   xmm5, 20
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 12
+        pslld   xmm6, 20
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 12
+        pslld   xmm7, 20
+        por     xmm7, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0xF0]
+        paddd   xmm1, xmmword ptr [rsp]
+        paddd   xmm2, xmmword ptr [rsp+0x90]
+        paddd   xmm3, xmmword ptr [rsp+0x60]
+        paddd   xmm0, xmm4
+        paddd   xmm1, xmm5
+        paddd   xmm2, xmm6
+        paddd   xmm3, xmm7
+        pxor    xmm12, xmm0
+        pxor    xmm13, xmm1
+        pxor    xmm14, xmm2
+        pxor    xmm15, xmm3
+        movdqa  xmm8, xmmword ptr [ROT8+rip]
+        pshufb  xmm12, xmm8
+        pshufb  xmm13, xmm8
+        pshufb  xmm14, xmm8
+        pshufb  xmm15, xmm8
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm12
+        paddd   xmm9, xmm13
+        paddd   xmm10, xmm14
+        paddd   xmm11, xmm15
+        pxor    xmm4, xmm8
+        pxor    xmm5, xmm9
+        pxor    xmm6, xmm10
+        pxor    xmm7, xmm11
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 7
+        pslld   xmm4, 25
+        por     xmm4, xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 7
+        pslld   xmm5, 25
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 7
+        pslld   xmm6, 25
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 7
+        pslld   xmm7, 25
+        por     xmm7, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0xE0]
+        paddd   xmm1, xmmword ptr [rsp+0x20]
+        paddd   xmm2, xmmword ptr [rsp+0x30]
+        paddd   xmm3, xmmword ptr [rsp+0x70]
+        paddd   xmm0, xmm5
+        paddd   xmm1, xmm6
+        paddd   xmm2, xmm7
+        paddd   xmm3, xmm4
+        pxor    xmm15, xmm0
+        pxor    xmm12, xmm1
+        pxor    xmm13, xmm2
+        pxor    xmm14, xmm3
+        movdqa  xmm8, xmmword ptr [ROT16+rip]
+        pshufb  xmm15, xmm8
+        pshufb  xmm12, xmm8
+        pshufb  xmm13, xmm8
+        pshufb  xmm14, xmm8
+        paddd   xmm10, xmm15
+        paddd   xmm11, xmm12
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm13
+        paddd   xmm9, xmm14
+        pxor    xmm5, xmm10
+        pxor    xmm6, xmm11
+        pxor    xmm7, xmm8
+        pxor    xmm4, xmm9
+        movdqa  xmmword ptr [rsp+0x100], xmm8
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 12
+        pslld   xmm5, 20
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 12
+        pslld   xmm6, 20
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 12
+        pslld   xmm7, 20
+        por     xmm7, xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 12
+        pslld   xmm4, 20
+        por     xmm4, xmm8
+        paddd   xmm0, xmmword ptr [rsp+0xA0]
+        paddd   xmm1, xmmword ptr [rsp+0xC0]
+        paddd   xmm2, xmmword ptr [rsp+0x40]
+        paddd   xmm3, xmmword ptr [rsp+0xD0]
+        paddd   xmm0, xmm5
+        paddd   xmm1, xmm6
+        paddd   xmm2, xmm7
+        paddd   xmm3, xmm4
+        pxor    xmm15, xmm0
+        pxor    xmm12, xmm1
+        pxor    xmm13, xmm2
+        pxor    xmm14, xmm3
+        movdqa  xmm8, xmmword ptr [ROT8+rip]
+        pshufb  xmm15, xmm8
+        pshufb  xmm12, xmm8
+        pshufb  xmm13, xmm8
+        pshufb  xmm14, xmm8
+        paddd   xmm10, xmm15
+        paddd   xmm11, xmm12
+        movdqa  xmm8, xmmword ptr [rsp+0x100]
+        paddd   xmm8, xmm13
+        paddd   xmm9, xmm14
+        pxor    xmm5, xmm10
+        pxor    xmm6, xmm11
+        pxor    xmm7, xmm8
+        pxor    xmm4, xmm9
+        pxor    xmm0, xmm8
+        pxor    xmm1, xmm9
+        pxor    xmm2, xmm10
+        pxor    xmm3, xmm11
+        movdqa  xmm8, xmm5
+        psrld   xmm8, 7
+        pslld   xmm5, 25
+        por     xmm5, xmm8
+        movdqa  xmm8, xmm6
+        psrld   xmm8, 7
+        pslld   xmm6, 25
+        por     xmm6, xmm8
+        movdqa  xmm8, xmm7
+        psrld   xmm8, 7
+        pslld   xmm7, 25
+        por     xmm7, xmm8
+        movdqa  xmm8, xmm4
+        psrld   xmm8, 7
+        pslld   xmm4, 25
+        por     xmm4, xmm8
+        pxor    xmm4, xmm12
+        pxor    xmm5, xmm13
+        pxor    xmm6, xmm14
+        pxor    xmm7, xmm15
+        mov     eax, r13d
+        jne     9b
+        movdqa  xmm9, xmm0
+        punpckldq xmm0, xmm1
+        punpckhdq xmm9, xmm1
+        movdqa  xmm11, xmm2
+        punpckldq xmm2, xmm3
+        punpckhdq xmm11, xmm3
+        movdqa  xmm1, xmm0
+        punpcklqdq xmm0, xmm2
+        punpckhqdq xmm1, xmm2
+        movdqa  xmm3, xmm9
+        punpcklqdq xmm9, xmm11
+        punpckhqdq xmm3, xmm11
+        movdqu  xmmword ptr [rbx], xmm0
+        movdqu  xmmword ptr [rbx+0x20], xmm1
+        movdqu  xmmword ptr [rbx+0x40], xmm9
+        movdqu  xmmword ptr [rbx+0x60], xmm3
+        movdqa  xmm9, xmm4
+        punpckldq xmm4, xmm5
+        punpckhdq xmm9, xmm5
+        movdqa  xmm11, xmm6
+        punpckldq xmm6, xmm7
+        punpckhdq xmm11, xmm7
+        movdqa  xmm5, xmm4
+        punpcklqdq xmm4, xmm6
+        punpckhqdq xmm5, xmm6
+        movdqa  xmm7, xmm9
+        punpcklqdq xmm9, xmm11
+        punpckhqdq xmm7, xmm11
+        movdqu  xmmword ptr [rbx+0x10], xmm4
+        movdqu  xmmword ptr [rbx+0x30], xmm5
+        movdqu  xmmword ptr [rbx+0x50], xmm9
+        movdqu  xmmword ptr [rbx+0x70], xmm7
+        movdqa  xmm1, xmmword ptr [rsp+0x110]
+        movdqa  xmm0, xmm1
+        paddd   xmm1, xmmword ptr [rsp+0x150]
+        movdqa  xmmword ptr [rsp+0x110], xmm1
+        pxor    xmm0, xmmword ptr [CMP_MSB_MASK+rip]
+        pxor    xmm1, xmmword ptr [CMP_MSB_MASK+rip]
+        pcmpgtd xmm0, xmm1
+        movdqa  xmm1, xmmword ptr [rsp+0x120]
+        psubd   xmm1, xmm0
+        movdqa  xmmword ptr [rsp+0x120], xmm1
+        add     rbx, 128
+        add     rdi, 32
+        sub     rsi, 4
+        cmp     rsi, 4
+        jnc     2b
+        test    rsi, rsi
+        jnz     3f
+4:
+        mov     rsp, rbp
+        pop     rbp
+        pop     rbx
+        pop     r12
+        pop     r13
+        pop     r14
+        pop     r15
+        ret
+.p2align 5
+3:
+        test    esi, 0x2
+        je      3f
+        movups  xmm0, xmmword ptr [rcx]
+        movups  xmm1, xmmword ptr [rcx+0x10]
+        movaps  xmm8, xmm0
+        movaps  xmm9, xmm1
+        movd    xmm13, dword ptr [rsp+0x110]
+        pinsrd  xmm13, dword ptr [rsp+0x120], 1
+        pinsrd  xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
+        movaps  xmmword ptr [rsp], xmm13
+        movd    xmm14, dword ptr [rsp+0x114]
+        pinsrd  xmm14, dword ptr [rsp+0x124], 1
+        pinsrd  xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
+        movaps  xmmword ptr [rsp+0x10], xmm14
+        mov     r8, qword ptr [rdi]
+        mov     r9, qword ptr [rdi+0x8]
+        movzx   eax, byte ptr [rbp+0x40]
+        or      eax, r13d
+        xor     edx, edx
+2:
+        mov     r14d, eax
+        or      eax, r12d
+        add     rdx, 64
+        cmp     rdx, r15
+        cmovne  eax, r14d
+        movaps  xmm2, xmmword ptr [BLAKE3_IV+rip]
+        movaps  xmm10, xmm2
+        movups  xmm4, xmmword ptr [r8+rdx-0x40]
+        movups  xmm5, xmmword ptr [r8+rdx-0x30]
+        movaps  xmm3, xmm4
+        shufps  xmm4, xmm5, 136
+        shufps  xmm3, xmm5, 221
+        movaps  xmm5, xmm3
+        movups  xmm6, xmmword ptr [r8+rdx-0x20]
+        movups  xmm7, xmmword ptr [r8+rdx-0x10]
+        movaps  xmm3, xmm6
+        shufps  xmm6, xmm7, 136
+        pshufd  xmm6, xmm6, 0x93
+        shufps  xmm3, xmm7, 221
+        pshufd  xmm7, xmm3, 0x93
+        movups  xmm12, xmmword ptr [r9+rdx-0x40]
+        movups  xmm13, xmmword ptr [r9+rdx-0x30]
+        movaps  xmm11, xmm12
+        shufps  xmm12, xmm13, 136
+        shufps  xmm11, xmm13, 221
+        movaps  xmm13, xmm11
+        movups  xmm14, xmmword ptr [r9+rdx-0x20]
+        movups  xmm15, xmmword ptr [r9+rdx-0x10]
+        movaps  xmm11, xmm14
+        shufps  xmm14, xmm15, 136
+        pshufd  xmm14, xmm14, 0x93
+        shufps  xmm11, xmm15, 221
+        pshufd  xmm15, xmm11, 0x93
+        movaps  xmm3, xmmword ptr [rsp]
+        movaps  xmm11, xmmword ptr [rsp+0x10]
+        pinsrd  xmm3, eax, 3
+        pinsrd  xmm11, eax, 3
+        mov     al, 7
+9:
+        paddd   xmm0, xmm4
+        paddd   xmm8, xmm12
+        movaps  xmmword ptr [rsp+0x20], xmm4
+        movaps  xmmword ptr [rsp+0x30], xmm12
+        paddd   xmm0, xmm1
+        paddd   xmm8, xmm9
+        pxor    xmm3, xmm0
+        pxor    xmm11, xmm8
+        movaps  xmm12, xmmword ptr [ROT16+rip]
+        pshufb  xmm3, xmm12
+        pshufb  xmm11, xmm12
+        paddd   xmm2, xmm3
+        paddd   xmm10, xmm11
+        pxor    xmm1, xmm2
+        pxor    xmm9, xmm10
+        movdqa  xmm4, xmm1
+        pslld   xmm1, 20
+        psrld   xmm4, 12
+        por     xmm1, xmm4
+        movdqa  xmm4, xmm9
+        pslld   xmm9, 20
+        psrld   xmm4, 12
+        por     xmm9, xmm4
+        paddd   xmm0, xmm5
+        paddd   xmm8, xmm13
+        movaps  xmmword ptr [rsp+0x40], xmm5
+        movaps  xmmword ptr [rsp+0x50], xmm13
+        paddd   xmm0, xmm1
+        paddd   xmm8, xmm9
+        pxor    xmm3, xmm0
+        pxor    xmm11, xmm8
+        movaps  xmm13, xmmword ptr [ROT8+rip]
+        pshufb  xmm3, xmm13
+        pshufb  xmm11, xmm13
+        paddd   xmm2, xmm3
+        paddd   xmm10, xmm11
+        pxor    xmm1, xmm2
+        pxor    xmm9, xmm10
+        movdqa  xmm4, xmm1
+        pslld   xmm1, 25
+        psrld   xmm4, 7
+        por     xmm1, xmm4
+        movdqa  xmm4, xmm9
+        pslld   xmm9, 25
+        psrld   xmm4, 7
+        por     xmm9, xmm4
+        pshufd  xmm0, xmm0, 0x93
+        pshufd  xmm8, xmm8, 0x93
+        pshufd  xmm3, xmm3, 0x4E
+        pshufd  xmm11, xmm11, 0x4E
+        pshufd  xmm2, xmm2, 0x39
+        pshufd  xmm10, xmm10, 0x39
+        paddd   xmm0, xmm6
+        paddd   xmm8, xmm14
+        paddd   xmm0, xmm1
+        paddd   xmm8, xmm9
+        pxor    xmm3, xmm0
+        pxor    xmm11, xmm8
+        pshufb  xmm3, xmm12
+        pshufb  xmm11, xmm12
+        paddd   xmm2, xmm3
+        paddd   xmm10, xmm11
+        pxor    xmm1, xmm2
+        pxor    xmm9, xmm10
+        movdqa  xmm4, xmm1
+        pslld   xmm1, 20
+        psrld   xmm4, 12
+        por     xmm1, xmm4
+        movdqa  xmm4, xmm9
+        pslld   xmm9, 20
+        psrld   xmm4, 12
+        por     xmm9, xmm4
+        paddd   xmm0, xmm7
+        paddd   xmm8, xmm15
+        paddd   xmm0, xmm1
+        paddd   xmm8, xmm9
+        pxor    xmm3, xmm0
+        pxor    xmm11, xmm8
+        pshufb  xmm3, xmm13
+        pshufb  xmm11, xmm13
+        paddd   xmm2, xmm3
+        paddd   xmm10, xmm11
+        pxor    xmm1, xmm2
+        pxor    xmm9, xmm10
+        movdqa  xmm4, xmm1
+        pslld   xmm1, 25
+        psrld   xmm4, 7
+        por     xmm1, xmm4
+        movdqa  xmm4, xmm9
+        pslld   xmm9, 25
+        psrld   xmm4, 7
+        por     xmm9, xmm4
+        pshufd  xmm0, xmm0, 0x39
+        pshufd  xmm8, xmm8, 0x39
+        pshufd  xmm3, xmm3, 0x4E
+        pshufd  xmm11, xmm11, 0x4E
+        pshufd  xmm2, xmm2, 0x93
+        pshufd  xmm10, xmm10, 0x93
+        dec     al
+        je      9f
+        movdqa  xmm12, xmmword ptr [rsp+0x20]
+        movdqa  xmm5, xmmword ptr [rsp+0x40]
+        pshufd  xmm13, xmm12, 0x0F
+        shufps  xmm12, xmm5, 214
+        pshufd  xmm4, xmm12, 0x39
+        movdqa  xmm12, xmm6
+        shufps  xmm12, xmm7, 250
+        pblendw xmm13, xmm12, 0xCC
+        movdqa  xmm12, xmm7
+        punpcklqdq xmm12, xmm5
+        pblendw xmm12, xmm6, 0xC0
+        pshufd  xmm12, xmm12, 0x78
+        punpckhdq xmm5, xmm7
+        punpckldq xmm6, xmm5
+        pshufd  xmm7, xmm6, 0x1E
+        movdqa  xmmword ptr [rsp+0x20], xmm13
+        movdqa  xmmword ptr [rsp+0x40], xmm12
+        movdqa  xmm5, xmmword ptr [rsp+0x30]
+        movdqa  xmm13, xmmword ptr [rsp+0x50]
+        pshufd  xmm6, xmm5, 0x0F
+        shufps  xmm5, xmm13, 214
+        pshufd  xmm12, xmm5, 0x39
+        movdqa  xmm5, xmm14
+        shufps  xmm5, xmm15, 250
+        pblendw xmm6, xmm5, 0xCC
+        movdqa  xmm5, xmm15
+        punpcklqdq xmm5, xmm13
+        pblendw xmm5, xmm14, 0xC0
+        pshufd  xmm5, xmm5, 0x78
+        punpckhdq xmm13, xmm15
+        punpckldq xmm14, xmm13
+        pshufd  xmm15, xmm14, 0x1E
+        movdqa  xmm13, xmm6
+        movdqa  xmm14, xmm5
+        movdqa  xmm5, xmmword ptr [rsp+0x20]
+        movdqa  xmm6, xmmword ptr [rsp+0x40]
+        jmp     9b
+9:
+        pxor    xmm0, xmm2
+        pxor    xmm1, xmm3
+        pxor    xmm8, xmm10
+        pxor    xmm9, xmm11
+        mov     eax, r13d
+        cmp     rdx, r15
+        jne     2b
+        movups  xmmword ptr [rbx], xmm0
+        movups  xmmword ptr [rbx+0x10], xmm1
+        movups  xmmword ptr [rbx+0x20], xmm8
+        movups  xmmword ptr [rbx+0x30], xmm9
+        movdqa  xmm0, xmmword ptr [rsp+0x130]
+        movdqa  xmm1, xmmword ptr [rsp+0x110]
+        movdqa  xmm2, xmmword ptr [rsp+0x120]
+        movdqu  xmm3, xmmword ptr [rsp+0x118]
+        movdqu  xmm4, xmmword ptr [rsp+0x128]
+        blendvps xmm1, xmm3, xmm0
+        blendvps xmm2, xmm4, xmm0
+        movdqa  xmmword ptr [rsp+0x110], xmm1
+        movdqa  xmmword ptr [rsp+0x120], xmm2
+        add     rdi, 16
+        add     rbx, 64
+        sub     rsi, 2
+3:
+        test    esi, 0x1
+        je      4b
+        movups  xmm0, xmmword ptr [rcx]
+        movups  xmm1, xmmword ptr [rcx+0x10]
+        movd    xmm13, dword ptr [rsp+0x110]
+        pinsrd  xmm13, dword ptr [rsp+0x120], 1
+        pinsrd  xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
+        movaps  xmm14, xmmword ptr [ROT8+rip]
+        movaps  xmm15, xmmword ptr [ROT16+rip]
+        mov     r8, qword ptr [rdi]
+        movzx   eax, byte ptr [rbp+0x40]
+        or      eax, r13d
+        xor     edx, edx
+2:
+        mov     r14d, eax
+        or      eax, r12d
+        add     rdx, 64
+        cmp     rdx, r15
+        cmovne  eax, r14d
+        movaps  xmm2, xmmword ptr [BLAKE3_IV+rip]
+        movaps  xmm3, xmm13
+        pinsrd  xmm3, eax, 3
+        movups  xmm4, xmmword ptr [r8+rdx-0x40]
+        movups  xmm5, xmmword ptr [r8+rdx-0x30]
+        movaps  xmm8, xmm4
+        shufps  xmm4, xmm5, 136
+        shufps  xmm8, xmm5, 221
+        movaps  xmm5, xmm8
+        movups  xmm6, xmmword ptr [r8+rdx-0x20]
+        movups  xmm7, xmmword ptr [r8+rdx-0x10]
+        movaps  xmm8, xmm6
+        shufps  xmm6, xmm7, 136
+        pshufd  xmm6, xmm6, 0x93
+        shufps  xmm8, xmm7, 221
+        pshufd  xmm7, xmm8, 0x93
+        mov     al, 7
+9:
+        paddd   xmm0, xmm4
+        paddd   xmm0, xmm1
+        pxor    xmm3, xmm0
+        pshufb  xmm3, xmm15
+        paddd   xmm2, xmm3
+        pxor    xmm1, xmm2
+        movdqa  xmm11, xmm1
+        pslld   xmm1, 20
+        psrld   xmm11, 12
+        por     xmm1, xmm11
+        paddd   xmm0, xmm5
+        paddd   xmm0, xmm1
+        pxor    xmm3, xmm0
+        pshufb  xmm3, xmm14
+        paddd   xmm2, xmm3
+        pxor    xmm1, xmm2
+        movdqa  xmm11, xmm1
+        pslld   xmm1, 25
+        psrld   xmm11, 7
+        por     xmm1, xmm11
+        pshufd  xmm0, xmm0, 0x93
+        pshufd  xmm3, xmm3, 0x4E
+        pshufd  xmm2, xmm2, 0x39
+        paddd   xmm0, xmm6
+        paddd   xmm0, xmm1
+        pxor    xmm3, xmm0
+        pshufb  xmm3, xmm15
+        paddd   xmm2, xmm3
+        pxor    xmm1, xmm2
+        movdqa  xmm11, xmm1
+        pslld   xmm1, 20
+        psrld   xmm11, 12
+        por     xmm1, xmm11
+        paddd   xmm0, xmm7
+        paddd   xmm0, xmm1
+        pxor    xmm3, xmm0
+        pshufb  xmm3, xmm14
+        paddd   xmm2, xmm3
+        pxor    xmm1, xmm2
+        movdqa  xmm11, xmm1
+        pslld   xmm1, 25
+        psrld   xmm11, 7
+        por     xmm1, xmm11
+        pshufd  xmm0, xmm0, 0x39
+        pshufd  xmm3, xmm3, 0x4E
+        pshufd  xmm2, xmm2, 0x93
+        dec     al
+        jz      9f
+        movdqa  xmm8, xmm4
+        shufps  xmm8, xmm5, 214
+        pshufd  xmm9, xmm4, 0x0F
+        pshufd  xmm4, xmm8, 0x39
+        movdqa  xmm8, xmm6
+        shufps  xmm8, xmm7, 250
+        pblendw xmm9, xmm8, 0xCC
+        movdqa  xmm8, xmm7
+        punpcklqdq xmm8, xmm5
+        pblendw xmm8, xmm6, 0xC0
+        pshufd  xmm8, xmm8, 0x78
+        punpckhdq xmm5, xmm7
+        punpckldq xmm6, xmm5
+        pshufd  xmm7, xmm6, 0x1E
+        movdqa  xmm5, xmm9
+        movdqa  xmm6, xmm8
+        jmp     9b
+9:
+        pxor    xmm0, xmm2
+        pxor    xmm1, xmm3
+        mov     eax, r13d
+        cmp     rdx, r15
+        jne     2b
+        movups  xmmword ptr [rbx], xmm0
+        movups  xmmword ptr [rbx+0x10], xmm1
+        jmp     4b
+
+.p2align 6
+blake3_compress_in_place_sse41:
+_blake3_compress_in_place_sse41:
+        _CET_ENDBR
+        movups  xmm0, xmmword ptr [rdi]
+        movups  xmm1, xmmword ptr [rdi+0x10]
+        movaps  xmm2, xmmword ptr [BLAKE3_IV+rip]
+        shl     r8, 32
+        add     rdx, r8
+        movq    xmm3, rcx
+        movq    xmm4, rdx
+        punpcklqdq xmm3, xmm4
+        movups  xmm4, xmmword ptr [rsi]
+        movups  xmm5, xmmword ptr [rsi+0x10]
+        movaps  xmm8, xmm4
+        shufps  xmm4, xmm5, 136
+        shufps  xmm8, xmm5, 221
+        movaps  xmm5, xmm8
+        movups  xmm6, xmmword ptr [rsi+0x20]
+        movups  xmm7, xmmword ptr [rsi+0x30]
+        movaps  xmm8, xmm6
+        shufps  xmm6, xmm7, 136
+        pshufd  xmm6, xmm6, 0x93
+        shufps  xmm8, xmm7, 221
+        pshufd  xmm7, xmm8, 0x93
+        movaps  xmm14, xmmword ptr [ROT8+rip]
+        movaps  xmm15, xmmword ptr [ROT16+rip]
+        mov     al, 7
+9:
+        paddd   xmm0, xmm4
+        paddd   xmm0, xmm1
+        pxor    xmm3, xmm0
+        pshufb  xmm3, xmm15
+        paddd   xmm2, xmm3
+        pxor    xmm1, xmm2
+        movdqa  xmm11, xmm1
+        pslld   xmm1, 20
+        psrld   xmm11, 12
+        por     xmm1, xmm11
+        paddd   xmm0, xmm5
+        paddd   xmm0, xmm1
+        pxor    xmm3, xmm0
+        pshufb  xmm3, xmm14
+        paddd   xmm2, xmm3
+        pxor    xmm1, xmm2
+        movdqa  xmm11, xmm1
+        pslld   xmm1, 25
+        psrld   xmm11, 7
+        por     xmm1, xmm11
+        pshufd  xmm0, xmm0, 0x93
+        pshufd  xmm3, xmm3, 0x4E
+        pshufd  xmm2, xmm2, 0x39
+        paddd   xmm0, xmm6
+        paddd   xmm0, xmm1
+        pxor    xmm3, xmm0
+        pshufb  xmm3, xmm15
+        paddd   xmm2, xmm3
+        pxor    xmm1, xmm2
+        movdqa  xmm11, xmm1
+        pslld   xmm1, 20
+        psrld   xmm11, 12
+        por     xmm1, xmm11
+        paddd   xmm0, xmm7
+        paddd   xmm0, xmm1
+        pxor    xmm3, xmm0
+        pshufb  xmm3, xmm14
+        paddd   xmm2, xmm3
+        pxor    xmm1, xmm2
+        movdqa  xmm11, xmm1
+        pslld   xmm1, 25
+        psrld   xmm11, 7
+        por     xmm1, xmm11
+        pshufd  xmm0, xmm0, 0x39
+        pshufd  xmm3, xmm3, 0x4E
+        pshufd  xmm2, xmm2, 0x93
+        dec     al
+        jz      9f
+        movdqa  xmm8, xmm4
+        shufps  xmm8, xmm5, 214
+        pshufd  xmm9, xmm4, 0x0F
+        pshufd  xmm4, xmm8, 0x39
+        movdqa  xmm8, xmm6
+        shufps  xmm8, xmm7, 250
+        pblendw xmm9, xmm8, 0xCC
+        movdqa  xmm8, xmm7
+        punpcklqdq xmm8, xmm5
+        pblendw xmm8, xmm6, 0xC0
+        pshufd  xmm8, xmm8, 0x78
+        punpckhdq xmm5, xmm7
+        punpckldq xmm6, xmm5
+        pshufd  xmm7, xmm6, 0x1E
+        movdqa  xmm5, xmm9
+        movdqa  xmm6, xmm8
+        jmp     9b
+9:
+        pxor    xmm0, xmm2
+        pxor    xmm1, xmm3
+        movups  xmmword ptr [rdi], xmm0
+        movups  xmmword ptr [rdi+0x10], xmm1
+        ret
+
+.p2align 6
+blake3_compress_xof_sse41:
+_blake3_compress_xof_sse41:
+        _CET_ENDBR
+        movups  xmm0, xmmword ptr [rdi]
+        movups  xmm1, xmmword ptr [rdi+0x10]
+        movaps  xmm2, xmmword ptr [BLAKE3_IV+rip]
+        movzx   eax, r8b
+        movzx   edx, dl
+        shl     rax, 32
+        add     rdx, rax
+        movq    xmm3, rcx
+        movq    xmm4, rdx
+        punpcklqdq xmm3, xmm4
+        movups  xmm4, xmmword ptr [rsi]
+        movups  xmm5, xmmword ptr [rsi+0x10]
+        movaps  xmm8, xmm4
+        shufps  xmm4, xmm5, 136
+        shufps  xmm8, xmm5, 221
+        movaps  xmm5, xmm8
+        movups  xmm6, xmmword ptr [rsi+0x20]
+        movups  xmm7, xmmword ptr [rsi+0x30]
+        movaps  xmm8, xmm6
+        shufps  xmm6, xmm7, 136
+        pshufd  xmm6, xmm6, 0x93
+        shufps  xmm8, xmm7, 221
+        pshufd  xmm7, xmm8, 0x93
+        movaps  xmm14, xmmword ptr [ROT8+rip]
+        movaps  xmm15, xmmword ptr [ROT16+rip]
+        mov     al, 7
+9:
+        paddd   xmm0, xmm4
+        paddd   xmm0, xmm1
+        pxor    xmm3, xmm0
+        pshufb  xmm3, xmm15
+        paddd   xmm2, xmm3
+        pxor    xmm1, xmm2
+        movdqa  xmm11, xmm1
+        pslld   xmm1, 20
+        psrld   xmm11, 12
+        por     xmm1, xmm11
+        paddd   xmm0, xmm5
+        paddd   xmm0, xmm1
+        pxor    xmm3, xmm0
+        pshufb  xmm3, xmm14
+        paddd   xmm2, xmm3
+        pxor    xmm1, xmm2
+        movdqa  xmm11, xmm1
+        pslld   xmm1, 25
+        psrld   xmm11, 7
+        por     xmm1, xmm11
+        pshufd  xmm0, xmm0, 0x93
+        pshufd  xmm3, xmm3, 0x4E
+        pshufd  xmm2, xmm2, 0x39
+        paddd   xmm0, xmm6
+        paddd   xmm0, xmm1
+        pxor    xmm3, xmm0
+        pshufb  xmm3, xmm15
+        paddd   xmm2, xmm3
+        pxor    xmm1, xmm2
+        movdqa  xmm11, xmm1
+        pslld   xmm1, 20
+        psrld   xmm11, 12
+        por     xmm1, xmm11
+        paddd   xmm0, xmm7
+        paddd   xmm0, xmm1
+        pxor    xmm3, xmm0
+        pshufb  xmm3, xmm14
+        paddd   xmm2, xmm3
+        pxor    xmm1, xmm2
+        movdqa  xmm11, xmm1
+        pslld   xmm1, 25
+        psrld   xmm11, 7
+        por     xmm1, xmm11
+        pshufd  xmm0, xmm0, 0x39
+        pshufd  xmm3, xmm3, 0x4E
+        pshufd  xmm2, xmm2, 0x93
+        dec     al
+        jz      9f
+        movdqa  xmm8, xmm4
+        shufps  xmm8, xmm5, 214
+        pshufd  xmm9, xmm4, 0x0F
+        pshufd  xmm4, xmm8, 0x39
+        movdqa  xmm8, xmm6
+        shufps  xmm8, xmm7, 250
+        pblendw xmm9, xmm8, 0xCC
+        movdqa  xmm8, xmm7
+        punpcklqdq xmm8, xmm5
+        pblendw xmm8, xmm6, 0xC0
+        pshufd  xmm8, xmm8, 0x78
+        punpckhdq xmm5, xmm7
+        punpckldq xmm6, xmm5
+        pshufd  xmm7, xmm6, 0x1E
+        movdqa  xmm5, xmm9
+        movdqa  xmm6, xmm8
+        jmp     9b
+9:
+        movdqu  xmm4, xmmword ptr [rdi]
+        movdqu  xmm5, xmmword ptr [rdi+0x10]
+        pxor    xmm0, xmm2
+        pxor    xmm1, xmm3
+        pxor    xmm2, xmm4
+        pxor    xmm3, xmm5
+        movups  xmmword ptr [r9], xmm0
+        movups  xmmword ptr [r9+0x10], xmm1
+        movups  xmmword ptr [r9+0x20], xmm2
+        movups  xmmword ptr [r9+0x30], xmm3
+        ret
+
+
+#ifdef __APPLE__
+.static_data
+#else
+.section .rodata
+#endif
+.p2align  6
+BLAKE3_IV:
+        .long  0x6A09E667, 0xBB67AE85
+        .long  0x3C6EF372, 0xA54FF53A
+ROT16:
+        .byte  2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13
+ROT8:
+        .byte  1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12
+ADD0:	
+        .long  0, 1, 2, 3
+ADD1:
+	.long  4, 4, 4, 4
+BLAKE3_IV_0:
+	.long  0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667
+BLAKE3_IV_1:
+	.long  0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85
+BLAKE3_IV_2:
+	.long  0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372
+BLAKE3_IV_3:
+	.long  0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A
+BLAKE3_BLOCK_LEN:
+	.long  64, 64, 64, 64
+CMP_MSB_MASK:
+	.long  0x80000000, 0x80000000, 0x80000000, 0x80000000