/* This file is part of limb https://lila.oss/limb
* Copyright (C) 2023 Olivier Brunel jjk@jjacky.com */
/* Based on official BLAKE3 implementation:
* https://github.com/BLAKE3-team/BLAKE3
* Copyright (C) 2019-2020 Samuel Neves and Jack O'Connor */
/* SPDX-License-Identifier: CC0-1.0 OR Apache-2.0 */
#ifndef LIMB_BLAKE3_BLAKE3_H
#define LIMB_BLAKE3_BLAKE3_H
#include <string.h>
#include <limb/blake3.h>
#include <limb/gccattributes.h>
#include <limb/u64.h>
/* internal flags */
enum blake3_flags {
CHUNK_START = 1 << 0,
CHUNK_END = 1 << 1,
PARENT = 1 << 2,
ROOT = 1 << 3,
KEYED_HASH = 1 << 4,
DERIVE_KEY_CONTEXT = 1 << 5,
DERIVE_KEY_MATERIAL = 1 << 6,
};
#define INLINE static inline __attribute__((always_inline))
#if defined(__x86_64__) || defined(_M_X64)
#define IS_X86
#define IS_X86_64
#endif
#if defined(__i386__) || defined(_M_IX86)
#define IS_X86
#define IS_X86_32
#endif
#if defined(__aarch64__) || defined(_M_ARM64)
#define IS_AARCH64
#endif
#if !defined(BLAKE3_USE_NEON)
/* If BLAKE3_USE_NEON not manually set, autodetect based on AArch64ness */
#if defined(IS_AARCH64)
#define BLAKE3_USE_NEON 1
#else
#define BLAKE3_USE_NEON 0
#endif
#endif
#if defined(IS_X86)
#define MAX_SIMD_DEGREE 16
#elif BLAKE3_USE_NEON == 1
#define MAX_SIMD_DEGREE 4
#else
#define MAX_SIMD_DEGREE 1
#endif
/* There are some places where we want a static size that's equal to the
* MAX_SIMD_DEGREE, but also at least 2. */
#define MAX_SIMD_DEGREE_OR_2 (MAX_SIMD_DEGREE > 2 ? MAX_SIMD_DEGREE : 2)
static const u32 IV[8] = {
0x6A09E667UL, 0xBB67AE85UL, 0x3C6EF372UL, 0xA54FF53AUL,
0x510E527FUL, 0x9B05688CUL, 0x1F83D9ABUL, 0x5BE0CD19UL
};
static const u8 MSG_SCHEDULE[7][16] = {
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
{ 2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8 },
{ 3, 4, 10, 12, 13, 2, 7, 14, 6, 5, 9, 0, 11, 15, 8, 1 },
{ 10, 7, 12, 9, 14, 3, 13, 15, 4, 0, 11, 2, 5, 8, 1, 6 },
{ 12, 13, 9, 11, 15, 10, 14, 8, 7, 2, 5, 3, 0, 1, 6, 4 },
{ 9, 14, 11, 5, 8, 12, 15, 1, 13, 3, 0, 10, 2, 6, 4, 7 },
{ 11, 15, 5, 0, 1, 9, 8, 6, 14, 10, 2, 12, 3, 4, 7, 13 },
};
/* Count the number of 1 bits. */
INLINE unsigned int
popcnt(u64 x)
{
#if defined(__GNUC__) || defined(__clang__)
return (unsigned int) __builtin_popcountll(x);
#else
unsigned int count = 0;
while (x != 0) {
count += 1;
x &= x - 1;
}
return count;
#endif
}
/* Largest power of two less than or equal to x. As a special case, returns 1
* when x is 0. */
INLINE u64
round_down_to_power_of_2(u64 x)
{
return 1ULL << (msb64(x | 1) - 1);
}
INLINE u32
counter_low(u64 counter)
{
return (u32) counter;
}
INLINE u32
counter_high(u64 counter)
{
return (u32) (counter >> 32);
}
INLINE u32
load32(const void *src)
{
const u8 *p = (const u8 *) src;
return ((u32)(p[0]) ) | ((u32)(p[1]) << 8)
| ((u32)(p[2]) << 16) | ((u32)(p[3]) << 24);
}
INLINE void
load_key_words(const u8 key[BLAKE3_KEY_LEN], u32 key_words[8])
{
key_words[0] = load32(&key[0 * 4]);
key_words[1] = load32(&key[1 * 4]);
key_words[2] = load32(&key[2 * 4]);
key_words[3] = load32(&key[3 * 4]);
key_words[4] = load32(&key[4 * 4]);
key_words[5] = load32(&key[5 * 4]);
key_words[6] = load32(&key[6 * 4]);
key_words[7] = load32(&key[7 * 4]);
}
INLINE void
store32(void *dst, u32 w)
{
u8 *p = (u8 *) dst;
p[0] = (u8) w;
p[1] = (u8) (w >> 8);
p[2] = (u8) (w >> 16);
p[3] = (u8) (w >> 24);
}
INLINE void
store_cv_words(u8 bytes_out[32], u32 cv_words[8])
{
store32(&bytes_out[0 * 4], cv_words[0]);
store32(&bytes_out[1 * 4], cv_words[1]);
store32(&bytes_out[2 * 4], cv_words[2]);
store32(&bytes_out[3 * 4], cv_words[3]);
store32(&bytes_out[4 * 4], cv_words[4]);
store32(&bytes_out[5 * 4], cv_words[5]);
store32(&bytes_out[6 * 4], cv_words[6]);
store32(&bytes_out[7 * 4], cv_words[7]);
}
void blake3_compress_in_place(u32 cv[8],
const u8 block[BLAKE3_BLOCK_LEN],
u8 block_len, u64 counter, u8 flags) gccattr_hidden;
void blake3_compress_xof(const u32 cv[8],
const u8 block[BLAKE3_BLOCK_LEN],
u8 block_len, u64 counter, u8 flags, u8 out[64]) gccattr_hidden;
void blake3_hash_many(const u8 *const *inputs, size_t num_inputs,
size_t blocks, const u32 key[8], u64 counter,
char increment_counter, u8 flags,
u8 flags_start, u8 flags_end, u8 *out) gccattr_hidden;
size_t blake3_simd_degree(void) gccattr_hidden;
// Declarations for implementation-specific functions.
void blake3_compress_in_place_portable(u32 cv[8],
const u8 block[BLAKE3_BLOCK_LEN],
u8 block_len, u64 counter, u8 flags) gccattr_hidden;
void blake3_compress_xof_portable(const u32 cv[8],
const u8 block[BLAKE3_BLOCK_LEN],
u8 block_len, u64 counter, u8 flags, u8 out[64]) gccattr_hidden;
void blake3_hash_many_portable(const u8 *const *inputs, size_t num_inputs,
size_t blocks, const u32 key[8],
u64 counter, char increment_counter,
u8 flags, u8 flags_start,
u8 flags_end, u8 *out) gccattr_hidden;
#if defined(IS_X86)
#if !defined(BLAKE3_NO_SSE2)
void blake3_compress_in_place_sse2(u32 cv[8],
const u8 block[BLAKE3_BLOCK_LEN],
u8 block_len, u64 counter,
u8 flags) gccattr_hidden;
void blake3_compress_xof_sse2(const u32 cv[8],
const u8 block[BLAKE3_BLOCK_LEN],
u8 block_len, u64 counter,
u8 flags, u8 out[64]) gccattr_hidden;
void blake3_hash_many_sse2(const u8 *const *inputs, size_t num_inputs,
size_t blocks, const u32 key[8],
u64 counter, char increment_counter,
u8 flags, u8 flags_start,
u8 flags_end, u8 *out) gccattr_hidden;
#endif
#if !defined(BLAKE3_NO_SSE41)
void blake3_compress_in_place_sse41(u32 cv[8],
const u8 block[BLAKE3_BLOCK_LEN],
u8 block_len, u64 counter,
u8 flags) gccattr_hidden;
void blake3_compress_xof_sse41(const u32 cv[8],
const u8 block[BLAKE3_BLOCK_LEN],
u8 block_len, u64 counter,
u8 flags, u8 out[64]) gccattr_hidden;
void blake3_hash_many_sse41(const u8 *const *inputs, size_t num_inputs,
size_t blocks, const u32 key[8],
u64 counter, char increment_counter,
u8 flags, u8 flags_start,
u8 flags_end, u8 *out) gccattr_hidden;
#endif
#if !defined(BLAKE3_NO_AVX2)
void blake3_hash_many_avx2(const u8 *const *inputs, size_t num_inputs,
size_t blocks, const u32 key[8],
u64 counter, char increment_counter,
u8 flags, u8 flags_start,
u8 flags_end, u8 *out) gccattr_hidden;
#endif
#if !defined(BLAKE3_NO_AVX512)
void blake3_compress_in_place_avx512(u32 cv[8],
const u8 block[BLAKE3_BLOCK_LEN],
u8 block_len, u64 counter,
u8 flags) gccattr_hidden;
void blake3_compress_xof_avx512(const u32 cv[8],
const u8 block[BLAKE3_BLOCK_LEN],
u8 block_len, u64 counter,
u8 flags, u8 out[64]) gccattr_hidden;
void blake3_hash_many_avx512(const u8 *const *inputs, size_t num_inputs,
size_t blocks, const u32 key[8],
u64 counter, char increment_counter,
u8 flags, u8 flags_start,
u8 flags_end, u8 *out) gccattr_hidden;
#endif
#endif
#if BLAKE3_USE_NEON == 1
void blake3_hash_many_neon(const u8 *const *inputs, size_t num_inputs,
size_t blocks, const u32 key[8],
u64 counter, char increment_counter,
u8 flags, u8 flags_start,
u8 flags_end, u8 *out) gccattr_hidden;
#endif
#endif /* LIMB_BLAKE3_BLAKE3_H */