Welcome to little lamb

Code » limb » master » tree

[master] / src / include / blake3.h

/* This file is part of limb                           https://lila.oss/limb
 * Copyright (C) 2023 Olivier Brunel                          jjk@jjacky.com */
/* Based on official BLAKE3 implementation:
 *  https://github.com/BLAKE3-team/BLAKE3
 * Copyright (C) 2019-2020 Samuel Neves and Jack O'Connor */
/* SPDX-License-Identifier: CC0-1.0 OR Apache-2.0 */
#ifndef LIMB_BLAKE3_BLAKE3_H
#define LIMB_BLAKE3_BLAKE3_H

#include <string.h>
#include <limb/blake3.h>
#include <limb/gccattributes.h>
#include <limb/u64.h>

/* internal flags */
enum blake3_flags {
  CHUNK_START         = 1 << 0,
  CHUNK_END           = 1 << 1,
  PARENT              = 1 << 2,
  ROOT                = 1 << 3,
  KEYED_HASH          = 1 << 4,
  DERIVE_KEY_CONTEXT  = 1 << 5,
  DERIVE_KEY_MATERIAL = 1 << 6,
};

#define INLINE static inline __attribute__((always_inline))

#if defined(__x86_64__) || defined(_M_X64) 
#define IS_X86
#define IS_X86_64
#endif

#if defined(__i386__) || defined(_M_IX86)
#define IS_X86
#define IS_X86_32
#endif

#if defined(__aarch64__) || defined(_M_ARM64)
#define IS_AARCH64
#endif

#if !defined(BLAKE3_USE_NEON)
  /* If BLAKE3_USE_NEON not manually set, autodetect based on AArch64ness */
  #if defined(IS_AARCH64)
    #define BLAKE3_USE_NEON 1
  #else
    #define BLAKE3_USE_NEON 0
  #endif
#endif

#if defined(IS_X86)
#define MAX_SIMD_DEGREE 16
#elif BLAKE3_USE_NEON == 1
#define MAX_SIMD_DEGREE 4
#else
#define MAX_SIMD_DEGREE 1
#endif

/* There are some places where we want a static size that's equal to the
 * MAX_SIMD_DEGREE, but also at least 2. */
#define MAX_SIMD_DEGREE_OR_2 (MAX_SIMD_DEGREE > 2 ? MAX_SIMD_DEGREE : 2)

static const u32 IV[8] = {
    0x6A09E667UL, 0xBB67AE85UL, 0x3C6EF372UL, 0xA54FF53AUL,
    0x510E527FUL, 0x9B05688CUL, 0x1F83D9ABUL, 0x5BE0CD19UL
};

static const u8 MSG_SCHEDULE[7][16] = {
    {  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 },
    {  2,  6,  3, 10,  7,  0,  4, 13,  1, 11, 12,  5,  9, 14, 15,  8 },
    {  3,  4, 10, 12, 13,  2,  7, 14,  6,  5,  9,  0, 11, 15,  8,  1 },
    { 10,  7, 12,  9, 14,  3, 13, 15,  4,  0, 11,  2,  5,  8,  1,  6 },
    { 12, 13,  9, 11, 15, 10, 14,  8,  7,  2,  5,  3,  0,  1,  6,  4 },
    {  9, 14, 11,  5,  8, 12, 15,  1, 13,  3,  0, 10,  2,  6,  4,  7 },
    { 11, 15,  5,  0,  1,  9,  8,  6, 14, 10,  2, 12,  3,  4,  7, 13 },
};

/* Count the number of 1 bits. */
INLINE unsigned int
popcnt(u64 x)
{
#if defined(__GNUC__) || defined(__clang__)
    return (unsigned int) __builtin_popcountll(x);
#else
    unsigned int count = 0;
    while (x != 0) {
        count += 1;
        x &= x - 1;
    }
    return count;
#endif
}

/* Largest power of two less than or equal to x. As a special case, returns 1
 * when x is 0. */
INLINE u64
round_down_to_power_of_2(u64 x)
{
    return 1ULL << (msb64(x | 1) - 1);
}

INLINE u32
counter_low(u64 counter)
{
    return (u32) counter;
}

INLINE u32
counter_high(u64 counter)
{
    return (u32) (counter >> 32);
}

INLINE u32
load32(const void *src)
{
    const u8 *p = (const u8 *) src;
    return ((u32)(p[0])      ) | ((u32)(p[1]) <<  8)
        |  ((u32)(p[2]) << 16) | ((u32)(p[3]) << 24);
}

INLINE void
load_key_words(const u8 key[BLAKE3_KEY_LEN], u32 key_words[8])
{
    key_words[0] = load32(&key[0 * 4]);
    key_words[1] = load32(&key[1 * 4]);
    key_words[2] = load32(&key[2 * 4]);
    key_words[3] = load32(&key[3 * 4]);
    key_words[4] = load32(&key[4 * 4]);
    key_words[5] = load32(&key[5 * 4]);
    key_words[6] = load32(&key[6 * 4]);
    key_words[7] = load32(&key[7 * 4]);
}

INLINE void
store32(void *dst, u32 w)
{
    u8 *p = (u8 *) dst;
    p[0] = (u8)  w;
    p[1] = (u8) (w >> 8);
    p[2] = (u8) (w >> 16);
    p[3] = (u8) (w >> 24);
}

INLINE void
store_cv_words(u8 bytes_out[32], u32 cv_words[8])
{
    store32(&bytes_out[0 * 4], cv_words[0]);
    store32(&bytes_out[1 * 4], cv_words[1]);
    store32(&bytes_out[2 * 4], cv_words[2]);
    store32(&bytes_out[3 * 4], cv_words[3]);
    store32(&bytes_out[4 * 4], cv_words[4]);
    store32(&bytes_out[5 * 4], cv_words[5]);
    store32(&bytes_out[6 * 4], cv_words[6]);
    store32(&bytes_out[7 * 4], cv_words[7]);
}

void blake3_compress_in_place(u32 cv[8],
                              const u8 block[BLAKE3_BLOCK_LEN],
                              u8 block_len, u64 counter, u8 flags) gccattr_hidden;

void blake3_compress_xof(const u32 cv[8],
                         const u8 block[BLAKE3_BLOCK_LEN],
                         u8 block_len, u64 counter, u8 flags, u8 out[64]) gccattr_hidden;

void blake3_hash_many(const u8 *const *inputs, size_t num_inputs,
                      size_t blocks, const u32 key[8], u64 counter,
                      char increment_counter, u8 flags,
                      u8 flags_start, u8 flags_end, u8 *out) gccattr_hidden;

size_t blake3_simd_degree(void) gccattr_hidden;


// Declarations for implementation-specific functions.
void blake3_compress_in_place_portable(u32 cv[8],
                                       const u8 block[BLAKE3_BLOCK_LEN],
                                       u8 block_len, u64 counter, u8 flags) gccattr_hidden;

void blake3_compress_xof_portable(const u32 cv[8],
                                  const u8 block[BLAKE3_BLOCK_LEN],
                                  u8 block_len, u64 counter, u8 flags, u8 out[64]) gccattr_hidden;

void blake3_hash_many_portable(const u8 *const *inputs, size_t num_inputs,
                               size_t blocks, const u32 key[8],
                               u64 counter, char increment_counter,
                               u8 flags, u8 flags_start,
                               u8 flags_end, u8 *out) gccattr_hidden;

#if defined(IS_X86)
#if !defined(BLAKE3_NO_SSE2)
void blake3_compress_in_place_sse2(u32 cv[8],
                                   const u8 block[BLAKE3_BLOCK_LEN],
                                   u8 block_len, u64 counter,
                                   u8 flags) gccattr_hidden;
void blake3_compress_xof_sse2(const u32 cv[8],
                              const u8 block[BLAKE3_BLOCK_LEN],
                              u8 block_len, u64 counter,
                              u8 flags, u8 out[64]) gccattr_hidden;
void blake3_hash_many_sse2(const u8 *const *inputs, size_t num_inputs,
                           size_t blocks, const u32 key[8],
                           u64 counter, char increment_counter,
                           u8 flags, u8 flags_start,
                           u8 flags_end, u8 *out) gccattr_hidden;
#endif
#if !defined(BLAKE3_NO_SSE41)
void blake3_compress_in_place_sse41(u32 cv[8],
                                    const u8 block[BLAKE3_BLOCK_LEN],
                                    u8 block_len, u64 counter,
                                    u8 flags) gccattr_hidden;
void blake3_compress_xof_sse41(const u32 cv[8],
                               const u8 block[BLAKE3_BLOCK_LEN],
                               u8 block_len, u64 counter,
                               u8 flags, u8 out[64]) gccattr_hidden;
void blake3_hash_many_sse41(const u8 *const *inputs, size_t num_inputs,
                            size_t blocks, const u32 key[8],
                            u64 counter, char increment_counter,
                            u8 flags, u8 flags_start,
                            u8 flags_end, u8 *out) gccattr_hidden;
#endif
#if !defined(BLAKE3_NO_AVX2)
void blake3_hash_many_avx2(const u8 *const *inputs, size_t num_inputs,
                           size_t blocks, const u32 key[8],
                           u64 counter, char increment_counter,
                           u8 flags, u8 flags_start,
                           u8 flags_end, u8 *out) gccattr_hidden;
#endif
#if !defined(BLAKE3_NO_AVX512)
void blake3_compress_in_place_avx512(u32 cv[8],
                                     const u8 block[BLAKE3_BLOCK_LEN],
                                     u8 block_len, u64 counter,
                                     u8 flags) gccattr_hidden;

void blake3_compress_xof_avx512(const u32 cv[8],
                                const u8 block[BLAKE3_BLOCK_LEN],
                                u8 block_len, u64 counter,
                                u8 flags, u8 out[64]) gccattr_hidden;

void blake3_hash_many_avx512(const u8 *const *inputs, size_t num_inputs,
                             size_t blocks, const u32 key[8],
                             u64 counter, char increment_counter,
                             u8 flags, u8 flags_start,
                             u8 flags_end, u8 *out) gccattr_hidden;
#endif
#endif

#if BLAKE3_USE_NEON == 1
void blake3_hash_many_neon(const u8 *const *inputs, size_t num_inputs,
                           size_t blocks, const u32 key[8],
                           u64 counter, char increment_counter,
                           u8 flags, u8 flags_start,
                           u8 flags_end, u8 *out) gccattr_hidden;
#endif


#endif /* LIMB_BLAKE3_BLAKE3_H */