Welcome to little lamb

Code » limb » master » tree

[master] / src / liblimb / blake3.h / blake3_dispatch.c

/* This file is part of limb                           https://lila.oss/limb
 * Copyright (C) 2023 Olivier Brunel                          jjk@jjacky.com */
/* Based on official BLAKE3 implementation:
 *  https://github.com/BLAKE3-team/BLAKE3
 * Copyright (C) 2019-2020 Samuel Neves and Jack O'Connor */
/* SPDX-License-Identifier: CC0-1.0 OR Apache-2.0 */
#include "blake3.h"

#if defined(IS_X86)
#if defined(__GNUC__)
#include <immintrin.h>
#else
#undef IS_X86 /* Unimplemented! */
#endif
#endif

#define MAYBE_UNUSED(x) (void)((x))

#if defined(IS_X86)
static u64
xgetbv(void)
{
    u32 eax = 0, edx = 0;
    __asm__ __volatile__("xgetbv\n" : "=a"(eax), "=d"(edx) : "c"(0));
    return ((u64) edx << 32) | eax;
}

static void
cpuid(u32 out[4], u32 id)
{
#if  defined(__i386__) || defined(_M_IX86)
    __asm__ __volatile__("movl %%ebx, %1\n"
                         "cpuid\n"
                         "xchgl %1, %%ebx\n"
                         : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3])
                         : "a"(id));
#else
    __asm__ __volatile__("cpuid\n"
                         : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3])
                         : "a"(id));
#endif
}

static void
cpuidex(u32 out[4], u32 id, u32 sid)
{
#if  defined(__i386__) || defined(_M_IX86)
    __asm__ __volatile__("movl %%ebx, %1\n"
                         "cpuid\n"
                         "xchgl %1, %%ebx\n"
                         : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3])
                         : "a"(id), "c"(sid));
#else
    __asm__ __volatile__("cpuid\n"
                         : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3])
                         : "a"(id), "c"(sid));
#endif
}

#endif /* IS_X86 */

enum cpu_feature {
  SSE2      = 1 << 0,
  SSSE3     = 1 << 1,
  SSE41     = 1 << 2,
  AVX       = 1 << 3,
  AVX2      = 1 << 4,
  AVX512F   = 1 << 5,
  AVX512VL  = 1 << 6,
  /* ... */
  UNDEFINED = 1 << 30
};

#if !defined(BLAKE3_TESTING)
static /* Allow the variable to be controlled manually for testing */
#endif
enum cpu_feature g_cpu_features = UNDEFINED;

#if !defined(BLAKE3_TESTING)
static
#endif
enum cpu_feature
get_cpu_features(void)
{

    if (g_cpu_features != UNDEFINED)
        return g_cpu_features;

#if defined(IS_X86)
    u32 regs[4] = {0};
    u32 *eax = &regs[0], *ebx = &regs[1], *ecx = &regs[2], *edx = &regs[3];
    (void) edx;
    enum cpu_feature features = 0;
    cpuid(regs, 0);
    const int max_id = *eax;
    cpuid(regs, 1);
#if defined(__amd64__) || defined(_M_X64)
    features |= SSE2;
#else
    if (*edx & (1UL << 26))
        features |= SSE2;
#endif
    if (*ecx & (1UL << 0))
        features |= SSSE3;
    if (*ecx & (1UL << 19))
        features |= SSE41;

    if (*ecx & (1UL << 27)) { /* OSXSAVE */
        const u64 mask = xgetbv();
        if ((mask & 6) == 6) { /* SSE and AVX states */
            if (*ecx & (1UL << 28))
                features |= AVX;
            if (max_id >= 7) {
                cpuidex(regs, 7, 0);
                if (*ebx & (1UL << 5))
                    features |= AVX2;
                if ((mask & 224) == 224) { /* Opmask, ZMM_Hi256, Hi16_Zmm */
                    if (*ebx & (1UL << 31))
                        features |= AVX512VL;
                    if (*ebx & (1UL << 16))
                        features |= AVX512F;
                }
            }
        }
    }
    g_cpu_features = features;
    return features;
#else
    /* How to detect NEON? */
    return 0;
#endif
}

void blake3_compress_in_place(u32 cv[8],
                              const u8 block[BLAKE3_BLOCK_LEN],
                              u8 block_len, u64 counter,
                              u8 flags)
{
#if defined(IS_X86)
    const enum cpu_feature features = get_cpu_features();
    MAYBE_UNUSED(features);
#if !defined(BLAKE3_NO_AVX512)
    if (features & AVX512VL) {
        blake3_compress_in_place_avx512(cv, block, block_len, counter, flags);
        return;
    }
#endif
#if !defined(BLAKE3_NO_SSE41)
    if (features & SSE41) {
        blake3_compress_in_place_sse41(cv, block, block_len, counter, flags);
        return;
    }
#endif
#if !defined(BLAKE3_NO_SSE2)
    if (features & SSE2) {
        blake3_compress_in_place_sse2(cv, block, block_len, counter, flags);
        return;
    }
#endif
#endif
    blake3_compress_in_place_portable(cv, block, block_len, counter, flags);
}

void blake3_compress_xof(const u32 cv[8],
                         const u8 block[BLAKE3_BLOCK_LEN],
                         u8 block_len, u64 counter, u8 flags,
                         u8 out[64])
{
#if defined(IS_X86)
    const enum cpu_feature features = get_cpu_features();
    MAYBE_UNUSED(features);
#if !defined(BLAKE3_NO_AVX512)
    if (features & AVX512VL) {
        blake3_compress_xof_avx512(cv, block, block_len, counter, flags, out);
        return;
    }
#endif
#if !defined(BLAKE3_NO_SSE41)
    if (features & SSE41) {
        blake3_compress_xof_sse41(cv, block, block_len, counter, flags, out);
        return;
    }
#endif
#if !defined(BLAKE3_NO_SSE2)
    if (features & SSE2) {
        blake3_compress_xof_sse2(cv, block, block_len, counter, flags, out);
        return;
    }
#endif
#endif
    blake3_compress_xof_portable(cv, block, block_len, counter, flags, out);
}

void blake3_hash_many(const u8 *const *inputs, size_t num_inputs,
                      size_t blocks, const u32 key[8], u64 counter,
                      char increment_counter, u8 flags,
                      u8 flags_start, u8 flags_end, u8 *out)
{
#if defined(IS_X86)
    const enum cpu_feature features = get_cpu_features();
    MAYBE_UNUSED(features);
#if !defined(BLAKE3_NO_AVX512)
    if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) {
        blake3_hash_many_avx512(inputs, num_inputs, blocks, key, counter,
                                increment_counter, flags, flags_start, flags_end,
                                out);
        return;
    }
#endif
#if !defined(BLAKE3_NO_AVX2)
    if (features & AVX2) {
        blake3_hash_many_avx2(inputs, num_inputs, blocks, key, counter,
                              increment_counter, flags, flags_start, flags_end,
                              out);
        return;
    }
#endif
#if !defined(BLAKE3_NO_SSE41)
    if (features & SSE41) {
        blake3_hash_many_sse41(inputs, num_inputs, blocks, key, counter,
                               increment_counter, flags, flags_start, flags_end,
                               out);
        return;
    }
#endif
#if !defined(BLAKE3_NO_SSE2)
    if (features & SSE2) {
        blake3_hash_many_sse2(inputs, num_inputs, blocks, key, counter,
                              increment_counter, flags, flags_start, flags_end,
                              out);
        return;
    }
#endif
#endif

#if BLAKE3_USE_NEON == 1
    blake3_hash_many_neon(inputs, num_inputs, blocks, key, counter,
                          increment_counter, flags, flags_start, flags_end, out);
    return;
#endif

    blake3_hash_many_portable(inputs, num_inputs, blocks, key, counter,
                              increment_counter, flags, flags_start, flags_end,
                              out);
}

/* The dynamically detected SIMD degree of the current platform. */
size_t
blake3_simd_degree(void)
{
#if defined(IS_X86)
    const enum cpu_feature features = get_cpu_features();
    MAYBE_UNUSED(features);
#if !defined(BLAKE3_NO_AVX512)
    if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL))
        return 16;
#endif
#if !defined(BLAKE3_NO_AVX2)
    if (features & AVX2)
        return 8;
#endif
#if !defined(BLAKE3_NO_SSE41)
    if (features & SSE41)
        return 4;
#endif
#if !defined(BLAKE3_NO_SSE2)
    if (features & SSE2)
        return 4;
#endif
#endif
#if BLAKE3_USE_NEON == 1
    return 4;
#endif
    return 1;
}