/* This file is part of limb https://lila.oss/limb
* Copyright (C) 2023 Olivier Brunel jjk@jjacky.com */
/* Based on official BLAKE3 implementation:
* https://github.com/BLAKE3-team/BLAKE3
* Copyright (C) 2019-2020 Samuel Neves and Jack O'Connor */
/* SPDX-License-Identifier: CC0-1.0 OR Apache-2.0 */
#include "blake3.h"
#if defined(IS_X86)
#if defined(__GNUC__)
#include <immintrin.h>
#else
#undef IS_X86 /* Unimplemented! */
#endif
#endif
#define MAYBE_UNUSED(x) (void)((x))
#if defined(IS_X86)
static u64
xgetbv(void)
{
u32 eax = 0, edx = 0;
__asm__ __volatile__("xgetbv\n" : "=a"(eax), "=d"(edx) : "c"(0));
return ((u64) edx << 32) | eax;
}
static void
cpuid(u32 out[4], u32 id)
{
#if defined(__i386__) || defined(_M_IX86)
__asm__ __volatile__("movl %%ebx, %1\n"
"cpuid\n"
"xchgl %1, %%ebx\n"
: "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3])
: "a"(id));
#else
__asm__ __volatile__("cpuid\n"
: "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3])
: "a"(id));
#endif
}
static void
cpuidex(u32 out[4], u32 id, u32 sid)
{
#if defined(__i386__) || defined(_M_IX86)
__asm__ __volatile__("movl %%ebx, %1\n"
"cpuid\n"
"xchgl %1, %%ebx\n"
: "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3])
: "a"(id), "c"(sid));
#else
__asm__ __volatile__("cpuid\n"
: "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3])
: "a"(id), "c"(sid));
#endif
}
#endif /* IS_X86 */
enum cpu_feature {
SSE2 = 1 << 0,
SSSE3 = 1 << 1,
SSE41 = 1 << 2,
AVX = 1 << 3,
AVX2 = 1 << 4,
AVX512F = 1 << 5,
AVX512VL = 1 << 6,
/* ... */
UNDEFINED = 1 << 30
};
#if !defined(BLAKE3_TESTING)
static /* Allow the variable to be controlled manually for testing */
#endif
enum cpu_feature g_cpu_features = UNDEFINED;
#if !defined(BLAKE3_TESTING)
static
#endif
enum cpu_feature
get_cpu_features(void)
{
if (g_cpu_features != UNDEFINED)
return g_cpu_features;
#if defined(IS_X86)
u32 regs[4] = {0};
u32 *eax = ®s[0], *ebx = ®s[1], *ecx = ®s[2], *edx = ®s[3];
(void) edx;
enum cpu_feature features = 0;
cpuid(regs, 0);
const int max_id = *eax;
cpuid(regs, 1);
#if defined(__amd64__) || defined(_M_X64)
features |= SSE2;
#else
if (*edx & (1UL << 26))
features |= SSE2;
#endif
if (*ecx & (1UL << 0))
features |= SSSE3;
if (*ecx & (1UL << 19))
features |= SSE41;
if (*ecx & (1UL << 27)) { /* OSXSAVE */
const u64 mask = xgetbv();
if ((mask & 6) == 6) { /* SSE and AVX states */
if (*ecx & (1UL << 28))
features |= AVX;
if (max_id >= 7) {
cpuidex(regs, 7, 0);
if (*ebx & (1UL << 5))
features |= AVX2;
if ((mask & 224) == 224) { /* Opmask, ZMM_Hi256, Hi16_Zmm */
if (*ebx & (1UL << 31))
features |= AVX512VL;
if (*ebx & (1UL << 16))
features |= AVX512F;
}
}
}
}
g_cpu_features = features;
return features;
#else
/* How to detect NEON? */
return 0;
#endif
}
void blake3_compress_in_place(u32 cv[8],
const u8 block[BLAKE3_BLOCK_LEN],
u8 block_len, u64 counter,
u8 flags)
{
#if defined(IS_X86)
const enum cpu_feature features = get_cpu_features();
MAYBE_UNUSED(features);
#if !defined(BLAKE3_NO_AVX512)
if (features & AVX512VL) {
blake3_compress_in_place_avx512(cv, block, block_len, counter, flags);
return;
}
#endif
#if !defined(BLAKE3_NO_SSE41)
if (features & SSE41) {
blake3_compress_in_place_sse41(cv, block, block_len, counter, flags);
return;
}
#endif
#if !defined(BLAKE3_NO_SSE2)
if (features & SSE2) {
blake3_compress_in_place_sse2(cv, block, block_len, counter, flags);
return;
}
#endif
#endif
blake3_compress_in_place_portable(cv, block, block_len, counter, flags);
}
void blake3_compress_xof(const u32 cv[8],
const u8 block[BLAKE3_BLOCK_LEN],
u8 block_len, u64 counter, u8 flags,
u8 out[64])
{
#if defined(IS_X86)
const enum cpu_feature features = get_cpu_features();
MAYBE_UNUSED(features);
#if !defined(BLAKE3_NO_AVX512)
if (features & AVX512VL) {
blake3_compress_xof_avx512(cv, block, block_len, counter, flags, out);
return;
}
#endif
#if !defined(BLAKE3_NO_SSE41)
if (features & SSE41) {
blake3_compress_xof_sse41(cv, block, block_len, counter, flags, out);
return;
}
#endif
#if !defined(BLAKE3_NO_SSE2)
if (features & SSE2) {
blake3_compress_xof_sse2(cv, block, block_len, counter, flags, out);
return;
}
#endif
#endif
blake3_compress_xof_portable(cv, block, block_len, counter, flags, out);
}
void blake3_hash_many(const u8 *const *inputs, size_t num_inputs,
size_t blocks, const u32 key[8], u64 counter,
char increment_counter, u8 flags,
u8 flags_start, u8 flags_end, u8 *out)
{
#if defined(IS_X86)
const enum cpu_feature features = get_cpu_features();
MAYBE_UNUSED(features);
#if !defined(BLAKE3_NO_AVX512)
if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) {
blake3_hash_many_avx512(inputs, num_inputs, blocks, key, counter,
increment_counter, flags, flags_start, flags_end,
out);
return;
}
#endif
#if !defined(BLAKE3_NO_AVX2)
if (features & AVX2) {
blake3_hash_many_avx2(inputs, num_inputs, blocks, key, counter,
increment_counter, flags, flags_start, flags_end,
out);
return;
}
#endif
#if !defined(BLAKE3_NO_SSE41)
if (features & SSE41) {
blake3_hash_many_sse41(inputs, num_inputs, blocks, key, counter,
increment_counter, flags, flags_start, flags_end,
out);
return;
}
#endif
#if !defined(BLAKE3_NO_SSE2)
if (features & SSE2) {
blake3_hash_many_sse2(inputs, num_inputs, blocks, key, counter,
increment_counter, flags, flags_start, flags_end,
out);
return;
}
#endif
#endif
#if BLAKE3_USE_NEON == 1
blake3_hash_many_neon(inputs, num_inputs, blocks, key, counter,
increment_counter, flags, flags_start, flags_end, out);
return;
#endif
blake3_hash_many_portable(inputs, num_inputs, blocks, key, counter,
increment_counter, flags, flags_start, flags_end,
out);
}
/* The dynamically detected SIMD degree of the current platform. */
size_t
blake3_simd_degree(void)
{
#if defined(IS_X86)
const enum cpu_feature features = get_cpu_features();
MAYBE_UNUSED(features);
#if !defined(BLAKE3_NO_AVX512)
if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL))
return 16;
#endif
#if !defined(BLAKE3_NO_AVX2)
if (features & AVX2)
return 8;
#endif
#if !defined(BLAKE3_NO_SSE41)
if (features & SSE41)
return 4;
#endif
#if !defined(BLAKE3_NO_SSE2)
if (features & SSE2)
return 4;
#endif
#endif
#if BLAKE3_USE_NEON == 1
return 4;
#endif
return 1;
}