/* This file is part of limb https://lila.oss/limb
* Copyright (C) 2023 Olivier Brunel jjk@jjacky.com */
/* SPDX-License-Identifier: GPL-2.0-only */
#include <limb/nextsplit.h>
#include <limb/u64.h>
#include <limb/rabin-tables.h>
/* This is based on rabin fingerprint, using a miw of two different variants :
* - for avg < 128 KiB the double-mask variant, which uses a large mask when avg
* hasn't yet been reached, then a smaller mask afterwards
* - from avg >= 128 KiB the TTTD variant, which also uses two masks, the
* "standard" one and a smaller/fallback one used only when the first one
* didn't find a split.
* Using those two (in that manner) is just the result of some testing, in an
* attempt to get the best deduplication results.
*/
size_t
nextsplit_rabin(size_t min, size_t avg, const void *data, size_t dlen)
{
if (dlen <= min) return dlen;
u8 window[RABIN_WINDOW_SIZE] = { 0 };
const u8 *b = data;
size_t wpos = 0, pos = min, last = 0;
u64 fp = 0;
u64 mask, lrg_mask;
int tttd = avg >= (128 << 10);
if (tttd)
mask = (1 << (msb64(avg ) - 1)) - 1;
else
mask = (1 << (msb64(avg * 2) - 1)) - 1;
lrg_mask = (1 << (msb64(avg / 2) - 1)) - 1;
#define TTTD_BREAK 0x78
#define APPEND8(p,m) (((p) << 8) | m) ^ rabin_tables.T[(p) >> 55]
for (;;) {
u8 old = window[wpos];
window[wpos] = b[pos];
fp = APPEND8(fp ^ rabin_tables.U[old], window[wpos]);
if (++wpos == sizeof(window)) wpos = 0;
if (pos == dlen) {
if (last) return last;
return pos;
}
if (pos >= min) {
if (tttd) {
if ((fp & lrg_mask) == TTTD_BREAK) {
if ((fp & mask) == TTTD_BREAK)
return pos;
last = pos;
}
} else if (pos < avg) {
if ((fp & mask) == mask)
return pos;
} else if ((fp & lrg_mask) == lrg_mask) {
return pos;
}
}
++pos;
}
}