Welcome to little lamb

Code » limb » release » tree

[release] / src / liblimb / nextsplit.h / nextsplit_rabin.c

/* This file is part of limb                           https://lila.oss/limb
 * Copyright (C) 2023 Olivier Brunel                          jjk@jjacky.com */
/* SPDX-License-Identifier: GPL-2.0-only */
#include <limb/nextsplit.h>
#include <limb/u64.h>
#include <limb/rabin-tables.h>

/* This is based on rabin fingerprint, using a miw of two different variants :
 * - for avg < 128 KiB the double-mask variant, which uses a large mask when avg
 *   hasn't yet been reached, then a smaller mask afterwards
 * - from avg >= 128 KiB the TTTD variant, which also uses two masks, the
 *   "standard" one and a smaller/fallback one used only when the first one
 *   didn't find a split.
 * Using those two (in that manner) is just the result of some testing, in an
 * attempt to get the best deduplication results.
 */

size_t
nextsplit_rabin(size_t min, size_t avg, const void *data, size_t dlen)
{
    if (dlen <= min) return dlen;

    u8 window[RABIN_WINDOW_SIZE] = { 0 };
    const u8 *b = data;
    size_t wpos = 0, pos = min, last = 0;
    u64 fp = 0;
    u64 mask, lrg_mask;
    int tttd = avg >= (128 << 10);

    if (tttd)
        mask = (1 << (msb64(avg    ) - 1)) - 1;
    else
        mask = (1 << (msb64(avg * 2) - 1)) - 1;
    lrg_mask = (1 << (msb64(avg / 2) - 1)) - 1;

#define TTTD_BREAK      0x78
#define APPEND8(p,m)    (((p) << 8) | m) ^ rabin_tables.T[(p) >> 55]
    for (;;) {
        u8 old = window[wpos];
        window[wpos] = b[pos];
        fp = APPEND8(fp ^ rabin_tables.U[old], window[wpos]);
        if (++wpos == sizeof(window)) wpos = 0;

        if (pos == dlen) {
            if (last) return last;
            return pos;
        }
        if (pos >= min) {
            if (tttd) {
                if ((fp & lrg_mask) == TTTD_BREAK) {
                    if ((fp & mask) == TTTD_BREAK)
                        return pos;
                    last = pos;
                }
            } else if (pos < avg) {
                if ((fp & mask) == mask)
                    return pos;
            } else if ((fp & lrg_mask) == lrg_mask) {
                return pos;
            }
        }
        ++pos;
    }
}