Welcome to little lamb

Code » test-rolling-hash » next » tree

[next] / src / test.c

#include <dirent.h>
#include <errno.h>
#include <time.h>
#include <skalibs/stralloc.h>
#include <skalibs/unix-transactional.h>
#include <skalibs/djbunix.h>
#include <skalibs/uint32.h>
#include <skalibs/uint16.h>
#include <skalibs/fmtscan.h>
#include <limb/int.h>
#include <limb/output.h>
#include <limb/blake3.h>

const char *PROG = "test";

extern size_t nextsplit(size_t min, size_t avg, const void *data, size_t dlen);

struct bench {
    size_t min;
    size_t avg;
    size_t max;
    int iter;
    stralloc *sa;
    u8 *blocks;
    size_t blkpos;
    size_t pos;
    size_t total;
    int hashes;
};

static void
bench(struct bench *b)
{
    size_t pos, curblkpos = b->blkpos;
    for (int i = 0; i < b->iter; ++i) {
        curblkpos = b->blkpos;
        pos = b->pos;
        while (pos < b->sa->len) {
            size_t len = b->sa->len - pos;
            size_t offset = nextsplit(b->min, b->avg,
                                      b->sa->s + pos, (len > b->max) ? b->max : len);
            curblkpos += u64_pack_trim((u64) offset, b->blocks + curblkpos);
            /* special case: when scanning directories, we don't keep the
             * entire dataset in memory, so we need to compute/print hashes
             * right now within the benchmarking. */
            if (i == 0 && b->hashes) {
                unsigned char buf[32];
                blake3(b->sa->s + pos, offset, buf);
                char hash[32 * 2 + 1] = { 0 };
                for (int i = 0; i < sizeof(buf); ++i) {
                    if (buf[i] & 0xf0)
                        hash[i * 2] = fmtscan_asc(buf[i] >> 4);
                    else
                        hash[i * 2] = '0';
                    hash[i * 2 + 1] = fmtscan_asc(buf[i] & 0x0f);
                }
                char size[UINT32_FMT];
                size[uint32_fmt(size, offset)] = 0;
                out(hash, " ", size);
            }
            pos += offset;
        }
    }
    b->blkpos = curblkpos;
    b->total += b->sa->len - b->pos;
}

static int
processdir(int basefd, const char *name, struct bench *b)
{
    int ret = -1;
    int fd;
    DIR *dir;
    struct dirent *de;

    dir = NULL;
    fd = open_readat(basefd, name);
    if (fd < 0) goto err;

    dir = fdopendir(fd);
    if (!dir) goto err;
    fd = -1;

    for (;;) {
        errno = 0;
        de = readdir(dir);
        if (!de) {
            if (errno) goto err;
            break;
        }
        if (de->d_name[0] == '.' &&
                (!de->d_name[1] || (de->d_name[1] == '.' && !de->d_name[2])))
            continue;

        size_t salen = b->sa->len;
        fd = open_readat(dirfd(dir), de->d_name);
        if (fd < 0 || !slurp(b->sa, fd)) {
            if (errno == EISDIR) {
                fd_close(fd);
                if (processdir(dirfd(dir), de->d_name, b) < 0)
                    goto err;
                continue;
            }
            if (fd >= 0) fd_close(fd);
            warnusys("process ...", name, "/", de->d_name);
            continue;
        }
        fd_close(fd);
        fd = -1;

        bench(b);
        b->sa->len = salen;
    }

    ret = 0;
err:
    if (fd >= 0) fd_close(fd);
    if (dir) closedir(dir);
    return ret;
}

#include <stdio.h>
int
main(int argc, const char *argv[])
{
    struct timespec ts1, ts2;
    stralloc sa = STRALLOC_ZERO;
    u8 blocks[800 << 10];
    struct bench b = {
        .min = (  4 << 10),
        .avg = (  8 << 10),
        .max = (  1 << 20),
        .iter = 1,
        .sa = &sa,
        .blocks = blocks,
    };
    int list = 0;

    while (argc >= 2) {
        if (!strncmp(argv[1], "-a", 2) || !strncmp(argv[1], "-m", 2)
                || !strncmp(argv[1], "-M", 2)) {
            const char *s = NULL;
            if (!argv[1][2])
                s = argv[2];
            else if (argc < 3)
                dief(1, "missing value for ", argv[1]);
            else
                s = argv[1] + 2;

            u32 u;
            if (!uint32_scan(s, &u))
                dief(1, "invalid value for ", argv[1]);
            switch (argv[1][1]) {
                case 'a': b.avg = u; break;
                case 'm': b.min = u; break;
                case 'M': b.max = u; break;
            }

            int e = (s == argv[2]) ? 1 : 0;
            argc -= 1 + e;
            memmove(&argv[1], &argv[2 + e], argc * sizeof(*argv));
        } else if (!strcmp(argv[1], "-l")) {
            list = 1;
        } else if (!strcmp(argv[1], "-H")) {
            list = 2;
        } else {
            break;
        }

        if (list)
            memmove(&argv[1], &argv[2], --argc * sizeof(*argv));
    }

    if (argc != 2 && argc != 3)
        dieusage(1, "[-a AVGSIZE] [-m MINSIZE] [-M MAXSIZE] [-l | -H] FILE [ITER]");

    if (argc == 3) {
        u16 u;
        if (!uint16_scan(argv[2], &u))
            dief(1, "invalid ITER argument");
        b.iter = u;
    }

    int fd = open_read(argv[1]);
    if (fd < 0 || !slurp(&sa, fd)) {
        if (errno != EISDIR)
            diefusys(2, "read ", argv[1]);
        fd_close(fd);
        fd = -1;
        if (list == 2) {
            b.hashes = 1;
            list = 0;
        }
    }

    clock_gettime(CLOCK_MONOTONIC, &ts1);
    if (fd < 0)
        processdir(AT_FDCWD, argv[1], &b);
    else
        bench(&b);
    clock_gettime(CLOCK_MONOTONIC, &ts2);
    blocks[b.blkpos] = 0;

    if (fd >= 0) fd_close(fd);

    size_t min = b.total;
    size_t max = 0;
    size_t total = 0;
    int n = 1;
    size_t pos = 0;
    for (int o = 0; blocks[o]; ++n) {
        u64 u;
        o += u64_unpack_trim(blocks + o, &u);
        if (list) {
            if (list == 2) {
                unsigned char buf[32];
                blake3(sa.s + pos, u, buf);
                for (int i = 0; i < sizeof(buf); ++i)
                    fprintf(stdout, "%.02x", buf[i]);
                pos += u;
            } else if (list == 1) {
                fprintf(stdout, "block:");
            }
            fprintf(stdout, " %lu\n", u);
        }
        if (u < min && blocks[o]) min = u;
        if (u > max) max = u;
        total += u;
    }
    --n;
    if (total != b.total) {
        fprintf(stderr, "incorrect total size %lu != %lu\n", total, b.total);
        return 2;
    }
    fprintf(stderr, "%u blocks; min=%lu, avg=%lu, max=%lu\n", n, min, total / n, max);

    ts2.tv_sec -= ts1.tv_sec;
    ts2.tv_nsec -= ts1.tv_nsec;
    double took = ts2.tv_sec + (ts2.tv_nsec / 1000000000.0);
    double speed = ((b.total * b.iter) / took) / (1 << 20);
    fprintf(stderr, "took %.09f seconds, %f MiB/s\n", took, speed);

    return 0;
}