Welcome to little lamb

Code » test-rolling-hash » commit 10b2c1c

First commit

author Olivier Brunel
2023-02-14 10:10:16 UTC
committer Olivier Brunel
2023-02-15 16:42:04 UTC
parent da4eeed5f3f27ff6d816a47cd4463b71479b2409

First commit

.gitignore +4 -0
include/rabin-tables.h +142 -0
meta/AUTHORS +6 -0
meta/bins/test-ae +4 -0
meta/bins/test-ae-32 +4 -0
meta/bins/test-buz +4 -0
meta/bins/test-hashchop +4 -0
meta/bins/test-rabin +4 -0
meta/bins/test-rabin-mask +4 -0
meta/bins/test-rabin-mix +4 -0
meta/bins/test-rabin-tttd +4 -0
meta/deps/comain/version +1 -1
meta/deps/limb/configure +0 -0
meta/deps/limb/files +0 -0
meta/deps/limb/git +1 -0
meta/deps/limb/static +0 -0
meta/deps/limb/version +1 -0
meta/deps/skalibs/get_version +2 -0
meta/deps/skalibs/git +1 -0
meta/deps/skalibs/incdir +1 -0
meta/deps/skalibs/include +0 -0
meta/deps/skalibs/libdir +1 -0
meta/deps/skalibs/library +1 -0
meta/deps/skalibs/version +1 -0
meta/desc +19 -0
meta/doc +0 -1
meta/site +0 -1
project.mk +8 -7
src/ae-32.c +56 -0
src/ae.c +56 -0
src/buz.c +102 -0
src/hashchop.c +50 -0
src/rabin-mask.c +42 -0
src/rabin-mix.c +49 -0
src/rabin-tttd.c +45 -0
src/rabin.c +36 -0
src/test.c +244 -0
tests +10 -0
tests-avgs +25 -0
tests-dedup +56 -0

diff --git a/.gitignore b/.gitignore
index 7ccd516..37b95b9 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,3 +6,7 @@
 *.o
 *.lo
 *.d
+/test-*
+/limb
+/limb.built
+/skalibs
diff --git a/include/rabin-tables.h b/include/rabin-tables.h
new file mode 100644
index 0000000..803fdb9
--- /dev/null
+++ b/include/rabin-tables.h
@@ -0,0 +1,142 @@
+#ifndef RABIN_TABLES_H
+#define RABIN_TABLES_H
+
+#include <limb/int.h>
+
+#define RABIN_WINDOW_SIZE 32
+
+static struct {
+	u64 T[256];
+	u64 U[256];
+} rabin_tables = { {
+0ULL, 13827942727904890243ULL, 9209141382100228870ULL, 13847383506563929733ULL,
+4646688091768732559ULL, 18418282764200457740ULL, 4589497290695549065ULL, 9248022939418307850ULL,
+4548086706303466141ULL, 9293376183537465118ULL, 4671215262765932955ULL, 18389821454691363864ULL,
+9178994581391098130ULL, 13872474804339788945ULL, 49301805127064084ULL, 13783723088868136855ULL,
+9096173412606932282ULL, 13968654369411725497ULL, 140008293365378620ULL, 13697645939728147391ULL,
+4485659674370133685ULL, 9342430525531865910ULL, 4723487651964868019ULL, 18332898835673176112ULL,
+4693337148998094759ULL, 18357989162782196260ULL, 4534953362004776097ULL, 9298205534970026274ULL,
+98603610254128168ULL, 13743006751583844779ULL, 9120702104026722094ULL, 13940196212404396717ULL,
+4872485406381467639ULL, 18192346825213864564ULL, 4347093914495974641ULL, 9490564665113899378ULL,
+280016586730757240ULL, 13548069301033521659ULL, 8948547805746743166ULL, 14107834197935999741ULL,
+8971319348740267370ULL, 14080011446993228009ULL, 238116977354180204ULL, 13595046231760440303ULL,
+4394630711393986277ULL, 9446975303929736038ULL, 4841840269215546851ULL, 18219053597636800608ULL,
+4441272913864004301ULL, 9386674297996189518ULL, 4787294948947036619ULL, 18269234251854840904ULL,
+9069906724009552194ULL, 13995059785443288257ULL, 149666996230500932ULL, 13687848784306683847ULL,
+197207220508256336ULL, 13644260118911444435ULL, 9039269429458137942ULL, 14021771634513411797ULL,
+4810060865537627103ULL, 18241404208053444188ULL, 4399372058942752985ULL, 9433648351099241818ULL,
+4097061761524384365ULL, 9744970812762935278ULL, 5122526527230073195ULL, 17937949576718177512ULL,
+8694187828991949282ULL, 14357842170245903457ULL, 534385256518247140ULL, 13298069953032646503ULL,
+560033173461514480ULL, 13268479238923367795ULL, 8649394528357491702ULL, 14406569746831191669ULL,
+5168435857949066111ULL, 17897095611493486332ULL, 4068026223473531001ULL, 9768924322162447866ULL,
+5109252900731139927ULL, 17942638697480534740ULL, 4119323861271030865ULL, 9713278820276904402ULL,
+476233954708360408ULL, 13365932622558560603ULL, 8743348389811328990ULL, 14316984640186325597ULL,
+8789261422787972554ULL, 14276131645628465225ULL, 447206534149920460ULL, 13389891483483167567ULL,
+4144965876933457477ULL, 9683680538431093702ULL, 5064458079674101059ULL, 17991363121564049600ULL,
+8882545827728008602ULL, 14169341458345928729ULL, 326604522282827420ULL, 13505993675309822751ULL,
+4267581128949538325ULL, 9574589897894073238ULL, 4968612946513855763ULL, 18091724430000130192ULL,
+4925575273883098887ULL, 18139813448019104388ULL, 4293718070877275137ULL, 9543375497177024898ULL,
+299333992461001864ULL, 13529316906856737035ULL, 8928953494903816078ULL, 14126872018474168845ULL,
+394414441016512672ULL, 13447613786754473251ULL, 8841776164113337254ULL, 14218695490258224677ULL,
+4973495624276863791ULL, 18078538858916275884ULL, 4235660326371356713ULL, 9596799195317271978ULL,
+4208386369169795645ULL, 9620121731075254206ULL, 5019895448838156603ULL, 18036064342397336760ULL,
+8798744117885505970ULL, 14266791801135832113ULL, 420552628488932020ULL, 13416402263661283127ULL,
+8194123523048768730ULL, 14866918157251578201ULL, 1043197551816318940ULL, 12798269171274486367ULL,
+3587967976203566933ULL, 10245053054460146390ULL, 5622309373242516563ULL, 17429155079726803408ULL,
+5668153059635265095ULL, 17388375657983898564ULL, 3559007428260750657ULL, 10268940266782255298ULL,
+1068770513036494280ULL, 12768744788610795595ULL, 8149395832355741390ULL, 14915571155987026765ULL,
+1120066346923028960ULL, 12713097482813373539ULL, 8090214404137183974ULL, 14961115770974627685ULL,
+5762105426466228847ULL, 17298789056714983404ULL, 3475209979042058601ULL, 10366395419952831722ULL,
+3500786384821783421ULL, 10336871715898132222ULL, 5717385561207771259ULL, 17347447149277421048ULL,
+8136052446947062002ULL, 14920329073552899441ULL, 1091104570615344116ULL, 12736981800331683447ULL,
+3614653578697433901ULL, 10218505801462279854ULL, 5612792507343960107ULL, 17438533321251517864ULL,
+8238647722542061730ULL, 14822251072776885537ULL, 979813566844257188ULL, 12861796316105920039ULL,
+952467909416720816ULL, 12885185707350004787ULL, 8285121171407569590ULL, 14779707226937790261ULL,
+5569689087391539775ULL, 17486696779622657980ULL, 3640865613862713657ULL, 10187225206663099578ULL,
+5482514350769116695ULL, 17578522845575945108ULL, 3735943193405689105ULL, 10105519217547378834ULL,
+894413068299840920ULL, 12938612308880203803ULL, 8333038893256783518ULL, 14718430009289227037ULL,
+8289931753866914954ULL, 14766592514173460745ULL, 920617003152635788ULL, 12907326345376135695ULL,
+3708603454438921989ULL, 10128916159348202118ULL, 5528989302877408259ULL, 17535982169418547584ULL,
+5290867652594615991ULL, 17765091655456017204ULL, 3936569740447636913ULL, 9891938842982305842ULL,
+653209044565654840ULL, 13183745368696811707ULL, 8565243276910093886ULL, 14500293120999091133ULL,
+8535162257899076650ULL, 14525310012799234473ULL, 702435722078594860ULL, 13139591889365609135ULL,
+3895234249284683685ULL, 9937225893027711526ULL, 5315329076278566051ULL, 17736704786290708768ULL,
+3985942541400290189ULL, 9851150547766197774ULL, 5232506378461325451ULL, 17832882822328657160ULL,
+8587436141754550274ULL, 14468388888438506881ULL, 640006920644498180ULL, 13188644461858070151ULL,
+598667984922003728ULL, 13233930833294476435ULL, 8611889740003922454ULL, 14439996925446217621ULL,
+5202431003033170591ULL, 17857906989807632156ULL, 4035170447278107033ULL, 9806999963238786074ULL,
+788828882033025344ULL, 13048268519271181507ULL, 8448483499799394886ULL, 14616910184945891269ULL,
+5372268252389710543ULL, 17683552328226674508ULL, 3838000129179320777ULL, 9990646906806897738ULL,
+3885611916177005533ULL, 9946991248553727582ULL, 5341557470905909467ULL, 17710333644123000152ULL,
+8471320652742713426ULL, 14589012856144883153ULL, 746854316924992340ULL, 13095311781452084951ULL,
+8416772738339591290ULL, 14639190916227389945ULL, 793499388440956796ULL, 13035013644565660415ULL,
+3797159031630852085ULL, 10039790897676313206ULL, 5440147474686342387ULL, 17625384611085121904ULL,
+5462988347067117287ULL, 17597488235771011940ULL, 3755192566835295713ULL, 10086839528562112610ULL,
+841105256977864040ULL, 12991350435755759851ULL, 8386060453613014638ULL, 14665969062442009581ULL
+}, {
+0ULL, 568999561260842924ULL, 1137999122521685848ULL, 589833262815396084ULL,
+2275998245043371696ULL, 1761055125200617756ULL, 1179666525630792168ULL, 1709813571542236740ULL,
+4551996490086743392ULL, 4093365957697716940ULL, 3522110250401235512ULL, 3964419511316364692ULL,
+2359333051261584336ULL, 2835991227898739836ULL, 3419627143084473480ULL, 2923270271204541220ULL,
+9103992980173486784ULL, 8769246759817692524ULL, 8186731915395433880ULL, 8536682337754266164ULL,
+7044220500802471024ULL, 7360965568252689372ULL, 7928839022632729384ULL, 7632927380768113796ULL,
+4718666102523168672ULL, 5087220157816518668ULL, 5671982455797479672ULL, 5283729429325956948ULL,
+6839254286168946960ULL, 6416670280833615548ULL, 5846540542409082440ULL, 6252803552238418404ULL,
+4848541058636740611ULL, 4948334975574230959ULL, 5513398027029317467ULL, 5433303080062040311ULL,
+6691366354261935795ULL, 6573562385112938783ULL, 5987111374761032171ULL, 6121236340355548743ULL,
+8962878867740132707ULL, 8901352301823769295ULL, 8334041934752250427ULL, 8380364299873535383ULL,
+7203352090154466259ULL, 7210839806991352959ULL, 7799546963378373771ULL, 7771225820016360231ULL,
+4404127935990301379ULL, 4250242530309684591ULL, 3662665828094967195ULL, 3832872504041521719ULL,
+2489192475691547763ULL, 2697125423483274207ULL, 3261062918908085035ULL, 3072828667298659463ULL,
+159151793960550307ULL, 418858545340760079ULL, 1008691531600494843ULL, 728151079873691479ULL,
+2134868877968572691ULL, 1893180871797948095ULL, 1326995922814959179ULL, 1553480001977369063ULL,
+4140017981295394181ULL, 4508440215577772585ULL, 3944784144731144925ULL, 3556663179771985265ULL,
+2801718574537737013ULL, 2379248763918204057ULL, 2966544078196702317ULL, 3372692584055817153ULL,
+459293339567948005ULL, 124626129572642633ULL, 690529550898757565ULL, 1040400653676232721ULL,
+1858373128789327445ULL, 2175021559617438201ULL, 1585470133093138701ULL, 1289655368385576609ULL,
+5126131060248446789ULL, 4667650181573556457ULL, 5253819664402282525ULL, 5695979512138221489ULL,
+6392390533637765621ULL, 6868881429762295385ULL, 6304099938176337581ULL, 5807910037588162817ULL,
+8649266140029483557ULL, 9218063236388832649ULL, 8629637504445252989ULL, 8081673800386034385ULL,
+7466306151047333013ULL, 6951547869274028857ULL, 7518576848274586573ULL, 8048539296617972833ULL,
+8808255871980602758ULL, 9068077127062600234ULL, 8500485060619369182ULL, 8219830414445200754ULL,
+7325331656189934390ULL, 7083511588238509210ULL, 7665745008083043438ULL, 7892360908524862402ULL,
+4978384951383095526ULL, 4824402668276991818ULL, 5394250846966548414ULL, 5564554159803643922ULL,
+6522125837816170070ULL, 6730138104920816122ULL, 6145657334597318926ULL, 5957344072895841954ULL,
+318303587921100614ULL, 256610050981413098ULL, 837717090681520158ULL, 884206736046997426ULL,
+2017383063200989686ULL, 2025020193485504090ULL, 1456302159747382958ULL, 1427831362402795778ULL,
+4269737755937145382ULL, 4369716270711127434ULL, 3786361743595896190ULL, 3706081958290735826ULL,
+2653991845629918358ULL, 2535985721102287674ULL, 3106960003954738126ULL, 3241287434182311010ULL,
+8280035962590788362ULL, 8434651392460768422ULL, 9016880431155545170ULL, 8847069596582984702ULL,
+7889568289462289850ULL, 7680922806604282390ULL, 7113326359543970530ULL, 7301147211969348942ULL,
+5603437149075474026ULL, 5342982678240115142ULL, 4758497527836408114ULL, 5038659764734477982ULL,
+5933088156393404634ULL, 6175541371108706166ULL, 6745385168111634306ULL, 6519261745934350382ULL,
+918586679135896010ULL, 817974673999609446ULL, 249252259145285266ULL, 329039360028524862ULL,
+1381059101797515130ULL, 1499698648235884758ULL, 2080801307352465442ULL, 1946966629999427470ULL,
+3716746257578654890ULL, 3779073216429081350ULL, 4350043119234876402ULL, 4304046226743934046ULL,
+3170940266186277402ULL, 3162669645539458486ULL, 2579310736771153218ULL, 2607288849693819630ULL,
+3576388832492159753ULL, 3910422620844113061ULL, 4497722316809239633ULL, 4147358460889842685ULL,
+3329315614885313977ULL, 3013300674686894613ULL, 2449648983920476897ULL, 2745956432782572877ULL,
+1047669660451831401ULL, 679880916799144389ULL, 90333870891925809ULL, 478947520005619357ULL,
+1233962491836858585ULL, 1655798880813039477ULL, 2221706683288911745ULL, 1815065424287875117ULL,
+5761796968237857225ULL, 5193633087362385509ULL, 4628855977429501585ULL, 5177312091034302781ULL,
+5792750111264442233ULL, 6306875246004386005ULL, 6893049108879767585ULL, 6362594182541330317ULL,
+8132924095840683177ULL, 8590771827482213125ULL, 9157805187066297329ULL, 8715152861335307357ULL,
+8018671473238768153ULL, 7542813792598115765ULL, 6954392441770739009ULL, 7451074751904496365ULL,
+5448463561577301647ULL, 5510623274259167523ULL, 4924167849500867031ULL, 4878337962643539579ULL,
+6055420538959001663ULL, 6047299606370273171ULL, 6612358241812547431ULL, 6640186975897991371ULL,
+8410230468549049327ULL, 8309803335859717187ULL, 8898506076770796727ULL, 8978108614461504283ULL,
+7719974969369724255ULL, 7838412085284223731ULL, 7271097303269440007ULL, 7137464815938880939ULL,
+3876251125767896143ULL, 3615910884025844707ULL, 4179843965010707223ULL, 4459891732314078395ULL,
+3053103567622800127ULL, 3295424995704749395ULL, 2707843877443665319ULL, 2481852551154156043ULL,
+783843403378688303ULL, 938362230970629763ULL, 372172866478145143ULL, 202458943404491227ULL,
+1546209342646353823ULL, 1337642904492785715ULL, 1927471475775701191ULL, 2115213042961694571ULL,
+636607175842201228ULL, 1094604286589387040ULL, 513220101962826196ULL, 70418087905312376ULL,
+1675434181363040316ULL, 1199409495357210512ULL, 1768413472093994852ULL, 2265263028127875272ULL,
+4034766126401979372ULL, 3466400055772462144ULL, 4050040386971008180ULL, 4598698930831246104ULL,
+2912604319494765916ULL, 3426914017695679216ULL, 2855662724805591556ULL, 2325022925751762344ULL,
+8539475511874290764ULL, 8171554672065395680ULL, 8739432541422254868ULL, 9128177977437734072ULL,
+7572723487191792380ULL, 7994674345494234448ULL, 7412163916581471652ULL, 7005408428755565064ULL,
+5307983691259836716ULL, 5642096764582741632ULL, 5071971442204575348ULL, 4721528541849172440ULL,
+6213920007909476252ULL, 5897808155943712816ULL, 6482574868364622020ULL, 6778978919773194088ULL
+} };
+#endif /* RABIN_TABLES_H */
diff --git a/meta/AUTHORS b/meta/AUTHORS
index 6aef6f1..b2c6dc6 100644
--- a/meta/AUTHORS
+++ b/meta/AUTHORS
@@ -1,2 +1,8 @@
 Main author:
 * Olivier Brunel <jjk@jjacky.com>
+
+Contributors:
+* Jonas Borgström & The Borg Collective [buz]
+* Min Fu [rabin, rabin-mask, rabin-tttd]
+* Scott Vokes <vokes.s@gmail.com> [hashchop]
+* Yucheng Zhang [ae]
diff --git a/meta/bins/test-ae b/meta/bins/test-ae
new file mode 100644
index 0000000..038c0c6
--- /dev/null
+++ b/meta/bins/test-ae
@@ -0,0 +1,4 @@
+obj/test.o
+limb
+skalibs
+obj/ae.o
diff --git a/meta/bins/test-ae-32 b/meta/bins/test-ae-32
new file mode 100644
index 0000000..41a6f7e
--- /dev/null
+++ b/meta/bins/test-ae-32
@@ -0,0 +1,4 @@
+obj/test.o
+limb
+skalibs
+obj/ae-32.o
diff --git a/meta/bins/test-buz b/meta/bins/test-buz
new file mode 100644
index 0000000..2e005f4
--- /dev/null
+++ b/meta/bins/test-buz
@@ -0,0 +1,4 @@
+obj/test.o
+limb
+skalibs
+obj/buz.o
diff --git a/meta/bins/test-hashchop b/meta/bins/test-hashchop
new file mode 100644
index 0000000..3413b64
--- /dev/null
+++ b/meta/bins/test-hashchop
@@ -0,0 +1,4 @@
+obj/test.o
+limb
+skalibs
+obj/hashchop.o
diff --git a/meta/bins/test-rabin b/meta/bins/test-rabin
new file mode 100644
index 0000000..630cbbf
--- /dev/null
+++ b/meta/bins/test-rabin
@@ -0,0 +1,4 @@
+obj/test.o
+limb
+skalibs
+obj/rabin.o
diff --git a/meta/bins/test-rabin-mask b/meta/bins/test-rabin-mask
new file mode 100644
index 0000000..4f8528d
--- /dev/null
+++ b/meta/bins/test-rabin-mask
@@ -0,0 +1,4 @@
+obj/test.o
+limb
+skalibs
+obj/rabin-mask.o
diff --git a/meta/bins/test-rabin-mix b/meta/bins/test-rabin-mix
new file mode 100644
index 0000000..22984c0
--- /dev/null
+++ b/meta/bins/test-rabin-mix
@@ -0,0 +1,4 @@
+obj/test.o
+limb
+skalibs
+obj/rabin-mix.o
diff --git a/meta/bins/test-rabin-tttd b/meta/bins/test-rabin-tttd
new file mode 100644
index 0000000..dd8bd81
--- /dev/null
+++ b/meta/bins/test-rabin-tttd
@@ -0,0 +1,4 @@
+obj/test.o
+limb
+skalibs
+obj/rabin-tttd.o
diff --git a/meta/deps/comain/version b/meta/deps/comain/version
index 4e379d2..bcab45a 100644
--- a/meta/deps/comain/version
+++ b/meta/deps/comain/version
@@ -1 +1 @@
-0.0.2
+0.0.3
diff --git a/meta/deps/limb/configure b/meta/deps/limb/configure
new file mode 100644
index 0000000..e69de29
diff --git a/meta/deps/limb/files b/meta/deps/limb/files
new file mode 100644
index 0000000..e69de29
diff --git a/meta/deps/limb/git b/meta/deps/limb/git
new file mode 100644
index 0000000..109fd23
--- /dev/null
+++ b/meta/deps/limb/git
@@ -0,0 +1 @@
+git://lila.oss/limb.git
diff --git a/meta/deps/limb/static b/meta/deps/limb/static
new file mode 100644
index 0000000..e69de29
diff --git a/meta/deps/limb/version b/meta/deps/limb/version
new file mode 100644
index 0000000..bcab45a
--- /dev/null
+++ b/meta/deps/limb/version
@@ -0,0 +1 @@
+0.0.3
diff --git a/meta/deps/skalibs/get_version b/meta/deps/skalibs/get_version
new file mode 100755
index 0000000..55041f5
--- /dev/null
+++ b/meta/deps/skalibs/get_version
@@ -0,0 +1,2 @@
+#!/bin/sh
+grep SKALIBS_VERSION "$@/skalibs/config.h" | cut -d\" -f2
diff --git a/meta/deps/skalibs/git b/meta/deps/skalibs/git
new file mode 100644
index 0000000..5965139
--- /dev/null
+++ b/meta/deps/skalibs/git
@@ -0,0 +1 @@
+git://lila.oss/skalibs.git
diff --git a/meta/deps/skalibs/incdir b/meta/deps/skalibs/incdir
new file mode 100644
index 0000000..e4484e2
--- /dev/null
+++ b/meta/deps/skalibs/incdir
@@ -0,0 +1 @@
+src/include
diff --git a/meta/deps/skalibs/include b/meta/deps/skalibs/include
new file mode 100644
index 0000000..e69de29
diff --git a/meta/deps/skalibs/libdir b/meta/deps/skalibs/libdir
new file mode 100644
index 0000000..5baacc7
--- /dev/null
+++ b/meta/deps/skalibs/libdir
@@ -0,0 +1 @@
+skalibs
diff --git a/meta/deps/skalibs/library b/meta/deps/skalibs/library
new file mode 100644
index 0000000..39971a0
--- /dev/null
+++ b/meta/deps/skalibs/library
@@ -0,0 +1 @@
+skarnet
diff --git a/meta/deps/skalibs/version b/meta/deps/skalibs/version
new file mode 100644
index 0000000..48c7359
--- /dev/null
+++ b/meta/deps/skalibs/version
@@ -0,0 +1 @@
+2.13.0.0
diff --git a/meta/desc b/meta/desc
new file mode 100644
index 0000000..da3f1b3
--- /dev/null
+++ b/meta/desc
@@ -0,0 +1,19 @@
+Test/benchmark some chunking algorithms for better data-deduplication
+
+Test/benchmark some chunking algorithms, when needing to split a data stream
+into variable-length chunks aimed for better data-deduplication.
+
+I originally worked out an implementation based on rabin fingerprint, but I
+figured I might look for alternative implementations/algorithms, and so we got:
+
+- rabin : another pretty identical implementation based on rabin
+- rabin-mask : a variant using two masks to find the next split, so when the
+  main one fails to find a split, we fall back on the other one
+- rabin-tttd : another variant using two masks to find the next split
+- rabin-mix : this is my original impementation, only using the previous two
+  variants. Specifically, mask below 128 KiB as target/average size, and tttd
+  otherwise (because tests seemed to indicate that was the best)
+- hashchop : another rolling-hash algorithm, based on rsync's hash
+- buz : buzhash implementation originally from BorgBackup
+- ae : implementation of Asymmetric Extremum chunking algorithm
+- ae-32 : AE variant using 32bit values instead of 64bit ones
diff --git a/meta/doc b/meta/doc
deleted file mode 100644
index 6b89b2f..0000000
--- a/meta/doc
+++ /dev/null
@@ -1 +0,0 @@
-https://lila.oss/doc/test-rolling-hash
diff --git a/meta/site b/meta/site
deleted file mode 100644
index 5bcf183..0000000
--- a/meta/site
+++ /dev/null
@@ -1 +0,0 @@
-https://lila.oss/test-rolling-hash
diff --git a/project.mk b/project.mk
index cd9bd93..4104399 100644
--- a/project.mk
+++ b/project.mk
@@ -1,7 +1,8 @@
-$(error You need to edit project.mk)
-
-# binaries: -- don't forget to set meta/bins/{bin1,bin2,...} with all deps & .o files
-BINS = 
-
-# librairies -- don't forget to set meta/libs{lib1,lib2,...} with all deps & .o files
-LIBS = 
+BINS = test-ae \
+	   test-ae-32 \
+	   test-buz \
+	   test-hashchop \
+	   test-rabin \
+	   test-rabin-mask \
+	   test-rabin-mix \
+	   test-rabin-tttd
diff --git a/src/ae-32.c b/src/ae-32.c
new file mode 100644
index 0000000..b600f64
--- /dev/null
+++ b/src/ae-32.c
@@ -0,0 +1,56 @@
+#include <endian.h>
+#include <string.h> /* size_t */
+#include <limb/int.h>
+
+/* Author: Yucheng Zhang
+ * For more see paper: AE: An Asymmetric Extremum Content Defined Chunking
+ * Algorithm for Fast and Bandwidth-Efficient Data Deduplication
+ */
+
+#if __BYTE_ORDER == __BIG_ENDIAN
+# define GETLE32(u) (u)
+#else
+# define GETLE32(u) __builtin_bswap32(u)
+#endif
+
+static inline
+int cmp(const u8 *x, const u8 *y)
+{
+    int ret;
+    u32 a = GETLE32(* ((u32 *) x));
+    u32 b = GETLE32(* ((u32 *) y));
+    if (a > b)
+        ret = 1;
+    else
+        ret = -1;
+    return ret;
+}
+
+size_t
+nextsplit(size_t min, size_t avg, const void *data, size_t dlen)
+{
+    if (dlen <= min) return dlen;
+
+    double e = 2.718281828;
+    int window_size = avg / (e - 1);
+    /* cur points to the current position
+     * max points to the position of max value
+     * end points to the end of buffer
+     */
+    const u8 *cur = (const u8 *) data + 1;
+    const u8 *max = (const u8 *) data;
+    const u8 *end = (const u8 *) data + dlen - sizeof(u32);
+
+    if (dlen <= window_size + sizeof(u32))
+        return dlen;
+
+    for ( ; cur <= end; ++cur) {
+        if (cmp(cur, max) < 0) {
+            max = cur;
+            continue;
+        }
+        if (cur == max + window_size)
+            return cur - (const u8 *) data;
+    }
+    return dlen;
+}
diff --git a/src/ae.c b/src/ae.c
new file mode 100644
index 0000000..d84f598
--- /dev/null
+++ b/src/ae.c
@@ -0,0 +1,56 @@
+#include <endian.h>
+#include <string.h> /* size_t */
+#include <limb/int.h>
+
+/* Author: Yucheng Zhang
+ * For more see paper: AE: An Asymmetric Extremum Content Defined Chunking
+ * Algorithm for Fast and Bandwidth-Efficient Data Deduplication
+ */
+
+#if __BYTE_ORDER == __BIG_ENDIAN
+# define GETLE64(u) (u)
+#else
+# define GETLE64(u) __builtin_bswap64(u)
+#endif
+
+static inline
+int cmp(const u8 *x, const u8 *y)
+{
+    int ret;
+    u64 a = GETLE64(* ((u64 *) x));
+    u64 b = GETLE64(* ((u64 *) y));
+    if (a > b)
+        ret = 1;
+    else
+        ret = -1;
+    return ret;
+}
+
+size_t
+nextsplit(size_t min, size_t avg, const void *data, size_t dlen)
+{
+    if (dlen <= min) return dlen;
+
+    double e = 2.718281828;
+    int window_size = avg / (e - 1);
+    /* cur points to the current position
+     * max points to the position of max value
+     * end points to the end of buffer
+     */
+    const u8 *cur = (const u8 *) data + 1;
+    const u8 *max = (const u8 *) data;
+    const u8 *end = (const u8 *) data + dlen - sizeof(u64);
+
+    if (dlen <= window_size + sizeof(u64))
+        return dlen;
+
+    for ( ; cur <= end; ++cur) {
+        if (cmp(cur, max) < 0) {
+            max = cur;
+            continue;
+        }
+        if (cur == max + window_size)
+            return cur - (const u8 *) data;
+    }
+    return dlen;
+}
diff --git a/src/buz.c b/src/buz.c
new file mode 100644
index 0000000..6e68b6d
--- /dev/null
+++ b/src/buz.c
@@ -0,0 +1,102 @@
+/* From BorgBackup:
+ * Copyright (C) 2015-2022 The Borg Collective (see AUTHORS file)
+ * Copyright (C) 2010-2014 Jonas Borgström <jonas@borgstrom.se>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ * 3. The name of the author may not be used to endorse or promote
+ *    products derived from this software without specific prior
+ *    written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <string.h> /* size_t */
+#include <limb/int.h>
+
+size_t
+nextsplit(size_t min, size_t avg, const void *data, size_t dlen)
+{
+    if (dlen <= min) return dlen;
+    u32 T[] =
+    {
+        0XE7F831EC, 0XF4026465, 0XAFB50CAE, 0X6D553C7A, 0XD639EFE3, 0X19A7B895, 0X9ABA5B21, 0X5417D6D4,
+        0X35FD2B84, 0XD1F6A159, 0X3F8E323F, 0XB419551C, 0XF444CEBF, 0X21DC3B80, 0XDE8D1E36, 0X84A32436,
+        0XBEB35A9D, 0XA36F24AA, 0XA4E60186, 0X98D18FFE, 0X3F042F9E, 0XDB228BCD, 0X096474B7, 0X5C20C2F7,
+        0XF9EEC872, 0XE8625275, 0XB9D38F80, 0XD48EB716, 0X22A950B4, 0X3CBAAEAA, 0XC37CDDD3, 0X8FEA6F6A,
+        0X1D55D526, 0X7FD6D3B3, 0XDAA072EE, 0X4345AC40, 0XA077C642, 0X8F2BD45B, 0X28509110, 0X55557613,
+        0XFFC17311, 0XD961FFEF, 0XE532C287, 0XAAB95937, 0X46D38365, 0XB065C703, 0XF2D91D0F, 0X92CD4BB0,
+        0X4007C712, 0XF35509DD, 0X505B2F69, 0X557EAD81, 0X310F4563, 0XBDDC5BE8, 0X9760F38C, 0X701E0205,
+        0X00157244, 0X14912826, 0XDC4CA32B, 0X67B196DE, 0X5DB292E8, 0X8C1B406B, 0X01F34075, 0XFA2520F7,
+        0X73BC37AB, 0X1E18BC30, 0XFE2C6CB3, 0X20C522D0, 0X5639E3DB, 0X942BDA35, 0X899AF9D1, 0XCED44035,
+        0X98CC025B, 0X255F5771, 0X70FEFA24, 0XE928FA4D, 0X2C030405, 0XB9325590, 0X20CB63BD, 0XA166305D,
+        0X80E52C0A, 0XA8FAFE2F, 0X1AD13F7D, 0XCFAF3685, 0X6C83A199, 0X7D26718A, 0XDE5DFCD9, 0X79CF7355,
+        0X8979D7FB, 0XEBF8C55E, 0XEBE408E4, 0XCD2AFFBA, 0XE483BE6E, 0XE239D6DE, 0X5DC1E9E0, 0X0473931F,
+        0X851B097C, 0XAC5DB249, 0X09C0F9F2, 0XD8D2F134, 0XE6F38E41, 0XB1C71BF1, 0X52B6E4DB, 0X07224424,
+        0X6CF73E85, 0X4F25D89C, 0X782A7D74, 0X10A68DCD, 0X3A868189, 0XD570D2DC, 0X69630745, 0X9542ED86,
+        0X331CD6B2, 0XA84B5B28, 0X07879C9D, 0X38372F64, 0X7185DB11, 0X25BA7C83, 0X01061523, 0XE6792F9F,
+        0XE5DF07D1, 0X4321B47F, 0X7D2469D8, 0X1A3A4F90, 0X48BE29A3, 0X669071AF, 0X8EC8DD31, 0X0810BFBF,
+        0X813A06B4, 0X68538345, 0X65865DDC, 0X43A71B8E, 0X78619A56, 0X5A34451D, 0X5BDAA3ED, 0X71EDC7E9,
+        0X17AC9A20, 0X78D10BFA, 0X6C1E7F35, 0XD51839D9, 0X240CBC51, 0X33513CC1, 0XD2B4F795, 0XCCAA8186,
+        0X0BABE682, 0XA33CF164, 0X18C643EA, 0XC1CA105F, 0X9959147A, 0X6D3D94DE, 0X0B654FBE, 0XED902CA0,
+        0X7D835CB5, 0X99BA1509, 0X6445C922, 0X495E76C2, 0XF07194BC, 0XA1631D7E, 0X677076A5, 0X89FFFE35,
+        0X1A49BCF3, 0X8E6C948A, 0X0144C917, 0X8D93AEA1, 0X16F87DDF, 0XC8F25D49, 0X1FB11297, 0X27E750CD,
+        0X2F422DA1, 0XDEE89A77, 0X1534C643, 0X457B7B8B, 0XAF172F7A, 0X6B9B09D6, 0X33573F7F, 0XF14E15C4,
+        0X526467D5, 0XAF488241, 0X87C3EE0D, 0X33BE490C, 0X95AA6E52, 0X43EC242E, 0XD77DE99B, 0XD018334F,
+        0X5B78D407, 0X498EB66B, 0XB1279FA8, 0XB38B0EA6, 0X90718376, 0XE325DEE2, 0X8E2F2CBA, 0XCAA5BDEC,
+        0X9D652C56, 0XAD68F5CB, 0XA77591AF, 0X88E37EE8, 0XF8FAA221, 0XFCBBBE47, 0X4F407786, 0XAF393889,
+        0XF444A1D9, 0X15AE1A2F, 0X40AA7097, 0X6F9486AC, 0X29D232A3, 0XE47609E9, 0XE8B631FF, 0XBA8565F4,
+        0X11288749, 0X46C9A838, 0XEB1B7CD8, 0XF516BBB1, 0XFB74FDA0, 0X010996E6, 0X4C994653, 0X1D889512,
+        0X53DCD9A3, 0XDD074697, 0X1E78E17C, 0X637C98BF, 0X930BB219, 0XCF7F75B0, 0XCB9355FB, 0X9E623009,
+        0XE466D82C, 0X28F968D3, 0XFEB385D9, 0X238E026C, 0XB8ED0560, 0X0C6A027A, 0X3D6FEC4B, 0XBB4B2EC2,
+        0XE715031C, 0XEDED011D, 0XCDC4D3B9, 0XC456FC96, 0XDD0EEA20, 0XB3DF8EC9, 0X12351993, 0XD9CBB01C,
+        0X603147A2, 0XCF37D17D, 0XF7FCD9DC, 0XD8556FA3, 0X104C8131, 0X13152774, 0XB4715811, 0X6A72C2C9,
+        0XC5AE37BB, 0XA76CE12A, 0X8150D8F3, 0X2EC29218, 0XA35F0984, 0X48C0647E, 0X0B5FF98C, 0X71893F7B
+    };
+    u32 mask = (1 << (msb64(avg) - 1)) - 1;
+
+#define WINDOW_SIZE 2047
+#define BARREL_SHIFT(v, shift) ( ((v) << (shift)) | ((v) >> ((32 - (shift)) & 0x1F)) )
+
+    const u8 *d = data;
+    u32 i, sum, imod;
+
+    if (dlen <= min + WINDOW_SIZE) return dlen;
+
+    sum = 0;
+    for (i = WINDOW_SIZE - 1; i > 0; --i) {
+        imod = i & 0x1F;
+        sum ^= BARREL_SHIFT(T[*d], imod);
+        ++d;
+    }
+    sum ^= T[*d];
+
+    d = data;
+    for (i = WINDOW_SIZE; i < dlen - WINDOW_SIZE; ++i) {
+        sum = BARREL_SHIFT(sum, 1) ^ BARREL_SHIFT(T[*d], 0x1F) ^ T[d[WINDOW_SIZE]];
+        if ((sum & mask) == 0) break;
+        ++d;
+    }
+    if (i >= dlen - WINDOW_SIZE)
+        i = dlen;
+
+    return i;
+}
diff --git a/src/hashchop.c b/src/hashchop.c
new file mode 100644
index 0000000..18b8184
--- /dev/null
+++ b/src/hashchop.c
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2011-2012 Scott Vokes <vokes.s@gmail.com>
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <string.h> /* size_t */
+#include <limb/int.h>
+
+size_t
+nextsplit(size_t min, size_t avg, const void *data, size_t dlen)
+{
+    if (dlen <= min) return dlen;
+    u32 mask = (1 << (msb64(avg) - 1)) - 1;
+    const u8 *buf = data;
+
+    u32 a = 1, b = 0;
+    size_t offset;
+
+    for (u32 i = 0; i < min; ++i) {
+        a += buf[i];
+        b += (min - i + 1) * buf[i];
+    }
+    a &= mask;
+    b &= mask;
+
+    for (offset = min; offset < dlen; ++offset) {
+        u32 k = offset - min;
+        u32 l = offset;
+        u8 nk = buf[k];
+        u8 nl = buf[l];
+        u32 na = (a - nk + nl);
+        u32 nb = (b - (l - k + 1) * nk + na);
+        u32 checksum = (na + (nb << 16)) & mask;
+        if (checksum == 0) break;
+        a = na;
+        b = nb;
+    }
+    return offset;
+}
diff --git a/src/rabin-mask.c b/src/rabin-mask.c
new file mode 100644
index 0000000..2e3c9e0
--- /dev/null
+++ b/src/rabin-mask.c
@@ -0,0 +1,42 @@
+#include <string.h> /* size_t */
+#include <limb/int.h>
+#include "rabin-tables.h"
+
+#define BREAKMARK_VALUE 0x78
+
+size_t
+nextsplit(size_t min, size_t avg, const void *data, size_t dlen)
+{
+    if (dlen <= min) return dlen;
+    u64 sml_mask = (1 << (msb64(avg * 2) - 1)) - 1;
+    u64 lrg_mask = (1 << (msb64(avg / 2) - 1)) - 1;
+    u64 fp = 0;
+    int i = min, wpos = -1;
+
+    const u8 *sce = data;
+    u8 window[RABIN_WINDOW_SIZE] = { 0 };
+
+    while (i < dlen) {
+        u8 om;
+        u64 x;
+        if (++wpos >= sizeof(window))
+            wpos = 0;
+        om = window[wpos];
+        window[wpos] = sce[i - 1];
+        fp ^= rabin_tables.U[om];
+        x = fp >> 55;
+        fp <<= 8;
+        fp |= sce[i - 1];
+        fp ^= rabin_tables.T[x];
+
+        if (i >= min) {
+            if (i < avg) {
+                if ((fp & sml_mask) == BREAKMARK_VALUE)
+                    break;
+            } else if ((fp & lrg_mask) == BREAKMARK_VALUE)
+                break;
+        }
+        ++i;
+    }
+    return i;
+}
diff --git a/src/rabin-mix.c b/src/rabin-mix.c
new file mode 100644
index 0000000..dcd3f2b
--- /dev/null
+++ b/src/rabin-mix.c
@@ -0,0 +1,49 @@
+#include <string.h> /* size_t */
+#include <limb/int.h>
+#include "rabin-tables.h"
+
+size_t
+nextsplit(size_t min, size_t avg, const void *data, size_t dlen)
+{
+    if (dlen <= min) return dlen;
+    u8 window[RABIN_WINDOW_SIZE] = { 0 };
+    const u8 *b = data;
+    size_t wpos = 0, pos = min, last = 0;
+    u64 fp = 0;
+    u64 mask, lrg_mask;
+    int tttd = avg >= (128 << 10);
+    if (tttd)
+        mask = (1 << (msb64(avg    ) - 1)) - 1;
+    else
+        mask = (1 << (msb64(avg * 2) - 1)) - 1;
+    lrg_mask = (1 << (msb64(avg / 2) - 1)) - 1;
+
+#define TTTD_BREAK      0x78
+#define APPEND8(p,m)    (((p) << 8) | m) ^ rabin_tables.T[(p) >> 55]
+    for (;;) {
+        u8 old = window[wpos];
+        window[wpos] = b[pos];
+        fp = APPEND8(fp ^ rabin_tables.U[old], window[wpos]);
+        if (++wpos == sizeof(window)) wpos = 0;
+
+        if (pos == dlen) {
+            if (last) return last;
+            return pos;
+        }
+        if (pos >= min) {
+            if (tttd) {
+                if ((fp & lrg_mask) == TTTD_BREAK) {
+                    if ((fp & mask) == TTTD_BREAK)
+                        return pos;
+                    last = pos;
+                }
+            } else if (pos < avg) {
+                if ((fp & mask) == mask)
+                    return pos;
+            } else if ((fp & lrg_mask) == lrg_mask) {
+                return pos;
+            }
+        }
+        ++pos;
+    }
+}
diff --git a/src/rabin-tttd.c b/src/rabin-tttd.c
new file mode 100644
index 0000000..6c1c9c5
--- /dev/null
+++ b/src/rabin-tttd.c
@@ -0,0 +1,45 @@
+#include <string.h> /* size_t */
+#include <limb/int.h>
+#include "rabin-tables.h"
+
+#define BREAKMARK_VALUE 0x78
+
+size_t
+nextsplit(size_t min, size_t avg, const void *data, size_t dlen)
+{
+    if (dlen <= min) return dlen;
+    u64     mask = (1 << (msb64(avg    ) - 1)) - 1;
+    u64 lrg_mask = (1 << (msb64(avg / 2) - 1)) - 1;
+    u64 fp = 0;
+    int i = min, wpos = -1;
+    int m = 0;
+
+    const u8 *sce = data;
+    u8 window[RABIN_WINDOW_SIZE] = { 0 };
+
+    while (i < dlen) {
+        u8 om;
+        u64 x;
+        if (++wpos >= sizeof(window))
+            wpos = 0;
+        om = window[wpos];
+        window[wpos] = sce[i - 1];
+        fp ^= rabin_tables.U[om];
+        x = fp >> 55;
+        fp <<= 8;
+        fp |= sce[i - 1];
+        fp ^= rabin_tables.T[x];
+
+        if (i >= min) {
+            if ((fp & lrg_mask) == BREAKMARK_VALUE) {
+                if ((fp & mask) == BREAKMARK_VALUE)
+                    return i;
+                m = i;
+            }
+        }
+        ++i;
+    }
+    if (m)
+        return m;
+    return i;
+}
diff --git a/src/rabin.c b/src/rabin.c
new file mode 100644
index 0000000..0140643
--- /dev/null
+++ b/src/rabin.c
@@ -0,0 +1,36 @@
+#include <string.h> /* size_t */
+#include <limb/int.h>
+#include "rabin-tables.h"
+
+#define BREAKMARK_VALUE 0x78
+
+size_t
+nextsplit(size_t min, size_t avg, const void *data, size_t dlen)
+{
+    if (dlen <= min) return dlen;
+    u64 mask = (1 << (msb64(avg) - 1)) - 1;
+    u64 fp = 0;
+    int i = min, wpos = -1;
+
+    const u8 *sce = data;
+    u8 window[RABIN_WINDOW_SIZE] = { 0 };
+
+    while (i < dlen) {
+        u8 om;
+        u64 x;
+        if (++wpos >= sizeof(window))
+            wpos = 0;
+        om = window[wpos];
+        window[wpos] = sce[i - 1];
+        fp ^= rabin_tables.U[om];
+        x = fp >> 55;
+        fp <<= 8;
+        fp |= sce[i - 1];
+        fp ^= rabin_tables.T[x];
+
+        if (i >= min && (fp & mask) == BREAKMARK_VALUE)
+            break;
+        ++i;
+    }
+    return i;
+}
diff --git a/src/test.c b/src/test.c
new file mode 100644
index 0000000..e23b2c9
--- /dev/null
+++ b/src/test.c
@@ -0,0 +1,244 @@
+#include <dirent.h>
+#include <errno.h>
+#include <time.h>
+#include <skalibs/stralloc.h>
+#include <skalibs/unix-transactional.h>
+#include <skalibs/djbunix.h>
+#include <skalibs/uint32.h>
+#include <skalibs/uint16.h>
+#include <skalibs/fmtscan.h>
+#include <limb/int.h>
+#include <limb/output.h>
+#include <limb/blake3.h>
+
+const char *PROG = "test";
+
+extern size_t nextsplit(size_t min, size_t avg, const void *data, size_t dlen);
+
+struct bench {
+    size_t min;
+    size_t avg;
+    size_t max;
+    int iter;
+    stralloc *sa;
+    u8 *blocks;
+    size_t blkpos;
+    size_t pos;
+    size_t total;
+    int hashes;
+};
+
+static void
+bench(struct bench *b)
+{
+    size_t pos, curblkpos = b->blkpos;
+    for (int i = 0; i < b->iter; ++i) {
+        curblkpos = b->blkpos;
+        pos = b->pos;
+        while (pos < b->sa->len) {
+            size_t len = b->sa->len - pos;
+            size_t offset = nextsplit(b->min, b->avg,
+                                      b->sa->s + pos, (len > b->max) ? b->max : len);
+            curblkpos += u64_pack_trim((u64) offset, b->blocks + curblkpos);
+            /* special case: when scanning directories, we don't keep the
+             * entire dataset in memory, so we need to compute/print hashes
+             * right now within the benchmarking. */
+            if (i == 0 && b->hashes) {
+                unsigned char buf[32];
+                blake3(b->sa->s + pos, offset, buf);
+                char hash[32 * 2 + 1] = { 0 };
+                for (int i = 0; i < sizeof(buf); ++i) {
+                    if (buf[i] & 0xf0)
+                        hash[i * 2] = fmtscan_asc(buf[i] >> 4);
+                    else
+                        hash[i * 2] = '0';
+                    hash[i * 2 + 1] = fmtscan_asc(buf[i] & 0x0f);
+                }
+                char size[UINT32_FMT];
+                size[uint32_fmt(size, offset)] = 0;
+                out(hash, " ", size);
+            }
+            pos += offset;
+        }
+    }
+    b->blkpos = curblkpos;
+    b->total += b->sa->len - b->pos;
+}
+
+static int
+processdir(int basefd, const char *name, struct bench *b)
+{
+    int ret = -1;
+    int fd;
+    DIR *dir;
+    struct dirent *de;
+
+    dir = NULL;
+    fd = open_readat(basefd, name);
+    if (fd < 0) goto err;
+
+    dir = fdopendir(fd);
+    if (!dir) goto err;
+    fd = -1;
+
+    for (;;) {
+        errno = 0;
+        de = readdir(dir);
+        if (!de) {
+            if (errno) goto err;
+            break;
+        }
+        if (de->d_name[0] == '.' &&
+                (!de->d_name[1] || (de->d_name[1] == '.' && !de->d_name[2])))
+            continue;
+
+        size_t salen = b->sa->len;
+        fd = open_readat(dirfd(dir), de->d_name);
+        if (fd < 0 || !slurp(b->sa, fd)) {
+            if (errno == EISDIR) {
+                fd_close(fd);
+                if (processdir(dirfd(dir), de->d_name, b) < 0)
+                    goto err;
+                continue;
+            }
+            if (fd >= 0) fd_close(fd);
+            warnusys("process ...", name, "/", de->d_name);
+            continue;
+        }
+        fd_close(fd);
+        fd = -1;
+
+        bench(b);
+        b->sa->len = salen;
+    }
+
+    ret = 0;
+err:
+    if (fd >= 0) fd_close(fd);
+    if (dir) closedir(dir);
+    return ret;
+}
+
+#include <stdio.h>
+int
+main(int argc, const char *argv[])
+{
+    struct timespec ts1, ts2;
+    stralloc sa = STRALLOC_ZERO;
+    u8 blocks[800 << 10];
+    struct bench b = {
+        .min = (  4 << 10),
+        .avg = (  8 << 10),
+        .max = (  1 << 20),
+        .iter = 1,
+        .sa = &sa,
+        .blocks = blocks,
+    };
+    int list = 0;
+
+    while (argc >= 2) {
+        if (!strncmp(argv[1], "-a", 2) || !strncmp(argv[1], "-m", 2)
+                || !strncmp(argv[1], "-M", 2)) {
+            const char *s = NULL;
+            if (!argv[1][2])
+                s = argv[2];
+            else if (argc < 3)
+                dief(1, "missing value for ", argv[1]);
+            else
+                s = argv[1] + 2;
+
+            u32 u;
+            if (!uint32_scan(s, &u))
+                dief(1, "invalid value for ", argv[1]);
+            switch (argv[1][1]) {
+                case 'a': b.avg = u; break;
+                case 'm': b.min = u; break;
+                case 'M': b.max = u; break;
+            }
+
+            int e = (s == argv[2]) ? 1 : 0;
+            argc -= 1 + e;
+            memmove(&argv[1], &argv[2 + e], argc * sizeof(*argv));
+        } else if (!strcmp(argv[1], "-l")) {
+            list = 1;
+        } else if (!strcmp(argv[1], "-H")) {
+            list = 2;
+        } else {
+            break;
+        }
+
+        if (list)
+            memmove(&argv[1], &argv[2], --argc * sizeof(*argv));
+    }
+
+    if (argc != 2 && argc != 3)
+        dieusage(1, "[-a AVGSIZE] [-m MINSIZE] [-M MAXSIZE] [-l | -H] FILE [ITER]");
+
+    if (argc == 3) {
+        u16 u;
+        if (!uint16_scan(argv[2], &u))
+            dief(1, "invalid ITER argument");
+        b.iter = u;
+    }
+
+    int fd = open_read(argv[1]);
+    if (fd < 0 || !slurp(&sa, fd)) {
+        if (errno != EISDIR)
+            diefusys(2, "read ", argv[1]);
+        fd_close(fd);
+        fd = -1;
+        if (list == 2) {
+            b.hashes = 1;
+            list = 0;
+        }
+    }
+
+    clock_gettime(CLOCK_MONOTONIC, &ts1);
+    if (fd < 0)
+        processdir(AT_FDCWD, argv[1], &b);
+    else
+        bench(&b);
+    clock_gettime(CLOCK_MONOTONIC, &ts2);
+    blocks[b.blkpos] = 0;
+
+    if (fd >= 0) fd_close(fd);
+
+    size_t min = b.total;
+    size_t max = 0;
+    size_t total = 0;
+    int n = 1;
+    size_t pos = 0;
+    for (int o = 0; blocks[o]; ++n) {
+        u64 u;
+        o += u64_unpack_trim(blocks + o, &u);
+        if (list) {
+            if (list == 2) {
+                unsigned char buf[32];
+                blake3(sa.s + pos, u, buf);
+                for (int i = 0; i < sizeof(buf); ++i)
+                    fprintf(stdout, "%.02x", buf[i]);
+                pos += u;
+            } else if (list == 1) {
+                fprintf(stdout, "block:");
+            }
+            fprintf(stdout, " %lu\n", u);
+        }
+        if (u < min && blocks[o]) min = u;
+        if (u > max) max = u;
+        total += u;
+    }
+    --n;
+    if (total != b.total) {
+        fprintf(stderr, "incorrect total size %lu != %lu\n", total, b.total);
+        return 2;
+    }
+    fprintf(stderr, "%u blocks; min=%lu, avg=%lu, max=%lu\n", n, min, total / n, max);
+
+    ts2.tv_sec -= ts1.tv_sec;
+    ts2.tv_nsec -= ts1.tv_nsec;
+    double took = ts2.tv_sec + (ts2.tv_nsec / 1000000000.0);
+    double speed = ((b.total * b.iter) / took) / (1 << 20);
+    fprintf(stderr, "took %.09f seconds, %f MiB/s\n", took, speed);
+
+    return 0;
+}
diff --git a/tests b/tests
new file mode 100755
index 0000000..732adc1
--- /dev/null
+++ b/tests
@@ -0,0 +1,10 @@
+#!/bin/sh
+
+for file in test-*; do
+    if test $# -eq 0; then
+        ./"$file"
+        exit 1
+    fi
+    echo "$file:"
+    ./"$file" "$@"
+done
diff --git a/tests-avgs b/tests-avgs
new file mode 100755
index 0000000..cbcceb9
--- /dev/null
+++ b/tests-avgs
@@ -0,0 +1,25 @@
+#!/bin/sh
+
+run_dedup() {
+    for file in test-*; do
+    algo=$(expr substr "$file" 6 ${#file})
+        ./tests-dedup $algo "$@" >>dedup.log 2>&1
+    done
+}
+
+printf '\n%s\n\n' "8 KiB" >> dedup.log
+run_dedup -a8192 "$@"
+printf '\n%s\n\n' "16 KiB" >> dedup.log
+run_dedup -a16384 "$@"
+printf '\n%s\n\n' "32 KiB" >> dedup.log
+run_dedup -a32768 "$@"
+printf '\n%s\n\n' "64 KiB" >> dedup.log
+run_dedup -a65536 "$@"
+printf '\n%s\n\n' "128 KiB" >> dedup.log
+run_dedup -a131072 "$@"
+printf '\n%s\n\n' "256 KiB" >> dedup.log
+run_dedup -a262144 "$@"
+printf '\n%s\n\n' "512 KiB" >> dedup.log
+run_dedup -a524288 "$@"
+printf '\n%s\n\n' "1024 KiB" >> dedup.log
+run_dedup -a1048576 -M2097152 "$@"
diff --git a/tests-dedup b/tests-dedup
new file mode 100755
index 0000000..39db122
--- /dev/null
+++ b/tests-dedup
@@ -0,0 +1,56 @@
+#!/bin/sh
+
+dieusage() {
+    echo usage: $0 ALGO [-a AVGSIZE] [-m MINSIZE] [-M MAXSIZE] FILE [FILE2]
+    exit 1
+}
+
+if test $# -le 1; then dieusage; fi
+
+algo="$1"
+shift
+
+if test ! -e ./test-"$algo"; then
+    echo $0: fatal: test-$algo not found >&2
+    exit 1
+fi
+
+calcdedup() {
+    minus=0
+    if test "$1" = "-1"; then minus=1; fi
+
+    dedup=0
+    while read -r count hash size <&0; do
+        count=$(($count - $minus))
+        size=$(($count * $size))
+        dedup=$(($dedup + $size))
+    done
+    echo dedup: $dedup
+}
+
+args=
+for arg; do
+    case "$arg" in
+        -a|-m|-M)
+            args="$args $arg $2"
+            shift 2
+            ;;
+        -a*|-m*|-M*)
+            args="$args $arg"
+            shift
+            ;;
+        -*)
+            dieusage
+            ;;
+    esac
+done
+
+echo "* $algo:"
+echo $1
+./test-$algo $args -H "$1" | sort > $algo.1
+uniq -cd $algo.1 | calcdedup -1
+if test -n "$2"; then
+    echo $2
+    ./test-$algo $args -H "$2" | sort | comm -12 $algo.1 - | uniq -c | calcdedup
+fi
+rm $algo.1