Code » test-rolling-hash » commit 10b2c1c
First commit
author | Olivier Brunel
<jjk@jjacky.com> 2023-02-14 10:10:16 UTC |
committer | Olivier Brunel
<jjk@jjacky.com> 2023-02-15 16:42:04 UTC |
parent | da4eeed5f3f27ff6d816a47cd4463b71479b2409 |
.gitignore | +4 | -0 |
include/rabin-tables.h | +142 | -0 |
meta/AUTHORS | +6 | -0 |
meta/bins/test-ae | +4 | -0 |
meta/bins/test-ae-32 | +4 | -0 |
meta/bins/test-buz | +4 | -0 |
meta/bins/test-hashchop | +4 | -0 |
meta/bins/test-rabin | +4 | -0 |
meta/bins/test-rabin-mask | +4 | -0 |
meta/bins/test-rabin-mix | +4 | -0 |
meta/bins/test-rabin-tttd | +4 | -0 |
meta/deps/comain/version | +1 | -1 |
meta/deps/limb/configure | +0 | -0 |
meta/deps/limb/files | +0 | -0 |
meta/deps/limb/git | +1 | -0 |
meta/deps/limb/static | +0 | -0 |
meta/deps/limb/version | +1 | -0 |
meta/deps/skalibs/get_version | +2 | -0 |
meta/deps/skalibs/git | +1 | -0 |
meta/deps/skalibs/incdir | +1 | -0 |
meta/deps/skalibs/include | +0 | -0 |
meta/deps/skalibs/libdir | +1 | -0 |
meta/deps/skalibs/library | +1 | -0 |
meta/deps/skalibs/version | +1 | -0 |
meta/desc | +19 | -0 |
meta/doc | +0 | -1 |
meta/site | +0 | -1 |
project.mk | +8 | -7 |
src/ae-32.c | +56 | -0 |
src/ae.c | +56 | -0 |
src/buz.c | +102 | -0 |
src/hashchop.c | +50 | -0 |
src/rabin-mask.c | +42 | -0 |
src/rabin-mix.c | +49 | -0 |
src/rabin-tttd.c | +45 | -0 |
src/rabin.c | +36 | -0 |
src/test.c | +244 | -0 |
tests | +10 | -0 |
tests-avgs | +25 | -0 |
tests-dedup | +56 | -0 |
diff --git a/.gitignore b/.gitignore index 7ccd516..37b95b9 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,7 @@ *.o *.lo *.d +/test-* +/limb +/limb.built +/skalibs diff --git a/include/rabin-tables.h b/include/rabin-tables.h new file mode 100644 index 0000000..803fdb9 --- /dev/null +++ b/include/rabin-tables.h @@ -0,0 +1,142 @@ +#ifndef RABIN_TABLES_H +#define RABIN_TABLES_H + +#include <limb/int.h> + +#define RABIN_WINDOW_SIZE 32 + +static struct { + u64 T[256]; + u64 U[256]; +} rabin_tables = { { +0ULL, 13827942727904890243ULL, 9209141382100228870ULL, 13847383506563929733ULL, +4646688091768732559ULL, 18418282764200457740ULL, 4589497290695549065ULL, 9248022939418307850ULL, +4548086706303466141ULL, 9293376183537465118ULL, 4671215262765932955ULL, 18389821454691363864ULL, +9178994581391098130ULL, 13872474804339788945ULL, 49301805127064084ULL, 13783723088868136855ULL, +9096173412606932282ULL, 13968654369411725497ULL, 140008293365378620ULL, 13697645939728147391ULL, +4485659674370133685ULL, 9342430525531865910ULL, 4723487651964868019ULL, 18332898835673176112ULL, +4693337148998094759ULL, 18357989162782196260ULL, 4534953362004776097ULL, 9298205534970026274ULL, +98603610254128168ULL, 13743006751583844779ULL, 9120702104026722094ULL, 13940196212404396717ULL, +4872485406381467639ULL, 18192346825213864564ULL, 4347093914495974641ULL, 9490564665113899378ULL, +280016586730757240ULL, 13548069301033521659ULL, 8948547805746743166ULL, 14107834197935999741ULL, +8971319348740267370ULL, 14080011446993228009ULL, 238116977354180204ULL, 13595046231760440303ULL, +4394630711393986277ULL, 9446975303929736038ULL, 4841840269215546851ULL, 18219053597636800608ULL, +4441272913864004301ULL, 9386674297996189518ULL, 4787294948947036619ULL, 18269234251854840904ULL, +9069906724009552194ULL, 13995059785443288257ULL, 149666996230500932ULL, 13687848784306683847ULL, +197207220508256336ULL, 13644260118911444435ULL, 9039269429458137942ULL, 14021771634513411797ULL, +4810060865537627103ULL, 18241404208053444188ULL, 4399372058942752985ULL, 9433648351099241818ULL, +4097061761524384365ULL, 9744970812762935278ULL, 5122526527230073195ULL, 17937949576718177512ULL, +8694187828991949282ULL, 14357842170245903457ULL, 534385256518247140ULL, 13298069953032646503ULL, +560033173461514480ULL, 13268479238923367795ULL, 8649394528357491702ULL, 14406569746831191669ULL, +5168435857949066111ULL, 17897095611493486332ULL, 4068026223473531001ULL, 9768924322162447866ULL, +5109252900731139927ULL, 17942638697480534740ULL, 4119323861271030865ULL, 9713278820276904402ULL, +476233954708360408ULL, 13365932622558560603ULL, 8743348389811328990ULL, 14316984640186325597ULL, +8789261422787972554ULL, 14276131645628465225ULL, 447206534149920460ULL, 13389891483483167567ULL, +4144965876933457477ULL, 9683680538431093702ULL, 5064458079674101059ULL, 17991363121564049600ULL, +8882545827728008602ULL, 14169341458345928729ULL, 326604522282827420ULL, 13505993675309822751ULL, +4267581128949538325ULL, 9574589897894073238ULL, 4968612946513855763ULL, 18091724430000130192ULL, +4925575273883098887ULL, 18139813448019104388ULL, 4293718070877275137ULL, 9543375497177024898ULL, +299333992461001864ULL, 13529316906856737035ULL, 8928953494903816078ULL, 14126872018474168845ULL, +394414441016512672ULL, 13447613786754473251ULL, 8841776164113337254ULL, 14218695490258224677ULL, +4973495624276863791ULL, 18078538858916275884ULL, 4235660326371356713ULL, 9596799195317271978ULL, +4208386369169795645ULL, 9620121731075254206ULL, 5019895448838156603ULL, 18036064342397336760ULL, +8798744117885505970ULL, 14266791801135832113ULL, 420552628488932020ULL, 13416402263661283127ULL, +8194123523048768730ULL, 14866918157251578201ULL, 1043197551816318940ULL, 12798269171274486367ULL, +3587967976203566933ULL, 10245053054460146390ULL, 5622309373242516563ULL, 17429155079726803408ULL, +5668153059635265095ULL, 17388375657983898564ULL, 3559007428260750657ULL, 10268940266782255298ULL, +1068770513036494280ULL, 12768744788610795595ULL, 8149395832355741390ULL, 14915571155987026765ULL, +1120066346923028960ULL, 12713097482813373539ULL, 8090214404137183974ULL, 14961115770974627685ULL, +5762105426466228847ULL, 17298789056714983404ULL, 3475209979042058601ULL, 10366395419952831722ULL, +3500786384821783421ULL, 10336871715898132222ULL, 5717385561207771259ULL, 17347447149277421048ULL, +8136052446947062002ULL, 14920329073552899441ULL, 1091104570615344116ULL, 12736981800331683447ULL, +3614653578697433901ULL, 10218505801462279854ULL, 5612792507343960107ULL, 17438533321251517864ULL, +8238647722542061730ULL, 14822251072776885537ULL, 979813566844257188ULL, 12861796316105920039ULL, +952467909416720816ULL, 12885185707350004787ULL, 8285121171407569590ULL, 14779707226937790261ULL, +5569689087391539775ULL, 17486696779622657980ULL, 3640865613862713657ULL, 10187225206663099578ULL, +5482514350769116695ULL, 17578522845575945108ULL, 3735943193405689105ULL, 10105519217547378834ULL, +894413068299840920ULL, 12938612308880203803ULL, 8333038893256783518ULL, 14718430009289227037ULL, +8289931753866914954ULL, 14766592514173460745ULL, 920617003152635788ULL, 12907326345376135695ULL, +3708603454438921989ULL, 10128916159348202118ULL, 5528989302877408259ULL, 17535982169418547584ULL, +5290867652594615991ULL, 17765091655456017204ULL, 3936569740447636913ULL, 9891938842982305842ULL, +653209044565654840ULL, 13183745368696811707ULL, 8565243276910093886ULL, 14500293120999091133ULL, +8535162257899076650ULL, 14525310012799234473ULL, 702435722078594860ULL, 13139591889365609135ULL, +3895234249284683685ULL, 9937225893027711526ULL, 5315329076278566051ULL, 17736704786290708768ULL, +3985942541400290189ULL, 9851150547766197774ULL, 5232506378461325451ULL, 17832882822328657160ULL, +8587436141754550274ULL, 14468388888438506881ULL, 640006920644498180ULL, 13188644461858070151ULL, +598667984922003728ULL, 13233930833294476435ULL, 8611889740003922454ULL, 14439996925446217621ULL, +5202431003033170591ULL, 17857906989807632156ULL, 4035170447278107033ULL, 9806999963238786074ULL, +788828882033025344ULL, 13048268519271181507ULL, 8448483499799394886ULL, 14616910184945891269ULL, +5372268252389710543ULL, 17683552328226674508ULL, 3838000129179320777ULL, 9990646906806897738ULL, +3885611916177005533ULL, 9946991248553727582ULL, 5341557470905909467ULL, 17710333644123000152ULL, +8471320652742713426ULL, 14589012856144883153ULL, 746854316924992340ULL, 13095311781452084951ULL, +8416772738339591290ULL, 14639190916227389945ULL, 793499388440956796ULL, 13035013644565660415ULL, +3797159031630852085ULL, 10039790897676313206ULL, 5440147474686342387ULL, 17625384611085121904ULL, +5462988347067117287ULL, 17597488235771011940ULL, 3755192566835295713ULL, 10086839528562112610ULL, +841105256977864040ULL, 12991350435755759851ULL, 8386060453613014638ULL, 14665969062442009581ULL +}, { +0ULL, 568999561260842924ULL, 1137999122521685848ULL, 589833262815396084ULL, +2275998245043371696ULL, 1761055125200617756ULL, 1179666525630792168ULL, 1709813571542236740ULL, +4551996490086743392ULL, 4093365957697716940ULL, 3522110250401235512ULL, 3964419511316364692ULL, +2359333051261584336ULL, 2835991227898739836ULL, 3419627143084473480ULL, 2923270271204541220ULL, +9103992980173486784ULL, 8769246759817692524ULL, 8186731915395433880ULL, 8536682337754266164ULL, +7044220500802471024ULL, 7360965568252689372ULL, 7928839022632729384ULL, 7632927380768113796ULL, +4718666102523168672ULL, 5087220157816518668ULL, 5671982455797479672ULL, 5283729429325956948ULL, +6839254286168946960ULL, 6416670280833615548ULL, 5846540542409082440ULL, 6252803552238418404ULL, +4848541058636740611ULL, 4948334975574230959ULL, 5513398027029317467ULL, 5433303080062040311ULL, +6691366354261935795ULL, 6573562385112938783ULL, 5987111374761032171ULL, 6121236340355548743ULL, +8962878867740132707ULL, 8901352301823769295ULL, 8334041934752250427ULL, 8380364299873535383ULL, +7203352090154466259ULL, 7210839806991352959ULL, 7799546963378373771ULL, 7771225820016360231ULL, +4404127935990301379ULL, 4250242530309684591ULL, 3662665828094967195ULL, 3832872504041521719ULL, +2489192475691547763ULL, 2697125423483274207ULL, 3261062918908085035ULL, 3072828667298659463ULL, +159151793960550307ULL, 418858545340760079ULL, 1008691531600494843ULL, 728151079873691479ULL, +2134868877968572691ULL, 1893180871797948095ULL, 1326995922814959179ULL, 1553480001977369063ULL, +4140017981295394181ULL, 4508440215577772585ULL, 3944784144731144925ULL, 3556663179771985265ULL, +2801718574537737013ULL, 2379248763918204057ULL, 2966544078196702317ULL, 3372692584055817153ULL, +459293339567948005ULL, 124626129572642633ULL, 690529550898757565ULL, 1040400653676232721ULL, +1858373128789327445ULL, 2175021559617438201ULL, 1585470133093138701ULL, 1289655368385576609ULL, +5126131060248446789ULL, 4667650181573556457ULL, 5253819664402282525ULL, 5695979512138221489ULL, +6392390533637765621ULL, 6868881429762295385ULL, 6304099938176337581ULL, 5807910037588162817ULL, +8649266140029483557ULL, 9218063236388832649ULL, 8629637504445252989ULL, 8081673800386034385ULL, +7466306151047333013ULL, 6951547869274028857ULL, 7518576848274586573ULL, 8048539296617972833ULL, +8808255871980602758ULL, 9068077127062600234ULL, 8500485060619369182ULL, 8219830414445200754ULL, +7325331656189934390ULL, 7083511588238509210ULL, 7665745008083043438ULL, 7892360908524862402ULL, +4978384951383095526ULL, 4824402668276991818ULL, 5394250846966548414ULL, 5564554159803643922ULL, +6522125837816170070ULL, 6730138104920816122ULL, 6145657334597318926ULL, 5957344072895841954ULL, +318303587921100614ULL, 256610050981413098ULL, 837717090681520158ULL, 884206736046997426ULL, +2017383063200989686ULL, 2025020193485504090ULL, 1456302159747382958ULL, 1427831362402795778ULL, +4269737755937145382ULL, 4369716270711127434ULL, 3786361743595896190ULL, 3706081958290735826ULL, +2653991845629918358ULL, 2535985721102287674ULL, 3106960003954738126ULL, 3241287434182311010ULL, +8280035962590788362ULL, 8434651392460768422ULL, 9016880431155545170ULL, 8847069596582984702ULL, +7889568289462289850ULL, 7680922806604282390ULL, 7113326359543970530ULL, 7301147211969348942ULL, +5603437149075474026ULL, 5342982678240115142ULL, 4758497527836408114ULL, 5038659764734477982ULL, +5933088156393404634ULL, 6175541371108706166ULL, 6745385168111634306ULL, 6519261745934350382ULL, +918586679135896010ULL, 817974673999609446ULL, 249252259145285266ULL, 329039360028524862ULL, +1381059101797515130ULL, 1499698648235884758ULL, 2080801307352465442ULL, 1946966629999427470ULL, +3716746257578654890ULL, 3779073216429081350ULL, 4350043119234876402ULL, 4304046226743934046ULL, +3170940266186277402ULL, 3162669645539458486ULL, 2579310736771153218ULL, 2607288849693819630ULL, +3576388832492159753ULL, 3910422620844113061ULL, 4497722316809239633ULL, 4147358460889842685ULL, +3329315614885313977ULL, 3013300674686894613ULL, 2449648983920476897ULL, 2745956432782572877ULL, +1047669660451831401ULL, 679880916799144389ULL, 90333870891925809ULL, 478947520005619357ULL, +1233962491836858585ULL, 1655798880813039477ULL, 2221706683288911745ULL, 1815065424287875117ULL, +5761796968237857225ULL, 5193633087362385509ULL, 4628855977429501585ULL, 5177312091034302781ULL, +5792750111264442233ULL, 6306875246004386005ULL, 6893049108879767585ULL, 6362594182541330317ULL, +8132924095840683177ULL, 8590771827482213125ULL, 9157805187066297329ULL, 8715152861335307357ULL, +8018671473238768153ULL, 7542813792598115765ULL, 6954392441770739009ULL, 7451074751904496365ULL, +5448463561577301647ULL, 5510623274259167523ULL, 4924167849500867031ULL, 4878337962643539579ULL, +6055420538959001663ULL, 6047299606370273171ULL, 6612358241812547431ULL, 6640186975897991371ULL, +8410230468549049327ULL, 8309803335859717187ULL, 8898506076770796727ULL, 8978108614461504283ULL, +7719974969369724255ULL, 7838412085284223731ULL, 7271097303269440007ULL, 7137464815938880939ULL, +3876251125767896143ULL, 3615910884025844707ULL, 4179843965010707223ULL, 4459891732314078395ULL, +3053103567622800127ULL, 3295424995704749395ULL, 2707843877443665319ULL, 2481852551154156043ULL, +783843403378688303ULL, 938362230970629763ULL, 372172866478145143ULL, 202458943404491227ULL, +1546209342646353823ULL, 1337642904492785715ULL, 1927471475775701191ULL, 2115213042961694571ULL, +636607175842201228ULL, 1094604286589387040ULL, 513220101962826196ULL, 70418087905312376ULL, +1675434181363040316ULL, 1199409495357210512ULL, 1768413472093994852ULL, 2265263028127875272ULL, +4034766126401979372ULL, 3466400055772462144ULL, 4050040386971008180ULL, 4598698930831246104ULL, +2912604319494765916ULL, 3426914017695679216ULL, 2855662724805591556ULL, 2325022925751762344ULL, +8539475511874290764ULL, 8171554672065395680ULL, 8739432541422254868ULL, 9128177977437734072ULL, +7572723487191792380ULL, 7994674345494234448ULL, 7412163916581471652ULL, 7005408428755565064ULL, +5307983691259836716ULL, 5642096764582741632ULL, 5071971442204575348ULL, 4721528541849172440ULL, +6213920007909476252ULL, 5897808155943712816ULL, 6482574868364622020ULL, 6778978919773194088ULL +} }; +#endif /* RABIN_TABLES_H */ diff --git a/meta/AUTHORS b/meta/AUTHORS index 6aef6f1..b2c6dc6 100644 --- a/meta/AUTHORS +++ b/meta/AUTHORS @@ -1,2 +1,8 @@ Main author: * Olivier Brunel <jjk@jjacky.com> + +Contributors: +* Jonas Borgström & The Borg Collective [buz] +* Min Fu [rabin, rabin-mask, rabin-tttd] +* Scott Vokes <vokes.s@gmail.com> [hashchop] +* Yucheng Zhang [ae] diff --git a/meta/bins/test-ae b/meta/bins/test-ae new file mode 100644 index 0000000..038c0c6 --- /dev/null +++ b/meta/bins/test-ae @@ -0,0 +1,4 @@ +obj/test.o +limb +skalibs +obj/ae.o diff --git a/meta/bins/test-ae-32 b/meta/bins/test-ae-32 new file mode 100644 index 0000000..41a6f7e --- /dev/null +++ b/meta/bins/test-ae-32 @@ -0,0 +1,4 @@ +obj/test.o +limb +skalibs +obj/ae-32.o diff --git a/meta/bins/test-buz b/meta/bins/test-buz new file mode 100644 index 0000000..2e005f4 --- /dev/null +++ b/meta/bins/test-buz @@ -0,0 +1,4 @@ +obj/test.o +limb +skalibs +obj/buz.o diff --git a/meta/bins/test-hashchop b/meta/bins/test-hashchop new file mode 100644 index 0000000..3413b64 --- /dev/null +++ b/meta/bins/test-hashchop @@ -0,0 +1,4 @@ +obj/test.o +limb +skalibs +obj/hashchop.o diff --git a/meta/bins/test-rabin b/meta/bins/test-rabin new file mode 100644 index 0000000..630cbbf --- /dev/null +++ b/meta/bins/test-rabin @@ -0,0 +1,4 @@ +obj/test.o +limb +skalibs +obj/rabin.o diff --git a/meta/bins/test-rabin-mask b/meta/bins/test-rabin-mask new file mode 100644 index 0000000..4f8528d --- /dev/null +++ b/meta/bins/test-rabin-mask @@ -0,0 +1,4 @@ +obj/test.o +limb +skalibs +obj/rabin-mask.o diff --git a/meta/bins/test-rabin-mix b/meta/bins/test-rabin-mix new file mode 100644 index 0000000..22984c0 --- /dev/null +++ b/meta/bins/test-rabin-mix @@ -0,0 +1,4 @@ +obj/test.o +limb +skalibs +obj/rabin-mix.o diff --git a/meta/bins/test-rabin-tttd b/meta/bins/test-rabin-tttd new file mode 100644 index 0000000..dd8bd81 --- /dev/null +++ b/meta/bins/test-rabin-tttd @@ -0,0 +1,4 @@ +obj/test.o +limb +skalibs +obj/rabin-tttd.o diff --git a/meta/deps/comain/version b/meta/deps/comain/version index 4e379d2..bcab45a 100644 --- a/meta/deps/comain/version +++ b/meta/deps/comain/version @@ -1 +1 @@ -0.0.2 +0.0.3 diff --git a/meta/deps/limb/configure b/meta/deps/limb/configure new file mode 100644 index 0000000..e69de29 diff --git a/meta/deps/limb/files b/meta/deps/limb/files new file mode 100644 index 0000000..e69de29 diff --git a/meta/deps/limb/git b/meta/deps/limb/git new file mode 100644 index 0000000..109fd23 --- /dev/null +++ b/meta/deps/limb/git @@ -0,0 +1 @@ +git://lila.oss/limb.git diff --git a/meta/deps/limb/static b/meta/deps/limb/static new file mode 100644 index 0000000..e69de29 diff --git a/meta/deps/limb/version b/meta/deps/limb/version new file mode 100644 index 0000000..bcab45a --- /dev/null +++ b/meta/deps/limb/version @@ -0,0 +1 @@ +0.0.3 diff --git a/meta/deps/skalibs/get_version b/meta/deps/skalibs/get_version new file mode 100755 index 0000000..55041f5 --- /dev/null +++ b/meta/deps/skalibs/get_version @@ -0,0 +1,2 @@ +#!/bin/sh +grep SKALIBS_VERSION "$@/skalibs/config.h" | cut -d\" -f2 diff --git a/meta/deps/skalibs/git b/meta/deps/skalibs/git new file mode 100644 index 0000000..5965139 --- /dev/null +++ b/meta/deps/skalibs/git @@ -0,0 +1 @@ +git://lila.oss/skalibs.git diff --git a/meta/deps/skalibs/incdir b/meta/deps/skalibs/incdir new file mode 100644 index 0000000..e4484e2 --- /dev/null +++ b/meta/deps/skalibs/incdir @@ -0,0 +1 @@ +src/include diff --git a/meta/deps/skalibs/include b/meta/deps/skalibs/include new file mode 100644 index 0000000..e69de29 diff --git a/meta/deps/skalibs/libdir b/meta/deps/skalibs/libdir new file mode 100644 index 0000000..5baacc7 --- /dev/null +++ b/meta/deps/skalibs/libdir @@ -0,0 +1 @@ +skalibs diff --git a/meta/deps/skalibs/library b/meta/deps/skalibs/library new file mode 100644 index 0000000..39971a0 --- /dev/null +++ b/meta/deps/skalibs/library @@ -0,0 +1 @@ +skarnet diff --git a/meta/deps/skalibs/version b/meta/deps/skalibs/version new file mode 100644 index 0000000..48c7359 --- /dev/null +++ b/meta/deps/skalibs/version @@ -0,0 +1 @@ +2.13.0.0 diff --git a/meta/desc b/meta/desc new file mode 100644 index 0000000..da3f1b3 --- /dev/null +++ b/meta/desc @@ -0,0 +1,19 @@ +Test/benchmark some chunking algorithms for better data-deduplication + +Test/benchmark some chunking algorithms, when needing to split a data stream +into variable-length chunks aimed for better data-deduplication. + +I originally worked out an implementation based on rabin fingerprint, but I +figured I might look for alternative implementations/algorithms, and so we got: + +- rabin : another pretty identical implementation based on rabin +- rabin-mask : a variant using two masks to find the next split, so when the + main one fails to find a split, we fall back on the other one +- rabin-tttd : another variant using two masks to find the next split +- rabin-mix : this is my original impementation, only using the previous two + variants. Specifically, mask below 128 KiB as target/average size, and tttd + otherwise (because tests seemed to indicate that was the best) +- hashchop : another rolling-hash algorithm, based on rsync's hash +- buz : buzhash implementation originally from BorgBackup +- ae : implementation of Asymmetric Extremum chunking algorithm +- ae-32 : AE variant using 32bit values instead of 64bit ones diff --git a/meta/doc b/meta/doc deleted file mode 100644 index 6b89b2f..0000000 --- a/meta/doc +++ /dev/null @@ -1 +0,0 @@ -https://lila.oss/doc/test-rolling-hash diff --git a/meta/site b/meta/site deleted file mode 100644 index 5bcf183..0000000 --- a/meta/site +++ /dev/null @@ -1 +0,0 @@ -https://lila.oss/test-rolling-hash diff --git a/project.mk b/project.mk index cd9bd93..4104399 100644 --- a/project.mk +++ b/project.mk @@ -1,7 +1,8 @@ -$(error You need to edit project.mk) - -# binaries: -- don't forget to set meta/bins/{bin1,bin2,...} with all deps & .o files -BINS = - -# librairies -- don't forget to set meta/libs{lib1,lib2,...} with all deps & .o files -LIBS = +BINS = test-ae \ + test-ae-32 \ + test-buz \ + test-hashchop \ + test-rabin \ + test-rabin-mask \ + test-rabin-mix \ + test-rabin-tttd diff --git a/src/ae-32.c b/src/ae-32.c new file mode 100644 index 0000000..b600f64 --- /dev/null +++ b/src/ae-32.c @@ -0,0 +1,56 @@ +#include <endian.h> +#include <string.h> /* size_t */ +#include <limb/int.h> + +/* Author: Yucheng Zhang + * For more see paper: AE: An Asymmetric Extremum Content Defined Chunking + * Algorithm for Fast and Bandwidth-Efficient Data Deduplication + */ + +#if __BYTE_ORDER == __BIG_ENDIAN +# define GETLE32(u) (u) +#else +# define GETLE32(u) __builtin_bswap32(u) +#endif + +static inline +int cmp(const u8 *x, const u8 *y) +{ + int ret; + u32 a = GETLE32(* ((u32 *) x)); + u32 b = GETLE32(* ((u32 *) y)); + if (a > b) + ret = 1; + else + ret = -1; + return ret; +} + +size_t +nextsplit(size_t min, size_t avg, const void *data, size_t dlen) +{ + if (dlen <= min) return dlen; + + double e = 2.718281828; + int window_size = avg / (e - 1); + /* cur points to the current position + * max points to the position of max value + * end points to the end of buffer + */ + const u8 *cur = (const u8 *) data + 1; + const u8 *max = (const u8 *) data; + const u8 *end = (const u8 *) data + dlen - sizeof(u32); + + if (dlen <= window_size + sizeof(u32)) + return dlen; + + for ( ; cur <= end; ++cur) { + if (cmp(cur, max) < 0) { + max = cur; + continue; + } + if (cur == max + window_size) + return cur - (const u8 *) data; + } + return dlen; +} diff --git a/src/ae.c b/src/ae.c new file mode 100644 index 0000000..d84f598 --- /dev/null +++ b/src/ae.c @@ -0,0 +1,56 @@ +#include <endian.h> +#include <string.h> /* size_t */ +#include <limb/int.h> + +/* Author: Yucheng Zhang + * For more see paper: AE: An Asymmetric Extremum Content Defined Chunking + * Algorithm for Fast and Bandwidth-Efficient Data Deduplication + */ + +#if __BYTE_ORDER == __BIG_ENDIAN +# define GETLE64(u) (u) +#else +# define GETLE64(u) __builtin_bswap64(u) +#endif + +static inline +int cmp(const u8 *x, const u8 *y) +{ + int ret; + u64 a = GETLE64(* ((u64 *) x)); + u64 b = GETLE64(* ((u64 *) y)); + if (a > b) + ret = 1; + else + ret = -1; + return ret; +} + +size_t +nextsplit(size_t min, size_t avg, const void *data, size_t dlen) +{ + if (dlen <= min) return dlen; + + double e = 2.718281828; + int window_size = avg / (e - 1); + /* cur points to the current position + * max points to the position of max value + * end points to the end of buffer + */ + const u8 *cur = (const u8 *) data + 1; + const u8 *max = (const u8 *) data; + const u8 *end = (const u8 *) data + dlen - sizeof(u64); + + if (dlen <= window_size + sizeof(u64)) + return dlen; + + for ( ; cur <= end; ++cur) { + if (cmp(cur, max) < 0) { + max = cur; + continue; + } + if (cur == max + window_size) + return cur - (const u8 *) data; + } + return dlen; +} diff --git a/src/buz.c b/src/buz.c new file mode 100644 index 0000000..6e68b6d --- /dev/null +++ b/src/buz.c @@ -0,0 +1,102 @@ +/* From BorgBackup: + * Copyright (C) 2015-2022 The Borg Collective (see AUTHORS file) + * Copyright (C) 2010-2014 Jonas Borgström <jonas@borgstrom.se> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * 3. The name of the author may not be used to endorse or promote + * products derived from this software without specific prior + * written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#include <string.h> /* size_t */ +#include <limb/int.h> + +size_t +nextsplit(size_t min, size_t avg, const void *data, size_t dlen) +{ + if (dlen <= min) return dlen; + u32 T[] = + { + 0XE7F831EC, 0XF4026465, 0XAFB50CAE, 0X6D553C7A, 0XD639EFE3, 0X19A7B895, 0X9ABA5B21, 0X5417D6D4, + 0X35FD2B84, 0XD1F6A159, 0X3F8E323F, 0XB419551C, 0XF444CEBF, 0X21DC3B80, 0XDE8D1E36, 0X84A32436, + 0XBEB35A9D, 0XA36F24AA, 0XA4E60186, 0X98D18FFE, 0X3F042F9E, 0XDB228BCD, 0X096474B7, 0X5C20C2F7, + 0XF9EEC872, 0XE8625275, 0XB9D38F80, 0XD48EB716, 0X22A950B4, 0X3CBAAEAA, 0XC37CDDD3, 0X8FEA6F6A, + 0X1D55D526, 0X7FD6D3B3, 0XDAA072EE, 0X4345AC40, 0XA077C642, 0X8F2BD45B, 0X28509110, 0X55557613, + 0XFFC17311, 0XD961FFEF, 0XE532C287, 0XAAB95937, 0X46D38365, 0XB065C703, 0XF2D91D0F, 0X92CD4BB0, + 0X4007C712, 0XF35509DD, 0X505B2F69, 0X557EAD81, 0X310F4563, 0XBDDC5BE8, 0X9760F38C, 0X701E0205, + 0X00157244, 0X14912826, 0XDC4CA32B, 0X67B196DE, 0X5DB292E8, 0X8C1B406B, 0X01F34075, 0XFA2520F7, + 0X73BC37AB, 0X1E18BC30, 0XFE2C6CB3, 0X20C522D0, 0X5639E3DB, 0X942BDA35, 0X899AF9D1, 0XCED44035, + 0X98CC025B, 0X255F5771, 0X70FEFA24, 0XE928FA4D, 0X2C030405, 0XB9325590, 0X20CB63BD, 0XA166305D, + 0X80E52C0A, 0XA8FAFE2F, 0X1AD13F7D, 0XCFAF3685, 0X6C83A199, 0X7D26718A, 0XDE5DFCD9, 0X79CF7355, + 0X8979D7FB, 0XEBF8C55E, 0XEBE408E4, 0XCD2AFFBA, 0XE483BE6E, 0XE239D6DE, 0X5DC1E9E0, 0X0473931F, + 0X851B097C, 0XAC5DB249, 0X09C0F9F2, 0XD8D2F134, 0XE6F38E41, 0XB1C71BF1, 0X52B6E4DB, 0X07224424, + 0X6CF73E85, 0X4F25D89C, 0X782A7D74, 0X10A68DCD, 0X3A868189, 0XD570D2DC, 0X69630745, 0X9542ED86, + 0X331CD6B2, 0XA84B5B28, 0X07879C9D, 0X38372F64, 0X7185DB11, 0X25BA7C83, 0X01061523, 0XE6792F9F, + 0XE5DF07D1, 0X4321B47F, 0X7D2469D8, 0X1A3A4F90, 0X48BE29A3, 0X669071AF, 0X8EC8DD31, 0X0810BFBF, + 0X813A06B4, 0X68538345, 0X65865DDC, 0X43A71B8E, 0X78619A56, 0X5A34451D, 0X5BDAA3ED, 0X71EDC7E9, + 0X17AC9A20, 0X78D10BFA, 0X6C1E7F35, 0XD51839D9, 0X240CBC51, 0X33513CC1, 0XD2B4F795, 0XCCAA8186, + 0X0BABE682, 0XA33CF164, 0X18C643EA, 0XC1CA105F, 0X9959147A, 0X6D3D94DE, 0X0B654FBE, 0XED902CA0, + 0X7D835CB5, 0X99BA1509, 0X6445C922, 0X495E76C2, 0XF07194BC, 0XA1631D7E, 0X677076A5, 0X89FFFE35, + 0X1A49BCF3, 0X8E6C948A, 0X0144C917, 0X8D93AEA1, 0X16F87DDF, 0XC8F25D49, 0X1FB11297, 0X27E750CD, + 0X2F422DA1, 0XDEE89A77, 0X1534C643, 0X457B7B8B, 0XAF172F7A, 0X6B9B09D6, 0X33573F7F, 0XF14E15C4, + 0X526467D5, 0XAF488241, 0X87C3EE0D, 0X33BE490C, 0X95AA6E52, 0X43EC242E, 0XD77DE99B, 0XD018334F, + 0X5B78D407, 0X498EB66B, 0XB1279FA8, 0XB38B0EA6, 0X90718376, 0XE325DEE2, 0X8E2F2CBA, 0XCAA5BDEC, + 0X9D652C56, 0XAD68F5CB, 0XA77591AF, 0X88E37EE8, 0XF8FAA221, 0XFCBBBE47, 0X4F407786, 0XAF393889, + 0XF444A1D9, 0X15AE1A2F, 0X40AA7097, 0X6F9486AC, 0X29D232A3, 0XE47609E9, 0XE8B631FF, 0XBA8565F4, + 0X11288749, 0X46C9A838, 0XEB1B7CD8, 0XF516BBB1, 0XFB74FDA0, 0X010996E6, 0X4C994653, 0X1D889512, + 0X53DCD9A3, 0XDD074697, 0X1E78E17C, 0X637C98BF, 0X930BB219, 0XCF7F75B0, 0XCB9355FB, 0X9E623009, + 0XE466D82C, 0X28F968D3, 0XFEB385D9, 0X238E026C, 0XB8ED0560, 0X0C6A027A, 0X3D6FEC4B, 0XBB4B2EC2, + 0XE715031C, 0XEDED011D, 0XCDC4D3B9, 0XC456FC96, 0XDD0EEA20, 0XB3DF8EC9, 0X12351993, 0XD9CBB01C, + 0X603147A2, 0XCF37D17D, 0XF7FCD9DC, 0XD8556FA3, 0X104C8131, 0X13152774, 0XB4715811, 0X6A72C2C9, + 0XC5AE37BB, 0XA76CE12A, 0X8150D8F3, 0X2EC29218, 0XA35F0984, 0X48C0647E, 0X0B5FF98C, 0X71893F7B + }; + u32 mask = (1 << (msb64(avg) - 1)) - 1; + +#define WINDOW_SIZE 2047 +#define BARREL_SHIFT(v, shift) ( ((v) << (shift)) | ((v) >> ((32 - (shift)) & 0x1F)) ) + + const u8 *d = data; + u32 i, sum, imod; + + if (dlen <= min + WINDOW_SIZE) return dlen; + + sum = 0; + for (i = WINDOW_SIZE - 1; i > 0; --i) { + imod = i & 0x1F; + sum ^= BARREL_SHIFT(T[*d], imod); + ++d; + } + sum ^= T[*d]; + + d = data; + for (i = WINDOW_SIZE; i < dlen - WINDOW_SIZE; ++i) { + sum = BARREL_SHIFT(sum, 1) ^ BARREL_SHIFT(T[*d], 0x1F) ^ T[d[WINDOW_SIZE]]; + if ((sum & mask) == 0) break; + ++d; + } + if (i >= dlen - WINDOW_SIZE) + i = dlen; + + return i; +} diff --git a/src/hashchop.c b/src/hashchop.c new file mode 100644 index 0000000..18b8184 --- /dev/null +++ b/src/hashchop.c @@ -0,0 +1,50 @@ +/* + * Copyright (c) 2011-2012 Scott Vokes <vokes.s@gmail.com> + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include <string.h> /* size_t */ +#include <limb/int.h> + +size_t +nextsplit(size_t min, size_t avg, const void *data, size_t dlen) +{ + if (dlen <= min) return dlen; + u32 mask = (1 << (msb64(avg) - 1)) - 1; + const u8 *buf = data; + + u32 a = 1, b = 0; + size_t offset; + + for (u32 i = 0; i < min; ++i) { + a += buf[i]; + b += (min - i + 1) * buf[i]; + } + a &= mask; + b &= mask; + + for (offset = min; offset < dlen; ++offset) { + u32 k = offset - min; + u32 l = offset; + u8 nk = buf[k]; + u8 nl = buf[l]; + u32 na = (a - nk + nl); + u32 nb = (b - (l - k + 1) * nk + na); + u32 checksum = (na + (nb << 16)) & mask; + if (checksum == 0) break; + a = na; + b = nb; + } + return offset; +} diff --git a/src/rabin-mask.c b/src/rabin-mask.c new file mode 100644 index 0000000..2e3c9e0 --- /dev/null +++ b/src/rabin-mask.c @@ -0,0 +1,42 @@ +#include <string.h> /* size_t */ +#include <limb/int.h> +#include "rabin-tables.h" + +#define BREAKMARK_VALUE 0x78 + +size_t +nextsplit(size_t min, size_t avg, const void *data, size_t dlen) +{ + if (dlen <= min) return dlen; + u64 sml_mask = (1 << (msb64(avg * 2) - 1)) - 1; + u64 lrg_mask = (1 << (msb64(avg / 2) - 1)) - 1; + u64 fp = 0; + int i = min, wpos = -1; + + const u8 *sce = data; + u8 window[RABIN_WINDOW_SIZE] = { 0 }; + + while (i < dlen) { + u8 om; + u64 x; + if (++wpos >= sizeof(window)) + wpos = 0; + om = window[wpos]; + window[wpos] = sce[i - 1]; + fp ^= rabin_tables.U[om]; + x = fp >> 55; + fp <<= 8; + fp |= sce[i - 1]; + fp ^= rabin_tables.T[x]; + + if (i >= min) { + if (i < avg) { + if ((fp & sml_mask) == BREAKMARK_VALUE) + break; + } else if ((fp & lrg_mask) == BREAKMARK_VALUE) + break; + } + ++i; + } + return i; +} diff --git a/src/rabin-mix.c b/src/rabin-mix.c new file mode 100644 index 0000000..dcd3f2b --- /dev/null +++ b/src/rabin-mix.c @@ -0,0 +1,49 @@ +#include <string.h> /* size_t */ +#include <limb/int.h> +#include "rabin-tables.h" + +size_t +nextsplit(size_t min, size_t avg, const void *data, size_t dlen) +{ + if (dlen <= min) return dlen; + u8 window[RABIN_WINDOW_SIZE] = { 0 }; + const u8 *b = data; + size_t wpos = 0, pos = min, last = 0; + u64 fp = 0; + u64 mask, lrg_mask; + int tttd = avg >= (128 << 10); + if (tttd) + mask = (1 << (msb64(avg ) - 1)) - 1; + else + mask = (1 << (msb64(avg * 2) - 1)) - 1; + lrg_mask = (1 << (msb64(avg / 2) - 1)) - 1; + +#define TTTD_BREAK 0x78 +#define APPEND8(p,m) (((p) << 8) | m) ^ rabin_tables.T[(p) >> 55] + for (;;) { + u8 old = window[wpos]; + window[wpos] = b[pos]; + fp = APPEND8(fp ^ rabin_tables.U[old], window[wpos]); + if (++wpos == sizeof(window)) wpos = 0; + + if (pos == dlen) { + if (last) return last; + return pos; + } + if (pos >= min) { + if (tttd) { + if ((fp & lrg_mask) == TTTD_BREAK) { + if ((fp & mask) == TTTD_BREAK) + return pos; + last = pos; + } + } else if (pos < avg) { + if ((fp & mask) == mask) + return pos; + } else if ((fp & lrg_mask) == lrg_mask) { + return pos; + } + } + ++pos; + } +} diff --git a/src/rabin-tttd.c b/src/rabin-tttd.c new file mode 100644 index 0000000..6c1c9c5 --- /dev/null +++ b/src/rabin-tttd.c @@ -0,0 +1,45 @@ +#include <string.h> /* size_t */ +#include <limb/int.h> +#include "rabin-tables.h" + +#define BREAKMARK_VALUE 0x78 + +size_t +nextsplit(size_t min, size_t avg, const void *data, size_t dlen) +{ + if (dlen <= min) return dlen; + u64 mask = (1 << (msb64(avg ) - 1)) - 1; + u64 lrg_mask = (1 << (msb64(avg / 2) - 1)) - 1; + u64 fp = 0; + int i = min, wpos = -1; + int m = 0; + + const u8 *sce = data; + u8 window[RABIN_WINDOW_SIZE] = { 0 }; + + while (i < dlen) { + u8 om; + u64 x; + if (++wpos >= sizeof(window)) + wpos = 0; + om = window[wpos]; + window[wpos] = sce[i - 1]; + fp ^= rabin_tables.U[om]; + x = fp >> 55; + fp <<= 8; + fp |= sce[i - 1]; + fp ^= rabin_tables.T[x]; + + if (i >= min) { + if ((fp & lrg_mask) == BREAKMARK_VALUE) { + if ((fp & mask) == BREAKMARK_VALUE) + return i; + m = i; + } + } + ++i; + } + if (m) + return m; + return i; +} diff --git a/src/rabin.c b/src/rabin.c new file mode 100644 index 0000000..0140643 --- /dev/null +++ b/src/rabin.c @@ -0,0 +1,36 @@ +#include <string.h> /* size_t */ +#include <limb/int.h> +#include "rabin-tables.h" + +#define BREAKMARK_VALUE 0x78 + +size_t +nextsplit(size_t min, size_t avg, const void *data, size_t dlen) +{ + if (dlen <= min) return dlen; + u64 mask = (1 << (msb64(avg) - 1)) - 1; + u64 fp = 0; + int i = min, wpos = -1; + + const u8 *sce = data; + u8 window[RABIN_WINDOW_SIZE] = { 0 }; + + while (i < dlen) { + u8 om; + u64 x; + if (++wpos >= sizeof(window)) + wpos = 0; + om = window[wpos]; + window[wpos] = sce[i - 1]; + fp ^= rabin_tables.U[om]; + x = fp >> 55; + fp <<= 8; + fp |= sce[i - 1]; + fp ^= rabin_tables.T[x]; + + if (i >= min && (fp & mask) == BREAKMARK_VALUE) + break; + ++i; + } + return i; +} diff --git a/src/test.c b/src/test.c new file mode 100644 index 0000000..e23b2c9 --- /dev/null +++ b/src/test.c @@ -0,0 +1,244 @@ +#include <dirent.h> +#include <errno.h> +#include <time.h> +#include <skalibs/stralloc.h> +#include <skalibs/unix-transactional.h> +#include <skalibs/djbunix.h> +#include <skalibs/uint32.h> +#include <skalibs/uint16.h> +#include <skalibs/fmtscan.h> +#include <limb/int.h> +#include <limb/output.h> +#include <limb/blake3.h> + +const char *PROG = "test"; + +extern size_t nextsplit(size_t min, size_t avg, const void *data, size_t dlen); + +struct bench { + size_t min; + size_t avg; + size_t max; + int iter; + stralloc *sa; + u8 *blocks; + size_t blkpos; + size_t pos; + size_t total; + int hashes; +}; + +static void +bench(struct bench *b) +{ + size_t pos, curblkpos = b->blkpos; + for (int i = 0; i < b->iter; ++i) { + curblkpos = b->blkpos; + pos = b->pos; + while (pos < b->sa->len) { + size_t len = b->sa->len - pos; + size_t offset = nextsplit(b->min, b->avg, + b->sa->s + pos, (len > b->max) ? b->max : len); + curblkpos += u64_pack_trim((u64) offset, b->blocks + curblkpos); + /* special case: when scanning directories, we don't keep the + * entire dataset in memory, so we need to compute/print hashes + * right now within the benchmarking. */ + if (i == 0 && b->hashes) { + unsigned char buf[32]; + blake3(b->sa->s + pos, offset, buf); + char hash[32 * 2 + 1] = { 0 }; + for (int i = 0; i < sizeof(buf); ++i) { + if (buf[i] & 0xf0) + hash[i * 2] = fmtscan_asc(buf[i] >> 4); + else + hash[i * 2] = '0'; + hash[i * 2 + 1] = fmtscan_asc(buf[i] & 0x0f); + } + char size[UINT32_FMT]; + size[uint32_fmt(size, offset)] = 0; + out(hash, " ", size); + } + pos += offset; + } + } + b->blkpos = curblkpos; + b->total += b->sa->len - b->pos; +} + +static int +processdir(int basefd, const char *name, struct bench *b) +{ + int ret = -1; + int fd; + DIR *dir; + struct dirent *de; + + dir = NULL; + fd = open_readat(basefd, name); + if (fd < 0) goto err; + + dir = fdopendir(fd); + if (!dir) goto err; + fd = -1; + + for (;;) { + errno = 0; + de = readdir(dir); + if (!de) { + if (errno) goto err; + break; + } + if (de->d_name[0] == '.' && + (!de->d_name[1] || (de->d_name[1] == '.' && !de->d_name[2]))) + continue; + + size_t salen = b->sa->len; + fd = open_readat(dirfd(dir), de->d_name); + if (fd < 0 || !slurp(b->sa, fd)) { + if (errno == EISDIR) { + fd_close(fd); + if (processdir(dirfd(dir), de->d_name, b) < 0) + goto err; + continue; + } + if (fd >= 0) fd_close(fd); + warnusys("process ...", name, "/", de->d_name); + continue; + } + fd_close(fd); + fd = -1; + + bench(b); + b->sa->len = salen; + } + + ret = 0; +err: + if (fd >= 0) fd_close(fd); + if (dir) closedir(dir); + return ret; +} + +#include <stdio.h> +int +main(int argc, const char *argv[]) +{ + struct timespec ts1, ts2; + stralloc sa = STRALLOC_ZERO; + u8 blocks[800 << 10]; + struct bench b = { + .min = ( 4 << 10), + .avg = ( 8 << 10), + .max = ( 1 << 20), + .iter = 1, + .sa = &sa, + .blocks = blocks, + }; + int list = 0; + + while (argc >= 2) { + if (!strncmp(argv[1], "-a", 2) || !strncmp(argv[1], "-m", 2) + || !strncmp(argv[1], "-M", 2)) { + const char *s = NULL; + if (!argv[1][2]) + s = argv[2]; + else if (argc < 3) + dief(1, "missing value for ", argv[1]); + else + s = argv[1] + 2; + + u32 u; + if (!uint32_scan(s, &u)) + dief(1, "invalid value for ", argv[1]); + switch (argv[1][1]) { + case 'a': b.avg = u; break; + case 'm': b.min = u; break; + case 'M': b.max = u; break; + } + + int e = (s == argv[2]) ? 1 : 0; + argc -= 1 + e; + memmove(&argv[1], &argv[2 + e], argc * sizeof(*argv)); + } else if (!strcmp(argv[1], "-l")) { + list = 1; + } else if (!strcmp(argv[1], "-H")) { + list = 2; + } else { + break; + } + + if (list) + memmove(&argv[1], &argv[2], --argc * sizeof(*argv)); + } + + if (argc != 2 && argc != 3) + dieusage(1, "[-a AVGSIZE] [-m MINSIZE] [-M MAXSIZE] [-l | -H] FILE [ITER]"); + + if (argc == 3) { + u16 u; + if (!uint16_scan(argv[2], &u)) + dief(1, "invalid ITER argument"); + b.iter = u; + } + + int fd = open_read(argv[1]); + if (fd < 0 || !slurp(&sa, fd)) { + if (errno != EISDIR) + diefusys(2, "read ", argv[1]); + fd_close(fd); + fd = -1; + if (list == 2) { + b.hashes = 1; + list = 0; + } + } + + clock_gettime(CLOCK_MONOTONIC, &ts1); + if (fd < 0) + processdir(AT_FDCWD, argv[1], &b); + else + bench(&b); + clock_gettime(CLOCK_MONOTONIC, &ts2); + blocks[b.blkpos] = 0; + + if (fd >= 0) fd_close(fd); + + size_t min = b.total; + size_t max = 0; + size_t total = 0; + int n = 1; + size_t pos = 0; + for (int o = 0; blocks[o]; ++n) { + u64 u; + o += u64_unpack_trim(blocks + o, &u); + if (list) { + if (list == 2) { + unsigned char buf[32]; + blake3(sa.s + pos, u, buf); + for (int i = 0; i < sizeof(buf); ++i) + fprintf(stdout, "%.02x", buf[i]); + pos += u; + } else if (list == 1) { + fprintf(stdout, "block:"); + } + fprintf(stdout, " %lu\n", u); + } + if (u < min && blocks[o]) min = u; + if (u > max) max = u; + total += u; + } + --n; + if (total != b.total) { + fprintf(stderr, "incorrect total size %lu != %lu\n", total, b.total); + return 2; + } + fprintf(stderr, "%u blocks; min=%lu, avg=%lu, max=%lu\n", n, min, total / n, max); + + ts2.tv_sec -= ts1.tv_sec; + ts2.tv_nsec -= ts1.tv_nsec; + double took = ts2.tv_sec + (ts2.tv_nsec / 1000000000.0); + double speed = ((b.total * b.iter) / took) / (1 << 20); + fprintf(stderr, "took %.09f seconds, %f MiB/s\n", took, speed); + + return 0; +} diff --git a/tests b/tests new file mode 100755 index 0000000..732adc1 --- /dev/null +++ b/tests @@ -0,0 +1,10 @@ +#!/bin/sh + +for file in test-*; do + if test $# -eq 0; then + ./"$file" + exit 1 + fi + echo "$file:" + ./"$file" "$@" +done diff --git a/tests-avgs b/tests-avgs new file mode 100755 index 0000000..cbcceb9 --- /dev/null +++ b/tests-avgs @@ -0,0 +1,25 @@ +#!/bin/sh + +run_dedup() { + for file in test-*; do + algo=$(expr substr "$file" 6 ${#file}) + ./tests-dedup $algo "$@" >>dedup.log 2>&1 + done +} + +printf '\n%s\n\n' "8 KiB" >> dedup.log +run_dedup -a8192 "$@" +printf '\n%s\n\n' "16 KiB" >> dedup.log +run_dedup -a16384 "$@" +printf '\n%s\n\n' "32 KiB" >> dedup.log +run_dedup -a32768 "$@" +printf '\n%s\n\n' "64 KiB" >> dedup.log +run_dedup -a65536 "$@" +printf '\n%s\n\n' "128 KiB" >> dedup.log +run_dedup -a131072 "$@" +printf '\n%s\n\n' "256 KiB" >> dedup.log +run_dedup -a262144 "$@" +printf '\n%s\n\n' "512 KiB" >> dedup.log +run_dedup -a524288 "$@" +printf '\n%s\n\n' "1024 KiB" >> dedup.log +run_dedup -a1048576 -M2097152 "$@" diff --git a/tests-dedup b/tests-dedup new file mode 100755 index 0000000..39db122 --- /dev/null +++ b/tests-dedup @@ -0,0 +1,56 @@ +#!/bin/sh + +dieusage() { + echo usage: $0 ALGO [-a AVGSIZE] [-m MINSIZE] [-M MAXSIZE] FILE [FILE2] + exit 1 +} + +if test $# -le 1; then dieusage; fi + +algo="$1" +shift + +if test ! -e ./test-"$algo"; then + echo $0: fatal: test-$algo not found >&2 + exit 1 +fi + +calcdedup() { + minus=0 + if test "$1" = "-1"; then minus=1; fi + + dedup=0 + while read -r count hash size <&0; do + count=$(($count - $minus)) + size=$(($count * $size)) + dedup=$(($dedup + $size)) + done + echo dedup: $dedup +} + +args= +for arg; do + case "$arg" in + -a|-m|-M) + args="$args $arg $2" + shift 2 + ;; + -a*|-m*|-M*) + args="$args $arg" + shift + ;; + -*) + dieusage + ;; + esac +done + +echo "* $algo:" +echo $1 +./test-$algo $args -H "$1" | sort > $algo.1 +uniq -cd $algo.1 | calcdedup -1 +if test -n "$2"; then + echo $2 + ./test-$algo $args -H "$2" | sort | comm -12 $algo.1 - | uniq -c | calcdedup +fi +rm $algo.1