diff options
Diffstat (limited to 'src/cmd')
68 files changed, 14443 insertions, 2 deletions
diff --git a/src/cmd/venti/copy.c b/src/cmd/venti/copy.c new file mode 100644 index 00000000..89fbbac9 --- /dev/null +++ b/src/cmd/venti/copy.c @@ -0,0 +1,170 @@ +#include <u.h> +#include <libc.h> +#include <venti.h> +#include <libsec.h> +#include <thread.h> + +int changes; +int rewrite; +int ignoreerrors; +int fast; +int verbose; +VtConn *zsrc, *zdst; + +void +usage(void) +{ + fprint(2, "usage: copy [-fir] [-t type] srchost dsthost score\n"); + threadexitsall("usage"); +} + +void +walk(uchar score[VtScoreSize], uint type, int base) +{ + int i, n; + uchar *buf; + VtEntry e; + VtRoot root; + + if(memcmp(score, vtzeroscore, VtScoreSize) == 0) + return; + + buf = vtmallocz(VtMaxLumpSize); + if(fast && vtread(zdst, score, type, buf, VtMaxLumpSize) >= 0){ + if(verbose) + fprint(2, "skip %V\n", score); + free(buf); + return; + } + + n = vtread(zsrc, score, type, buf, VtMaxLumpSize); + if(n < 0){ + if(rewrite){ + changes++; + memmove(score, vtzeroscore, VtScoreSize); + }else if(!ignoreerrors) + sysfatal("reading block %V (type %d): %r", type, score); + return; + } + + switch(type){ + case VtRootType: + if(vtrootunpack(&root, buf) < 0){ + fprint(2, "warning: could not unpack root in %V %d\n", score, type); + break; + } + walk(root.score, VtDirType, 0); + walk(root.prev, VtRootType, 0); + vtrootpack(&root, buf); /* walk might have changed score */ + break; + + case VtDirType: + for(i=0; i<n/VtEntrySize; i++){ + if(vtentryunpack(&e, buf, i) < 0){ + fprint(2, "warning: could not unpack entry #%d in %V %d\n", i, score, type); + continue; + } + if(!(e.flags & VtEntryActive)) + continue; + walk(e.score, e.type, e.type&VtTypeBaseMask); + vtentrypack(&e, buf, i); + } + break; + + case VtDataType: + break; + + default: /* pointers */ + for(i=0; i<n; i+=VtScoreSize) + if(memcmp(buf+i, vtzeroscore, VtScoreSize) != 0) + walk(buf+i, type-1, base); + break; + } + + if(vtwrite(zdst, score, type, buf, n) < 0){ + /* figure out score for better error message */ + /* can't use input argument - might have changed contents */ + n = vtzerotruncate(type, buf, n); + sha1(buf, n, score, nil); + sysfatal("writing block %V (type %d): %r", score, type); + } + free(buf); +} + +void +threadmain(int argc, char *argv[]) +{ + int type, n; + uchar score[VtScoreSize]; + uchar *buf; + char *prefix; + + fmtinstall('F', vtfcallfmt); + fmtinstall('V', vtscorefmt); + + type = -1; + ARGBEGIN{ + case 'f': + fast = 1; + break; + case 'i': + if(rewrite) + usage(); + ignoreerrors = 1; + break; + case 'r': + if(ignoreerrors) + usage(); + rewrite = 1; + break; + case 't': + type = atoi(EARGF(usage())); + break; + default: + usage(); + break; + }ARGEND + + if(argc != 3) + usage(); + + if(vtparsescore(argv[2], &prefix, score) < 0) + sysfatal("could not parse score: %r"); + + buf = vtmallocz(VtMaxLumpSize); + + zsrc = vtdial(argv[0]); + if(zsrc == nil) + sysfatal("could not dial src server: %r"); + if(vtconnect(zsrc) < 0) + sysfatal("vtconnect src: %r"); + + zdst = vtdial(argv[1]); + if(zdst == nil) + sysfatal("could not dial dst server: %r"); + if(vtconnect(zdst) < 0) + sysfatal("vtconnect dst: %r"); + + if(type != -1){ + n = vtread(zsrc, score, type, buf, VtMaxLumpSize); + if(n < 0) + sysfatal("could not read block: %r"); + }else{ + for(type=0; type<VtMaxType; type++){ + n = vtread(zsrc, score, type, buf, VtMaxLumpSize); + if(n >= 0) + break; + } + if(type == VtMaxType) + sysfatal("could not find block %V of any type", score); + } + + walk(score, type, VtDirType); + if(changes) + print("%s:%V (%d pointers rewritten)\n", prefix, score, changes); + + if(vtsync(zdst) < 0) + sysfatal("could not sync dst server: %r"); + + threadexitsall(0); +} diff --git a/src/cmd/venti/devnull.c b/src/cmd/venti/devnull.c new file mode 100644 index 00000000..29cf9ecf --- /dev/null +++ b/src/cmd/venti/devnull.c @@ -0,0 +1,80 @@ +/* Copyright (c) 2004 Russ Cox */ +#include <u.h> +#include <libc.h> +#include <venti.h> +#include <thread.h> +#include <libsec.h> + +#ifndef _UNISTD_H_ +#pragma varargck type "F" VtFcall* +#pragma varargck type "T" void +#endif + +int verbose; + +enum +{ + STACK = 8192, +}; + +void +usage(void) +{ + fprint(2, "usage: venti/devnull [-v] [-a address]\n"); + threadexitsall("usage"); +} + +void +threadmain(int argc, char **argv) +{ + VtReq *r; + VtSrv *srv; + char *address; + Packet *p; + + fmtinstall('V', vtscorefmt); + fmtinstall('F', vtfcallfmt); + + address = "tcp!*!venti"; + + ARGBEGIN{ + case 'v': + verbose++; + break; + case 'a': + address = EARGF(usage()); + break; + default: + usage(); + }ARGEND + + srv = vtlisten(address); + if(srv == nil) + sysfatal("vtlisten %s: %r", argv[1]); + + while((r = vtgetreq(srv)) != nil){ + r->rx.msgtype = r->tx.msgtype+1; + if(verbose) + fprint(2, "<- %F\n", &r->tx); + switch(r->tx.msgtype){ + case VtTping: + break; + case VtTgoodbye: + break; + case VtTread: + r->rx.error = vtstrdup("no such block"); + r->rx.msgtype = VtRerror; + break; + case VtTwrite: + packetsha1(r->tx.data, r->rx.score); + break; + case VtTsync: + break; + } + if(verbose) + fprint(2, "-> %F\n", &r->rx); + vtrespond(r); + } + threadexitsall(nil); +} + diff --git a/src/cmd/venti/mkfile b/src/cmd/venti/mkfile index 1a1b652d..bba8d2eb 100644 --- a/src/cmd/venti/mkfile +++ b/src/cmd/venti/mkfile @@ -1,3 +1,13 @@ -%:VQ: - echo venti will return once it is debugged. +<$PLAN9/src/mkhdr + +DIRS=srv + +TARG=\ + copy\ + read\ + sync\ + write\ + +<$PLAN9/src/mkmany +<$PLAN9/src/mkdirs diff --git a/src/cmd/venti/mkroot.c b/src/cmd/venti/mkroot.c new file mode 100644 index 00000000..f18cbf35 --- /dev/null +++ b/src/cmd/venti/mkroot.c @@ -0,0 +1,59 @@ +#include "stdinc.h" +#include "dat.h" +#include "fns.h" + +char *host; + +void +usage(void) +{ + fprint(2, "usage: mkroot [-h host] name type score blocksize prev\n"); + threadexitsall("usage"); +} + +void +threadmain(int argc, char *argv[]) +{ + uchar score[VtScoreSize]; + uchar buf[VtRootSize]; + VtConn *z; + VtRoot root; + + ARGBEGIN{ + case 'h': + host = EARGF(usage()); + break; + default: + usage(); + break; + }ARGEND + + if(argc != 5) + usage(); + + ventifmtinstall(); + + strecpy(root.name, root.name+sizeof root.name, argv[0]); + strecpy(root.type, root.type+sizeof root.type, argv[1]); + if(vtparsescore(argv[2], strlen(argv[2]), nil, root.score) < 0) + sysfatal("bad score '%s'", argv[2]); + root.blocksize = atoi(argv[3]); + if(vtparsescore(argv[4], strlen(argv[4]), nil, root.prev) < 0) + sysfatal("bad score '%s'", argv[4]); + vtrootpack(&root, buf); + + z = vtdial(host); + if(z == nil) + sysfatal("could not connect to server: %r"); + + if(vtconnect(z) < 0) + sysfatal("vtconnect: %r"); + + if(vtwrite(z, score, VtRootType, buf, VtRootSize) < 0) + sysfatal("vtwrite: %r"); + if(vtsync(z) < 0) + sysfatal("vtsync: %r"); + vthangup(z); + print("%V\n", score); + threadexitsall(0); +} diff --git a/src/cmd/venti/randtest.c b/src/cmd/venti/randtest.c new file mode 100644 index 00000000..b7a09ef8 --- /dev/null +++ b/src/cmd/venti/randtest.c @@ -0,0 +1,334 @@ +#include <u.h> +#include <libc.h> +#include <venti.h> +#include <libsec.h> +#include <thread.h> + + +enum { STACK = 32768 }; +void xxxsrand(long); +long xxxlrand(void); + +Channel *cw; +Channel *cr; +char *host; +int blocksize, seed, randpct; +int doread, dowrite, packets, permute; +vlong totalbytes, cur; +VtConn *z; +int multi; +int maxpackets; +int sequence; +int doublecheck = 1; +uint *order; + +void +usage(void) +{ + fprint(2, "usage: randtest [-q] [-h host] [-s seed] [-b blocksize] [-p randpct] [-n totalbytes] [-M maxblocks] [-P] [-r] [-w]\n"); + threadexitsall("usage"); +} + +void +wr(char *buf, char *buf2) +{ + uchar score[VtScoreSize], score2[VtScoreSize]; + DigestState ds; + + memset(&ds, 0, sizeof ds); + if(doublecheck) + sha1((uchar*)buf, blocksize, score, &ds); + if(vtwrite(z, score2, VtDataType, (uchar*)buf, blocksize) < 0) + sysfatal("vtwrite %V at %,lld: %r", score, cur); + if(doublecheck && memcmp(score, score2, VtScoreSize) != 0) + sysfatal("score mismatch! %V %V", score, score2); +} + +void +wrthread(void *v) +{ + char *p; + + USED(v); + while((p = recvp(cw)) != nil){ + wr(p, nil); + free(p); + } +} + +void +rd(char *buf, char *buf2) +{ + uchar score[VtScoreSize]; + DigestState ds; + + memset(&ds, 0, sizeof ds); + sha1((uchar*)buf, blocksize, score, &ds); + if(vtread(z, score, VtDataType, (uchar*)buf2, blocksize) < 0) + sysfatal("vtread %V at %,lld: %r", score, cur); + if(memcmp(buf, buf2, blocksize) != 0) + sysfatal("bad data read! %V", score); +} + +void +rdthread(void *v) +{ + char *p, *buf2; + + buf2 = vtmalloc(blocksize); + USED(v); + while((p = recvp(cr)) != nil){ + rd(p, buf2); + free(p); + } +} + +char *template; + +void +run(void (*fn)(char*, char*), Channel *c) +{ + int i, t, j, packets; + char *buf2, *buf; + + buf2 = vtmalloc(blocksize); + buf = vtmalloc(blocksize); + cur = 0; + packets = totalbytes/blocksize; + if(maxpackets == 0) + maxpackets = packets; + order = vtmalloc(packets*sizeof order[0]); + for(i=0; i<packets; i++) + order[i] = i; + if(permute){ + for(i=1; i<packets; i++){ + j = nrand(i+1); + t = order[i]; + order[i] = order[j]; + order[j] = t; + } + } + for(i=0; i<packets && i<maxpackets; i++){ + memmove(buf, template, blocksize); + *(uint*)buf = order[i]; + if(c){ + sendp(c, buf); + buf = vtmalloc(blocksize); + }else + (*fn)(buf, buf2); + cur += blocksize; + } + free(order); +} + +#define TWID64 ((u64int)~(u64int)0) + +u64int +unittoull(char *s) +{ + char *es; + u64int n; + + if(s == nil) + return TWID64; + n = strtoul(s, &es, 0); + if(*es == 'k' || *es == 'K'){ + n *= 1024; + es++; + }else if(*es == 'm' || *es == 'M'){ + n *= 1024*1024; + es++; + }else if(*es == 'g' || *es == 'G'){ + n *= 1024*1024*1024; + es++; + }else if(*es == 't' || *es == 'T'){ + n *= 1024*1024; + n *= 1024*1024; + } + if(*es != '\0') + return TWID64; + return n; +} + +void +threadmain(int argc, char *argv[]) +{ + int i, max; + vlong t0; + double t; + + blocksize = 8192; + seed = 0; + randpct = 50; + host = nil; + doread = 0; + dowrite = 0; + totalbytes = 1*1024*1024*1024; + fmtinstall('V', vtscorefmt); + fmtinstall('F', vtfcallfmt); + + ARGBEGIN{ + case 'b': + blocksize = unittoull(EARGF(usage())); + break; + case 'h': + host = EARGF(usage()); + break; + case 'M': + maxpackets = unittoull(EARGF(usage())); + break; + case 'm': + multi = atoi(EARGF(usage())); + break; + case 'n': + totalbytes = unittoull(EARGF(usage())); + break; + case 'p': + randpct = atoi(EARGF(usage())); + break; + case 'P': + permute = 1; + break; + case 'S': + doublecheck = 0; + ventidoublechecksha1 = 0; + break; + case 's': + seed = atoi(EARGF(usage())); + break; + case 'r': + doread = 1; + break; + case 'w': + dowrite = 1; + break; + case 'V': + chattyventi++; + break; + default: + usage(); + }ARGEND + + if(doread==0 && dowrite==0){ + doread = 1; + dowrite = 1; + } + + z = vtdial(host); + if(z == nil) + sysfatal("could not connect to server: %r"); + if(vtconnect(z) < 0) + sysfatal("vtconnect: %r"); + + if(multi){ + cr = chancreate(sizeof(void*), 0); + cw = chancreate(sizeof(void*), 0); + for(i=0; i<multi; i++){ + proccreate(wrthread, nil, STACK); + proccreate(rdthread, nil, STACK); + } + } + + template = vtmalloc(blocksize); + xxxsrand(seed); + max = (256*randpct)/100; + if(max == 0) + max = 1; + for(i=0; i<blocksize; i++) + template[i] = xxxlrand()%max; + if(dowrite){ + t0 = nsec(); + run(wr, cw); + for(i=0; i<multi; i++) + sendp(cw, nil); + t = (nsec() - t0)/1.e9; + print("write: %lld bytes / %.3f seconds = %.6f MB/s\n", + totalbytes, t, (double)totalbytes/1e6/t); + } + if(doread){ + t0 = nsec(); + run(rd, cr); + for(i=0; i<multi; i++) + sendp(cr, nil); + t = (nsec() - t0)/1.e9; + print("read: %lld bytes / %.3f seconds = %.6f MB/s\n", + totalbytes, t, (double)totalbytes/1e6/t); + } + threadexitsall(nil); +} + + +/* + * algorithm by + * D. P. Mitchell & J. A. Reeds + */ + +#define LEN 607 +#define TAP 273 +#define MASK 0x7fffffffL +#define A 48271 +#define M 2147483647 +#define Q 44488 +#define R 3399 +#define NORM (1.0/(1.0+MASK)) + +static ulong rng_vec[LEN]; +static ulong* rng_tap = rng_vec; +static ulong* rng_feed = 0; + +static void +isrand(long seed) +{ + long lo, hi, x; + int i; + + rng_tap = rng_vec; + rng_feed = rng_vec+LEN-TAP; + seed = seed%M; + if(seed < 0) + seed += M; + if(seed == 0) + seed = 89482311; + x = seed; + /* + * Initialize by x[n+1] = 48271 * x[n] mod (2**31 - 1) + */ + for(i = -20; i < LEN; i++) { + hi = x / Q; + lo = x % Q; + x = A*lo - R*hi; + if(x < 0) + x += M; + if(i >= 0) + rng_vec[i] = x; + } +} + +void +xxxsrand(long seed) +{ + isrand(seed); +} + +long +xxxlrand(void) +{ + ulong x; + + rng_tap--; + if(rng_tap < rng_vec) { + if(rng_feed == 0) { + isrand(1); + rng_tap--; + } + rng_tap += LEN; + } + rng_feed--; + if(rng_feed < rng_vec) + rng_feed += LEN; + x = (*rng_feed + *rng_tap) & MASK; + *rng_feed = x; + + return x; +} + diff --git a/src/cmd/venti/read.c b/src/cmd/venti/read.c new file mode 100644 index 00000000..3f3441e7 --- /dev/null +++ b/src/cmd/venti/read.c @@ -0,0 +1,75 @@ +#include <u.h> +#include <libc.h> +#include <venti.h> +#include <libsec.h> +#include <thread.h> + +void +usage(void) +{ + fprint(2, "usage: read [-h host] [-t type] score\n"); + threadexitsall("usage"); +} + +void +threadmain(int argc, char *argv[]) +{ + int type, n; + uchar score[VtScoreSize]; + uchar *buf; + VtConn *z; + char *host; + + fmtinstall('F', vtfcallfmt); + fmtinstall('V', vtscorefmt); + + host = nil; + type = -1; + ARGBEGIN{ + case 'h': + host = EARGF(usage()); + break; + case 't': + type = atoi(argv[1]); + break; + default: + usage(); + break; + }ARGEND + + if(argc != 1) + usage(); + + if(vtparsescore(argv[0], nil, score) < 0) + sysfatal("could not parse score '%s': %r", argv[0]); + + buf = vtmallocz(VtMaxLumpSize); + + z = vtdial(host); + if(z == nil) + sysfatal("could not connect to server: %r"); + + if(vtconnect(z) < 0) + sysfatal("vtconnect: %r"); + + if(type == -1){ + n = -1; + for(type=0; type<VtMaxType; type++){ + n = vtread(z, score, type, buf, VtMaxLumpSize); + if(n >= 0){ + fprint(2, "venti/read%s%s %V %d\n", host ? " -h" : "", host ? host : "", + score, type); + break; + } + } + }else{ + type = atoi(argv[1]); + n = vtread(z, score, type, buf, VtMaxLumpSize); + } + vthangup(z); + if(n < 0) + sysfatal("could not read block: %r"); + if(write(1, buf, n) != n) + sysfatal("write: %r"); + threadexitsall(0); +} diff --git a/src/cmd/venti/readlist.c b/src/cmd/venti/readlist.c new file mode 100644 index 00000000..bb1d9b6b --- /dev/null +++ b/src/cmd/venti/readlist.c @@ -0,0 +1,112 @@ +#include <u.h> +#include <libc.h> +#include <thread.h> +#include <venti.h> +#include <bio.h> + +char *host; +Biobuf b; +VtConn *z; +uchar *buf; +void run(Biobuf*); +int nn; + +void +usage(void) +{ + fprint(2, "usage: readlist [-h host] list\n"); + threadexitsall("usage"); +} + +int +parsescore(uchar *score, char *buf, int n) +{ + int i, c; + + memset(score, 0, VtScoreSize); + + if(n != VtScoreSize*2){ + werrstr("score wrong length %d", n); + return -1; + } + for(i=0; i<VtScoreSize*2; i++) { + if(buf[i] >= '0' && buf[i] <= '9') + c = buf[i] - '0'; + else if(buf[i] >= 'a' && buf[i] <= 'f') + c = buf[i] - 'a' + 10; + else if(buf[i] >= 'A' && buf[i] <= 'F') + c = buf[i] - 'A' + 10; + else { + c = buf[i]; + werrstr("bad score char %d '%c'", c, c); + return -1; + } + + if((i & 1) == 0) + c <<= 4; + + score[i>>1] |= c; + } + return 0; +} + +void +threadmain(int argc, char *argv[]) +{ + int fd, i; + + ARGBEGIN{ + case 'h': + host = EARGF(usage()); + break; + default: + usage(); + break; + }ARGEND + + fmtinstall('V', vtscorefmt); + buf = vtmallocz(VtMaxLumpSize); + z = vtdial(host); + if(z == nil) + sysfatal("could not connect to server: %r"); + if(vtconnect(z) < 0) + sysfatal("vtconnect: %r"); + + if(argc == 0){ + Binit(&b, 0, OREAD); + run(&b); + }else{ + for(i=0; i<argc; i++){ + if((fd = open(argv[i], OREAD)) < 0) + sysfatal("open %s: %r", argv[i]); + Binit(&b, fd, OREAD); + run(&b); + } + } + threadexitsall(nil); +} + +void +run(Biobuf *b) +{ + char *p, *f[10]; + int nf; + uchar score[20]; + int type, n; + + while((p = Brdline(b, '\n')) != nil){ + p[Blinelen(b)-1] = 0; + nf = tokenize(p, f, nelem(f)); + if(nf != 2) + sysfatal("syntax error in work list"); + if(parsescore(score, f[0], strlen(f[0])) < 0) + sysfatal("bad score %s in work list", f[0]); + type = atoi(f[1]); + n = vtread(z, score, type, buf, VtMaxLumpSize); + if(n < 0) + sysfatal("could not read %s %s: %r", f[0], f[1]); + // write(1, buf, n); + if(++nn%1000 == 0) + print("%d...", nn); + } +} diff --git a/src/cmd/venti/ro.c b/src/cmd/venti/ro.c new file mode 100644 index 00000000..541dae4e --- /dev/null +++ b/src/cmd/venti/ro.c @@ -0,0 +1,112 @@ +/* Copyright (c) 2004 Russ Cox */ +#include <u.h> +#include <libc.h> +#include <venti.h> +#include <thread.h> +#include <libsec.h> + +#ifndef _UNISTD_H_ +#pragma varargck type "F" VtFcall* +#pragma varargck type "T" void +#endif + +VtConn *z; +int verbose; + +enum +{ + STACK = 8192, +}; + +void +usage(void) +{ + fprint(2, "usage: venti/ro [-v] [-a address] [-h address]\n"); + threadexitsall("usage"); +} + +void +readthread(void *v) +{ + char err[ERRMAX]; + VtReq *r; + uchar *buf; + int n; + + r = v; + buf = vtmalloc(r->tx.count); + if((n=vtread(z, r->tx.score, r->tx.blocktype, buf, r->tx.count)) < 0){ + r->rx.msgtype = VtRerror; + rerrstr(err, sizeof err); + r->rx.error = vtstrdup(err); + free(buf); + }else{ + r->rx.data = packetforeign(buf, n, free, buf); + } + if(verbose) + fprint(2, "-> %F\n", &r->rx); + vtrespond(r); +} + +void +threadmain(int argc, char **argv) +{ + VtReq *r; + VtSrv *srv; + char *address, *ventiaddress; + + fmtinstall('F', vtfcallfmt); + fmtinstall('V', vtscorefmt); + + address = "tcp!*!venti"; + ventiaddress = nil; + + ARGBEGIN{ + case 'v': + verbose++; + break; + case 'a': + address = EARGF(usage()); + break; + case 'h': + ventiaddress = EARGF(usage()); + break; + default: + usage(); + }ARGEND + + if((z = vtdial(ventiaddress)) == nil) + sysfatal("vtdial %s: %r", ventiaddress); + if(vtconnect(z) < 0) + sysfatal("vtconnect: %r"); + + srv = vtlisten(address); + if(srv == nil) + sysfatal("vtlisten %s: %r", address); + + while((r = vtgetreq(srv)) != nil){ + r->rx.msgtype = r->tx.msgtype+1; + if(verbose) + fprint(2, "<- %F\n", &r->tx); + switch(r->tx.msgtype){ + case VtTping: + break; + case VtTgoodbye: + break; + case VtTread: + threadcreate(readthread, r, 16384); + continue; + case VtTwrite: + r->rx.error = vtstrdup("read-only server"); + r->rx.msgtype = VtRerror; + break; + case VtTsync: + break; + } + if(verbose) + fprint(2, "-> %F\n", &r->rx); + vtrespond(r); + } + threadexitsall(nil); +} + diff --git a/src/cmd/venti/srv/arena.c b/src/cmd/venti/srv/arena.c new file mode 100644 index 00000000..15bf44d2 --- /dev/null +++ b/src/cmd/venti/srv/arena.c @@ -0,0 +1,737 @@ +#include "stdinc.h" +#include "dat.h" +#include "fns.h" + +typedef struct ASum ASum; + +struct ASum +{ + Arena *arena; + ASum *next; +}; + +static void sealarena(Arena *arena); +static int okarena(Arena *arena); +static int loadarena(Arena *arena); +static CIBlock *getcib(Arena *arena, int clump, int writing, CIBlock *rock); +static void putcib(Arena *arena, CIBlock *cib); +static void sumproc(void *); + +static QLock sumlock; +static Rendez sumwait; +static ASum *sumq; +static uchar zero[8192]; + +int arenasumsleeptime; + +int +initarenasum(void) +{ + sumwait.l = &sumlock; + + if(vtproc(sumproc, nil) < 0){ + seterr(EOk, "can't start arena checksum slave: %r"); + return -1; + } + return 0; +} + +/* + * make an Arena, and initialize it based upon the disk header and trailer. + */ +Arena* +initarena(Part *part, u64int base, u64int size, u32int blocksize) +{ + Arena *arena; + + arena = MKZ(Arena); + arena->part = part; + arena->blocksize = blocksize; + arena->clumpmax = arena->blocksize / ClumpInfoSize; + arena->base = base + blocksize; + arena->size = size - 2 * blocksize; + + if(loadarena(arena) < 0){ + seterr(ECorrupt, "arena header or trailer corrupted"); + freearena(arena); + return nil; + } + if(okarena(arena) < 0){ + freearena(arena); + return nil; + } + + if(arena->diskstats.sealed && scorecmp(zeroscore, arena->score)==0) + backsumarena(arena); + + return arena; +} + +void +freearena(Arena *arena) +{ + if(arena == nil) + return; + free(arena); +} + +Arena* +newarena(Part *part, u32int vers, char *name, u64int base, u64int size, u32int blocksize) +{ + int bsize; + Arena *arena; + + if(nameok(name) < 0){ + seterr(EOk, "illegal arena name", name); + return nil; + } + arena = MKZ(Arena); + arena->part = part; + arena->version = vers; + if(vers == ArenaVersion4) + arena->clumpmagic = _ClumpMagic; + else{ + do + arena->clumpmagic = fastrand(); + while(arena->clumpmagic==_ClumpMagic || arena->clumpmagic==0); + } + arena->blocksize = blocksize; + arena->clumpmax = arena->blocksize / ClumpInfoSize; + arena->base = base + blocksize; + arena->size = size - 2 * blocksize; + + namecp(arena->name, name); + + bsize = sizeof zero; + if(bsize > arena->blocksize) + bsize = arena->blocksize; + + if(wbarena(arena)<0 || wbarenahead(arena)<0 + || writepart(arena->part, arena->base, zero, bsize)<0){ + freearena(arena); + return nil; + } + + return arena; +} + +int +readclumpinfo(Arena *arena, int clump, ClumpInfo *ci) +{ + CIBlock *cib, r; + + cib = getcib(arena, clump, 0, &r); + if(cib == nil) + return -1; + unpackclumpinfo(ci, &cib->data->data[cib->offset]); + putcib(arena, cib); + return 0; +} + +int +readclumpinfos(Arena *arena, int clump, ClumpInfo *cis, int n) +{ + CIBlock *cib, r; + int i; + + for(i = 0; i < n; i++){ + cib = getcib(arena, clump + i, 0, &r); + if(cib == nil) + break; + unpackclumpinfo(&cis[i], &cib->data->data[cib->offset]); + putcib(arena, cib); + } + return i; +} + +/* + * write directory information for one clump + * must be called the arena locked + */ +int +writeclumpinfo(Arena *arena, int clump, ClumpInfo *ci) +{ + CIBlock *cib, r; + + cib = getcib(arena, clump, 1, &r); + if(cib == nil) + return -1; + dirtydblock(cib->data, DirtyArenaCib); + packclumpinfo(ci, &cib->data->data[cib->offset]); + putcib(arena, cib); + return 0; +} + +u64int +arenadirsize(Arena *arena, u32int clumps) +{ + return ((clumps / arena->clumpmax) + 1) * arena->blocksize; +} + +/* + * read a clump of data + * n is a hint of the size of the data, not including the header + * make sure it won't run off the end, then return the number of bytes actually read + */ +u32int +readarena(Arena *arena, u64int aa, u8int *buf, long n) +{ + DBlock *b; + u64int a; + u32int blocksize, off, m; + long nn; + + if(n == 0) + return -1; + + qlock(&arena->lock); + a = arena->size - arenadirsize(arena, arena->memstats.clumps); + qunlock(&arena->lock); + if(aa >= a){ + seterr(EOk, "reading beyond arena clump storage: clumps=%d aa=%lld a=%lld -1 clumps=%lld\n", arena->memstats.clumps, aa, a, arena->size - arenadirsize(arena, arena->memstats.clumps - 1)); + return -1; + } + if(aa + n > a) + n = a - aa; + + blocksize = arena->blocksize; + a = arena->base + aa; + off = a & (blocksize - 1); + a -= off; + nn = 0; + for(;;){ + b = getdblock(arena->part, a, OREAD); + if(b == nil) + return -1; + m = blocksize - off; + if(m > n - nn) + m = n - nn; + memmove(&buf[nn], &b->data[off], m); + putdblock(b); + nn += m; + if(nn == n) + break; + off = 0; + a += blocksize; + } + return n; +} + +/* + * write some data to the clump section at a given offset + * used to fix up corrupted arenas. + */ +u32int +writearena(Arena *arena, u64int aa, u8int *clbuf, u32int n) +{ + DBlock *b; + u64int a; + u32int blocksize, off, m; + long nn; + int ok; + + if(n == 0) + return -1; + + qlock(&arena->lock); + a = arena->size - arenadirsize(arena, arena->memstats.clumps); + if(aa >= a || aa + n > a){ + qunlock(&arena->lock); + seterr(EOk, "writing beyond arena clump storage"); + return -1; + } + + blocksize = arena->blocksize; + a = arena->base + aa; + off = a & (blocksize - 1); + a -= off; + nn = 0; + for(;;){ + b = getdblock(arena->part, a, off != 0 || off + n < blocksize ? ORDWR : OWRITE); + if(b == nil){ + qunlock(&arena->lock); + return -1; + } + dirtydblock(b, DirtyArena); + m = blocksize - off; + if(m > n - nn) + m = n - nn; + memmove(&b->data[off], &clbuf[nn], m); + // ok = writepart(arena->part, a, b->data, blocksize); + ok = 0; + putdblock(b); + if(ok < 0){ + qunlock(&arena->lock); + return -1; + } + nn += m; + if(nn == n) + break; + off = 0; + a += blocksize; + } + qunlock(&arena->lock); + return n; +} + +/* + * allocate space for the clump and write it, + * updating the arena directory +ZZZ question: should this distinguish between an arena +filling up and real errors writing the clump? + */ +u64int +writeaclump(Arena *arena, Clump *c, u8int *clbuf, u64int start, u64int *pa) +{ + DBlock *b; + u64int a, aa; + u32int clump, n, nn, m, off, blocksize; + int ok; + AState as; + + n = c->info.size + ClumpSize + U32Size; + qlock(&arena->lock); + aa = arena->memstats.used; + if(arena->memstats.sealed + || aa + n + U32Size + arenadirsize(arena, arena->memstats.clumps + 1) > arena->size){ + if(!arena->memstats.sealed){ + trace(0, "seal memstats %s", arena->name); + arena->memstats.sealed = 1; + as.arena = arena; + as.aa = start+aa; + as.stats = arena->memstats; + setdcachestate(&as); + } + qunlock(&arena->lock); + return TWID64; + } + if(packclump(c, &clbuf[0], arena->clumpmagic) < 0){ + qunlock(&arena->lock); + return TWID64; + } + + /* + * write the data out one block at a time + */ + blocksize = arena->blocksize; + a = arena->base + aa; + off = a & (blocksize - 1); + a -= off; + nn = 0; + for(;;){ + b = getdblock(arena->part, a, off != 0 ? ORDWR : OWRITE); + if(b == nil){ + qunlock(&arena->lock); + return TWID64; + } + dirtydblock(b, DirtyArena); + m = blocksize - off; + if(m > n - nn) + m = n - nn; + memmove(&b->data[off], &clbuf[nn], m); + // ok = writepart(arena->part, a, b->data, blocksize); + ok = 0; + putdblock(b); + if(ok < 0){ + qunlock(&arena->lock); + return TWID64; + } + nn += m; + if(nn == n) + break; + off = 0; + a += blocksize; + } + + arena->memstats.used += c->info.size + ClumpSize; + arena->memstats.uncsize += c->info.uncsize; + if(c->info.size < c->info.uncsize) + arena->memstats.cclumps++; + + clump = arena->memstats.clumps++; + if(arena->memstats.clumps == 0) + sysfatal("clumps wrapped"); + arena->wtime = now(); + if(arena->ctime == 0) + arena->ctime = arena->wtime; + + writeclumpinfo(arena, clump, &c->info); + + /* set up for call to setdcachestate */ + as.arena = arena; + as.aa = start+arena->memstats.used; + as.stats = arena->memstats; + + /* update this before calling setdcachestate so it cannot be behind dcache.diskstate */ + *pa = start+aa; + setdcachestate(&as); + qunlock(&arena->lock); + + return aa; +} + +int +atailcmp(ATailStats *a, ATailStats *b) +{ + /* good test */ + if(a->used < b->used) + return -1; + if(a->used > b->used) + return 1; + + /* suspect tests - why order this way? (no one cares) */ + if(a->clumps < b->clumps) + return -1; + if(a->clumps > b->clumps) + return 1; + if(a->cclumps < b->cclumps) + return -1; + if(a->cclumps > b->cclumps) + return 1; + if(a->uncsize < b->uncsize) + return -1; + if(a->uncsize > b->uncsize) + return 1; + if(a->sealed < b->sealed) + return -1; + if(a->sealed > b->sealed) + return 1; + + /* everything matches */ + return 0; +} + +void +setatailstate(AState *as) +{ + int i, j, osealed; + Arena *a; + Index *ix; + + trace(0, "setatailstate %s 0x%llux clumps %d", as->arena->name, as->aa, as->stats.clumps); + + ix = mainindex; + for(i=0; i<ix->narenas; i++) + if(ix->arenas[i] == as->arena) + break; + if(i==ix->narenas || as->aa < ix->amap[i].start || as->aa >= ix->amap[i].stop || as->arena != ix->arenas[i]){ + fprint(2, "funny settailstate 0x%llux\n", as->aa); + return; + } + + for(j=i; --j>=0; ){ + a = ix->arenas[j]; + if(atailcmp(&a->diskstats, &a->memstats) == 0) + break; + } + for(j++; j<=i; j++){ + a = ix->arenas[j]; + qlock(&a->lock); + osealed = a->diskstats.sealed; + if(j == i) + a->diskstats = as->stats; + else + a->diskstats = a->memstats; + wbarena(a); + if(a->diskstats.sealed != osealed && !a->inqueue) + sealarena(a); + qunlock(&a->lock); + } +} + +/* + * once sealed, an arena never has any data added to it. + * it should only be changed to fix errors. + * this also syncs the clump directory. + */ +static void +sealarena(Arena *arena) +{ + arena->inqueue = 1; + backsumarena(arena); +} + +void +backsumarena(Arena *arena) +{ + ASum *as; + + if(sumwait.l == nil) + return; + + as = MK(ASum); + if(as == nil) + return; + qlock(&sumlock); + as->arena = arena; + as->next = sumq; + sumq = as; + rwakeup(&sumwait); + qunlock(&sumlock); +} + +static void +sumproc(void *unused) +{ + ASum *as; + Arena *arena; + + USED(unused); + + for(;;){ + qlock(&sumlock); + while(sumq == nil) + rsleep(&sumwait); + as = sumq; + sumq = as->next; + qunlock(&sumlock); + arena = as->arena; + free(as); + + sumarena(arena); + } +} + +void +sumarena(Arena *arena) +{ + ZBlock *b; + DigestState s; + u64int a, e; + u32int bs; + u8int score[VtScoreSize]; + + bs = MaxIoSize; + if(bs < arena->blocksize) + bs = arena->blocksize; + + /* + * read & sum all blocks except the last one + */ + memset(&s, 0, sizeof s); + b = alloczblock(bs, 0, arena->part->blocksize); + e = arena->base + arena->size; + for(a = arena->base - arena->blocksize; a + arena->blocksize <= e; a += bs){ + sleep(arenasumsleeptime); + if(a + bs > e) + bs = arena->blocksize; + if(readpart(arena->part, a, b->data, bs) < 0) + goto ReadErr; + addstat(StatSumRead, 1); + addstat(StatSumReadBytes, bs); + sha1(b->data, bs, nil, &s); + } + + /* + * the last one is special, since it may already have the checksum included + */ + bs = arena->blocksize; + if(readpart(arena->part, e, b->data, bs) < 0){ +ReadErr: + logerr(EOk, "sumarena can't sum %s, read at %lld failed: %r", arena->name, a); + freezblock(b); + return; + } + addstat(StatSumRead, 1); + addstat(StatSumReadBytes, bs); + + sha1(b->data, bs-VtScoreSize, nil, &s); + sha1(zeroscore, VtScoreSize, nil, &s); + sha1(nil, 0, score, &s); + + /* + * check for no checksum or the same + * + * the writepart is okay because we flushed the dcache in sealarena + */ + if(scorecmp(score, &b->data[bs - VtScoreSize]) != 0){ + if(scorecmp(zeroscore, &b->data[bs - VtScoreSize]) != 0) + logerr(EOk, "overwriting mismatched checksums for arena=%s, found=%V calculated=%V", + arena->name, &b->data[bs - VtScoreSize], score); + scorecp(&b->data[bs - VtScoreSize], score); + if(writepart(arena->part, e, b->data, bs) < 0) + logerr(EOk, "sumarena can't write sum for %s: %r", arena->name); + } + freezblock(b); + + qlock(&arena->lock); + scorecp(arena->score, score); + qunlock(&arena->lock); +} + +/* + * write the arena trailer block to the partition + */ +int +wbarena(Arena *arena) +{ + DBlock *b; + int bad; + + if((b = getdblock(arena->part, arena->base + arena->size, OWRITE)) == nil){ + logerr(EAdmin, "can't write arena trailer: %r"); + return -1; + } + dirtydblock(b, DirtyArenaTrailer); + bad = okarena(arena)<0 || packarena(arena, b->data)<0; + putdblock(b); + if(bad) + return -1; + return 0; +} + +int +wbarenahead(Arena *arena) +{ + ZBlock *b; + ArenaHead head; + int bad; + + namecp(head.name, arena->name); + head.version = arena->version; + head.size = arena->size + 2 * arena->blocksize; + head.blocksize = arena->blocksize; + head.clumpmagic = arena->clumpmagic; + b = alloczblock(arena->blocksize, 1, arena->part->blocksize); + if(b == nil){ + logerr(EAdmin, "can't write arena header: %r"); +///ZZZ add error message? + return -1; + } + /* + * this writepart is okay because it only happens + * during initialization. + */ + bad = packarenahead(&head, b->data)<0 || + writepart(arena->part, arena->base - arena->blocksize, b->data, arena->blocksize)<0; + freezblock(b); + if(bad) + return -1; + return 0; +} + +/* + * read the arena header and trailer blocks from disk + */ +static int +loadarena(Arena *arena) +{ + ArenaHead head; + ZBlock *b; + + b = alloczblock(arena->blocksize, 0, arena->part->blocksize); + if(b == nil) + return -1; + if(readpart(arena->part, arena->base + arena->size, b->data, arena->blocksize) < 0){ + freezblock(b); + return -1; + } + if(unpackarena(arena, b->data) < 0){ + freezblock(b); + return -1; + } + if(arena->version != ArenaVersion4 && arena->version != ArenaVersion5){ + seterr(EAdmin, "unknown arena version %d", arena->version); + freezblock(b); + return -1; + } + scorecp(arena->score, &b->data[arena->blocksize - VtScoreSize]); + + if(readpart(arena->part, arena->base - arena->blocksize, b->data, arena->blocksize) < 0){ + logerr(EAdmin, "can't read arena header: %r"); + freezblock(b); + return 0; + } + if(unpackarenahead(&head, b->data) < 0) + logerr(ECorrupt, "corrupted arena header: %r"); + else if(namecmp(arena->name, head.name)!=0 + || arena->clumpmagic != head.clumpmagic + || arena->version != head.version + || arena->blocksize != head.blocksize + || arena->size + 2 * arena->blocksize != head.size){ + if(namecmp(arena->name, head.name)!=0) + logerr(ECorrupt, "arena tail name %s head %s", + arena->name, head.name); + else if(arena->clumpmagic != head.clumpmagic) + logerr(ECorrupt, "arena tail clumpmagic 0x%lux head 0x%lux", + (ulong)arena->clumpmagic, (ulong)head.clumpmagic); + else if(arena->version != head.version) + logerr(ECorrupt, "arena tail version %d head version %d", + arena->version, head.version); + else if(arena->blocksize != head.blocksize) + logerr(ECorrupt, "arena tail block size %d head %d", + arena->blocksize, head.blocksize); + else if(arena->size+2*arena->blocksize != head.size) + logerr(ECorrupt, "arena tail size %lud head %lud", + (ulong)arena->size+2*arena->blocksize, head.size); + else + logerr(ECorrupt, "arena header inconsistent with arena data"); + } + freezblock(b); + + return 0; +} + +static int +okarena(Arena *arena) +{ + u64int dsize; + int ok; + + ok = 0; + dsize = arenadirsize(arena, arena->diskstats.clumps); + if(arena->diskstats.used + dsize > arena->size){ + seterr(ECorrupt, "arena used > size"); + ok = -1; + } + + if(arena->diskstats.cclumps > arena->diskstats.clumps) + logerr(ECorrupt, "arena has more compressed clumps than total clumps"); + + if(arena->diskstats.uncsize + arena->diskstats.clumps * ClumpSize + arena->blocksize < arena->diskstats.used) + logerr(ECorrupt, "arena uncompressed size inconsistent with used space %lld %d %lld", arena->diskstats.uncsize, arena->diskstats.clumps, arena->diskstats.used); + + if(arena->ctime > arena->wtime) + logerr(ECorrupt, "arena creation time after last write time"); + + return ok; +} + +static CIBlock* +getcib(Arena *arena, int clump, int writing, CIBlock *rock) +{ + int mode; + CIBlock *cib; + u32int block, off; + + if(clump >= arena->memstats.clumps){ + seterr(EOk, "clump directory access out of range"); + return nil; + } + block = clump / arena->clumpmax; + off = (clump - block * arena->clumpmax) * ClumpInfoSize; + cib = rock; + cib->block = block; + cib->offset = off; + + if(writing){ + if(off == 0 && clump == arena->memstats.clumps-1) + mode = OWRITE; + else + mode = ORDWR; + }else + mode = OREAD; + + cib->data = getdblock(arena->part, + arena->base + arena->size - (block + 1) * arena->blocksize, mode); + if(cib->data == nil) + return nil; + return cib; +} + +static void +putcib(Arena *arena, CIBlock *cib) +{ + putdblock(cib->data); + cib->data = nil; +} diff --git a/src/cmd/venti/srv/arenas.c b/src/cmd/venti/srv/arenas.c new file mode 100644 index 00000000..2ad1bb02 --- /dev/null +++ b/src/cmd/venti/srv/arenas.c @@ -0,0 +1,414 @@ +#include "stdinc.h" +#include "dat.h" +#include "fns.h" + +typedef struct AHash AHash; + +/* + * hash table for finding arena's based on their names. + */ +struct AHash +{ + AHash *next; + Arena *arena; +}; + +enum +{ + AHashSize = 512 +}; + +static AHash *ahash[AHashSize]; + +static u32int +hashstr(char *s) +{ + u32int h; + int c; + + h = 0; + for(; c = *s; s++){ + c ^= c << 6; + h += (c << 11) ^ (c >> 1); + c = *s; + h ^= (c << 14) + (c << 7) + (c << 4) + c; + } + return h; +} + +int +addarena(Arena *arena) +{ + AHash *a; + u32int h; + + h = hashstr(arena->name) & (AHashSize - 1); + a = MK(AHash); + if(a == nil) + return -1; + a->arena = arena; + a->next = ahash[h]; + ahash[h] = a; + return 0; +} + +Arena* +findarena(char *name) +{ + AHash *a; + u32int h; + + h = hashstr(name) & (AHashSize - 1); + for(a = ahash[h]; a != nil; a = a->next) + if(strcmp(a->arena->name, name) == 0) + return a->arena; + return nil; +} + +int +delarena(Arena *arena) +{ + AHash *a, *last; + u32int h; + + h = hashstr(arena->name) & (AHashSize - 1); + last = nil; + for(a = ahash[h]; a != nil; a = a->next){ + if(a->arena == arena){ + if(last != nil) + last->next = a->next; + else + ahash[h] = a->next; + free(a); + return 0; + } + last = a; + } + return -1; +} + +ArenaPart* +initarenapart(Part *part) +{ + AMapN amn; + ArenaPart *ap; + ZBlock *b; + u32int i; + int ok; + + b = alloczblock(HeadSize, 0, 0); + if(b == nil || readpart(part, PartBlank, b->data, HeadSize) < 0){ + seterr(EAdmin, "can't read arena partition header: %r"); + return nil; + } + + ap = MKZ(ArenaPart); + if(ap == nil){ + freezblock(b); + return nil; + } + ap->part = part; + ok = unpackarenapart(ap, b->data); + freezblock(b); + if(ok < 0){ + freearenapart(ap, 0); + return nil; + } + + ap->tabbase = (PartBlank + HeadSize + ap->blocksize - 1) & ~(ap->blocksize - 1); + if(ap->version != ArenaPartVersion){ + seterr(ECorrupt, "unknown arena partition version %d", ap->version); + freearenapart(ap, 0); + return nil; + } + if(ap->blocksize & (ap->blocksize - 1)){ + seterr(ECorrupt, "illegal non-power-of-2 block size %d\n", ap->blocksize); + freearenapart(ap, 0); + return nil; + } + if(ap->tabbase >= ap->arenabase){ + seterr(ECorrupt, "arena partition table overlaps with arena storage"); + freearenapart(ap, 0); + return nil; + } + ap->tabsize = ap->arenabase - ap->tabbase; + partblocksize(part, ap->blocksize); + ap->size = ap->part->size & ~(u64int)(ap->blocksize - 1); + + if(readarenamap(&amn, part, ap->tabbase, ap->tabsize) < 0){ + freearenapart(ap, 0); + return nil; + } + ap->narenas = amn.n; + ap->map = amn.map; + if(okamap(ap->map, ap->narenas, ap->arenabase, ap->size, "arena table") < 0){ + freearenapart(ap, 0); + return nil; + } + + ap->arenas = MKNZ(Arena*, ap->narenas); + for(i = 0; i < ap->narenas; i++){ + ap->arenas[i] = initarena(part, ap->map[i].start, ap->map[i].stop - ap->map[i].start, ap->blocksize); + if(ap->arenas[i] == nil){ + seterr(ECorrupt, "%s: %r", ap->map[i].name); + freearenapart(ap, 1); + return nil; + } + if(namecmp(ap->map[i].name, ap->arenas[i]->name) != 0){ + seterr(ECorrupt, "arena name mismatches with expected name: %s vs. %s", + ap->map[i].name, ap->arenas[i]->name); + freearenapart(ap, 1); + return nil; + } + if(findarena(ap->arenas[i]->name)){ + seterr(ECorrupt, "duplicate arena name %s in %s", + ap->map[i].name, ap->part->name); + freearenapart(ap, 1); + return nil; + } + } + + for(i = 0; i < ap->narenas; i++) + addarena(ap->arenas[i]); + + return ap; +} + +ArenaPart* +newarenapart(Part *part, u32int blocksize, u32int tabsize) +{ + ArenaPart *ap; + + if(blocksize & (blocksize - 1)){ + seterr(ECorrupt, "illegal non-power-of-2 block size %d\n", blocksize); + return nil; + } + ap = MKZ(ArenaPart); + if(ap == nil) + return nil; + + ap->version = ArenaPartVersion; + ap->part = part; + ap->blocksize = blocksize; + partblocksize(part, blocksize); + ap->size = part->size & ~(u64int)(blocksize - 1); + ap->tabbase = (PartBlank + HeadSize + blocksize - 1) & ~(blocksize - 1); + ap->arenabase = (ap->tabbase + tabsize + blocksize - 1) & ~(blocksize - 1); + ap->tabsize = ap->arenabase - ap->tabbase; + ap->narenas = 0; + + if(wbarenapart(ap) < 0){ + freearenapart(ap, 0); + return nil; + } + + return ap; +} + +int +wbarenapart(ArenaPart *ap) +{ + ZBlock *b; + + if(okamap(ap->map, ap->narenas, ap->arenabase, ap->size, "arena table") < 0) + return -1; + b = alloczblock(HeadSize, 1, 0); + if(b == nil) +//ZZZ set error message? + return -1; + + if(packarenapart(ap, b->data) < 0){ + seterr(ECorrupt, "can't make arena partition header: %r"); + freezblock(b); + return -1; + } + if(writepart(ap->part, PartBlank, b->data, HeadSize) < 0){ + seterr(EAdmin, "can't write arena partition header: %r"); + freezblock(b); + return -1; + } + freezblock(b); + + return wbarenamap(ap->map, ap->narenas, ap->part, ap->tabbase, ap->tabsize); +} + +void +freearenapart(ArenaPart *ap, int freearenas) +{ + int i; + + if(ap == nil) + return; + if(freearenas){ + for(i = 0; i < ap->narenas; i++){ + if(ap->arenas[i] == nil) + continue; + delarena(ap->arenas[i]); + freearena(ap->arenas[i]); + } + } + free(ap->map); + free(ap->arenas); + free(ap); +} + +int +okamap(AMap *am, int n, u64int start, u64int stop, char *what) +{ + u64int last; + u32int i; + + last = start; + for(i = 0; i < n; i++){ + if(am[i].start < last){ + if(i == 0) + seterr(ECorrupt, "invalid start address in %s", what); + else + seterr(ECorrupt, "overlapping ranges in %s", what); + return -1; + } + if(am[i].stop < am[i].start){ + seterr(ECorrupt, "invalid range in %s", what); + return -1; + } + last = am[i].stop; + } + if(last > stop){ + seterr(ECorrupt, "invalid ending address in %s", what); + return -1; + } + return 0; +} + +int +maparenas(AMap *am, Arena **arenas, int n, char *what) +{ + u32int i; + + for(i = 0; i < n; i++){ + arenas[i] = findarena(am[i].name); + if(arenas[i] == nil){ + seterr(EAdmin, "can't find arena '%s' for '%s'\n", am[i].name, what); + return -1; + } + } + return 0; +} + +int +readarenamap(AMapN *amn, Part *part, u64int base, u32int size) +{ + IFile f; + u32int ok; + + if(partifile(&f, part, base, size) < 0) + return -1; + ok = parseamap(&f, amn); + freeifile(&f); + return ok; +} + +int +wbarenamap(AMap *am, int n, Part *part, u64int base, u64int size) +{ + Fmt f; + ZBlock *b; + + b = alloczblock(size, 1, part->blocksize); + if(b == nil) + return -1; + + fmtzbinit(&f, b); + + if(outputamap(&f, am, n) < 0){ + seterr(ECorrupt, "arena set size too small"); + freezblock(b); + return -1; + } + if(writepart(part, base, b->data, size) < 0){ + seterr(EAdmin, "can't write arena set: %r"); + freezblock(b); + return -1; + } + freezblock(b); + return 0; +} + +/* + * amap: n '\n' amapelem * n + * n: u32int + * amapelem: name '\t' astart '\t' asize '\n' + * astart, asize: u64int + */ +int +parseamap(IFile *f, AMapN *amn) +{ + AMap *am; + u64int v64; + u32int v; + char *s, *t, *flds[4]; + int i, n; + + /* + * arenas + */ + if(ifileu32int(f, &v) < 0){ + seterr(ECorrupt, "syntax error: bad number of elements in %s", f->name); + return -1; + } + n = v; + if(n > MaxAMap){ + seterr(ECorrupt, "illegal number of elements in %s", f->name); + return -1; + } + am = MKNZ(AMap, n); + if(am == nil){ + fprint(2, "out of memory\n"); + return -1; + } + for(i = 0; i < n; i++){ + s = ifileline(f); + if(s) + t = estrdup(s); + else + t = nil; + if(s == nil || getfields(s, flds, 4, 0, "\t") != 3){ + fprint(2, "early eof after %d of %d, %s:#%d: %s\n", i, n, f->name, f->pos, t); + free(t); + return -1; + } + free(t); + if(nameok(flds[0]) < 0) + return -1; + namecp(am[i].name, flds[0]); + if(stru64int(flds[1], &v64) < 0){ + seterr(ECorrupt, "syntax error: bad arena base address in %s", f->name); + free(am); + return -1; + } + am[i].start = v64; + if(stru64int(flds[2], &v64) < 0){ + seterr(ECorrupt, "syntax error: bad arena size in %s", f->name); + free(am); + return -1; + } + am[i].stop = v64; + } + + amn->map = am; + amn->n = n; + return 0; +} + +int +outputamap(Fmt *f, AMap *am, int n) +{ + int i; + + if(fmtprint(f, "%ud\n", n) < 0) + return -1; + for(i = 0; i < n; i++) + if(fmtprint(f, "%s\t%llud\t%llud\n", am[i].name, am[i].start, am[i].stop) < 0) + return -1; + return 0; +} diff --git a/src/cmd/venti/srv/bloom.c b/src/cmd/venti/srv/bloom.c new file mode 100644 index 00000000..5c50a0df --- /dev/null +++ b/src/cmd/venti/srv/bloom.c @@ -0,0 +1,210 @@ +/* + * Bloom filter tracking which scores are present in our arenas + * and (more importantly) which are not. + */ + +#include "stdinc.h" +#include "dat.h" +#include "fns.h" + +int +bloominit(Bloom *b, vlong vsize, u8int *data) +{ + ulong size; + + size = vsize; + if(size != vsize){ /* truncation */ + werrstr("bloom data too big"); + return -1; + } + + b->size = size; + b->nhash = 32; /* will be fixed by caller on initialization */ + if(data != nil) + if(unpackbloomhead(b, data) < 0) + return -1; + +fprint(2, "bloom size %lud nhash %d\n", b->size, b->nhash); + b->mask = b->size-1; + b->data = data; + return 0; +} + +void +wbbloomhead(Bloom *b) +{ + packbloomhead(b, b->data); +} + +Bloom* +readbloom(Part *p) +{ + int i, n; + uint ones; + uchar buf[512]; + uchar *data; + u32int *a; + Bloom *b; + + b = vtmallocz(sizeof *b); + if(readpart(p, 0, buf, sizeof buf) < 0) + return nil; +fprint(2, "header %.16H\n", buf); + if(bloominit(b, 0, buf) < 0){ + vtfree(b); + return nil; + } + data = vtmallocz(b->size); + if(readpart(p, 0, data, b->size) < 0){ + vtfree(b); + vtfree(data); + return nil; + } + b->data = data; + b->part = p; + + a = (u32int*)b->data; + n = b->size/4; + ones = 0; + for(i=0; i<n; i++) + ones += countbits(a[i]); + addstat(StatBloomOnes, ones); + + if(b->size == MaxBloomSize) /* 2^32 overflows ulong */ + addstat(StatBloomBits, b->size*8-1); + else + addstat(StatBloomBits, b->size*8); + + return b; +} + +int +writebloom(Bloom *b) +{ + wbbloomhead(b); + return writepart(b->part, 0, b->data, b->size); +} + +/* + * Derive two random 32-bit quantities a, b from the score + * and then use a+b*i as a sequence of bloom filter indices. + * Michael Mitzenmacher has a recent (2005) paper saying this is okay. + * We reserve the bottom bytes (BloomHeadSize*8 bits) for the header. + */ +static void +gethashes(u8int *score, ulong *h) +{ + int i; + u32int a, b; + + a = 0; + b = 0; + for(i=4; i+8<=VtScoreSize; i+=8){ + a ^= *(u32int*)(score+i); + b ^= *(u32int*)(score+i+4); + } + if(i+4 <= VtScoreSize) /* 20 is not 4-aligned */ + a ^= *(u32int*)(score+i); + for(i=0; i<BloomMaxHash; i++, a+=b) + h[i] = a < BloomHeadSize*8 ? BloomHeadSize*8 : a; +} + +static void +_markbloomfilter(Bloom *b, u8int *score) +{ + int i, nnew; + ulong h[BloomMaxHash]; + u32int x, *y, z, *tab; + + trace("markbloomfilter", "markbloomfilter %V", score); + gethashes(score, h); + nnew = 0; + tab = (u32int*)b->data; + for(i=0; i<b->nhash; i++){ + x = h[i]; + y = &tab[(x&b->mask)>>5]; + z = 1<<(x&31); + if(!(*y&z)){ + nnew++; + *y |= z; + } + } + if(nnew) + addstat(StatBloomOnes, nnew); + + trace("markbloomfilter", "markbloomfilter exit"); +} + +static int +_inbloomfilter(Bloom *b, u8int *score) +{ + int i; + ulong h[BloomMaxHash], x; + u32int *tab; + + gethashes(score, h); + tab = (u32int*)b->data; + for(i=0; i<b->nhash; i++){ + x = h[i]; + if(!(tab[(x&b->mask)>>5] & (1<<(x&31)))) + return 0; + } + return 1; +} + +int +inbloomfilter(Bloom *b, u8int *score) +{ + int r; + uint ms; + + if(b == nil) + return 1; + + ms = msec(); + rlock(&b->lk); + r = _inbloomfilter(b, score); + runlock(&b->lk); + ms = ms - msec(); + addstat2(StatBloomLookup, 1, StatBloomLookupTime, ms); + if(r) + addstat(StatBloomMiss, 1); + else + addstat(StatBloomHit, 1); + return r; +} + +void +markbloomfilter(Bloom *b, u8int *score) +{ + if(b == nil) + return; + + rlock(&b->lk); + qlock(&b->mod); + _markbloomfilter(b, score); + qunlock(&b->mod); + runlock(&b->lk); +} + +static void +bloomwriteproc(void *v) +{ + Bloom *b; + + b = v; + for(;;){ + recv(b->writechan, 0); + if(writebloom(b) < 0) + fprint(2, "oops! writing bloom: %r\n"); + send(b->writedonechan, 0); + } +} + +void +startbloomproc(Bloom *b) +{ + b->writechan = chancreate(sizeof(void*), 0); + b->writedonechan = chancreate(sizeof(void*), 0); + vtproc(bloomwriteproc, b); +} diff --git a/src/cmd/venti/srv/buildbuck.c b/src/cmd/venti/srv/buildbuck.c new file mode 100644 index 00000000..240e77d7 --- /dev/null +++ b/src/cmd/venti/srv/buildbuck.c @@ -0,0 +1,132 @@ +#include "stdinc.h" +#include "dat.h" +#include "fns.h" + +/* + * An IEStream is a sorted list of index entries. + */ +struct IEStream +{ + Part *part; + u64int off; /* read position within part */ + u64int n; /* number of valid ientries left to read */ + u32int size; /* allocated space in buffer */ + u8int *buf; + u8int *pos; /* current place in buffer */ + u8int *epos; /* end of valid buffer contents */ +}; + +IEStream* +initiestream(Part *part, u64int off, u64int clumps, u32int size) +{ + IEStream *ies; + +//ZZZ out of memory? + ies = MKZ(IEStream); + ies->buf = MKN(u8int, size); + ies->epos = ies->buf; + ies->pos = ies->epos; + ies->off = off; + ies->n = clumps; + ies->size = size; + ies->part = part; + return ies; +} + +void +freeiestream(IEStream *ies) +{ + if(ies == nil) + return; + free(ies->buf); + free(ies); +} + +/* + * Return the next IEntry (still packed) in the stream. + */ +static u8int* +peekientry(IEStream *ies) +{ + u32int n, nn; + + n = ies->epos - ies->pos; + if(n < IEntrySize){ + memmove(ies->buf, ies->pos, n); + ies->epos = &ies->buf[n]; + ies->pos = ies->buf; + nn = ies->size; + if(nn > ies->n * IEntrySize) + nn = ies->n * IEntrySize; + nn -= n; + if(nn == 0) + return nil; +//fprint(2, "peek %d from %llud into %p\n", nn, ies->off, ies->epos); + if(readpart(ies->part, ies->off, ies->epos, nn) < 0){ + seterr(EOk, "can't read sorted index entries: %r"); + return nil; + } + ies->epos += nn; + ies->off += nn; + } + return ies->pos; +} + +/* + * Compute the bucket number for the given IEntry. + * Knows that the score is the first thing in the packed + * representation. + */ +static u32int +iebuck(Index *ix, u8int *b, IBucket *ib, IEStream *ies) +{ + USED(ies); + USED(ib); + return hashbits(b, 32) / ix->div; +} + +/* + * Fill ib with the next bucket in the stream. + */ +u32int +buildbucket(Index *ix, IEStream *ies, IBucket *ib, uint maxdata) +{ + IEntry ie1, ie2; + u8int *b; + u32int buck; + + buck = TWID32; + ib->n = 0; + while(ies->n){ + b = peekientry(ies); + if(b == nil) + return TWID32; +//fprint(2, "b=%p ies->n=%lld ib.n=%d buck=%d score=%V\n", b, ies->n, ib->n, iebuck(ix, b, ib, ies), b); + if(ib->n == 0) + buck = iebuck(ix, b, ib, ies); + else{ + if(buck != iebuck(ix, b, ib, ies)) + break; + if(ientrycmp(&ib->data[(ib->n - 1)* IEntrySize], b) == 0){ + /* + * guess that the larger address is the correct one to use + */ + unpackientry(&ie1, &ib->data[(ib->n - 1)* IEntrySize]); + unpackientry(&ie2, b); + seterr(EOk, "duplicate index entry for score=%V type=%d", ie1.score, ie1.ia.type); + ib->n--; + if(ie1.ia.addr > ie2.ia.addr) + memmove(b, &ib->data[ib->n * IEntrySize], IEntrySize); + } + } + if((ib->n+1)*IEntrySize > maxdata){ + seterr(EOk, "bucket overflow"); + return TWID32; + } + memmove(&ib->data[ib->n * IEntrySize], b, IEntrySize); + ib->n++; + ies->n--; + ies->pos += IEntrySize; + } + return buck; +} diff --git a/src/cmd/venti/srv/buildindex.c b/src/cmd/venti/srv/buildindex.c new file mode 100644 index 00000000..8714474a --- /dev/null +++ b/src/cmd/venti/srv/buildindex.c @@ -0,0 +1,160 @@ +/* + * Rebuild the Venti index from scratch. + */ + +#include "stdinc.h" +#include "dat.h" +#include "fns.h" + +/* + * Write a single bucket. Could profit from a big buffer here + * so that we can absorb sporadic runs of blocks into one write, + * avoiding disk seeks. + */ +static int +writebucket(Index *ix, u32int buck, IBucket *ib, ZBlock *b) +{ + ISect *is; + + is = ix->sects[indexsect0(ix, buck)]; + if(buck < is->start || buck >= is->stop){ + seterr(EAdmin, "cannot find index section for bucket %lud\n", (ulong)buck); + return -1; + } + buck -= is->start; + +/* + qlock(&stats.lock); + stats.indexwrites++; + qunlock(&stats.lock); +*/ + packibucket(ib, b->data, is->bucketmagic); + return writepart(is->part, is->blockbase + ((u64int)buck << is->blocklog), b->data, is->blocksize); +} + +static int +buildindex(Index *ix, Part *part, u64int off, u64int clumps, int zero) +{ + IEStream *ies; + IBucket ib, zib; + ZBlock *z, *b; + u32int next, buck; + int ok; + uint nbuck; + u64int found = 0; + +//ZZZ make buffer size configurable + b = alloczblock(ix->blocksize, 0, ix->blocksize); + z = alloczblock(ix->blocksize, 1, ix->blocksize); + ies = initiestream(part, off, clumps, 64*1024); + if(b == nil || z == nil || ies == nil){ + ok = 0; + goto breakout; + return -1; + } + ok = 0; + next = 0; + memset(&ib, 0, sizeof ib); + ib.data = b->data + IBucketSize; + zib.data = z->data + IBucketSize; + zib.n = 0; + nbuck = 0; + for(;;){ + buck = buildbucket(ix, ies, &ib, ix->blocksize-IBucketSize); + found += ib.n; + if(zero){ + for(; next != buck; next++){ + if(next == ix->buckets){ + if(buck != TWID32){ + fprint(2, "bucket out of range\n"); + ok = -1; + } + goto breakout; + } + if(writebucket(ix, next, &zib, z) < 0){ + fprint(2, "can't write zero bucket to buck=%d: %r", next); + ok = -1; + } + } + } + if(buck >= ix->buckets){ + if(buck == TWID32) + break; + fprint(2, "bucket out of range\n"); + ok = -1; + goto breakout; + } + if(writebucket(ix, buck, &ib, b) < 0){ + fprint(2, "bad bucket found=%lld: %r\n", found); + ok = -1; + } + next = buck + 1; + if(++nbuck%10000 == 0) + fprint(2, "\t%,d buckets written...\n", nbuck); + } +breakout:; + fprint(2, "wrote index with %lld entries\n", found); + freeiestream(ies); + freezblock(z); + freezblock(b); + return ok; +} + +void +usage(void) +{ + fprint(2, "usage: buildindex [-Z] [-B blockcachesize] config tmppart\n"); + threadexitsall(0); +} + +Config conf; + +void +threadmain(int argc, char *argv[]) +{ + Part *part; + u64int clumps, base; + u32int bcmem; + int zero; + + zero = 1; + bcmem = 0; + ARGBEGIN{ + case 'B': + bcmem = unittoull(ARGF()); + break; + case 'Z': + zero = 0; + break; + default: + usage(); + break; + }ARGEND + + if(argc != 2) + usage(); + + if(initventi(argv[0], &conf) < 0) + sysfatal("can't init venti: %r"); + + if(bcmem < maxblocksize * (mainindex->narenas + mainindex->nsects * 4 + 16)) + bcmem = maxblocksize * (mainindex->narenas + mainindex->nsects * 4 + 16); + if(0) fprint(2, "initialize %d bytes of disk block cache\n", bcmem); + initdcache(bcmem); + + fprint(2, "building a new index %s using %s for temporary storage\n", mainindex->name, argv[1]); + + part = initpart(argv[1], ORDWR|ODIRECT); + if(part == nil) + sysfatal("can't initialize temporary partition: %r"); + + clumps = sortrawientries(mainindex, part, &base, mainindex->bloom); + if(clumps == TWID64) + sysfatal("can't build sorted index: %r"); + fprint(2, "found and sorted index entries for clumps=%lld at %lld\n", clumps, base); + + if(buildindex(mainindex, part, base, clumps, zero) < 0) + sysfatal("can't build new index: %r"); + + threadexitsall(0); +} diff --git a/src/cmd/venti/srv/checkarenas.c b/src/cmd/venti/srv/checkarenas.c new file mode 100644 index 00000000..525a634c --- /dev/null +++ b/src/cmd/venti/srv/checkarenas.c @@ -0,0 +1,135 @@ +#include "stdinc.h" +#include "dat.h" +#include "fns.h" + +static int verbose; + +static void +checkarena(Arena *arena, int scan, int fix) +{ + ATailStats old; + int err, e; + + if(verbose && arena->memstats.clumps) + printarena(2, arena); + + old = arena->memstats; + + if(scan){ + arena->memstats.used = 0; + arena->memstats.clumps = 0; + arena->memstats.cclumps = 0; + arena->memstats.uncsize = 0; + } + + err = 0; + for(;;){ + e = syncarena(arena, 0, 1000, 0, fix); + err |= e; + if(!(e & SyncHeader)) + break; + if(verbose && arena->memstats.clumps) + fprint(2, "."); + } + if(verbose && arena->memstats.clumps) + fprint(2, "\n"); + + err &= ~SyncHeader; + if(arena->memstats.used != old.used + || arena->memstats.clumps != old.clumps + || arena->memstats.cclumps != old.cclumps + || arena->memstats.uncsize != old.uncsize){ + fprint(2, "%s: incorrect arena header fields\n", arena->name); + printarena(2, arena); + err |= SyncHeader; + } + + if(!err || !fix) + return; + + fprint(2, "%s: writing fixed arena header fields\n", arena->name); + arena->diskstats = arena->memstats; + if(wbarena(arena) < 0) + fprint(2, "arena header write failed: %r\n"); + flushdcache(); +} + +void +usage(void) +{ + fprint(2, "usage: checkarenas [-afv] file [arenaname...]\n"); + threadexitsall(0); +} + +int +should(char *name, int argc, char **argv) +{ + int i; + + if(argc == 0) + return 1; + for(i=0; i<argc; i++) + if(strcmp(name, argv[i]) == 0) + return 1; + return 0; +} + +void +threadmain(int argc, char *argv[]) +{ + ArenaPart *ap; + Part *part; + char *file; + int i, fix, scan; + + ventifmtinstall(); + statsinit(); + + fix = 0; + scan = 0; + ARGBEGIN{ + case 'f': + fix++; + break; + case 'a': + scan = 1; + break; + case 'v': + verbose++; + break; + default: + usage(); + break; + }ARGEND + + if(!fix) + readonly = 1; + + if(argc < 1) + usage(); + + file = argv[0]; + + part = initpart(file, ORDWR|ODIRECT); + if(part == nil) + sysfatal("can't open partition %s: %r", file); + + ap = initarenapart(part); + if(ap == nil) + sysfatal("can't initialize arena partition in %s: %r", file); + + if(verbose > 1){ + printarenapart(2, ap); + fprint(2, "\n"); + } + + initdcache(8 * MaxDiskBlock); + + for(i = 0; i < ap->narenas; i++) + if(should(ap->arenas[i]->name, argc, argv)) + checkarena(ap->arenas[i], scan, fix); + + if(verbose > 1) + printstats(); + threadexitsall(0); +} diff --git a/src/cmd/venti/srv/checkindex.c b/src/cmd/venti/srv/checkindex.c new file mode 100644 index 00000000..f7040d12 --- /dev/null +++ b/src/cmd/venti/srv/checkindex.c @@ -0,0 +1,293 @@ +#include "stdinc.h" +#include "dat.h" +#include "fns.h" + +static int extra, missing, wrong; + +static void +phdr(DBlock *eb) +{ + static int did; + + if(!did){ + did = 1; + print("# diff actual correct\n"); + } + print("%s block 0x%llux\n", eb->part->name, eb->addr); +} + +static void +pie(IEntry *ie, char c) +{ + print("%c %V %22lld %3d %5d %3d\n", + c, ie->score, ie->ia.addr, ie->ia.type, ie->ia.size, ie->ia.blocks); +} + +static int +checkbucket(Index *ix, u32int buck, IBucket *ib) +{ + ISect *is; + DBlock *eb; + IBucket eib; + IEntry ie, eie; + int i, ei, ok, c, hdr; + + is = ix->sects[indexsect0(ix, buck)]; + if(buck < is->start || buck >= is->stop){ + seterr(EAdmin, "cannot find index section for bucket %lud\n", (ulong)buck); + return -1; + } + buck -= is->start; + eb = getdblock(is->part, is->blockbase + ((u64int)buck << is->blocklog), OREAD); + if(eb == nil) + return -1; + unpackibucket(&eib, eb->data, is->bucketmagic); + + ok = 0; + ei = 0; + hdr = 0; + for(i = 0; i < ib->n; i++){ + while(ei < eib.n){ + c = ientrycmp(&ib->data[i * IEntrySize], &eib.data[ei * IEntrySize]); + if(c == 0){ + unpackientry(&ie, &ib->data[i * IEntrySize]); + unpackientry(&eie, &eib.data[ei * IEntrySize]); + if(iaddrcmp(&ie.ia, &eie.ia) != 0){ + if(!hdr){ + phdr(eb); + hdr = 1; + } + wrong++; + pie(&eie, '<'); + pie(&ie, '>'); + } + ei++; + goto cont; + } + if(c < 0) + break; + if(!hdr){ + phdr(eb); + hdr = 1; + } + unpackientry(&eie, &eib.data[ei*IEntrySize]); + extra++; + pie(&eie, '<'); + ei++; + ok = -1; + } + if(!hdr){ + phdr(eb); + hdr = 1; + } + unpackientry(&ie, &ib->data[i*IEntrySize]); + missing++; + pie(&ie, '>'); + ok = -1; + cont:; + } + for(; ei < eib.n; ei++){ + if(!hdr){ + phdr(eb); + hdr = 1; + } + unpackientry(&eie, &eib.data[ei*IEntrySize]); + pie(&eie, '<'); + ok = -1; + } + putdblock(eb); + return ok; +} + +int +checkindex(Index *ix, Part *part, u64int off, u64int clumps, int zero) +{ + IEStream *ies; + IBucket ib, zib; + ZBlock *z, *b; + u32int next, buck; + int ok, bok; +u64int found = 0; + +//ZZZ make buffer size configurable + b = alloczblock(ix->blocksize, 0, ix->blocksize); + z = alloczblock(ix->blocksize, 1, ix->blocksize); + ies = initiestream(part, off, clumps, 64*1024); + if(b == nil || z == nil || ies == nil){ + werrstr("allocating: %r"); + ok = -1; + goto breakout; + return -1; + } + ok = 0; + next = 0; + memset(&ib, 0, sizeof ib); + ib.data = b->data; + zib.data = z->data; + zib.n = 0; + zib.buck = 0; + for(;;){ + buck = buildbucket(ix, ies, &ib, ix->blocksize-IBucketSize); + found += ib.n; + if(zero){ + for(; next != buck; next++){ + if(next == ix->buckets){ + if(buck != TWID32){ + ok = -1; + werrstr("internal error: bucket out of range"); + } + if(ok < 0) + werrstr("%d spurious entries, %d missing, %d wrong", extra, missing, wrong); + goto breakout; + } + bok = checkbucket(ix, next, &zib); + if(bok < 0) + ok = -1; + } + } + if(buck >= ix->buckets){ + if(buck == TWID32) + break; + werrstr("internal error: bucket out of range"); + ok = -1; + goto breakout; + } + bok = checkbucket(ix, buck, &ib); + if(bok < 0) + ok = -1; + next = buck + 1; + } +breakout: + freeiestream(ies); + freezblock(z); + freezblock(b); + return ok; +} + +int +checkbloom(Bloom *b1, Bloom *b2, int fix) +{ + u32int *a1, *a2; + int i, n, extra, missing; + + if(b1==nil && b2==nil) + return 0; + if(b1==nil || b2==nil){ + werrstr("nil/non-nil"); + return -1; + } + wbbloomhead(b1); + wbbloomhead(b2); + if(memcmp(b1->data, b2->data, BloomHeadSize) != 0){ + werrstr("bloom header mismatch"); + return -1; + } + a1 = (u32int*)b1->data; + a2 = (u32int*)b2->data; + n = b1->size/4; + extra = 0; + missing = 0; + for(i=BloomHeadSize/4; i<n; i++){ + if(a1[i] != a2[i]){ +print("%.8ux/%.8ux.", a1[i], a2[i]); + extra += countbits(a1[i] & ~a2[i]); + missing += countbits(a2[i] & ~a1[i]); + } + } + if(extra || missing) + fprint(2, "bloom filter: %d spurious bits, %d missing bits\n", extra, missing); + else + fprint(2, "bloom filter: correct\n"); + if(!fix && missing){ + werrstr("missing bits"); + return -1; + } + if(fix && (missing || extra)){ + memmove(b1->data, b2->data, b1->size); + return writebloom(b1); + } + return 0; +} + + +void +usage(void) +{ + fprint(2, "usage: checkindex [-f] [-B blockcachesize] config tmp\n"); + threadexitsall(0); +} + +Config conf; + +void +threadmain(int argc, char *argv[]) +{ + Bloom *oldbloom, *newbloom; + Part *part; + u64int clumps, base; + u32int bcmem; + int fix, skipz, ok; + + fix = 0; + bcmem = 0; + skipz = 0; + ARGBEGIN{ + case 'B': + bcmem = unittoull(ARGF()); + break; + case 'f': + fix++; + break; + case 'Z': + skipz = 1; + break; + default: + usage(); + break; + }ARGEND + + if(argc != 2) + usage(); + + ventifmtinstall(); + + part = initpart(argv[1], ORDWR|ODIRECT); + if(part == nil) + sysfatal("can't initialize temporary partition: %r"); + + if(!fix) + readonly = 1; + + if(initventi(argv[0], &conf) < 0) + sysfatal("can't init venti: %r"); + oldbloom = mainindex->bloom; + newbloom = nil; + if(oldbloom){ + newbloom = vtmallocz(sizeof *newbloom); + bloominit(newbloom, oldbloom->size, nil); + newbloom->data = vtmallocz(oldbloom->size); + } + if(bcmem < maxblocksize * (mainindex->narenas + mainindex->nsects * 4 + 16)) + bcmem = maxblocksize * (mainindex->narenas + mainindex->nsects * 4 + 16); + if(0) fprint(2, "initialize %d bytes of disk block cache\n", bcmem); + initdcache(bcmem); + + fprint(2, "checkindex: building entry list\n"); + clumps = sortrawientries(mainindex, part, &base, newbloom); + if(clumps == TWID64) + sysfatal("can't build sorted index: %r"); + fprint(2, "checkindex: checking %lld entries at %lld\n", clumps, base); + ok = 0; + if(checkindex(mainindex, part, base, clumps, !skipz) < 0){ + fprint(2, "checkindex: %r\n"); + ok = -1; + } + if(checkbloom(oldbloom, newbloom, fix) < 0){ + fprint(2, "checkbloom: %r\n"); + ok = -1; + } + if(ok < 0) + sysfatal("errors found"); + fprint(2, "checkindex: index is correct\n"); + threadexitsall(0); +} diff --git a/src/cmd/venti/srv/clump.c b/src/cmd/venti/srv/clump.c new file mode 100644 index 00000000..88ebdb50 --- /dev/null +++ b/src/cmd/venti/srv/clump.c @@ -0,0 +1,222 @@ +#include "stdinc.h" +#include "dat.h" +#include "fns.h" +#include "whack.h" + +/* + * Write a lump to disk. Updates ia with an index address + * for the newly-written lump. Upon return, the lump will + * have been placed in the disk cache but will likely not be on disk yet. + */ +int +storeclump(Index *ix, ZBlock *zb, u8int *sc, int type, u32int creator, IAddr *ia) +{ + ZBlock *cb; + Clump cl; + u64int a; + u8int bh[VtScoreSize]; + int size, dsize; + + trace(TraceLump, "storeclump enter", sc, type); + size = zb->len; + if(size > VtMaxLumpSize){ + seterr(EStrange, "lump too large"); + return -1; + } + if(vttypevalid(type) < 0){ + seterr(EStrange, "invalid lump type"); + return -1; + } + + if(0){ + scoremem(bh, zb->data, size); + if(scorecmp(sc, bh) != 0){ + seterr(ECorrupt, "storing clump: corrupted; expected=%V got=%V, size=%d", sc, bh, size); + return -1; + } + } + + cb = alloczblock(size + ClumpSize + U32Size, 0, 0); + if(cb == nil) + return -1; + + cl.info.type = type; + cl.info.uncsize = size; + cl.creator = creator; + cl.time = now(); + scorecp(cl.info.score, sc); + + trace(TraceLump, "storeclump whackblock"); + dsize = whackblock(&cb->data[ClumpSize], zb->data, size); + if(dsize > 0 && dsize < size){ + cl.encoding = ClumpECompress; + }else{ + if(dsize > size){ + fprint(2, "whack error: dsize=%d size=%d\n", dsize, size); + abort(); + } + cl.encoding = ClumpENone; + dsize = size; + memmove(&cb->data[ClumpSize], zb->data, size); + } + memset(cb->data+ClumpSize+dsize, 0, 4); + cl.info.size = dsize; + + ia->addr = 0; + ia->type = type; + ia->size = size; + ia->blocks = (dsize + ClumpSize + (1 << ABlockLog) - 1) >> ABlockLog; + + a = writeiclump(ix, &cl, cb->data, &ia->addr); + + trace(TraceLump, "storeclump exit %lld", a); + + freezblock(cb); + if(a == TWID64) + return -1; + +/* + qlock(&stats.lock); + stats.clumpwrites++; + stats.clumpbwrites += size; + stats.clumpbcomp += dsize; + qunlock(&stats.lock); +*/ + + return 0; +} + +u32int +clumpmagic(Arena *arena, u64int aa) +{ + u8int buf[U32Size]; + + if(readarena(arena, aa, buf, U32Size) < 0) + return TWID32; + return unpackmagic(buf); +} + +/* + * fetch a block based at addr. + * score is filled in with the block's score. + * blocks is roughly the length of the clump on disk; + * if zero, the length is unknown. + */ +ZBlock* +loadclump(Arena *arena, u64int aa, int blocks, Clump *cl, u8int *score, int verify) +{ + Unwhack uw; + ZBlock *zb, *cb; + u8int bh[VtScoreSize], *buf; + u32int n; + int nunc; + +/* + qlock(&stats.lock); + stats.clumpreads++; + qunlock(&stats.lock); +*/ + + if(blocks <= 0) + blocks = 1; + + trace(TraceLump, "loadclump enter"); + + cb = alloczblock(blocks << ABlockLog, 0, 0); + if(cb == nil) + return nil; + n = readarena(arena, aa, cb->data, blocks << ABlockLog); + if(n < ClumpSize){ + if(n != 0) + seterr(ECorrupt, "loadclump read less than a header"); + freezblock(cb); + return nil; + } + trace(TraceLump, "loadclump unpack"); + if(unpackclump(cl, cb->data, arena->clumpmagic) < 0){ + seterr(ECorrupt, "loadclump %s %llud: %r", arena->name, aa); + freezblock(cb); + return nil; + } + n -= ClumpSize; + if(n < cl->info.size){ + freezblock(cb); + n = cl->info.size; + cb = alloczblock(n, 0, 0); + if(cb == nil) + return nil; + if(readarena(arena, aa + ClumpSize, cb->data, n) != n){ + seterr(ECorrupt, "loadclump read too little data"); + freezblock(cb); + return nil; + } + buf = cb->data; + }else + buf = cb->data + ClumpSize; + + scorecp(score, cl->info.score); + + zb = alloczblock(cl->info.uncsize, 0, 0); + if(zb == nil){ + freezblock(cb); + return nil; + } + switch(cl->encoding){ + case ClumpECompress: + trace(TraceLump, "loadclump decompress"); + unwhackinit(&uw); + nunc = unwhack(&uw, zb->data, cl->info.uncsize, buf, cl->info.size); + if(nunc != cl->info.uncsize){ + if(nunc < 0) + seterr(ECorrupt, "decompression of %llud failed: %s", aa, uw.err); + else + seterr(ECorrupt, "decompression of %llud gave partial block: %d/%d\n", aa, nunc, cl->info.uncsize); + freezblock(cb); + freezblock(zb); + return nil; + } + break; + case ClumpENone: + if(cl->info.size != cl->info.uncsize){ + seterr(ECorrupt, "loading clump: bad uncompressed size for uncompressed block %llud", aa); + freezblock(cb); + freezblock(zb); + return nil; + } + scoremem(bh, buf, cl->info.uncsize); + if(scorecmp(cl->info.score, bh) != 0) + seterr(ECorrupt, "pre-copy sha1 wrong at %s %llud: expected=%V got=%V", arena->name, aa, cl->info.score, bh); + memmove(zb->data, buf, cl->info.uncsize); + break; + default: + seterr(ECorrupt, "unknown encoding in loadlump %llud", aa); + freezblock(cb); + freezblock(zb); + return nil; + } + freezblock(cb); + + if(verify){ + trace(TraceLump, "loadclump verify"); + scoremem(bh, zb->data, cl->info.uncsize); + if(scorecmp(cl->info.score, bh) != 0){ + seterr(ECorrupt, "loading clump: corrupted at %s %llud; expected=%V got=%V", arena->name, aa, cl->info.score, bh); + freezblock(zb); + return nil; + } + if(vttypevalid(cl->info.type) < 0){ + seterr(ECorrupt, "loading lump at %s %llud: invalid lump type %d", arena->name, aa, cl->info.type); + freezblock(zb); + return nil; + } + } + + trace(TraceLump, "loadclump exit"); +/* + qlock(&stats.lock); + stats.clumpbreads += cl->info.size; + stats.clumpbuncomp += cl->info.uncsize; + qunlock(&stats.lock); +*/ + return zb; +} diff --git a/src/cmd/venti/srv/clumpstats.c b/src/cmd/venti/srv/clumpstats.c new file mode 100644 index 00000000..d2cfe251 --- /dev/null +++ b/src/cmd/venti/srv/clumpstats.c @@ -0,0 +1,127 @@ +#include "stdinc.h" +#include "dat.h" +#include "fns.h" + +int count[VtMaxLumpSize][VtMaxType]; +Config conf; + +enum +{ + ClumpChunks = 32*1024 +}; + +static int +readarenainfo(Arena *arena) +{ + ClumpInfo *ci, *cis; + u32int clump; + int i, n, ok; + + if(arena->memstats.clumps) + fprint(2, "reading directory for arena=%s with %d entries\n", arena->name, arena->memstats.clumps); + + cis = MKN(ClumpInfo, ClumpChunks); + ok = 0; + for(clump = 0; clump < arena->memstats.clumps; clump += n){ + n = ClumpChunks; + + if(n > arena->memstats.clumps - clump) + n = arena->memstats.clumps - clump; + + if((i=readclumpinfos(arena, clump, cis, n)) != n){ + seterr(EOk, "arena directory read failed %d not %d: %r", i, n); + ok = -1; + break; + } + + for(i = 0; i < n; i++){ + ci = &cis[i]; + if(ci->type >= VtMaxType || ci->uncsize >= VtMaxLumpSize) { + fprint(2, "bad clump: %d: type = %d: size = %d\n", clump+i, ci->type, ci->uncsize); + continue; + } + count[ci->uncsize][ci->type]++; + } + } + free(cis); + if(ok < 0) + return TWID32; + return clump; +} + +static void +clumpstats(Index *ix) +{ + int ok; + ulong clumps, n; + int i, j, t; + + ok = 0; + clumps = 0; + for(i = 0; i < ix->narenas; i++){ + n = readarenainfo(ix->arenas[i]); + if(n == TWID32){ + ok = -1; + break; + } + clumps += n; + } + + if(ok < 0) + return; + + print("clumps = %ld\n", clumps); + for(i=0; i<VtMaxLumpSize; i++) { + t = 0; + for(j=0; j<VtMaxType; j++) + t += count[i][j]; + if(t == 0) + continue; + print("%d\t%d", i, t); + for(j=0; j<VtMaxType; j++) + print("\t%d", count[i][j]); + print("\n"); + } +} + + +void +usage(void) +{ + fprint(2, "usage: clumpstats [-B blockcachesize] config\n"); + threadexitsall(0); +} + +void +threadmain(int argc, char *argv[]) +{ + u32int bcmem; + + bcmem = 0; + + ARGBEGIN{ + case 'B': + bcmem = unittoull(ARGF()); + break; + default: + usage(); + break; + }ARGEND + + readonly = 1; + + if(argc != 1) + usage(); + + if(initventi(argv[0], &conf) < 0) + sysfatal("can't init venti: %r"); + + if(bcmem < maxblocksize * (mainindex->narenas + mainindex->nsects * 4 + 16)) + bcmem = maxblocksize * (mainindex->narenas + mainindex->nsects * 4 + 16); + if(0) fprint(2, "initialize %d bytes of disk block cache\n", bcmem); + initdcache(bcmem); + + clumpstats(mainindex); + + threadexitsall(0); +} diff --git a/src/cmd/venti/srv/config.c b/src/cmd/venti/srv/config.c new file mode 100644 index 00000000..e6232d53 --- /dev/null +++ b/src/cmd/venti/srv/config.c @@ -0,0 +1,245 @@ +#include "stdinc.h" +#include "dat.h" +#include "fns.h" + +Index *mainindex; +int paranoid = 1; /* should verify hashes on disk read */ + +static ArenaPart *configarenas(char *file); +static ISect *configisect(char *file); +static Bloom *configbloom(char *file); + +int +initventi(char *file, Config *conf) +{ + statsinit(); + + if(file == nil){ + seterr(EOk, "no configuration file"); + return -1; + } + if(runconfig(file, conf) < 0){ + seterr(EOk, "can't initialize venti: %r"); + return -1; + } + mainindex = initindex(conf->index, conf->sects, conf->nsects); + if(mainindex == nil) + return -1; + mainindex->bloom = conf->bloom; + return 0; +} + +static int +numok(char *s) +{ + char *p; + + strtoull(s, &p, 0); + if(p == s) + return -1; + if(*p == 0) + return 0; + if(p[1] == 0 && strchr("MmGgKk", *p)) + return 0; + return 0; +} + +/* + * configs : + * | configs config + * config : "isect" filename + * | "arenas" filename + * | "index" name + * | "bcmem" num + * | "mem" num + * | "icmem" num + * | "queuewrites" + * | "httpaddr" address + * | "addr" address + * + * '#' and \n delimit comments + */ +enum +{ + MaxArgs = 2 +}; +int +runconfig(char *file, Config *config) +{ + ArenaPart **av; + ISect **sv; + IFile f; + char *s, *line, *flds[MaxArgs + 1]; + int i, ok; + + if(readifile(&f, file) < 0) + return -1; + memset(config, 0, sizeof *config); + config->mem = 0xFFFFFFFFUL; + ok = -1; + line = nil; + for(;;){ + s = ifileline(&f); + if(s == nil){ + ok = 0; + break; + } + line = estrdup(s); + i = getfields(s, flds, MaxArgs + 1, 1, " \t\r"); + if(i == 2 && strcmp(flds[0], "isect") == 0){ + sv = MKN(ISect*, config->nsects + 1); + for(i = 0; i < config->nsects; i++) + sv[i] = config->sects[i]; + free(config->sects); + config->sects = sv; + config->sects[config->nsects] = configisect(flds[1]); + if(config->sects[config->nsects] == nil) + break; + config->nsects++; + }else if(i == 2 && strcmp(flds[0], "arenas") == 0){ + av = MKN(ArenaPart*, config->naparts + 1); + for(i = 0; i < config->naparts; i++) + av[i] = config->aparts[i]; + free(config->aparts); + config->aparts = av; + config->aparts[config->naparts] = configarenas(flds[1]); + if(config->aparts[config->naparts] == nil) + break; + config->naparts++; + }else if(i == 2 && strcmp(flds[0], "bloom") == 0){ + if(config->bloom){ + seterr(EAdmin, "duplicate bloom lines in configuration file %s", file); + break; + } + if((config->bloom = configbloom(flds[1])) == nil) + break; + }else if(i == 2 && strcmp(flds[0], "index") == 0){ + if(nameok(flds[1]) < 0){ + seterr(EAdmin, "illegal index name %s in config file %s", flds[1], file); + break; + } + if(config->index != nil){ + seterr(EAdmin, "duplicate indices in config file %s", file); + break; + } + config->index = estrdup(flds[1]); + }else if(i == 2 && strcmp(flds[0], "bcmem") == 0){ + if(numok(flds[1]) < 0){ + seterr(EAdmin, "illegal size %s in config file %s", + flds[1], file); + break; + } + if(config->bcmem != 0){ + seterr(EAdmin, "duplicate bcmem lines in config file %s", file); + break; + } + config->bcmem = unittoull(flds[1]); + }else if(i == 2 && strcmp(flds[0], "mem") == 0){ + if(numok(flds[1]) < 0){ + seterr(EAdmin, "illegal size %s in config file %s", + flds[1], file); + break; + } + if(config->mem != 0xFFFFFFFFUL){ + seterr(EAdmin, "duplicate mem lines in config file %s", file); + break; + } + config->mem = unittoull(flds[1]); + }else if(i == 2 && strcmp(flds[0], "icmem") == 0){ + if(numok(flds[1]) < 0){ + seterr(EAdmin, "illegal size %s in config file %s", + flds[1], file); + break; + } + if(config->icmem != 0){ + seterr(EAdmin, "duplicate icmem lines in config file %s", file); + break; + } + config->icmem = unittoull(flds[1]); + }else if(i == 1 && strcmp(flds[0], "queuewrites") == 0){ + config->queuewrites = 1; + }else if(i == 2 && strcmp(flds[0], "httpaddr") == 0){ + if(config->haddr){ + seterr(EAdmin, "duplicate httpaddr lines in configuration file %s", file); + break; + } + config->haddr = estrdup(flds[1]); + }else if(i == 2 && strcmp(flds[0], "webroot") == 0){ + if(config->webroot){ + seterr(EAdmin, "duplicate webroot lines in configuration file %s", file); + break; + } + config->webroot = estrdup(flds[1]); + }else if(i == 2 && strcmp(flds[0], "addr") == 0){ + if(config->vaddr){ + seterr(EAdmin, "duplicate addr lines in configuration file %s", file); + break; + } + config->vaddr = estrdup(flds[1]); + }else{ + seterr(EAdmin, "illegal line '%s' in configuration file %s", line, file); + break; + } + free(line); + line = nil; + } + free(line); + freeifile(&f); + if(ok < 0){ + free(config->sects); + config->sects = nil; + free(config->aparts); + config->aparts = nil; + } + return ok; +} + +static ISect* +configisect(char *file) +{ + Part *part; + ISect *is; + + if(0) fprint(2, "configure index section in %s\n", file); + + part = initpart(file, ORDWR|ODIRECT); + if(part == nil) + return nil; + is = initisect(part); + if(is == nil) + werrstr("%s: %r", file); + return is; +} + +static ArenaPart* +configarenas(char *file) +{ + ArenaPart *ap; + Part *part; + + if(0) fprint(2, "configure arenas in %s\n", file); + part = initpart(file, ORDWR|ODIRECT); + if(part == nil) + return nil; + ap = initarenapart(part); + if(ap == nil) + werrstr("%s: %r", file); + return ap; +} + +static Bloom* +configbloom(char *file) +{ + Bloom *b; + Part *part; + + if(0) fprint(2, "configure bloom in %s\n", file); + part = initpart(file, ORDWR|ODIRECT); + if(part == nil) + return nil; + b = readbloom(part); + if(b == nil) + werrstr("%s: %r", file); + return b; +} + diff --git a/src/cmd/venti/srv/conv.c b/src/cmd/venti/srv/conv.c new file mode 100644 index 00000000..13afc7d2 --- /dev/null +++ b/src/cmd/venti/srv/conv.c @@ -0,0 +1,632 @@ +#include "stdinc.h" +#include "dat.h" +#include "fns.h" + +/* + * disk structure conversion routines + */ +#define U8GET(p) ((p)[0]) +#define U16GET(p) (((p)[0]<<8)|(p)[1]) +#define U32GET(p) ((u32int)(((p)[0]<<24)|((p)[1]<<16)|((p)[2]<<8)|(p)[3])) +#define U64GET(p) (((u64int)U32GET(p)<<32)|(u64int)U32GET((p)+4)) + +#define U8PUT(p,v) (p)[0]=(v)&0xFF +#define U16PUT(p,v) (p)[0]=((v)>>8)&0xFF;(p)[1]=(v)&0xFF +#define U32PUT(p,v) (p)[0]=((v)>>24)&0xFF;(p)[1]=((v)>>16)&0xFF;(p)[2]=((v)>>8)&0xFF;(p)[3]=(v)&0xFF +#define U64PUT(p,v,t32) t32=(v)>>32;U32PUT(p,t32);t32=(v);U32PUT((p)+4,t32) + +static struct { + u32int m; + char *s; +} magics[] = { + ArenaPartMagic, "ArenaPartMagic", + ArenaHeadMagic, "ArenaHeadMagic", + ArenaMagic, "ArenaMagic", + ISectMagic, "ISectMagic", + BloomMagic, "BloomMagic", +}; + +static char* +fmtmagic(char *s, u32int m) +{ + int i; + + for(i=0; i<nelem(magics); i++) + if(magics[i].m == m) + return magics[i].s; + sprint(s, "0x%08ux", m); + return s; +} + +u32int +unpackmagic(u8int *buf) +{ + return U32GET(buf); +} + +void +packmagic(u32int magic, u8int *buf) +{ + U32PUT(buf, magic); +} + +int +unpackarenapart(ArenaPart *ap, u8int *buf) +{ + u8int *p; + u32int m; + char fbuf[20]; + + p = buf; + + m = U32GET(p); + if(m != ArenaPartMagic){ + seterr(ECorrupt, "arena set has wrong magic number: %s expected ArenaPartMagic (%lux)", fmtmagic(fbuf, m), ArenaPartMagic); + return -1; + } + p += U32Size; + ap->version = U32GET(p); + p += U32Size; + ap->blocksize = U32GET(p); + p += U32Size; + ap->arenabase = U32GET(p); + p += U32Size; + + if(buf + ArenaPartSize != p) + sysfatal("unpackarenapart unpacked wrong amount"); + + return 0; +} + +int +packarenapart(ArenaPart *ap, u8int *buf) +{ + u8int *p; + + p = buf; + + U32PUT(p, ArenaPartMagic); + p += U32Size; + U32PUT(p, ap->version); + p += U32Size; + U32PUT(p, ap->blocksize); + p += U32Size; + U32PUT(p, ap->arenabase); + p += U32Size; + + if(buf + ArenaPartSize != p) + sysfatal("packarenapart packed wrong amount"); + + return 0; +} + +int +unpackarena(Arena *arena, u8int *buf) +{ + int sz; + u8int *p; + u32int m; + char fbuf[20]; + + p = buf; + + m = U32GET(p); + if(m != ArenaMagic){ + seterr(ECorrupt, "arena has wrong magic number: %s expected ArenaMagic (%lux)", fmtmagic(fbuf, m), ArenaMagic); + return -1; + } + p += U32Size; + arena->version = U32GET(p); + p += U32Size; + namecp(arena->name, (char*)p); + p += ANameSize; + arena->diskstats.clumps = U32GET(p); + p += U32Size; + arena->diskstats.cclumps = U32GET(p); + p += U32Size; + arena->ctime = U32GET(p); + p += U32Size; + arena->wtime = U32GET(p); + p += U32Size; + if(arena->version == ArenaVersion5){ + arena->clumpmagic = U32GET(p); + p += U32Size; + } + arena->diskstats.used = U64GET(p); + p += U64Size; + arena->diskstats.uncsize = U64GET(p); + p += U64Size; + arena->diskstats.sealed = U8GET(p); + p += U8Size; + + arena->memstats = arena->diskstats; + + switch(arena->version){ + case ArenaVersion4: + sz = ArenaSize4; + arena->clumpmagic = _ClumpMagic; + break; + case ArenaVersion5: + sz = ArenaSize5; + break; + default: + seterr(ECorrupt, "arena has bad version number %d", arena->version); + return -1; + } + if(buf + sz != p) + sysfatal("unpackarena unpacked wrong amount"); + + return 0; +} + +int +packarena(Arena *arena, u8int *buf) +{ + int sz; + u8int *p; + u32int t32; + + switch(arena->version){ + case ArenaVersion4: + sz = ArenaSize4; + if(arena->clumpmagic != _ClumpMagic) + fprint(2, "warning: writing old arena tail loses clump magic 0x%lux != 0x%lux\n", + (ulong)arena->clumpmagic, (ulong)_ClumpMagic); + break; + case ArenaVersion5: + sz = ArenaSize5; + break; + default: + sysfatal("packarena unknown version %d", arena->version); + return -1; + } + + p = buf; + + U32PUT(p, ArenaMagic); + p += U32Size; + U32PUT(p, arena->version); + p += U32Size; + namecp((char*)p, arena->name); + p += ANameSize; + U32PUT(p, arena->diskstats.clumps); + p += U32Size; + U32PUT(p, arena->diskstats.cclumps); + p += U32Size; + U32PUT(p, arena->ctime); + p += U32Size; + U32PUT(p, arena->wtime); + p += U32Size; + if(arena->version == ArenaVersion5){ + U32PUT(p, arena->clumpmagic); + p += U32Size; + } + U64PUT(p, arena->diskstats.used, t32); + p += U64Size; + U64PUT(p, arena->diskstats.uncsize, t32); + p += U64Size; + U8PUT(p, arena->diskstats.sealed); + p += U8Size; + + if(buf + sz != p) + sysfatal("packarena packed wrong amount"); + + return 0; +} + +int +unpackarenahead(ArenaHead *head, u8int *buf) +{ + u8int *p; + u32int m; + int sz; + + p = buf; + + m = U32GET(p); + /* XXX check magic! */ + + p += U32Size; + head->version = U32GET(p); + p += U32Size; + namecp(head->name, (char*)p); + p += ANameSize; + head->blocksize = U32GET(p); + p += U32Size; + head->size = U64GET(p); + p += U64Size; + if(head->version == ArenaVersion5){ + head->clumpmagic = U32GET(p); + p += U32Size; + } + + switch(head->version){ + case ArenaVersion4: + sz = ArenaHeadSize4; + head->clumpmagic = _ClumpMagic; + break; + case ArenaVersion5: + sz = ArenaHeadSize5; + break; + default: + seterr(ECorrupt, "arena head has unexpected version %d", head->version); + return -1; + } + + if(buf + sz != p) + sysfatal("unpackarenahead unpacked wrong amount"); + + return 0; +} + +int +packarenahead(ArenaHead *head, u8int *buf) +{ + u8int *p; + int sz; + u32int t32; + + switch(head->version){ + case ArenaVersion4: + sz = ArenaHeadSize4; + if(head->clumpmagic != _ClumpMagic) + fprint(2, "warning: writing old arena header loses clump magic 0x%lux != 0x%lux\n", + (ulong)head->clumpmagic, (ulong)_ClumpMagic); + break; + case ArenaVersion5: + sz = ArenaHeadSize5; + break; + default: + sysfatal("packarenahead unknown version %d", head->version); + return -1; + } + + p = buf; + + U32PUT(p, ArenaHeadMagic); + p += U32Size; + U32PUT(p, head->version); + p += U32Size; + namecp((char*)p, head->name); + p += ANameSize; + U32PUT(p, head->blocksize); + p += U32Size; + U64PUT(p, head->size, t32); + p += U64Size; + if(head->version == ArenaVersion5){ + U32PUT(p, head->clumpmagic); + p += U32Size; + } + if(buf + sz != p) + sysfatal("packarenahead packed wrong amount"); + + return 0; +} + +static int +checkclump(Clump *w) +{ + if(w->encoding == ClumpENone){ + if(w->info.size != w->info.uncsize){ + seterr(ECorrupt, "uncompressed wad size mismatch"); + return -1; + } + }else if(w->encoding == ClumpECompress){ + if(w->info.size >= w->info.uncsize){ + seterr(ECorrupt, "compressed lump has inconsistent block sizes %d %d", w->info.size, w->info.uncsize); + return -1; + } + }else{ + seterr(ECorrupt, "clump has illegal encoding"); + return -1; + } + + return 0; +} + +int +unpackclump(Clump *c, u8int *buf, u32int cmagic) +{ + u8int *p; + u32int magic; + + p = buf; + magic = U32GET(p); + if(magic != cmagic){ + seterr(ECorrupt, "clump has bad magic number=%#8.8ux != %#8.8ux", magic, cmagic); + return -1; + } + p += U32Size; + + c->info.type = vtfromdisktype(U8GET(p)); + p += U8Size; + c->info.size = U16GET(p); + p += U16Size; + c->info.uncsize = U16GET(p); + p += U16Size; + scorecp(c->info.score, p); + p += VtScoreSize; + + c->encoding = U8GET(p); + p += U8Size; + c->creator = U32GET(p); + p += U32Size; + c->time = U32GET(p); + p += U32Size; + + if(buf + ClumpSize != p) + sysfatal("unpackclump unpacked wrong amount"); + + return checkclump(c); +} + +int +packclump(Clump *c, u8int *buf, u32int magic) +{ + u8int *p; + + p = buf; + U32PUT(p, magic); + p += U32Size; + + U8PUT(p, vttodisktype(c->info.type)); + p += U8Size; + U16PUT(p, c->info.size); + p += U16Size; + U16PUT(p, c->info.uncsize); + p += U16Size; + scorecp(p, c->info.score); + p += VtScoreSize; + + U8PUT(p, c->encoding); + p += U8Size; + U32PUT(p, c->creator); + p += U32Size; + U32PUT(p, c->time); + p += U32Size; + + if(buf + ClumpSize != p) + sysfatal("packclump packed wrong amount"); + + return checkclump(c); +} + +void +unpackclumpinfo(ClumpInfo *ci, u8int *buf) +{ + u8int *p; + + p = buf; + ci->type = vtfromdisktype(U8GET(p)); + p += U8Size; + ci->size = U16GET(p); + p += U16Size; + ci->uncsize = U16GET(p); + p += U16Size; + scorecp(ci->score, p); + p += VtScoreSize; + + if(buf + ClumpInfoSize != p) + sysfatal("unpackclumpinfo unpacked wrong amount"); +} + +void +packclumpinfo(ClumpInfo *ci, u8int *buf) +{ + u8int *p; + + p = buf; + U8PUT(p, vttodisktype(ci->type)); + p += U8Size; + U16PUT(p, ci->size); + p += U16Size; + U16PUT(p, ci->uncsize); + p += U16Size; + scorecp(p, ci->score); + p += VtScoreSize; + + if(buf + ClumpInfoSize != p) + sysfatal("packclumpinfo packed wrong amount"); +} + +int +unpackisect(ISect *is, u8int *buf) +{ + u8int *p; + u32int m; + char fbuf[20]; + + p = buf; + + + m = U32GET(p); + if(m != ISectMagic){ + seterr(ECorrupt, "index section has wrong magic number: %s expected ISectMagic (%lux)", + fmtmagic(fbuf, m), ISectMagic); + return -1; + } + p += U32Size; + is->version = U32GET(p); + p += U32Size; + namecp(is->name, (char*)p); + p += ANameSize; + namecp(is->index, (char*)p); + p += ANameSize; + is->blocksize = U32GET(p); + p += U32Size; + is->blockbase = U32GET(p); + p += U32Size; + is->blocks = U32GET(p); + p += U32Size; + is->start = U32GET(p); + p += U32Size; + is->stop = U32GET(p); + p += U32Size; + if(buf + ISectSize1 != p) + sysfatal("unpackisect unpacked wrong amount"); + is->bucketmagic = 0; + if(is->version == ISectVersion2){ + is->bucketmagic = U32GET(p); + p += U32Size; + if(buf + ISectSize2 != p) + sysfatal("unpackisect unpacked wrong amount"); + } + + return 0; +} + +int +packisect(ISect *is, u8int *buf) +{ + u8int *p; + + p = buf; + + U32PUT(p, ISectMagic); + p += U32Size; + U32PUT(p, is->version); + p += U32Size; + namecp((char*)p, is->name); + p += ANameSize; + namecp((char*)p, is->index); + p += ANameSize; + U32PUT(p, is->blocksize); + p += U32Size; + U32PUT(p, is->blockbase); + p += U32Size; + U32PUT(p, is->blocks); + p += U32Size; + U32PUT(p, is->start); + p += U32Size; + U32PUT(p, is->stop); + p += U32Size; + if(buf + ISectSize1 != p) + sysfatal("packisect packed wrong amount"); + if(is->version == ISectVersion2){ + U32PUT(p, is->bucketmagic); + p += U32Size; + if(buf + ISectSize2 != p) + sysfatal("packisect packed wrong amount"); + } + + return 0; +} + +void +unpackientry(IEntry *ie, u8int *buf) +{ + u8int *p; + + p = buf; + + scorecp(ie->score, p); + p += VtScoreSize; + ie->wtime = U32GET(p); + p += U32Size; + ie->train = U16GET(p); + p += U16Size; + ie->ia.addr = U64GET(p); +if(ie->ia.addr>>56) print("%.8H => %llux\n", p, ie->ia.addr); + p += U64Size; + ie->ia.size = U16GET(p); + p += U16Size; + if(p - buf != IEntryTypeOff) + sysfatal("unpackientry bad IEntryTypeOff amount"); + ie->ia.type = vtfromdisktype(U8GET(p)); + p += U8Size; + ie->ia.blocks = U8GET(p); + p += U8Size; + + if(p - buf != IEntrySize) + sysfatal("unpackientry unpacked wrong amount"); +} + +void +packientry(IEntry *ie, u8int *buf) +{ + u32int t32; + u8int *p; + + p = buf; + + scorecp(p, ie->score); + p += VtScoreSize; + U32PUT(p, ie->wtime); + p += U32Size; + U16PUT(p, ie->train); + p += U16Size; + U64PUT(p, ie->ia.addr, t32); + p += U64Size; + U16PUT(p, ie->ia.size); + p += U16Size; + U8PUT(p, vttodisktype(ie->ia.type)); + p += U8Size; + U8PUT(p, ie->ia.blocks); + p += U8Size; + + if(p - buf != IEntrySize) + sysfatal("packientry packed wrong amount"); +} + +void +unpackibucket(IBucket *b, u8int *buf, u32int magic) +{ + b->n = U16GET(buf); + b->data = buf + IBucketSize; + if(magic && magic != U32GET(buf+U16Size)) + b->n = 0; +} + +void +packibucket(IBucket *b, u8int *buf, u32int magic) +{ + U16PUT(buf, b->n); + U32PUT(buf+U16Size, magic); +} + +void +packbloomhead(Bloom *b, u8int *buf) +{ + u8int *p; + + p = buf; + U32PUT(p, BloomMagic); + U32PUT(p+4, BloomVersion); + U32PUT(p+8, b->nhash); + U32PUT(p+12, b->size); +} + +int +unpackbloomhead(Bloom *b, u8int *buf) +{ + u8int *p; + u32int m; + char fbuf[20]; + + p = buf; + + m = U32GET(p); + if(m != BloomMagic){ + seterr(ECorrupt, "bloom filter has wrong magic number: %s expected BloomMagic (%lux)", fmtmagic(fbuf, m), (ulong)BloomMagic); + return -1; + } + p += U32Size; + + m = U32GET(p); + if(m != BloomVersion){ + seterr(ECorrupt, "bloom filter has wrong version %ud expected %ud", (uint)m, (uint)BloomVersion); + return -1; + } + p += U32Size; + + b->nhash = U32GET(p); + p += U32Size; + + b->size = U32GET(p); + p += U32Size; + + if(buf + BloomHeadSize != p) + sysfatal("unpackarena unpacked wrong amount"); + + return 0; +} diff --git a/src/cmd/venti/srv/dat.h b/src/cmd/venti/srv/dat.h new file mode 100644 index 00000000..5f6d1a3f --- /dev/null +++ b/src/cmd/venti/srv/dat.h @@ -0,0 +1,718 @@ +typedef struct Config Config; +typedef struct AMap AMap; +typedef struct AMapN AMapN; +typedef struct Arena Arena; +typedef struct AState AState; +typedef struct ArenaHead ArenaHead; +typedef struct ArenaPart ArenaPart; +typedef struct ArenaTail ArenaTail; +typedef struct ATailStats ATailStats; +typedef struct CIBlock CIBlock; +typedef struct Clump Clump; +typedef struct ClumpInfo ClumpInfo; +typedef struct Graph Graph; +typedef struct IAddr IAddr; +typedef struct IBucket IBucket; +typedef struct IEStream IEStream; +typedef struct IEntry IEntry; +typedef struct IFile IFile; +typedef struct ISect ISect; +typedef struct Index Index; +typedef struct Lump Lump; +typedef struct DBlock DBlock; +typedef struct Part Part; +typedef struct Statbin Statbin; +typedef struct Statdesc Statdesc; +typedef struct Stats Stats; +typedef struct ZBlock ZBlock; +typedef struct Round Round; +typedef struct Bloom Bloom; + +#define TWID32 ((u32int)~(u32int)0) +#define TWID64 ((u64int)~(u64int)0) +#define TWID8 ((u8int)~(u8int)0) + +enum +{ + ABlockLog = 9, /* log2(512), the quantum for reading arenas */ + ANameSize = 64, + MaxDiskBlock = 64*1024, /* max. allowed size for a disk block */ + MaxIoSize = 64*1024, /* max. allowed size for a disk io operation */ + PartBlank = 256*1024, /* untouched section at beginning of partition */ + HeadSize = 512, /* size of a header after PartBlank */ + MinArenaSize = 1*1024*1024, /* smallest reasonable arena size */ + IndexBase = 1024*1024, /* initial address to use in an index */ + MaxIo = 64*1024, /* max size of a single read or write operation */ + ICacheBits = 16, /* default bits for indexing icache */ + ICacheDepth = 4, /* default depth of an icache hash chain */ + MaxAMap = 2*1024, /* max. allowed arenas in an address mapping; must be < 32*1024 */ + + /* + * return codes from syncarena + */ + SyncDataErr = 1 << 0, /* problem reading the clump data */ + SyncCIErr = 1 << 1, /* found erroneous clump directory entries */ + SyncCIZero = 1 << 2, /* found unwritten clump directory entries */ + SyncFixErr = 1 << 3, /* error writing fixed data */ + SyncHeader = 1 << 4, /* altered header fields */ + + /* + * error severity + */ + EOk = 0, /* error expected in normal operation */ + EStrange, /* strange error that should be logged */ + ECorrupt, /* corrupted data found in arenas */ + EICorrupt, /* corrupted data found in index */ + EAdmin, /* should be brought to administrators' attention */ + ECrash, /* really bad internal error */ + EBug, /* a limitation which should be fixed */ + EInconsist, /* inconsistencies between index and arena */ + EMax, + + /* + * internal disk formats for the venti archival storage system + */ + /* + * magic numbers on disk + */ + _ClumpMagic = 0xd15cb10c, /* clump header, deprecated */ + ClumpFreeMagic = 0, /* free clump; terminates active clump log */ + + ArenaPartMagic = 0xa9e4a5e7, /* arena partition header */ + ArenaMagic = 0xf2a14ead, /* arena trailer */ + ArenaHeadMagic = 0xd15c4ead, /* arena header */ + + BloomMagic = 0xb1004ead, /* bloom filter header */ + BloomMaxHash = 32, + + ISectMagic = 0xd15c5ec7, /* index header */ + + ArenaPartVersion = 3, + ArenaVersion4 = 4, + ArenaVersion5 = 5, + BloomVersion = 1, + IndexVersion = 1, + ISectVersion1 = 1, + ISectVersion2 = 2, + + /* + * encodings of clumps on disk + */ + ClumpEErr = 0, /* can't happen */ + ClumpENone, /* plain */ + ClumpECompress, /* compressed */ + ClumpEMax, + + /* + * sizes in bytes on disk + */ + U8Size = 1, + U16Size = 2, + U32Size = 4, + U64Size = 8, + + ArenaPartSize = 4 * U32Size, + ArenaSize4 = 2 * U64Size + 6 * U32Size + ANameSize + U8Size, + ArenaSize5 = ArenaSize4 + U32Size, + ArenaHeadSize4 = U64Size + 3 * U32Size + ANameSize, + ArenaHeadSize5 = ArenaHeadSize4 + U32Size, + BloomHeadSize = 4 * U32Size, + ISectSize1 = 7 * U32Size + 2 * ANameSize, + ISectSize2 = ISectSize1 + U32Size, + ClumpInfoSize = U8Size + 2 * U16Size + VtScoreSize, + ClumpSize = ClumpInfoSize + U8Size + 3 * U32Size, + MaxBloomSize = 1<<(32-3), /* 2^32 bits */ + MaxBloomHash = 32, /* bits per score */ + /* + * BUG - The various block copies that manipulate entry buckets + * would be faster if we bumped IBucketSize up to 8 and IEntrySize up to 40, + * so that everything is word-aligned. Buildindex is actually cpu-bound + * by the (byte at a time) copying in qsort. + */ + IBucketSize = U32Size + U16Size, + IEntrySize = U64Size + U32Size + 2*U16Size + 2*U8Size + VtScoreSize, + IEntryTypeOff = VtScoreSize + U64Size + U32Size + 2 * U16Size, + + MaxClumpBlocks = (VtMaxLumpSize + ClumpSize + (1 << ABlockLog) - 1) >> ABlockLog, + + /* + * dirty flags - order controls disk write order + */ + DirtyArena = 1, + DirtyArenaCib, + DirtyArenaTrailer, + DirtyMax, + + VentiZZZZZZZZ +}; + +extern char TraceDisk[]; +extern char TraceLump[]; +extern char TraceBlock[]; +extern char TraceProc[]; +extern char TraceWork[]; +extern char TraceQuiet[]; +extern char TraceRpc[]; + +/* + * results of parsing and initializing a config file + */ +struct Config +{ + char *index; /* name of the index to initialize */ + int naparts; /* arena partitions initialized */ + ArenaPart **aparts; + int nsects; /* index sections initialized */ + ISect **sects; + Bloom *bloom; /* bloom filter */ + u32int bcmem; + u32int mem; + u32int icmem; + int queuewrites; + char* haddr; + char* vaddr; + char* webroot; +}; + +/* + * a Part is the low level interface to files or disks. + * there are two main types of partitions + * arena paritions, which some number of arenas, each in a sub-partition. + * index partition, which only have one subpartition. + */ +struct Part +{ + int fd; /* rock for accessing the disk */ + int mode; + u64int offset; + u64int size; /* size of the partiton */ + u32int blocksize; /* block size for reads and writes */ + u32int fsblocksize; /* minimum file system block size */ + char *name; + char *filename; + Channel *writechan; /* chan[dcache.nblock](DBlock*) */ +}; + +/* + * a cached block from the partition + * yuck -- most of this is internal structure for the cache + * all other routines should only use data + */ +struct DBlock +{ + u8int *data; + + Part *part; /* partition in which cached */ + u64int addr; /* base address on the partition */ + u32int size; /* amount of data available, not amount allocated; should go away */ + u32int mode; + u32int dirty; + u32int dirtying; + DBlock *next; /* doubly linked hash chains */ + DBlock *prev; + u32int heap; /* index in heap table */ + u32int used; /* last reference times */ + u32int used2; + u32int ref; /* reference count */ + RWLock lock; /* for access to data only */ + Channel *writedonechan; + void* chanbuf[1]; /* buffer for the chan! */ +}; + +/* + * a cached block from the partition + * yuck -- most of this is internal structure for the cache + * all other routines should only use data + * double yuck -- this is mostly the same as a DBlock + */ +struct Lump +{ + Packet *data; + + Part *part; /* partition in which cached */ + u8int score[VtScoreSize]; /* score of packet */ + u8int type; /* type of packet */ + u32int size; /* amount of data allocated to hold packet */ + Lump *next; /* doubly linked hash chains */ + Lump *prev; + u32int heap; /* index in heap table */ + u32int used; /* last reference times */ + u32int used2; + u32int ref; /* reference count */ + QLock lock; /* for access to data only */ +}; + +/* + * mapping between names and address ranges + */ +struct AMap +{ + u64int start; + u64int stop; + char name[ANameSize]; +}; + +/* + * an AMap along with a length + */ +struct AMapN +{ + int n; + AMap *map; +}; + +/* + * an ArenaPart is a partition made up of Arenas + * it exists because most os's don't support many partitions, + * and we want to have many different Arenas + */ +struct ArenaPart +{ + Part *part; + u64int size; /* size of underlying partition, rounded down to blocks */ + Arena **arenas; + u32int tabbase; /* base address of arena table on disk */ + u32int tabsize; /* max. bytes in arena table */ + + /* + * fields stored on disk + */ + u32int version; + u32int blocksize; /* "optimal" block size for reads and writes */ + u32int arenabase; /* base address of first arena */ + + /* + * stored in the arena mapping table on disk + */ + AMap *map; + int narenas; +}; + +/* + * info about one block in the clump info cache + */ +struct CIBlock +{ + u32int block; /* blocks in the directory */ + int offset; /* offsets of one clump in the data */ + DBlock *data; +}; + +/* + * Statistics kept in the tail. + */ +struct ATailStats +{ + u32int clumps; /* number of clumps */ + u32int cclumps; /* number of compressed clumps */ + u64int used; + u64int uncsize; + u8int sealed; +}; + +/* + * Arena state - represents a point in the data log + */ +struct AState +{ + Arena *arena; + u64int aa; /* index address */ + ATailStats stats; +}; + +/* + * an Arena is a log of Clumps, preceeded by an ArenaHeader, + * and followed by a Arena, each in one disk block. + * struct on disk is not always up to date, but should be self-consistent. + * to sync after reboot, follow clumps starting at used until ClumpFreeMagic if found. + * <struct name="Arena" type="Arena *"> + * <field name="name" val="s->name" type="AName"/> + * <field name="version" val="s->version" type="U32int"/> + * <field name="partition" val="s->part->name" type="AName"/> + * <field name="blocksize" val="s->blocksize" type="U32int"/> + * <field name="start" val="s->base" type="U64int"/> + * <field name="stop" val="s->base+2*s->blocksize" type="U64int"/> + * <field name="created" val="s->ctime" type="U32int"/> + * <field name="modified" val="s->wtime" type="U32int"/> + * <field name="sealed" val="s->sealed" type="Sealed"/> + * <field name="score" val="s->score" type="Score"/> + * <field name="clumps" val="s->clumps" type="U32int"/> + * <field name="compressedclumps" val="s->cclumps" type="U32int"/> + * <field name="data" val="s->uncsize" type="U64int"/> + * <field name="compresseddata" val="s->used - s->clumps * ClumpSize" type="U64int"/> + * <field name="storage" val="s->used + s->clumps * ClumpInfoSize" type="U64int"/> + * </struct> + */ +struct Arena +{ + QLock lock; /* lock for arena fields, writing to disk */ + Part *part; /* partition in which arena lives */ + int blocksize; /* size of block to read or write */ + u64int base; /* base address on disk */ + u64int size; /* total space in the arena */ + u64int limit; /* storage limit for clumps */ + u8int score[VtScoreSize]; /* score of the entire sealed & summed arena */ + + int clumpmax; /* ClumpInfos per block */ + AState mem; + int inqueue; + DigestState sha1; + + /* + * fields stored on disk + */ + u32int version; + char name[ANameSize]; /* text label */ + ATailStats memstats; + ATailStats diskstats; + u32int ctime; /* first time a block was written */ + u32int wtime; /* last time a block was written */ + u32int clumpmagic; +}; + +/* + * redundant storage of some fields at the beginning of each arena + */ +struct ArenaHead +{ + u32int version; + char name[ANameSize]; + u32int blocksize; + u64int size; + u32int clumpmagic; +}; + +/* + * most interesting meta information for a clump. + * stored in each clump's header and in the Arena's directory, + * stored in reverse order just prior to the arena trailer + */ +struct ClumpInfo +{ + u8int type; + u16int size; /* size of disk data, not including header */ + u16int uncsize; /* size of uncompressed data */ + u8int score[VtScoreSize]; /* score of the uncompressed data only */ +}; + +/* + * header for an immutable clump of data + */ +struct Clump +{ + ClumpInfo info; + u8int encoding; + u32int creator; /* initial client which wrote the block */ + u32int time; /* creation at gmt seconds since 1/1/1970 */ +}; + +/* + * index of all clumps according to their score + * this is just a wrapper to tie together the index sections + * <struct name="Index" type="Index *"> + * <field name="name" val="s->name" type="AName"/> + * <field name="version" val="s->version" type="U32int"/> + * <field name="blocksize" val="s->blocksize" type="U32int"/> + * <field name="tabsize" val="s->tabsize" type="U32int"/> + * <field name="buckets" val="s->buckets" type="U32int"/> + * <field name="buckdiv" val="s->div" type="U32int"/> + * <field name="bitblocks" val="s->div" type="U32int"/> + * <field name="maxdepth" val="s->div" type="U32int"/> + * <field name="bitkeylog" val="s->div" type="U32int"/> + * <field name="bitkeymask" val="s->div" type="U32int"/> + * <array name="sect" val="&s->smap[i]" elems="s->nsects" type="Amap"/> + * <array name="amap" val="&s->amap[i]" elems="s->narenas" type="Amap"/> + * <array name="arena" val="s->arenas[i]" elems="s->narenas" type="Arena"/> + * </struct> + * <struct name="Amap" type="AMap *"> + * <field name="name" val="s->name" type="AName"/> + * <field name="start" val="s->start" type="U64int"/> + * <field name="stop" val="s->stop" type="U64int"/> + * </struct> + */ +struct Index +{ + u32int div; /* divisor for mapping score to bucket */ + u32int buckets; /* last bucket used in disk hash table */ + u32int blocksize; + u32int tabsize; /* max. bytes in index config */ + u32int bitblocks; //XXX remove these fields + u32int maxdepth; + u32int bitkeylog; + u32int bitkeymask; + + int mapalloc; /* first arena to check when adding a lump */ + Arena **arenas; /* arenas in the mapping */ + ISect **sects; /* sections which hold the buckets */ + Bloom *bloom; /* bloom filter */ + + /* + * fields stored in config file + */ + u32int version; + char name[ANameSize]; /* text label */ + int nsects; + AMap *smap; /* mapping of buckets to index sections */ + int narenas; + AMap *amap; /* mapping from index addesses to arenas */ +}; + +/* + * one part of the bucket storage for an index. + * the index blocks are sequentially allocated + * across all of the sections. + */ +struct ISect +{ + Part *part; + int blocklog; /* log2(blocksize) */ + int buckmax; /* max. entries in a index bucket */ + u32int tabbase; /* base address of index config table on disk */ + u32int tabsize; /* max. bytes in index config */ + Channel *writechan; + Channel *writedonechan; + + /* + * fields stored on disk + */ + u32int version; + u32int bucketmagic; + char name[ANameSize]; /* text label */ + char index[ANameSize]; /* index owning the section */ + u32int blocksize; /* size of hash buckets in index */ + u32int blockbase; /* address of start of on disk index table */ + u32int blocks; /* total blocks on disk; some may be unused */ + u32int start; /* first bucket in this section */ + u32int stop; /* limit of buckets in this section */ +}; + +/* + * externally interesting part of an IEntry + */ +struct IAddr +{ + u64int addr; + u16int size; /* uncompressed size */ + u8int type; /* type of block */ + u8int blocks; /* arena io quanta for Clump + data */ +}; + +/* + * entries in the index + * kept in IBuckets in the disk index table, + * cached in the memory ICache. + */ +struct IEntry +{ + u8int score[VtScoreSize]; + IEntry *next; /* next in hash chain */ + IEntry *nextdirty; /* next in dirty chain */ + u32int wtime; /* last write time */ + u16int train; /* relative train containing the most recent ref; 0 if no ref, 1 if in same car */ + u8int rac; /* read ahead count */ + u8int dirty; /* is dirty */ + IAddr ia; +}; + +/* + * buckets in the on disk index table + */ +struct IBucket +{ + u16int n; /* number of active indices */ + u32int buck; /* used by buildindex/checkindex only */ + u8int *data; +}; + +/* + * temporary buffers used by individual threads + */ +struct ZBlock +{ + u32int len; + u32int _size; + u8int *data; + u8int *free; +}; + +/* + * simple input buffer for a '\0' terminated text file + */ +struct IFile +{ + char *name; /* name of the file */ + ZBlock *b; /* entire contents of file */ + u32int pos; /* current position in the file */ +}; + +struct Statdesc +{ + char *name; + ulong max; +}; + +/* keep in sync with stats.c:/statdesc and httpd.c:/graphname*/ +enum +{ + StatRpcTotal, + StatRpcRead, + StatRpcReadOk, + StatRpcReadFail, + StatRpcReadBytes, + StatRpcReadTime, + StatRpcReadCached, + StatRpcReadCachedTime, + StatRpcReadUncached, + StatRpcReadUncachedTime, + StatRpcWrite, + StatRpcWriteNew, + StatRpcWriteOld, + StatRpcWriteFail, + StatRpcWriteBytes, + StatRpcWriteTime, + StatRpcWriteNewTime, + StatRpcWriteOldTime, + + StatLcacheHit, + StatLcacheMiss, + StatLcacheRead, + StatLcacheWrite, + StatLcacheSize, + StatLcacheStall, + StatLcacheReadTime, + + StatDcacheHit, + StatDcacheMiss, + StatDcacheLookup, + StatDcacheRead, + StatDcacheWrite, + StatDcacheDirty, + StatDcacheSize, + StatDcacheFlush, + StatDcacheStall, + StatDcacheLookupTime, + + StatDblockStall, + StatLumpStall, + + StatIcacheHit, + StatIcacheMiss, + StatIcacheRead, + StatIcacheWrite, + StatIcacheFill, + StatIcachePrefetch, + StatIcacheDirty, + StatIcacheSize, + StatIcacheFlush, + StatIcacheStall, + StatIcacheReadTime, + + StatBloomHit, + StatBloomMiss, + StatBloomFalseMiss, + StatBloomLookup, + StatBloomOnes, + StatBloomBits, + StatBloomLookupTime, + + StatApartRead, + StatApartReadBytes, + StatApartWrite, + StatApartWriteBytes, + + StatIsectRead, + StatIsectReadBytes, + StatIsectWrite, + StatIsectWriteBytes, + + StatSumRead, + StatSumReadBytes, + + NStat +}; + +extern Statdesc statdesc[NStat]; + +/* + * statistics about the operation of the server + * mainly for performance monitoring and profiling. + */ +struct Stats +{ + ulong now; + ulong n[NStat]; +}; + +struct Statbin +{ + uint nsamp; + uint min; + uint max; + uint avg; +}; + +struct Graph +{ + long (*fn)(Stats*, Stats*, void*); + void *arg; + long t0; + long t1; + long min; + long max; + long wid; + long ht; + int fill; +}; + +/* + * for kicking background processes that run one round after another after another + */ +struct Round +{ + QLock lock; + Rendez start; + Rendez finish; + Rendez delaywait; + int delaytime; + int delaykick; + char* name; + int last; + int current; + int next; + int doanother; +}; + +/* + * Bloom filter of stored block hashes + */ +struct Bloom +{ + RWLock lk; /* protects nhash, nbits, tab, mb */ + QLock mod; /* one marker at a time, protects nb */ + int nhash; + ulong size; /* bytes in tab */ + ulong mask; /* to produce index */ + u8int *data; + Part *part; + Channel *writechan; + Channel *writedonechan; +}; + +extern Index *mainindex; +extern u32int maxblocksize; /* max. block size used by any partition */ +extern int paranoid; /* should verify hashes on disk read */ +extern int queuewrites; /* put all lump writes on a queue and finish later */ +extern int readonly; /* only allowed to read the disk data */ +extern Stats stats; +extern u8int zeroscore[VtScoreSize]; +extern int compressblocks; +extern int writestodevnull; /* dangerous - for performance debugging */ +extern int collectstats; +extern QLock memdrawlock; +extern int icachesleeptime; +extern int arenasumsleeptime; + +#ifndef PLAN9PORT +#pragma varargck type "V" uchar* +#define ODIRECT 0 +#endif diff --git a/src/cmd/venti/srv/dcache.c b/src/cmd/venti/srv/dcache.c new file mode 100644 index 00000000..72aaafaf --- /dev/null +++ b/src/cmd/venti/srv/dcache.c @@ -0,0 +1,816 @@ +/* + * Disk cache. + * + * Caches raw disk blocks. Getdblock() gets a block, putdblock puts it back. + * Getdblock has a mode parameter that determines i/o and access to a block: + * if mode is OREAD or ORDWR, it is read from disk if not already in memory. + * If mode is ORDWR or OWRITE, it is locked for exclusive use before being returned. + * It is *not* marked dirty -- once changes have been made, they should be noted + * by using dirtydblock() before putdblock(). + * + * There is a global cache lock as well as a lock on each block. + * Within a thread, the cache lock can be acquired while holding a block lock, + * but not vice versa; and a block cannot be locked if you already hold the lock + * on another block. + * + * The flush proc writes out dirty blocks in batches, one batch per dirty tag. + * For example, the DirtyArena blocks are all written to disk before any of the + * DirtyArenaCib blocks. + * + * This code used to be in charge of flushing the dirty index blocks out to + * disk, but updating the index turned out to benefit from extra care. + * Now cached index blocks are never marked dirty. The index.c code takes + * care of updating them behind our back, and uses _getdblock to update any + * cached copies of the blocks as it changes them on disk. + */ + +#include "stdinc.h" +#include "dat.h" +#include "fns.h" + +typedef struct DCache DCache; + +enum +{ + HashLog = 9, + HashSize = 1<<HashLog, + HashMask = HashSize - 1, +}; + +struct DCache +{ + QLock lock; + RWLock dirtylock; /* must be held to inspect or set b->dirty */ + Rendez full; + Round round; + DBlock *free; /* list of available lumps */ + u32int now; /* ticks for usage timestamps */ + int size; /* max. size of any block; allocated to each block */ + DBlock **heads; /* hash table for finding address */ + int nheap; /* number of available victims */ + DBlock **heap; /* heap for locating victims */ + int nblocks; /* number of blocks allocated */ + DBlock *blocks; /* array of block descriptors */ + DBlock **write; /* array of block pointers to be written */ + u8int *mem; /* memory for all block descriptors */ + int ndirty; /* number of dirty blocks */ + int maxdirty; /* max. number of dirty blocks */ + Channel *ra; + u8int *rabuf; + u32int ramax; + u32int rasize; + u64int raaddr; + Part *rapart; + + AState diskstate; + AState state; +}; + +typedef struct Ra Ra; +struct Ra +{ + Part *part; + u64int addr; +}; + +static DCache dcache; + +static int downheap(int i, DBlock *b); +static int upheap(int i, DBlock *b); +static DBlock *bumpdblock(void); +static void delheap(DBlock *db); +static void fixheap(int i, DBlock *b); +static void flushproc(void*); +static void writeproc(void*); +static void raproc(void*); + +void +initdcache(u32int mem) +{ + DBlock *b, *last; + u32int nblocks, blocksize; + int i; + u8int *p; + + if(mem < maxblocksize * 2) + sysfatal("need at least %d bytes for the disk cache", maxblocksize * 2); + if(maxblocksize == 0) + sysfatal("no max. block size given for disk cache"); + blocksize = maxblocksize; + nblocks = mem / blocksize; + dcache.full.l = &dcache.lock; + dcache.nblocks = nblocks; + dcache.maxdirty = (nblocks * 2) / 3; + trace(TraceProc, "initialize disk cache with %d blocks of %d bytes, maximum %d dirty blocks\n", + nblocks, blocksize, dcache.maxdirty); + dcache.size = blocksize; + dcache.heads = MKNZ(DBlock*, HashSize); + dcache.heap = MKNZ(DBlock*, nblocks); + dcache.blocks = MKNZ(DBlock, nblocks); + dcache.write = MKNZ(DBlock*, nblocks); + dcache.mem = MKNZ(u8int, (nblocks+1+128) * blocksize); + dcache.ra = chancreate(sizeof(Ra), 0); + + last = nil; + p = (u8int*)(((ulong)dcache.mem+blocksize-1)&~(ulong)(blocksize-1)); + for(i = 0; i < nblocks; i++){ + b = &dcache.blocks[i]; + b->data = &p[i * blocksize]; + b->heap = TWID32; + b->writedonechan = chancreate(sizeof(void*), 1); + b->next = last; + last = b; + } + dcache.rabuf = &p[i*blocksize]; + dcache.ramax = 128*blocksize; + dcache.raaddr = 0; + dcache.rapart = nil; + + dcache.free = last; + dcache.nheap = 0; + setstat(StatDcacheSize, nblocks); + initround(&dcache.round, "dcache", 120*1000); + + vtproc(flushproc, nil); + vtproc(delaykickroundproc, &dcache.round); + vtproc(raproc, nil); +} + +void +setdcachestate(AState *a) +{ + trace(TraceBlock, "setdcachestate %s 0x%llux clumps %d", a->arena ? a->arena->name : nil, a->aa, a->stats.clumps); + qlock(&dcache.lock); + dcache.state = *a; + qunlock(&dcache.lock); +} + +AState +diskstate(void) +{ + AState a; + + qlock(&dcache.lock); + a = dcache.diskstate; + qunlock(&dcache.lock); + return a; +} + +static void +raproc(void *v) +{ + Ra ra; + DBlock *b; + + USED(v); + while(recv(dcache.ra, &ra) == 1){ + if(ra.part->size <= ra.addr) + continue; + b = _getdblock(ra.part, ra.addr, OREAD, 2); + putdblock(b); + } +} + +void +dreadahead(Part *part, u64int addr, int miss) +{ + Ra ra; + static struct { + Part *part; + u64int addr; + } lastmiss; + static struct { + Part *part; + u64int addr; + int dir; + } lastra; + +return; + if(miss){ + if(lastmiss.part==part && lastmiss.addr==addr-dcache.size){ + XRa: + lastra.part = part; + lastra.dir = addr-lastmiss.addr; + lastra.addr = addr+lastra.dir; + ra.part = part; + ra.addr = lastra.addr; + nbsend(dcache.ra, &ra); + }else if(lastmiss.part==part && lastmiss.addr==addr+dcache.size){ + addr -= dcache.size; + goto XRa; + } + }else{ + if(lastra.part==part && lastra.addr==addr){ + lastra.addr += lastra.dir; + ra.part = part; + ra.addr = lastra.addr; + nbsend(dcache.ra, &ra); + } + } + + if(miss){ + lastmiss.part = part; + lastmiss.addr = addr; + } + +// fprint(2, "%s %llx %s\n", part->name, addr, miss ? "miss" : "hit"); +} + +int +rareadpart(Part *part, u64int addr, u8int *buf, uint n, int load) +{ + uint nn; + static RWLock ralock; + + rlock(&ralock); + if(dcache.rapart==part && dcache.raaddr <= addr && addr+n <= dcache.raaddr+dcache.rasize){ + memmove(buf, dcache.rabuf+(addr-dcache.raaddr), n); + runlock(&ralock); + return 0; + } + if(load != 2 || addr >= part->size){ /* addr >= part->size: let readpart do the error */ + runlock(&ralock); + return readpart(part, addr, buf, n); + } + + runlock(&ralock); + wlock(&ralock); +fprint(2, "raread %s %llx\n", part->name, addr); + nn = dcache.ramax; + if(addr+nn > part->size) + nn = part->size - addr; + if(readpart(part, addr, dcache.rabuf, nn) < 0){ + wunlock(&ralock); + return -1; + } + memmove(buf, dcache.rabuf, n); + dcache.rapart = part; + dcache.rasize = nn; + dcache.raaddr = addr; + wunlock(&ralock); + + addstat(StatApartReadBytes, nn-n); + return 0; +} + +static u32int +pbhash(u64int addr) +{ + u32int h; + +#define hashit(c) ((((c) * 0x6b43a9b5) >> (32 - HashLog)) & HashMask) + h = (addr >> 32) ^ addr; + return hashit(h); +} + +DBlock* +getdblock(Part *part, u64int addr, int mode) +{ + DBlock *b; + uint ms; + + ms = msec(); + b = _getdblock(part, addr, mode, 1); + if(mode == OREAD || mode == ORDWR) + addstat(StatDcacheRead, 1); + if(mode == OWRITE || mode == ORDWR) + addstat(StatDcacheWrite, 1); + ms = msec() - ms; + addstat2(StatDcacheLookup, 1, StatDcacheLookupTime, ms); + return b; +} + +DBlock* +_getdblock(Part *part, u64int addr, int mode, int load) +{ + DBlock *b; + u32int h, size; + + trace(TraceBlock, "getdblock enter %s 0x%llux", part->name, addr); + size = part->blocksize; + if(size > dcache.size){ + seterr(EAdmin, "block size %d too big for cache with size %d", size, dcache.size); + return nil; + } + h = pbhash(addr); + + /* + * look for the block in the cache + */ +//checkdcache(); + qlock(&dcache.lock); +again: + for(b = dcache.heads[h]; b != nil; b = b->next){ + if(b->part == part && b->addr == addr){ + /* + qlock(&stats.lock); + stats.pchit++; + qunlock(&stats.lock); + */ + if(load){ + addstat(StatDcacheHit, 1); + if(load != 2 && mode != OWRITE) + dreadahead(part, b->addr, 0); + } + goto found; + } + } + + /* + * missed: locate the block with the oldest second to last use. + * remove it from the heap, and fix up the heap. + */ + if(!load){ + qunlock(&dcache.lock); + return nil; + } + + addstat(StatDcacheMiss, 1); + + b = bumpdblock(); + if(b == nil){ + trace(TraceBlock, "all disk cache blocks in use"); + addstat(StatDcacheStall, 1); + rsleep(&dcache.full); + addstat(StatDcacheStall, -1); + goto again; + } + + assert(!b->dirty); + + /* + * the new block has no last use, so assume it happens sometime in the middle +ZZZ this is not reasonable + */ + b->used = (b->used2 + dcache.now) / 2; + + /* + * rechain the block on the correct hash chain + */ + b->next = dcache.heads[h]; + dcache.heads[h] = b; + if(b->next != nil) + b->next->prev = b; + b->prev = nil; + + b->addr = addr; + b->part = part; + b->size = 0; + if(load != 2 && mode != OWRITE) + dreadahead(part, b->addr, 1); + +found: + b->ref++; + b->used2 = b->used; + b->used = dcache.now++; + if(b->heap != TWID32) + fixheap(b->heap, b); + + qunlock(&dcache.lock); +//checkdcache(); + + trace(TraceBlock, "getdblock lock"); + addstat(StatDblockStall, 1); + if(mode == OREAD) + rlock(&b->lock); + else + wlock(&b->lock); + addstat(StatDblockStall, -1); + trace(TraceBlock, "getdblock locked"); + + if(b->size != size){ + if(mode == OREAD){ + addstat(StatDblockStall, 1); + runlock(&b->lock); + wlock(&b->lock); + addstat(StatDblockStall, -1); + } + if(b->size < size){ + if(mode == OWRITE) + memset(&b->data[b->size], 0, size - b->size); + else{ + trace(TraceBlock, "getdblock readpart %s 0x%llux", part->name, addr); + if(rareadpart(part, addr + b->size, &b->data[b->size], size - b->size, load) < 0){ + b->mode = ORDWR; /* so putdblock wunlocks */ + putdblock(b); + return nil; + } + trace(TraceBlock, "getdblock readpartdone"); + addstat(StatApartRead, 1); + addstat(StatApartReadBytes, size-b->size); + } + } + b->size = size; + if(mode == OREAD){ + addstat(StatDblockStall, 1); + wunlock(&b->lock); + rlock(&b->lock); + addstat(StatDblockStall, -1); + } + } + + b->mode = mode; + trace(TraceBlock, "getdblock exit"); + return b; +} + +void +putdblock(DBlock *b) +{ + if(b == nil) + return; + + trace(TraceBlock, "putdblock %s 0x%llux", b->part->name, b->addr); + + if(b->mode == OREAD) + runlock(&b->lock); + else + wunlock(&b->lock); + +//checkdcache(); + qlock(&dcache.lock); + if(--b->ref == 0 && !b->dirty){ + if(b->heap == TWID32) + upheap(dcache.nheap++, b); + rwakeupall(&dcache.full); + } + qunlock(&dcache.lock); +//checkdcache(); +} + +void +dirtydblock(DBlock *b, int dirty) +{ + int odirty; + Part *p; + + + trace(TraceBlock, "dirtydblock enter %s 0x%llux %d from 0x%lux", b->part->name, b->addr, dirty, getcallerpc(&b)); + assert(b->ref != 0); + assert(b->mode==ORDWR || b->mode==OWRITE); + + odirty = b->dirty; + if(b->dirty) + assert(b->dirty == dirty); + else + b->dirty = dirty; + + p = b->part; + if(p->writechan == nil){ + trace(TraceBlock, "dirtydblock allocwriteproc %s", p->name); + /* XXX hope this doesn't fail! */ + p->writechan = chancreate(sizeof(DBlock*), dcache.nblocks); + vtproc(writeproc, p); + } + qlock(&dcache.lock); + if(!odirty){ + dcache.ndirty++; + setstat(StatDcacheDirty, dcache.ndirty); + if(dcache.ndirty >= dcache.maxdirty) + kickround(&dcache.round, 0); + else + delaykickround(&dcache.round); + } + qunlock(&dcache.lock); +} + +/* + * remove some block from use and update the free list and counters + */ +static DBlock* +bumpdblock(void) +{ + DBlock *b; + ulong h; + + trace(TraceBlock, "bumpdblock enter"); + b = dcache.free; + if(b != nil){ + dcache.free = b->next; + return b; + } + + if(dcache.ndirty >= dcache.maxdirty) + kickdcache(); + + /* + * remove blocks until we find one that is unused + * referenced blocks are left in the heap even though + * they can't be scavenged; this is simple a speed optimization + */ + for(;;){ + if(dcache.nheap == 0){ + kickdcache(); + trace(TraceBlock, "bumpdblock gotnothing"); + return nil; + } + b = dcache.heap[0]; + delheap(b); + if(!b->ref && !b->dirty) + break; + } + + trace(TraceBlock, "bumpdblock bumping %s 0x%llux", b->part->name, b->addr); + + /* + * unchain the block + */ + if(b->prev == nil){ + h = pbhash(b->addr); + if(dcache.heads[h] != b) + sysfatal("bad hash chains in disk cache"); + dcache.heads[h] = b->next; + }else + b->prev->next = b->next; + if(b->next != nil) + b->next->prev = b->prev; + + return b; +} + +/* + * delete an arbitrary block from the heap + */ +static void +delheap(DBlock *db) +{ + if(db->heap == TWID32) + return; + fixheap(db->heap, dcache.heap[--dcache.nheap]); + db->heap = TWID32; +} + +/* + * push an element up or down to it's correct new location + */ +static void +fixheap(int i, DBlock *b) +{ + if(upheap(i, b) == i) + downheap(i, b); +} + +static int +upheap(int i, DBlock *b) +{ + DBlock *bb; + u32int now; + int p; + + now = dcache.now; + for(; i != 0; i = p){ + p = (i - 1) >> 1; + bb = dcache.heap[p]; + if(b->used2 - now >= bb->used2 - now) + break; + dcache.heap[i] = bb; + bb->heap = i; + } + + dcache.heap[i] = b; + b->heap = i; + return i; +} + +static int +downheap(int i, DBlock *b) +{ + DBlock *bb; + u32int now; + int k; + + now = dcache.now; + for(; ; i = k){ + k = (i << 1) + 1; + if(k >= dcache.nheap) + break; + if(k + 1 < dcache.nheap && dcache.heap[k]->used2 - now > dcache.heap[k + 1]->used2 - now) + k++; + bb = dcache.heap[k]; + if(b->used2 - now <= bb->used2 - now) + break; + dcache.heap[i] = bb; + bb->heap = i; + } + + dcache.heap[i] = b; + b->heap = i; + return i; +} + +static void +findblock(DBlock *bb) +{ + DBlock *b, *last; + int h; + + last = nil; + h = pbhash(bb->addr); + for(b = dcache.heads[h]; b != nil; b = b->next){ + if(last != b->prev) + sysfatal("bad prev link"); + if(b == bb) + return; + last = b; + } + sysfatal("block missing from hash table"); +} + +void +checkdcache(void) +{ + DBlock *b; + u32int size, now; + int i, k, refed, nfree; + + qlock(&dcache.lock); + size = dcache.size; + now = dcache.now; + for(i = 0; i < dcache.nheap; i++){ + if(dcache.heap[i]->heap != i) + sysfatal("dc: mis-heaped at %d: %d", i, dcache.heap[i]->heap); + if(i > 0 && dcache.heap[(i - 1) >> 1]->used2 - now > dcache.heap[i]->used2 - now) + sysfatal("dc: bad heap ordering"); + k = (i << 1) + 1; + if(k < dcache.nheap && dcache.heap[i]->used2 - now > dcache.heap[k]->used2 - now) + sysfatal("dc: bad heap ordering"); + k++; + if(k < dcache.nheap && dcache.heap[i]->used2 - now > dcache.heap[k]->used2 - now) + sysfatal("dc: bad heap ordering"); + } + + refed = 0; + for(i = 0; i < dcache.nblocks; i++){ + b = &dcache.blocks[i]; + if(b->data != &dcache.mem[i * size]) + sysfatal("dc: mis-blocked at %d", i); + if(b->ref && b->heap == TWID32) + refed++; + if(b->addr) + findblock(b); + if(b->heap != TWID32 + && dcache.heap[b->heap] != b) + sysfatal("dc: spurious heap value"); + } + + nfree = 0; + for(b = dcache.free; b != nil; b = b->next){ + if(b->addr != 0 || b->heap != TWID32) + sysfatal("dc: bad free list"); + nfree++; + } + + if(dcache.nheap + nfree + refed != dcache.nblocks) + sysfatal("dc: missing blocks: %d %d %d", dcache.nheap, refed, dcache.nblocks); + qunlock(&dcache.lock); +} + +void +flushdcache(void) +{ + trace(TraceProc, "flushdcache enter"); + kickround(&dcache.round, 1); + trace(TraceProc, "flushdcache exit"); +} + +void +kickdcache(void) +{ + kickround(&dcache.round, 0); +} + +static int +parallelwrites(DBlock **b, DBlock **eb, int dirty) +{ + DBlock **p, **q; + for(p=b; p<eb && (*p)->dirty == dirty; p++){ + assert(b<=p && p<eb); + sendp((*p)->part->writechan, *p); + } + q = p; + for(p=b; p<q; p++){ + assert(b<=p && p<eb); + recvp((*p)->writedonechan); + } + + return p-b; +} + +/* + * Sort first by dirty flag, then by partition, then by address in partition. + */ +static int +writeblockcmp(const void *va, const void *vb) +{ + DBlock *a, *b; + + a = *(DBlock**)va; + b = *(DBlock**)vb; + + if(a->dirty != b->dirty) + return a->dirty - b->dirty; + if(a->part != b->part){ + if(a->part < b->part) + return -1; + if(a->part > b->part) + return 1; + } + if(a->addr < b->addr) + return -1; + return 1; +} + +static void +flushproc(void *v) +{ + int i, j, n; + ulong t0; + DBlock *b, **write; + AState as; + + USED(v); + threadsetname("flushproc"); + for(;;){ + waitforkick(&dcache.round); + + trace(TraceWork, "start"); + qlock(&dcache.lock); + as = dcache.state; + qunlock(&dcache.lock); + + t0 = nsec()/1000; + + trace(TraceProc, "build t=%lud", (ulong)(nsec()/1000)-t0); + write = dcache.write; + n = 0; + for(i=0; i<dcache.nblocks; i++){ + b = &dcache.blocks[i]; + if(b->dirty) + write[n++] = b; + } + + qsort(write, n, sizeof(write[0]), writeblockcmp); + + /* Write each stage of blocks out. */ + trace(TraceProc, "writeblocks t=%lud", (ulong)(nsec()/1000)-t0); + i = 0; + for(j=1; j<DirtyMax; j++){ + trace(TraceProc, "writeblocks.%d t=%lud", j, (ulong)(nsec()/1000)-t0); + i += parallelwrites(write+i, write+n, j); + } + if(i != n){ + fprint(2, "in flushproc i=%d n=%d\n", i, n); + for(i=0; i<n; i++) + fprint(2, "\tblock %d: dirty=%d\n", i, write[i]->dirty); + abort(); + } + +/* XXX +* the locking here is suspect. what if a block is redirtied +* after the write happens? we'll still decrement dcache.ndirty here. +*/ + trace(TraceProc, "undirty.%d t=%lud", j, (ulong)(nsec()/1000)-t0); + qlock(&dcache.lock); + dcache.diskstate = as; + for(i=0; i<n; i++){ + b = write[i]; + --dcache.ndirty; + if(b->ref == 0 && b->heap == TWID32){ + upheap(dcache.nheap++, b); + rwakeupall(&dcache.full); + } + } + setstat(StatDcacheDirty, dcache.ndirty); + qunlock(&dcache.lock); + addstat(StatDcacheFlush, 1); + trace(TraceWork, "finish"); + } +} + +static void +writeproc(void *v) +{ + DBlock *b; + Part *p; + + p = v; + + threadsetname("writeproc:%s", p->name); + for(;;){ + b = recvp(p->writechan); + trace(TraceWork, "start"); + assert(b->part == p); + trace(TraceProc, "wlock %s 0x%llux", p->name, b->addr); + wlock(&b->lock); + trace(TraceProc, "writepart %s 0x%llux", p->name, b->addr); + if(writepart(p, b->addr, b->data, b->size) < 0) + fprint(2, "write error: %r\n"); /* XXX details! */ + addstat(StatApartWrite, 1); + addstat(StatApartWriteBytes, b->size); + b->dirty = 0; + wunlock(&b->lock); + trace(TraceProc, "finish %s 0x%llux", p->name, b->addr); + trace(TraceWork, "finish"); + sendp(b->writedonechan, b); + } +} diff --git a/src/cmd/venti/srv/dump.c b/src/cmd/venti/srv/dump.c new file mode 100644 index 00000000..fa2bfb7d --- /dev/null +++ b/src/cmd/venti/srv/dump.c @@ -0,0 +1,47 @@ +#include "stdinc.h" +#include "dat.h" +#include "fns.h" + +void +printindex(int fd, Index *ix) +{ + int i; + + fprint(fd, "index=%s version=%d blocksize=%d tabsize=%d\n", + ix->name, ix->version, ix->blocksize, ix->tabsize); + fprint(fd, "\tbuckets=%d div=%d\n", ix->buckets, ix->div); + for(i = 0; i < ix->nsects; i++) + fprint(fd, "\tsect=%s for buckets [%lld,%lld)\n", ix->smap[i].name, ix->smap[i].start, ix->smap[i].stop); + for(i = 0; i < ix->narenas; i++) + fprint(fd, "\tarena=%s at [%lld,%lld)\n", ix->amap[i].name, ix->amap[i].start, ix->amap[i].stop); +} + +void +printarenapart(int fd, ArenaPart *ap) +{ + int i; + + fprint(fd, "arena partition=%s\n\tversion=%d blocksize=%d arenas=%d\n\tsetbase=%d setsize=%d\n", + ap->part->name, ap->version, ap->blocksize, ap->narenas, ap->tabbase, ap->tabsize); + for(i = 0; i < ap->narenas; i++) + fprint(fd, "\tarena=%s at [%lld,%lld)\n", ap->map[i].name, ap->map[i].start, ap->map[i].stop); +} + +void +printarena(int fd, Arena *arena) +{ + fprint(fd, "arena='%s' [%lld,%lld)\n\tversion=%d created=%d modified=%d", + arena->name, arena->base, arena->base + arena->size + 2 * arena->blocksize, + arena->version, arena->ctime, arena->wtime); + if(arena->memstats.sealed) + fprint(2, " sealed\n"); + else + fprint(2, "\n"); + if(scorecmp(zeroscore, arena->score) != 0) + fprint(2, "\tscore=%V\n", arena->score); + + fprint(fd, "\tclumps=%,d compressed clumps=%,d data=%,lld compressed data=%,lld disk storage=%,lld\n", + arena->memstats.clumps, arena->memstats.cclumps, arena->memstats.uncsize, + arena->memstats.used - arena->memstats.clumps * ClumpSize, + arena->memstats.used + arena->memstats.clumps * ClumpInfoSize); +} diff --git a/src/cmd/venti/srv/findscore.c b/src/cmd/venti/srv/findscore.c new file mode 100644 index 00000000..226d97ae --- /dev/null +++ b/src/cmd/venti/srv/findscore.c @@ -0,0 +1,121 @@ +#include "stdinc.h" +#include "dat.h" +#include "fns.h" + +enum +{ + ClumpChunks = 32*1024 +}; + +static int verbose; + +int +clumpinfoeq(ClumpInfo *c, ClumpInfo *d) +{ + return c->type == d->type + && c->size == d->size + && c->uncsize == d->uncsize + && scorecmp(c->score, d->score)==0; +} + +int +findscore(Arena *arena, uchar *score) +{ + IEntry ie; + ClumpInfo *ci, *cis; + u64int a; + u32int clump; + int i, n, found; + +//ZZZ remove fprint? + if(arena->memstats.clumps) + fprint(2, "reading directory for arena=%s with %d entries\n", arena->name, arena->memstats.clumps); + + cis = MKN(ClumpInfo, ClumpChunks); + found = 0; + a = 0; + memset(&ie, 0, sizeof(IEntry)); + for(clump = 0; clump < arena->memstats.clumps; clump += n){ + n = ClumpChunks; + if(n > arena->memstats.clumps - clump) + n = arena->memstats.clumps - clump; + if(readclumpinfos(arena, clump, cis, n) != n){ + seterr(EOk, "arena directory read failed: %r"); + break; + } + + for(i = 0; i < n; i++){ + ci = &cis[i]; + if(scorecmp(score, ci->score)==0){ + fprint(2, "found at clump=%d with type=%d size=%d csize=%d position=%lld\n", + clump + i, ci->type, ci->uncsize, ci->size, a); + found++; + } + a += ci->size + ClumpSize; + } + } + free(cis); + return found; +} + +void +usage(void) +{ + fprint(2, "usage: findscore [-v] arenafile score\n"); + threadexitsall(0); +} + +void +threadmain(int argc, char *argv[]) +{ + ArenaPart *ap; + Part *part; + char *file; + u8int score[VtScoreSize]; + int i, found; + + ventifmtinstall(); + + ARGBEGIN{ + case 'v': + verbose++; + break; + default: + usage(); + break; + }ARGEND + + readonly = 1; + + if(argc != 2) + usage(); + + file = argv[0]; + if(strscore(argv[1], score) < 0) + sysfatal("bad score %s\n", argv[1]); + + part = initpart(file, OREAD|ODIRECT); + if(part == nil) + sysfatal("can't open partition %s: %r", file); + + ap = initarenapart(part); + if(ap == nil) + sysfatal("can't initialize arena partition in %s: %r", file); + + if(verbose > 1){ + printarenapart(2, ap); + fprint(2, "\n"); + } + + initdcache(8 * MaxDiskBlock); + + found = 0; + for(i = 0; i < ap->narenas; i++) + found += findscore(ap->arenas[i], score); + + print("found %d occurrences of %V\n", found, score); + + if(verbose > 1) + printstats(); + threadexitsall(0); +} diff --git a/src/cmd/venti/srv/fmtarenas.c b/src/cmd/venti/srv/fmtarenas.c new file mode 100644 index 00000000..44c975ec --- /dev/null +++ b/src/cmd/venti/srv/fmtarenas.c @@ -0,0 +1,135 @@ +#include "stdinc.h" +#include "dat.h" +#include "fns.h" +#ifndef ODIRECT +#define ODIRECT 0 +#endif + +void +usage(void) +{ + fprint(2, "usage: fmtarenas [-Z] [-b blocksize] [-a arenasize] name file\n"); + threadexitsall(0); +} + +void +threadmain(int argc, char *argv[]) +{ + int vers; + ArenaPart *ap; + Part *part; + Arena *arena; + u64int addr, limit, asize, apsize; + char *file, *name, aname[ANameSize]; + int i, n, blocksize, tabsize, zero; + + ventifmtinstall(); + statsinit(); + + blocksize = 8 * 1024; + asize = 512 * 1024 *1024; + tabsize = 512 * 1024; /* BUG: should be determine from number of arenas */ + zero = -1; + vers = ArenaVersion5; + ARGBEGIN{ + case 'D': + settrace(EARGF(usage())); + break; + case 'a': + asize = unittoull(ARGF()); + if(asize == TWID64) + usage(); + break; + case 'b': + blocksize = unittoull(ARGF()); + if(blocksize == ~0) + usage(); + if(blocksize > MaxDiskBlock){ + fprint(2, "block size too large, max %d\n", MaxDiskBlock); + threadexitsall("usage"); + } + break; + case '4': + vers = ArenaVersion4; + break; + case 'Z': + zero = 0; + break; + default: + usage(); + break; + }ARGEND + + if(zero == -1){ + if(vers == ArenaVersion4) + zero = 1; + else + zero = 0; + } + + if(argc != 2) + usage(); + + name = argv[0]; + file = argv[1]; + + if(nameok(name) < 0) + sysfatal("illegal name template %s", name); + + part = initpart(file, ORDWR|ODIRECT); + if(part == nil) + sysfatal("can't open partition %s: %r", file); + + if(zero) + zeropart(part, blocksize); + + maxblocksize = blocksize; + initdcache(20*blocksize); + + ap = newarenapart(part, blocksize, tabsize); + if(ap == nil) + sysfatal("can't initialize arena: %r"); + + apsize = ap->size - ap->arenabase; + n = apsize / asize; + if(apsize - (n * asize) >= MinArenaSize) + n++; + + fprint(2, "fmtarenas %s: %,d arenas, %,lld bytes storage, %,d bytes for index map\n", + file, n, apsize, ap->tabsize); + + ap->narenas = n; + ap->map = MKNZ(AMap, n); + ap->arenas = MKNZ(Arena*, n); + + addr = ap->arenabase; + for(i = 0; i < n; i++){ + limit = addr + asize; + if(limit >= ap->size || ap->size - limit < MinArenaSize){ + limit = ap->size; + if(limit - addr < MinArenaSize) + sysfatal("bad arena set math: runt arena at %lld,%lld %lld\n", addr, limit, ap->size); + } + + snprint(aname, ANameSize, "%s%d", name, i); + + if(0) fprint(2, "adding arena %s at [%lld,%lld)\n", aname, addr, limit); + + arena = newarena(part, vers, aname, addr, limit - addr, blocksize); + if(!arena) + fprint(2, "can't make new arena %s: %r", aname); + freearena(arena); + + ap->map[i].start = addr; + ap->map[i].stop = limit; + namecp(ap->map[i].name, aname); + + addr = limit; + } + + if(wbarenapart(ap) < 0) + fprint(2, "can't write back arena partition header for %s: %r\n", file); + + flushdcache(); + threadexitsall(0); +} diff --git a/src/cmd/venti/srv/fmtbloom.c b/src/cmd/venti/srv/fmtbloom.c new file mode 100644 index 00000000..3c50d82f --- /dev/null +++ b/src/cmd/venti/srv/fmtbloom.c @@ -0,0 +1,115 @@ +#include "stdinc.h" +#include "dat.h" +#include "fns.h" + +Bloom b; + +void +usage(void) +{ + fprint(2, "usage: fmtbloom [-s size] [-n nblocks | -N nhash] file\n"); + threadexitsall(0); +} + +void +threadmain(int argc, char *argv[]) +{ + Part *part; + char *file; + vlong bits, size, size2; + int nhash; + vlong nblocks; + + ventifmtinstall(); + statsinit(); + + size = 0; + nhash = nblocks = 0; + ARGBEGIN{ + case 'n': + if(nhash || nblocks) + usage(); + nblocks = unittoull(EARGF(usage())); + break; + case 'N': + if(nhash || nblocks) + usage(); + nhash = unittoull(EARGF(usage())); + if(nhash > BloomMaxHash){ + fprint(2, "maximum possible is -N %d", BloomMaxHash); + usage(); + } + break; + case 's': + size = unittoull(ARGF()); + if(size == ~0) + usage(); + break; + default: + usage(); + break; + }ARGEND + + if(argc != 1) + usage(); + + file = argv[0]; + + part = initpart(file, ORDWR|ODIRECT); + if(part == nil) + sysfatal("can't open partition %s: %r", file); + + if(size == 0) + size = part->size; + + if(size < 1024*1024) + sysfatal("bloom filter too small"); + + if(size > MaxBloomSize){ + fprint(2, "warning: not using entire %,lld bytes; using only %,lld bytes\n", + size, MaxBloomSize); + size = MaxBloomSize; + } + if(size&(size-1)){ + for(size2=1; size2<size; size2*=2) + ; + size = size2/2; + fprint(2, "warning: size not a power of 2; only using %lldMB\n", size/1024/1024); + } + + if(nblocks){ + /* + * no use for more than 32 bits per block + * shoot for less than 64 bits per block + */ + size2 = size; + while(size2*8 >= nblocks*64) + size2 >>= 1; + if(size2 != size){ + size = size2; + fprint(2, "warning: using only %lldMB - not enough blocks to warrant more\n", + size/1024/1024); + } + + /* + * optimal is to use ln 2 times as many hash functions as we have bits per blocks. + */ + bits = (8*size)/nblocks; + nhash = bits*7/10; + if(nhash > BloomMaxHash) + nhash = BloomMaxHash; + } + if(!nhash) + nhash = BloomMaxHash; + if(bloominit(&b, size, nil) < 0) + sysfatal("bloominit: %r"); + b.nhash = nhash; + bits = nhash*10/7; + nblocks = (8*size)/bits; + fprint(2, "fmtbloom: using %lldMB, %d hashes/score, best up to %,lld blocks\n", size, nhash, nblocks); + b.data = vtmallocz(size); + b.part = part; + if(writebloom(&b) < 0) + sysfatal("writing %s: %r", file); + threadexitsall(0); +} diff --git a/src/cmd/venti/srv/fmtindex.c b/src/cmd/venti/srv/fmtindex.c new file mode 100644 index 00000000..a0eb6b16 --- /dev/null +++ b/src/cmd/venti/srv/fmtindex.c @@ -0,0 +1,120 @@ +#include "stdinc.h" +#include "dat.h" +#include "fns.h" + +void +usage(void) +{ + fprint(2, "usage: fmtindex [-a] config\n"); + threadexitsall(0); +} + +void +threadmain(int argc, char *argv[]) +{ + Config conf; + Index *ix; + ArenaPart *ap; + Arena **arenas; + AMap *amap; + u64int addr; + char *file; + u32int i, j, n, narenas; + int add; + + ventifmtinstall(); + statsinit(); + + add = 0; + ARGBEGIN{ + case 'a': + add = 1; + break; + default: + usage(); + break; + }ARGEND + + if(argc != 1) + usage(); + + file = argv[0]; + + if(runconfig(file, &conf) < 0) + sysfatal("can't initialize config %s: %r", file); + if(conf.index == nil) + sysfatal("no index specified in %s", file); + if(nameok(conf.index) < 0) + sysfatal("illegal index name %s", conf.index); + + narenas = 0; + for(i = 0; i < conf.naparts; i++){ + ap = conf.aparts[i]; + narenas += ap->narenas; + } + + if(add){ + ix = initindex(conf.index, conf.sects, conf.nsects); + if(ix == nil) + sysfatal("can't initialize index %s: %r", conf.index); + }else{ + ix = newindex(conf.index, conf.sects, conf.nsects); + if(ix == nil) + sysfatal("can't create new index %s: %r", conf.index); + + n = 0; + for(i = 0; i < ix->nsects; i++) + n += ix->sects[i]->blocks; + + if(0) fprint(2, "using %ud buckets of %ud; div=%d\n", ix->buckets, n, ix->div); + } + amap = MKNZ(AMap, narenas); + arenas = MKNZ(Arena*, narenas); + + addr = IndexBase; + n = 0; + for(i = 0; i < conf.naparts; i++){ + ap = conf.aparts[i]; + for(j = 0; j < ap->narenas; j++){ + if(n >= narenas) + sysfatal("too few slots in index's arena set"); + + arenas[n] = ap->arenas[j]; + if(n < ix->narenas){ + if(arenas[n] != ix->arenas[n]) + sysfatal("mismatched arenas %s and %s at slot %d\n", + arenas[n]->name, ix->arenas[n]->name, n); + amap[n] = ix->amap[n]; + if(amap[n].start != addr) + sysfatal("mis-located arena %s in index %s\n", arenas[n]->name, ix->name); + addr = amap[n].stop; + }else{ + amap[n].start = addr; + addr += ap->arenas[j]->size; + amap[n].stop = addr; + namecp(amap[n].name, ap->arenas[j]->name); + if(0) fprint(2, "add arena %s at [%lld,%lld)\n", + amap[n].name, amap[n].start, amap[n].stop); + } + + n++; + } + } + if(0){ + fprint(2, "configured index=%s with arenas=%d and storage=%lld\n", + ix->name, n, addr - IndexBase); + fprint(2, "\tbitblocks=%d maxdepth=%d buckets=%d\n", + ix->bitblocks, ix->maxdepth, ix->buckets); + } + fprint(2, "fmtindex: %,d arenas, %,d index buckets, %,lld bytes storage\n", + n, ix->buckets, addr-IndexBase); + + ix->amap = amap; + ix->arenas = arenas; + ix->narenas = narenas; + + if(wbindex(ix) < 0) + fprint(2, "can't write back arena partition header for %s: %r\n", file); + + threadexitsall(0); +} diff --git a/src/cmd/venti/srv/fmtisect.c b/src/cmd/venti/srv/fmtisect.c new file mode 100644 index 00000000..28b88de6 --- /dev/null +++ b/src/cmd/venti/srv/fmtisect.c @@ -0,0 +1,83 @@ +#include "stdinc.h" +#include "dat.h" +#include "fns.h" + +void +usage(void) +{ + fprint(2, "usage: fmtisect [-Z] [-b blocksize] name file\n"); + threadexitsall(0); +} + +void +threadmain(int argc, char *argv[]) +{ + int vers; + ISect *is; + Part *part; + char *file, *name; + int blocksize, setsize, zero; + + ventifmtinstall(); + statsinit(); + + blocksize = 8 * 1024; + setsize = 512 * 1024; + zero = -1; + vers = ISectVersion2; + ARGBEGIN{ + case 'b': + blocksize = unittoull(ARGF()); + if(blocksize == ~0) + usage(); + if(blocksize > MaxDiskBlock){ + fprint(2, "block size too large, max %d\n", MaxDiskBlock); + threadexitsall("usage"); + } + break; + case '1': + vers = ISectVersion1; + break; + case 'Z': + zero = 0; + break; + default: + usage(); + break; + }ARGEND + + if(zero == -1){ + if(vers == ISectVersion1) + zero = 1; + else + zero = 0; + } + + if(argc != 2) + usage(); + + name = argv[0]; + file = argv[1]; + + if(nameok(name) < 0) + sysfatal("illegal name %s", name); + + part = initpart(file, ORDWR|ODIRECT); + if(part == nil) + sysfatal("can't open partition %s: %r", file); + + if(zero) + zeropart(part, blocksize); + + is = newisect(part, vers, name, blocksize, setsize); + if(is == nil) + sysfatal("can't initialize new index: %r"); + + fprint(2, "fmtisect %s: %,d buckets of %,d entries, %,d bytes for index map\n", + file, is->blocks, is->buckmax, setsize); + + if(wbisect(is) < 0) + fprint(2, "can't write back index section header for %s: %r\n", file); + + threadexitsall(0); +} diff --git a/src/cmd/venti/srv/fns.h b/src/cmd/venti/srv/fns.h new file mode 100644 index 00000000..f35580ed --- /dev/null +++ b/src/cmd/venti/srv/fns.h @@ -0,0 +1,206 @@ +/* + * sorted by 4,/^$/|sort -bd +1 + */ +int addarena(Arena *name); +void addstat(int, int); +void addstat2(int, int, int, int); +ZBlock *alloczblock(u32int size, int zeroed, uint alignment); +Arena *amapitoa(Index *index, u64int a, u64int *aa); +u64int arenadirsize(Arena *arena, u32int clumps); +void arenaupdate(Arena *arena, u32int size, u8int *score); +void backsumarena(Arena *arena); +void binstats(long (*fn)(Stats *s0, Stats *s1, void*), void *arg, long t0, long t1, Statbin *bin, int nbin); +int bloominit(Bloom*, vlong, uchar*); +int bucklook(u8int*, int, u8int*, int); +u32int buildbucket(Index *ix, IEStream *ies, IBucket *ib, uint); +void checkdcache(void); +void checklumpcache(void); +int clumpinfoeq(ClumpInfo *c, ClumpInfo *d); +int clumpinfoeq(ClumpInfo *c, ClumpInfo *d); +u32int clumpmagic(Arena *arena, u64int aa); +uint countbits(uint n); +int delarena(Arena *arena); +void delaykickicache(void); +void delaykickround(Round*); +void delaykickroundproc(void*); +void dirtydblock(DBlock*, int); +AState diskstate(void); +void *emalloc(ulong); +void *erealloc(void *, ulong); +char *estrdup(char*); +void *ezmalloc(ulong); +Arena *findarena(char *name); +int flushciblocks(Arena *arena); +void flushdcache(void); +void flushicache(void); +void flushqueue(void); +void fmtzbinit(Fmt *f, ZBlock *b); +void freearena(Arena *arena); +void freearenapart(ArenaPart *ap, int freearenas); +void freeiestream(IEStream *ies); +void freeifile(IFile *f); +void freeisect(ISect *is); +void freeindex(Index *index); +void freepart(Part *part); +void freezblock(ZBlock *b); +DBlock *_getdblock(Part *part, u64int addr, int mode, int load); +DBlock *getdblock(Part *part, u64int addr, int mode); +u32int hashbits(u8int *score, int nbits); +int httpdinit(char *address, char *webroot); +int iaddrcmp(IAddr *ia1, IAddr *ia2); +IEntry* icachedirty(u32int, u32int, u64int); +void icacheclean(IEntry*); +int ientrycmp(const void *vie1, const void *vie2); +char *ifileline(IFile *f); +int ifilename(IFile *f, char *dst); +int ifileu32int(IFile *f, u32int *r); +int inbloomfilter(Bloom*, u8int*); +int indexsect(Index *ix, u8int *score); +int indexsect0(Index *ix, u32int buck); +Arena *initarena(Part *part, u64int base, u64int size, u32int blocksize); +ArenaPart *initarenapart(Part *part); +int initarenasum(void); +void initbloomfilter(Index*); +void initdcache(u32int mem); +void initicache(int bits, int depth); +void initicachewrite(void); +IEStream *initiestream(Part *part, u64int off, u64int clumps, u32int size); +ISect *initisect(Part *part); +Index *initindex(char *name, ISect **sects, int n); +void initlumpcache(u32int size, u32int nblocks); +int initlumpqueues(int nq); +Part* initpart(char *name, int mode); +void initround(Round*, char*, int); +int initventi(char *config, Config *conf); +void insertlump(Lump *lump, Packet *p); +int insertscore(u8int *score, IAddr *ia, int write); +void kickdcache(void); +void kickicache(void); +void kickround(Round*, int wait); +ZBlock *loadclump(Arena *arena, u64int aa, int blocks, Clump *cl, u8int *score, int verify); +DBlock *loadibucket(Index *index, u8int *score, ISect **is, u32int *buck, IBucket *ib); +int loadientry(Index *index, u8int *score, int type, IEntry *ie); +void logerr(int severity, char *fmt, ...); +Lump *lookuplump(u8int *score, int type); +int lookupscore(u8int *score, int type, IAddr *ia, int *rac); +int maparenas(AMap *am, Arena **arenas, int n, char *what); +void markbloomfilter(Bloom*, u8int*); +uint msec(void); +int namecmp(char *s, char *t); +void namecp(char *dst, char *src); +int nameok(char *name); +Arena *newarena(Part *part, u32int, char *name, u64int base, u64int size, u32int blocksize); +ArenaPart *newarenapart(Part *part, u32int blocksize, u32int tabsize); +ISect *newisect(Part *part, u32int vers, char *name, u32int blocksize, u32int tabsize); +Index *newindex(char *name, ISect **sects, int n); +u32int now(void); +int okamap(AMap *am, int n, u64int start, u64int stop, char *what); +int okibucket(IBucket*, ISect*); +int outputamap(Fmt *f, AMap *am, int n); +int outputindex(Fmt *f, Index *ix); +int packarena(Arena *arena, u8int *buf); +int packarenahead(ArenaHead *head, u8int *buf); +int packarenapart(ArenaPart *as, u8int *buf); +void packbloomhead(Bloom*, u8int*); +int packclump(Clump *c, u8int *buf, u32int); +void packclumpinfo(ClumpInfo *ci, u8int *buf); +void packibucket(IBucket *b, u8int *buf, u32int magic); +void packientry(IEntry *i, u8int *buf); +int packisect(ISect *is, u8int *buf); +void packmagic(u32int magic, u8int *buf); +ZBlock *packet2zblock(Packet *p, u32int size); +int parseamap(IFile *f, AMapN *amn); +int parseindex(IFile *f, Index *ix); +void partblocksize(Part *part, u32int blocksize); +int partifile(IFile *f, Part *part, u64int start, u32int size); +void printarenapart(int fd, ArenaPart *ap); +void printarena(int fd, Arena *arena); +void printindex(int fd, Index *ix); +void printstats(void); +void putdblock(DBlock *b); +void putlump(Lump *b); +int queuewrite(Lump *b, Packet *p, int creator, uint ms); +u32int readarena(Arena *arena, u64int aa, u8int *buf, long n); +int readarenamap(AMapN *amn, Part *part, u64int base, u32int size); +Bloom *readbloom(Part*); +int readclumpinfo(Arena *arena, int clump, ClumpInfo *ci); +int readclumpinfos(Arena *arena, int clump, ClumpInfo *cis, int n); +ZBlock *readfile(char *name); +int readifile(IFile *f, char *name); +Packet *readlump(u8int *score, int type, u32int size, int *cached); +int readpart(Part *part, u64int addr, u8int *buf, u32int n); +int runconfig(char *config, Config*); +int scorecmp(u8int *, u8int *); +void scoremem(u8int *score, u8int *buf, int size); +void setatailstate(AState*); +void setdcachestate(AState*); +void seterr(int severity, char *fmt, ...); +void setstat(int, long); +void settrace(char *type); +u64int sortrawientries(Index *ix, Part *tmp, u64int *tmpoff, Bloom *bloom); +void startbloomproc(Bloom*); +Memimage* statgraph(Graph *g); +void statsinit(void); +int storeclump(Index *index, ZBlock *b, u8int *score, int type, u32int creator, IAddr *ia); +int storeientry(Index *index, IEntry *m); +int strscore(char *s, u8int *score); +int stru32int(char *s, u32int *r); +int stru64int(char *s, u64int *r); +void sumarena(Arena *arena); +int syncarena(Arena *arena, u64int start, u32int n, int zok, int fix); +int syncarenaindex(Index *ix, Arena *arena, u32int clump, u64int a, int fix, int *pflush, int check); +int syncindex(Index *ix, int fix, int mustflushicache, int check); +void trace(char *type, char*, ...); +void traceinit(void); +int u64log2(u64int v); +u64int unittoull(char *s); +int unpackarena(Arena *arena, u8int *buf); +int unpackarenahead(ArenaHead *head, u8int *buf); +int unpackarenapart(ArenaPart *as, u8int *buf); +int unpackbloomhead(Bloom*, u8int*); +int unpackclump(Clump *c, u8int *buf, u32int); +void unpackclumpinfo(ClumpInfo *ci, u8int *buf); +void unpackibucket(IBucket *b, u8int *buf, u32int magic); +void unpackientry(IEntry *i, u8int *buf); +int unpackisect(ISect *is, u8int *buf); +u32int unpackmagic(u8int *buf); +void ventifmtinstall(void); +void vtloghdump(Hio*, VtLog*); +void vtloghlist(Hio*); +int vtproc(void(*)(void*), void*); +int vttypevalid(int type); +void waitforkick(Round*); +int wbarena(Arena *arena); +int wbarenahead(Arena *arena); +int wbarenamap(AMap *am, int n, Part *part, u64int base, u64int size); +int wbarenapart(ArenaPart *ap); +void wbbloomhead(Bloom*); +int wbisect(ISect *is); +int wbindex(Index *ix); +int whackblock(u8int *dst, u8int *src, int ssize); +u64int writeaclump(Arena *a, Clump *c, u8int *clbuf, u64int, u64int*); +u32int writearena(Arena *arena, u64int aa, u8int *clbuf, u32int n); +int writebloom(Bloom*); +int writeclumpinfo(Arena *arean, int clump, ClumpInfo *ci); +int writepng(Hio*, Memimage*); +u64int writeiclump(Index *ix, Clump *c, u8int *clbuf, u64int*); +int writelump(Packet *p, u8int *score, int type, u32int creator, uint ms); +int writepart(Part *part, u64int addr, u8int *buf, u32int n); +int writeqlump(Lump *u, Packet *p, int creator, uint ms); +Packet *zblock2packet(ZBlock *zb, u32int size); +void zeropart(Part *part, int blocksize); + +/* +#pragma varargck argpos sysfatal 1 +#pragma varargck argpos logerr 2 +#pragma varargck argpos SetErr 2 +*/ + +#define scorecmp(h1,h2) memcmp((h1),(h2),VtScoreSize) +#define scorecp(h1,h2) memmove((h1),(h2),VtScoreSize) + +#define MK(t) ((t*)emalloc(sizeof(t))) +#define MKZ(t) ((t*)ezmalloc(sizeof(t))) +#define MKN(t,n) ((t*)emalloc((n)*sizeof(t))) +#define MKNZ(t,n) ((t*)ezmalloc((n)*sizeof(t))) +#define MKNA(t,at,n) ((t*)emalloc(sizeof(t) + (n)*sizeof(at))) diff --git a/src/cmd/venti/srv/graph.c b/src/cmd/venti/srv/graph.c new file mode 100644 index 00000000..157b82db --- /dev/null +++ b/src/cmd/venti/srv/graph.c @@ -0,0 +1,202 @@ +#include "stdinc.h" +#include "dat.h" +#include "fns.h" + +enum +{ + Top = 1, + Bottom = 1, + Left = 40, + Right = 0, + MinWidth = Left+Right+2, + MinHeight = Top+Bottom+2, + DefaultWidth = Left+Right+500, + DefaultHeight = Top+Bottom+40 +}; + +QLock memdrawlock; +static Memsubfont *smallfont; +static Memimage *black; +static Memimage *blue; +static Memimage *red; +static Memimage *lofill[6]; +static Memimage *hifill[6]; +static Memimage *grid; + +static ulong fill[] = { + 0xFFAAAAFF, 0xBB5D5DFF, /* peach */ + DPalegreygreen, DPurpleblue, /* aqua */ + DDarkyellow, DYellowgreen, /* yellow */ + DMedgreen, DDarkgreen, /* green */ + 0x00AAFFFF, 0x0088CCFF, /* blue */ + 0xCCCCCCFF, 0x888888FF, /* grey */ +}; + +Memimage* +allocrepl(ulong color) +{ + Memimage *m; + + m = allocmemimage(Rect(0,0,1,1), RGB24); + memfillcolor(m, color); + m->flags |= Frepl; + m->clipr = Rect(-1000000, -1000000, 1000000, 1000000); + return m; +} + +static void +ginit(void) +{ + static int first = 1; + int i; + + if(!first) + return; + + first = 0; + memimageinit(); + smallfont = openmemsubfont(unsharp("#9/font/lucidasans/lstr.10")); + black = memblack; + blue = allocrepl(DBlue); + red = allocrepl(DRed); + grid = allocrepl(0x77777777); + for(i=0; i<nelem(fill)/2 && i<nelem(lofill) && i<nelem(hifill); i++){ + lofill[i] = allocrepl(fill[2*i]); + hifill[i] = allocrepl(fill[2*i+1]); + } +} + +static void +mklabel(char *str, int v) +{ + if(v < 0){ + v = -v; + *str++ = '-'; + } + if(v < 10000) + sprint(str, "%d", v); + else if(v < 10000000) + sprint(str, "%dk", v/1000); + else + sprint(str, "%dM", v/1000000); +} + +static void +drawlabel(Memimage *m, Point p, int n) +{ + char buf[30]; + Point w; + + mklabel(buf, n); + w = memsubfontwidth(smallfont, buf); + memimagestring(m, Pt(p.x-5-w.x, p.y), memblack, ZP, smallfont, buf); +} + +static int +scalept(int val, int valmin, int valmax, int ptmin, int ptmax) +{ + if(val <= valmin) + val = valmin; + if(val >= valmax) + val = valmax; + if(valmax == valmin) + valmax++; + return ptmin + (vlong)(val-valmin)*(ptmax-ptmin)/(valmax-valmin); +} + +Memimage* +statgraph(Graph *g) +{ + int i, lastlo, nbin, x, lo, hi, min, max, first; + Memimage *m; + Rectangle r; + Statbin *b, bin[2000]; /* 32 kB, but whack is worse */ + + needstack(8192); /* double check that bin didn't kill us */ + + if(g->wid <= MinWidth) + g->wid = DefaultWidth; + if(g->ht <= MinHeight) + g->ht = DefaultHeight; + if(g->wid > nelem(bin)) + g->wid = nelem(bin); + if(g->fill < 0) + g->fill = ((uint)g->arg>>8)%nelem(lofill); + if(g->fill > nelem(lofill)) + g->fill %= nelem(lofill); + + nbin = g->wid - (Left+Right); + binstats(g->fn, g->arg, g->t0, g->t1, bin, nbin); + + /* + * compute bounds + */ + min = g->min; + max = g->max; + if(min < 0 || max <= min){ + min = max = 0; + first = 1; + for(i=0; i<nbin; i++){ + b = &bin[i]; + if(b->nsamp == 0) + continue; + if(first || b->min < min) + min = b->min; + if(first || b->max > max) + max = b->max; + first = 0; + } + } + + qlock(&memdrawlock); + ginit(); + if(smallfont==nil || black==nil || blue==nil || red==nil || hifill==nil || lofill==nil){ + werrstr("graphics initialization failed"); + qunlock(&memdrawlock); + return nil; + } + + /* fresh image */ + m = allocmemimage(Rect(0,0,g->wid,g->ht), ABGR32); + if(m == nil){ + qunlock(&memdrawlock); + return nil; + } + r = Rect(Left, Top, g->wid-Right, g->ht-Bottom); + memfillcolor(m, DTransparent); + + /* x axis */ + memimagedraw(m, Rect(r.min.x, r.max.y, r.max.x, r.max.y+1), black, ZP, memopaque, ZP, S); + + /* y labels */ + drawlabel(m, r.min, max); + if(min != 0) + drawlabel(m, Pt(r.min.x, r.max.y-smallfont->height), min); + + /* actual data */ + lastlo = -1; + for(i=0; i<nbin; i++){ + b = &bin[i]; + if(b->nsamp == 0) + continue; + lo = scalept(b->min, min, max, r.max.y, r.min.y); + hi = scalept(b->max, min, max, r.max.y, r.min.y); + x = r.min.x+i; + hi-=2; + if(0) + if(lastlo != -1){ + if(lastlo < lo) + memimagedraw(m, Rect(x-1, lastlo, x, lo), hifill[g->fill], ZP, memopaque, ZP, S); + else if(lastlo > lo) + memimagedraw(m, Rect(x-1, lo, x, lastlo), hifill[g->fill], ZP, memopaque, ZP, S); + } + memimagedraw(m, Rect(x, hi, x+1,lo), hifill[g->fill], ZP, memopaque, ZP, S); + memimagedraw(m, Rect(x, lo, x+1, r.max.y), lofill[g->fill], ZP, memopaque, ZP, S); + lastlo = lo; + } + + if(bin[nbin-1].nsamp) + drawlabel(m, Pt(r.max.x, r.min.y+(Dy(r)-smallfont->height)/2), bin[nbin-1].avg); + qunlock(&memdrawlock); + return m; +} diff --git a/src/cmd/venti/srv/httpd.c b/src/cmd/venti/srv/httpd.c new file mode 100644 index 00000000..5f1a00e1 --- /dev/null +++ b/src/cmd/venti/srv/httpd.c @@ -0,0 +1,988 @@ +#include "stdinc.h" +#include "dat.h" +#include "fns.h" +#include "xml.h" + +typedef struct HttpObj HttpObj; +extern QLock memdrawlock; + +enum +{ + ObjNameSize = 64, + MaxObjs = 16 +}; + +struct HttpObj +{ + char name[ObjNameSize]; + int (*f)(HConnect*); +}; + +static HttpObj objs[MaxObjs]; + +static char *webroot; + +static void listenproc(void*); +static int estats(HConnect *c); +static int dindex(HConnect *c); +static int xindex(HConnect *c); +static int xlog(HConnect *c); +static int sindex(HConnect *c); +static int hicacheflush(HConnect *c); +static int hdcacheflush(HConnect *c); +static int notfound(HConnect *c); +static int httpdobj(char *name, int (*f)(HConnect*)); +static int xgraph(HConnect *c); +static int xset(HConnect *c); +static int fromwebdir(HConnect *c); + +int +httpdinit(char *address, char *dir) +{ + fmtinstall('D', hdatefmt); +/* fmtinstall('H', httpfmt); */ + fmtinstall('U', hurlfmt); + + if(address == nil) + address = "tcp!*!http"; + webroot = dir; + + httpdobj("/stats", estats); + httpdobj("/index", dindex); + httpdobj("/storage", sindex); + httpdobj("/xindex", xindex); + httpdobj("/flushicache", hicacheflush); + httpdobj("/flushdcache", hdcacheflush); + httpdobj("/graph/", xgraph); + httpdobj("/set/", xset); + httpdobj("/log", xlog); + httpdobj("/log/", xlog); + + if(vtproc(listenproc, address) < 0) + return -1; + return 0; +} + +static int +httpdobj(char *name, int (*f)(HConnect*)) +{ + int i; + + if(name == nil || strlen(name) >= ObjNameSize) + return -1; + for(i = 0; i < MaxObjs; i++){ + if(objs[i].name[0] == '\0'){ + strcpy(objs[i].name, name); + objs[i].f = f; + return 0; + } + if(strcmp(objs[i].name, name) == 0) + return -1; + } + return -1; +} + +static HConnect* +mkconnect(void) +{ + HConnect *c; + + c = mallocz(sizeof(HConnect), 1); + if(c == nil) + sysfatal("out of memory"); + c->replog = nil; + c->hpos = c->header; + c->hstop = c->header; + return c; +} + +void httpproc(void*); + +static void +listenproc(void *vaddress) +{ + HConnect *c; + char *address, ndir[NETPATHLEN], dir[NETPATHLEN]; + int ctl, nctl, data; + +//sleep(1000); /* let strace find us */ + + address = vaddress; + ctl = announce(address, dir); + if(ctl < 0){ + fprint(2, "venti: httpd can't announce on %s: %r\n", address); + return; + } + + if(0) print("announce ctl %d dir %s\n", ctl, dir); + for(;;){ + /* + * wait for a call (or an error) + */ + nctl = listen(dir, ndir); + if(0) print("httpd listen %d %s...\n", nctl, ndir); + if(nctl < 0){ + fprint(2, "venti: httpd can't listen on %s: %r\n", address); + return; + } + + data = accept(ctl, ndir); + if(0) print("httpd accept %d...\n", data); + if(data < 0){ + fprint(2, "venti: httpd accept: %r\n"); + close(nctl); + continue; + } + if(0) print("httpd close nctl %d\n", nctl); + close(nctl); + c = mkconnect(); + hinit(&c->hin, data, Hread); + hinit(&c->hout, data, Hwrite); + vtproc(httpproc, c); + } +} + +void +httpproc(void *v) +{ + HConnect *c; + int ok, i, n; + +//sleep(1000); /* let strace find us */ + c = v; + + for(;;){ + /* + * No timeout because the signal appears to hit every + * proc, not just us. + */ + if(hparsereq(c, 0) < 0) + break; + + ok = -1; + for(i = 0; i < MaxObjs && objs[i].name[0]; i++){ + n = strlen(objs[i].name); + if((objs[i].name[n-1] == '/' && strncmp(c->req.uri, objs[i].name, n) == 0) + || (objs[i].name[n-1] != '/' && strcmp(c->req.uri, objs[i].name) == 0)){ + ok = (*objs[i].f)(c); + goto found; + } + } + ok = fromwebdir(c); + found: + if(c->head.closeit) + ok = -1; + hreqcleanup(c); + + if(ok < 0) + break; + } + hreqcleanup(c); + close(c->hin.fd); + free(c); +} + +static int +percent(long v, long total) +{ + if(total == 0) + total = 1; + if(v < 1000*1000) + return (v * 100) / total; + total /= 100; + if(total == 0) + total = 1; + return v / total; +} + +static int +preq(HConnect *c) +{ + if(hparseheaders(c, 0) < 0) + return -1; + if(strcmp(c->req.meth, "GET") != 0 + && strcmp(c->req.meth, "HEAD") != 0) + return hunallowed(c, "GET, HEAD"); + if(c->head.expectother || c->head.expectcont) + return hfail(c, HExpectFail, nil); + return 0; +} + +static int +preqtype(HConnect *c, char *type) +{ + Hio *hout; + int r; + + r = preq(c); + if(r < 0) + return r; + + hout = &c->hout; + if(c->req.vermaj){ + hokheaders(c); + hprint(hout, "Content-type: %s\r\n", type); + if(http11(c)) + hprint(hout, "Transfer-Encoding: chunked\r\n"); + hprint(hout, "\r\n"); + } + + if(http11(c)) + hxferenc(hout, 1); + else + c->head.closeit = 1; + return 0; +} + +static int +preqtext(HConnect *c) +{ + return preqtype(c, "text/plain"); +} + +static int +notfound(HConnect *c) +{ + int r; + + r = preq(c); + if(r < 0) + return r; + return hfail(c, HNotFound, c->req.uri); +} + +struct { + char *ext; + char *type; +} exttab[] = { + ".html", "text/html", + ".txt", "text/plain", + ".xml", "text/xml", + ".png", "image/png", + ".gif", "image/gif", + 0 +}; + +static int +fromwebdir(HConnect *c) +{ + char buf[4096], *p, *ext, *type; + int i, fd, n, defaulted; + Dir *d; + + if(webroot == nil || strstr(c->req.uri, "..")) + return notfound(c); + snprint(buf, sizeof buf-20, "%s/%s", webroot, c->req.uri+1); + defaulted = 0; +reopen: + if((fd = open(buf, OREAD)) < 0) + return notfound(c); + d = dirfstat(fd); + if(d == nil){ + close(fd); + return notfound(c); + } + if(d->mode&DMDIR){ + if(!defaulted){ + defaulted = 1; + strcat(buf, "/index.html"); + free(d); + close(fd); + goto reopen; + } + free(d); + return notfound(c); + } + free(d); + p = buf+strlen(buf); + type = "application/octet-stream"; + for(i=0; exttab[i].ext; i++){ + ext = exttab[i].ext; + if(p-strlen(ext) >= buf && strcmp(p-strlen(ext), ext) == 0){ + type = exttab[i].type; + break; + } + } + if(preqtype(c, type) < 0){ + close(fd); + return 0; + } + while((n = read(fd, buf, sizeof buf)) > 0) + if(hwrite(&c->hout, buf, n) < 0) + break; + close(fd); + hflush(&c->hout); + return 0; +} + +static struct +{ + char *name; + int *p; +} namedints[] = +{ + "compress", &compressblocks, + "devnull", &writestodevnull, + "logging", &ventilogging, + "stats", &collectstats, + "icachesleeptime", &icachesleeptime, + "arenasumsleeptime", &arenasumsleeptime, + 0 +}; + +static int +xset(HConnect *c) +{ + int i, nf, r; + char *f[10], *s; + + s = estrdup(c->req.uri); + nf = getfields(s+strlen("/set/"), f, nelem(f), 1, "/"); + + if(nf < 1) + return notfound(c); + for(i=0; namedints[i].name; i++){ + if(strcmp(f[0], namedints[i].name) == 0){ + if(nf >= 2) + *namedints[i].p = atoi(f[1]); + r = preqtext(c); + if(r < 0) + return r; + hprint(&c->hout, "%s = %d\n", f[0], *namedints[i].p); + hflush(&c->hout); + return 0; + } + } + return notfound(c); +} + +static int +estats(HConnect *c) +{ + Hio *hout; + int r; + + r = preqtext(c); + if(r < 0) + return r; + + + hout = &c->hout; +/* + hprint(hout, "lump writes=%,ld\n", stats.lumpwrites); + hprint(hout, "lump reads=%,ld\n", stats.lumpreads); + hprint(hout, "lump cache read hits=%,ld\n", stats.lumphit); + hprint(hout, "lump cache read misses=%,ld\n", stats.lumpmiss); + + hprint(hout, "clump disk writes=%,ld\n", stats.clumpwrites); + hprint(hout, "clump disk bytes written=%,lld\n", stats.clumpbwrites); + hprint(hout, "clump disk bytes compressed=%,lld\n", stats.clumpbcomp); + hprint(hout, "clump disk reads=%,ld\n", stats.clumpreads); + hprint(hout, "clump disk bytes read=%,lld\n", stats.clumpbreads); + hprint(hout, "clump disk bytes uncompressed=%,lld\n", stats.clumpbuncomp); + + hprint(hout, "clump directory disk writes=%,ld\n", stats.ciwrites); + hprint(hout, "clump directory disk reads=%,ld\n", stats.cireads); + + hprint(hout, "index disk writes=%,ld\n", stats.indexwrites); + hprint(hout, "index disk reads=%,ld\n", stats.indexreads); + hprint(hout, "index disk bloom filter hits=%,ld %d%% falsemisses=%,ld %d%%\n", + stats.indexbloomhits, + percent(stats.indexbloomhits, stats.indexreads), + stats.indexbloomfalsemisses, + percent(stats.indexbloomfalsemisses, stats.indexreads)); + hprint(hout, "bloom filter bits=%,ld of %,ld %d%%\n", + stats.bloomones, stats.bloombits, percent(stats.bloomones, stats.bloombits)); + hprint(hout, "index disk reads for modify=%,ld\n", stats.indexwreads); + hprint(hout, "index disk reads for allocation=%,ld\n", stats.indexareads); + hprint(hout, "index block splits=%,ld\n", stats.indexsplits); + + hprint(hout, "index cache lookups=%,ld\n", stats.iclookups); + hprint(hout, "index cache hits=%,ld %d%%\n", stats.ichits, + percent(stats.ichits, stats.iclookups)); + hprint(hout, "index cache fills=%,ld %d%%\n", stats.icfills, + percent(stats.icfills, stats.iclookups)); + hprint(hout, "index cache inserts=%,ld\n", stats.icinserts); + + hprint(hout, "disk cache hits=%,ld\n", stats.pchit); + hprint(hout, "disk cache misses=%,ld\n", stats.pcmiss); + hprint(hout, "disk cache reads=%,ld\n", stats.pcreads); + hprint(hout, "disk cache bytes read=%,lld\n", stats.pcbreads); + + hprint(hout, "disk cache writes=%,ld\n", stats.dirtydblocks); + hprint(hout, "disk cache writes absorbed=%,ld %d%%\n", stats.absorbedwrites, + percent(stats.absorbedwrites, stats.dirtydblocks)); + + hprint(hout, "disk cache flushes=%,ld\n", stats.dcacheflushes); + hprint(hout, "disk cache flush writes=%,ld (%,ld per flush)\n", + stats.dcacheflushwrites, + stats.dcacheflushwrites/(stats.dcacheflushes ? stats.dcacheflushes : 1)); + + hprint(hout, "disk writes=%,ld\n", stats.diskwrites); + hprint(hout, "disk bytes written=%,lld\n", stats.diskbwrites); + hprint(hout, "disk reads=%,ld\n", stats.diskreads); + hprint(hout, "disk bytes read=%,lld\n", stats.diskbreads); +*/ + + hflush(hout); + return 0; +} + +static int +sindex(HConnect *c) +{ + Hio *hout; + Index *ix; + Arena *arena; + vlong clumps, cclumps, uncsize, used, size; + int i, r, active; + + r = preqtext(c); + if(r < 0) + return r; + hout = &c->hout; + + ix = mainindex; + + hprint(hout, "index=%s\n", ix->name); + + active = 0; + clumps = 0; + cclumps = 0; + uncsize = 0; + used = 0; + size = 0; + for(i = 0; i < ix->narenas; i++){ + arena = ix->arenas[i]; + if(arena != nil && arena->memstats.clumps != 0){ + active++; + clumps += arena->memstats.clumps; + cclumps += arena->memstats.cclumps; + uncsize += arena->memstats.uncsize; + used += arena->memstats.used; + } + size += arena->size; + } + hprint(hout, "total arenas=%,d active=%,d\n", ix->narenas, active); + hprint(hout, "total space=%,lld used=%,lld\n", size, used + clumps * ClumpInfoSize); + hprint(hout, "clumps=%,lld compressed clumps=%,lld data=%,lld compressed data=%,lld\n", + clumps, cclumps, uncsize, used - clumps * ClumpSize); + hflush(hout); + return 0; +} + +static void +darena(Hio *hout, Arena *arena) +{ + hprint(hout, "arena='%s' on %s at [%lld,%lld)\n\tversion=%d created=%d modified=%d", + arena->name, arena->part->name, arena->base, arena->base + arena->size + 2 * arena->blocksize, + arena->version, arena->ctime, arena->wtime); + if(arena->memstats.sealed) + hprint(hout, " mem=sealed"); + if(arena->diskstats.sealed) + hprint(hout, " disk=sealed"); + hprint(hout, "\n"); + if(scorecmp(zeroscore, arena->score) != 0) + hprint(hout, "\tscore=%V\n", arena->score); + + hprint(hout, "\tmem: clumps=%d compressed clumps=%d data=%,lld compressed data=%,lld storage=%,lld\n", + arena->memstats.clumps, arena->memstats.cclumps, arena->memstats.uncsize, + arena->memstats.used - arena->memstats.clumps * ClumpSize, + arena->memstats.used + arena->memstats.clumps * ClumpInfoSize); + hprint(hout, "\tdisk: clumps=%d compressed clumps=%d data=%,lld compressed data=%,lld storage=%,lld\n", + arena->diskstats.clumps, arena->diskstats.cclumps, arena->diskstats.uncsize, + arena->diskstats.used - arena->diskstats.clumps * ClumpSize, + arena->diskstats.used + arena->diskstats.clumps * ClumpInfoSize); +} + +static int +hicacheflush(HConnect *c) +{ + Hio *hout; + int r; + + r = preqtext(c); + if(r < 0) + return r; + hout = &c->hout; + + flushicache(); + hprint(hout, "flushed icache\n"); + hflush(hout); + return 0; +} + +static int +hdcacheflush(HConnect *c) +{ + Hio *hout; + int r; + + r = preqtext(c); + if(r < 0) + return r; + hout = &c->hout; + + flushdcache(); + hprint(hout, "flushed dcache\n"); + hflush(hout); + return 0; +} + +static int +dindex(HConnect *c) +{ + Hio *hout; + Index *ix; + int i, r; + + r = preqtext(c); + if(r < 0) + return r; + hout = &c->hout; + + + ix = mainindex; + hprint(hout, "index=%s version=%d blocksize=%d tabsize=%d\n", + ix->name, ix->version, ix->blocksize, ix->tabsize); + hprint(hout, "\tbuckets=%d div=%d\n", ix->buckets, ix->div); + for(i = 0; i < ix->nsects; i++) + hprint(hout, "\tsect=%s for buckets [%lld,%lld) buckmax=%d\n", ix->smap[i].name, ix->smap[i].start, ix->smap[i].stop, ix->sects[i]->buckmax); + for(i = 0; i < ix->narenas; i++){ + if(ix->arenas[i] != nil && ix->arenas[i]->memstats.clumps != 0){ + hprint(hout, "arena=%s at index [%lld,%lld)\n\t", ix->amap[i].name, ix->amap[i].start, ix->amap[i].stop); + darena(hout, ix->arenas[i]); + } + } + hflush(hout); + return 0; +} + +typedef struct Arg Arg; +struct Arg +{ + int index; + int index2; +}; + +static long +rawgraph(Stats *s, Stats *t, void *va) +{ + Arg *a; + + a = va; + return t->n[a->index]; +} + +static long +diffgraph(Stats *s, Stats *t, void *va) +{ + Arg *a; + + a = va; + return t->n[a->index] - s->n[a->index]; +} + +static long +pctgraph(Stats *s, Stats *t, void *va) +{ + Arg *a; + + a = va; + return percent(t->n[a->index], t->n[a->index2]); +} + +static long +pctdiffgraph(Stats *s, Stats *t, void *va) +{ + Arg *a; + + a = va; + return percent(t->n[a->index]-s->n[a->index], t->n[a->index2]-s->n[a->index2]); +} + +static long +netbw(Stats *s) +{ + ulong *n; + + n = s->n; + return n[StatRpcReadBytes]+n[StatRpcWriteBytes]; /* not exactly right */ +} + +static long +diskbw(Stats *s) +{ + ulong *n; + + n = s->n; + return n[StatApartReadBytes]+n[StatApartWriteBytes] + + n[StatIsectReadBytes]+n[StatIsectWriteBytes] + + n[StatSumReadBytes]; +} + +static long +iobw(Stats *s) +{ + return netbw(s)+diskbw(s); +} + +static long +diskgraph(Stats *s, Stats *t, void *va) +{ + USED(va); + return diskbw(t)-diskbw(s); +} + +static long +netgraph(Stats *s, Stats *t, void *va) +{ + USED(va); + return netbw(t)-netbw(s); +} + +static long +iograph(Stats *s, Stats *t, void *va) +{ + USED(va); + return iobw(t)-iobw(s); +} + + +static char* graphname[] = +{ + "rpctotal", + "rpcread", + "rpcreadok", + "rpcreadfail", + "rpcreadbyte", + "rpcreadtime", + "rpcreadcached", + "rpcreadcachedtime", + "rpcreaduncached", + "rpcreaduncachedtime", + "rpcwrite", + "rpcwritenew", + "rpcwriteold", + "rpcwritefail", + "rpcwritebyte", + "rpcwritetime", + "rpcwritenewtime", + "rpcwriteoldtime", + + "lcachehit", + "lcachemiss", + "lcachelookup", + "lcachewrite", + "lcachesize", + "lcachestall", + "lcachelookuptime", + + "dcachehit", + "dcachemiss", + "dcachelookup", + "dcacheread", + "dcachewrite", + "dcachedirty", + "dcachesize", + "dcacheflush", + "dcachestall", + "dcachelookuptime", + + "dblockstall", + "lumpstall", + + "icachehit", + "icachemiss", + "icachelookup", + "icachewrite", + "icachefill", + "icacheprefetch", + "icachedirty", + "icachesize", + "icacheflush", + "icachestall", + "icachelookuptime", + + "bloomhit", + "bloommiss", + "bloomfalsemiss", + "bloomlookup", + "bloomones", + "bloombits", + "bloomlookuptime", + + "apartread", + "apartreadbyte", + "apartwrite", + "apartwritebyte", + + "isectread", + "isectreadbyte", + "isectwrite", + "isectwritebyte", + + "sumread", + "sumreadbyte", +}; + +static int +findname(char *s) +{ + int i; + + for(i=0; i<nelem(graphname); i++) + if(strcmp(graphname[i], s) == 0) + return i; +fprint(2, "no name '%s'\n", s); + return -1; +} + +static void +dotextbin(Hio *io, Graph *g) +{ + int i, nbin; + Statbin *b, bin[2000]; /* 32 kB, but whack is worse */ + + needstack(8192); /* double check that bin didn't kill us */ + nbin = 100; + binstats(g->fn, g->arg, g->t0, g->t1, bin, nbin); + + hprint(io, "stats\n\n"); + for(i=0; i<nbin; i++){ + b = &bin[i]; + hprint(io, "%d: nsamp=%d min=%d max=%d avg=%d\n", + i, b->nsamp, b->min, b->max, b->avg); + } +} + +static int +xgraph(HConnect *c) +{ + char *f[20], *s; + Hio *hout; + Memimage *m; + int i, nf, dotext; + Graph g; + Arg arg; + + s = estrdup(c->req.uri); +if(0) fprint(2, "graph %s\n" ,s); + memset(&g, 0, sizeof g); + nf = getfields(s+strlen("/graph/"), f, nelem(f), 1, "/"); + if(nf < 1) + goto notfound; + if((arg.index = findname(f[0])) == -1 && strcmp(f[0], "*") != 0) + goto notfound; + g.arg = &arg; + g.t0 = -120; + g.t1 = 0; + g.min = -1; + g.max = -1; + g.fn = rawgraph; + g.wid = -1; + g.ht = -1; + dotext = 0; + g.fill = -1; + for(i=1; i<nf; i++){ + if(strncmp(f[i], "t0=", 3) == 0) + g.t0 = atoi(f[i]+3); + else if(strncmp(f[i], "t1=", 3) == 0) + g.t1 = atoi(f[i]+3); + else if(strncmp(f[i], "min=", 4) == 0) + g.min = atoi(f[i]+4); + else if(strncmp(f[i], "max=", 4) == 0) + g.max = atoi(f[i]+4); + else if(strncmp(f[i], "pct=", 4) == 0){ + if((arg.index2 = findname(f[i]+4)) == -1) + goto notfound; + g.fn = pctgraph; + g.min = 0; + g.max = 100; + }else if(strncmp(f[i], "pctdiff=", 8) == 0){ + if((arg.index2 = findname(f[i]+8)) == -1) + goto notfound; + g.fn = pctdiffgraph; + g.min = 0; + g.max = 100; + }else if(strcmp(f[i], "diff") == 0) + g.fn = diffgraph; + else if(strcmp(f[i], "text") == 0) + dotext = 1; + else if(strncmp(f[i], "wid=", 4) == 0) + g.wid = atoi(f[i]+4); + else if(strncmp(f[i], "ht=", 3) == 0) + g.ht = atoi(f[i]+3); + else if(strncmp(f[i], "fill=", 5) == 0) + g.fill = atoi(f[i]+5); + else if(strcmp(f[i], "diskbw") == 0) + g.fn = diskgraph; + else if(strcmp(f[i], "iobw") == 0) + g.fn = iograph; + else if(strcmp(f[i], "netbw") == 0) + g.fn = netgraph; + } + if(dotext){ + preqtype(c, "text/plain"); + dotextbin(&c->hout, &g); + hflush(&c->hout); + return 0; + } + + m = statgraph(&g); + if(m == nil) + goto notfound; + + if(preqtype(c, "image/png") < 0) + return -1; + hout = &c->hout; + writepng(hout, m); + qlock(&memdrawlock); + freememimage(m); + qunlock(&memdrawlock); + hflush(hout); + free(s); + return 0; + +notfound: + free(s); + return notfound(c); +} + +static int +xloglist(HConnect *c) +{ + if(preqtype(c, "text/html") < 0) + return -1; + vtloghlist(&c->hout); + hflush(&c->hout); + return 0; +} + +static int +xlog(HConnect *c) +{ + char *name; + VtLog *l; + + if(strcmp(c->req.uri, "/log") == 0 || strcmp(c->req.uri, "/log/") == 0) + return xloglist(c); + if(strncmp(c->req.uri, "/log/", 5) != 0) + return notfound(c); + name = c->req.uri + strlen("/log/"); + l = vtlogopen(name, 0); + if(l == nil) + return notfound(c); + if(preqtype(c, "text/html") < 0){ + vtlogclose(l); + return -1; + } + vtloghdump(&c->hout, l); + vtlogclose(l); + hflush(&c->hout); + return 0; +} + +static int +xindex(HConnect *c) +{ + if(preqtype(c, "text/xml") < 0) + return -1; + xmlindex(&c->hout, mainindex, "index", 0); + hflush(&c->hout); + return 0; +} + +void +xmlindent(Hio *hout, int indent) +{ + int i; + + for(i = 0; i < indent; i++) + hputc(hout, '\t'); +} + +void +xmlaname(Hio *hout, char *v, char *tag) +{ + hprint(hout, " %s=\"%s\"", tag, v); +} + +void +xmlscore(Hio *hout, u8int *v, char *tag) +{ + if(scorecmp(zeroscore, v) == 0) + return; + hprint(hout, " %s=\"%V\"", tag, v); +} + +void +xmlsealed(Hio *hout, int v, char *tag) +{ + if(!v) + return; + hprint(hout, " %s=\"yes\"", tag); +} + +void +xmlu32int(Hio *hout, u32int v, char *tag) +{ + hprint(hout, " %s=\"%ud\"", tag, v); +} + +void +xmlu64int(Hio *hout, u64int v, char *tag) +{ + hprint(hout, " %s=\"%llud\"", tag, v); +} + +void +vtloghdump(Hio *h, VtLog *l) +{ + int i; + VtLogChunk *c; + char *name; + + name = l ? l->name : "<nil>"; + +fprint(2, "hdump xfer %d\n", h->xferenc); + hprint(h, "<html><head>\n"); + hprint(h, "<title>Venti Server Log: %s</title>\n", name); + hprint(h, "</head><body>\n"); + hprint(h, "<b>Venti Server Log: %s</b>\n<p>\n", name); + + if(l){ + c = l->w; + for(i=0; i<l->nchunk; i++){ + if(++c == l->chunk+l->nchunk) + c = l->chunk; + hwrite(h, c->p, c->wp-c->p); + } + } + hprint(h, "</body></html>\n"); +} + +static int +strpcmp(const void *va, const void *vb) +{ + return strcmp(*(char**)va, *(char**)vb); +} + +void +vtloghlist(Hio *h) +{ + char **p; + int i, n; + + hprint(h, "<html><head>\n"); + hprint(h, "<title>Venti Server Logs</title>\n"); + hprint(h, "</head><body>\n"); + hprint(h, "<b>Venti Server Logs</b>\n<p>\n"); + + p = vtlognames(&n); + qsort(p, n, sizeof(p[0]), strpcmp); + for(i=0; i<n; i++) + hprint(h, "<a href=\"/log/%s\">%s</a><br>\n", p[i], p[i]); + vtfree(p); + hprint(h, "</body></html>\n"); +} diff --git a/src/cmd/venti/srv/icache.c b/src/cmd/venti/srv/icache.c new file mode 100644 index 00000000..46d411e5 --- /dev/null +++ b/src/cmd/venti/srv/icache.c @@ -0,0 +1,348 @@ +#include "stdinc.h" +#include "dat.h" +#include "fns.h" + +typedef struct ICache ICache; +struct ICache +{ + QLock lock; /* locks hash table & all associated data */ + Rendez full; + IEntry **heads; /* heads of all the hash chains */ + int bits; /* bits to use for indexing heads */ + u32int size; /* number of heads; == 1 << bits, should be < entries */ + IEntry *base; /* all allocated hash table entries */ + u32int entries; /* elements in base */ + IEntry *dirty; /* chain of dirty elements */ + u32int ndirty; + u32int maxdirty; + u32int unused; /* index of first unused element in base */ + u32int stolen; /* last head from which an element was stolen */ + + Arena *last[4]; + Arena *lastload; + int nlast; +}; + +static ICache icache; + +static IEntry *icachealloc(IAddr *ia, u8int *score); + +/* + * bits is the number of bits in the icache hash table + * depth is the average depth + * memory usage is about (1<<bits) * depth * sizeof(IEntry) + (1<<bits) * sizeof(IEntry*) + */ +void +initicache(int bits, int depth) +{ + icache.bits = bits; + icache.size = 1 << bits; + icache.entries = depth * icache.size; + icache.maxdirty = icache.entries/2; + icache.base = MKNZ(IEntry, icache.entries); + icache.heads = MKNZ(IEntry*, icache.size); + icache.full.l = &icache.lock; + setstat(StatIcacheSize, icache.entries); +} + +u32int +hashbits(u8int *sc, int bits) +{ + u32int v; + + v = (sc[0] << 24) | (sc[1] << 16) | (sc[2] << 8) | sc[3]; + if(bits < 32) + v >>= (32 - bits); + return v; +} + +static void +loadarenaclumps(Arena *arena, u64int aa) +{ + ulong i; + ClumpInfo ci; + IAddr ia; + +fprint(2, "seed index cache with arena @%llud, (map %llud), %d clumps\n", arena->base, aa, arena->memstats.clumps); + for(i=0; i<arena->memstats.clumps; i++){ + if(readclumpinfo(arena, i, &ci) < 0) + break; + ia.type = ci.type; + ia.size = ci.uncsize; + ia.blocks = (ci.size + ClumpSize + (1 << ABlockLog) - 1) >> ABlockLog; + ia.addr = aa; + aa += ClumpSize + ci.size; + if(ia.type != VtCorruptType) + insertscore(ci.score, &ia, 0); + } +} + +/* +ZZZ need to think about evicting the correct IEntry, +and writing back the wtime. + * look up data score in the index cache + * if this fails, pull it in from the disk index table, if it exists. + * + * must be called with the lump for this score locked + */ +int +lookupscore(u8int *score, int type, IAddr *ia, int *rac) +{ + IEntry d, *ie, *last; + u32int h; + u64int aa; + Arena *load; + int i; + uint ms; + + load = nil; + aa = 0; + ms = msec(); + + trace(TraceLump, "lookupscore %V.%d", score, type); + + qlock(&icache.lock); + h = hashbits(score, icache.bits); + last = nil; + for(ie = icache.heads[h]; ie != nil; ie = ie->next){ + if(ie->ia.type == type && scorecmp(ie->score, score)==0){ + if(last != nil) + last->next = ie->next; + else + icache.heads[h] = ie->next; + addstat(StatIcacheHit, 1); + ie->rac = 1; + trace(TraceLump, "lookupscore incache"); + goto found; + } + last = ie; + } + addstat(StatIcacheMiss, 1); + qunlock(&icache.lock); + + if(loadientry(mainindex, score, type, &d) < 0){ + ms = msec() - ms; + addstat2(StatIcacheRead, 1, StatIcacheReadTime, ms); + return -1; + } + + addstat(StatIcacheFill, 1); + + trace(TraceLump, "lookupscore loaded"); + + /* + * no one else can load an entry for this score, + * since we have the overall score lock. + */ + qlock(&icache.lock); + + /* + * If we notice that all the hits are coming from one arena, + * load the table of contents for that arena into the cache. + */ + ie = icachealloc(&d.ia, score); + icache.last[icache.nlast++%nelem(icache.last)] = amapitoa(mainindex, ie->ia.addr, &aa); + aa = ie->ia.addr - aa; /* compute base addr of arena */ + for(i=0; i<nelem(icache.last); i++) + if(icache.last[i] != icache.last[0]) + break; + if(i==nelem(icache.last) && icache.lastload != icache.last[0]){ + load = icache.last[0]; + icache.lastload = load; + } + +found: + ie->next = icache.heads[h]; + icache.heads[h] = ie; + + *ia = ie->ia; + *rac = ie->rac; + + qunlock(&icache.lock); + + if(load){ + trace(TraceProc, "preload 0x%llux", aa); + loadarenaclumps(load, aa); + } + ms = msec() - ms; + addstat2(StatIcacheRead, 1, StatIcacheReadTime, ms); + + return 0; +} + +/* + * insert a new element in the hash table. + */ +int +insertscore(u8int *score, IAddr *ia, int write) +{ + IEntry *ie, se; + u32int h; + + trace(TraceLump, "insertscore enter"); + if(write) + addstat(StatIcacheWrite, 1); + else + addstat(StatIcachePrefetch, 1); + + qlock(&icache.lock); + h = hashbits(score, icache.bits); + + ie = icachealloc(ia, score); + if(write){ + icache.ndirty++; + setstat(StatIcacheDirty, icache.ndirty); + delaykickicache(); + ie->dirty = 1; + } + ie->next = icache.heads[h]; + icache.heads[h] = ie; + + se = *ie; + qunlock(&icache.lock); + + if(write && icache.ndirty >= icache.maxdirty) + kickicache(); + + /* + * It's okay not to do this under icache.lock. + * Calling insertscore only happens when we hold + * the lump, meaning any searches for this block + * will hit in the lump cache until after we return. + */ + markbloomfilter(mainindex->bloom, score); + + return 0; +} + +/* + * allocate a index cache entry which hasn't been used in a while. + * must be called with icache.lock locked + * if the score is already in the table, update the entry. + */ +static IEntry * +icachealloc(IAddr *ia, u8int *score) +{ + int i; + IEntry *ie, *last, *clean, *lastclean; + u32int h; + + h = hashbits(score, icache.bits); + last = nil; + for(ie = icache.heads[h]; ie != nil; ie = ie->next){ + if(ie->ia.type == ia->type && scorecmp(ie->score, score)==0){ + if(last != nil) + last->next = ie->next; + else + icache.heads[h] = ie->next; + trace(TraceLump, "icachealloc hit"); + ie->rac = 1; + return ie; + } + last = ie; + } + + h = icache.unused; + if(h < icache.entries){ + ie = &icache.base[h++]; + icache.unused = h; + trace(TraceLump, "icachealloc unused"); + goto Found; + } + + h = icache.stolen; + for(i=0;; i++){ + h++; + if(h >= icache.size) + h = 0; + if(i == icache.size){ + trace(TraceLump, "icachealloc sleep"); + addstat(StatIcacheStall, 1); + while(icache.ndirty == icache.entries){ + /* + * This is a bit suspect. Kickicache will wake up the + * icachewritecoord, but if all the index entries are for + * unflushed disk blocks, icachewritecoord won't be + * able to do much. It always rewakes everyone when + * it thinks it is done, though, so at least we'll go around + * the while loop again. Also, if icachewritecoord sees + * that the disk state hasn't change at all since the last + * time around, it kicks the disk. This needs to be + * rethought, but it shouldn't deadlock anymore. + */ + kickicache(); + rsleep(&icache.full); + } + addstat(StatIcacheStall, -1); + i = 0; + } + lastclean = nil; + clean = nil; + last = nil; + for(ie=icache.heads[h]; ie; last=ie, ie=ie->next){ + if(!ie->dirty){ + clean = ie; + lastclean = last; + } + } + if(clean){ + if(lastclean) + lastclean->next = clean->next; + else + icache.heads[h] = clean->next; + clean->next = nil; + icache.stolen = h; + ie = clean; + trace(TraceLump, "icachealloc steal"); + goto Found; + } + } + +Found: + ie->ia = *ia; + scorecp(ie->score, score); + ie->rac = 0; + return ie; +} + +IEntry* +icachedirty(u32int lo, u32int hi, u64int limit) +{ + int i; + u32int h; + IEntry *ie, *dirty; + + dirty = nil; + trace(TraceProc, "icachedirty enter"); + qlock(&icache.lock); + for(i=0; i<icache.size; i++) + for(ie = icache.heads[i]; ie; ie=ie->next) + if(ie->dirty && ie->ia.addr != 0 && ie->ia.addr < limit){ + h = hashbits(ie->score, 32); + if(lo <= h && h <= hi){ + ie->nextdirty = dirty; + dirty = ie; + } + } + qunlock(&icache.lock); + trace(TraceProc, "icachedirty exit"); + if(dirty == nil) + flushdcache(); + return dirty; +} + +void +icacheclean(IEntry *ie) +{ + trace(TraceProc, "icachedirty enter"); + qlock(&icache.lock); + for(; ie; ie=ie->nextdirty){ + icache.ndirty--; + ie->dirty = 0; + } + setstat(StatIcacheDirty, icache.ndirty); + rwakeupall(&icache.full); + qunlock(&icache.lock); + trace(TraceProc, "icachedirty exit"); +} + diff --git a/src/cmd/venti/srv/icachewrite.c b/src/cmd/venti/srv/icachewrite.c new file mode 100644 index 00000000..900af871 --- /dev/null +++ b/src/cmd/venti/srv/icachewrite.c @@ -0,0 +1,318 @@ +/* + * Write the dirty icache entries to disk. Random seeks are + * so expensive that it makes sense to wait until we have + * a lot and then just make a sequential pass over the disk. + */ +#include "stdinc.h" +#include "dat.h" +#include "fns.h" + +static void icachewriteproc(void*); +static void icachewritecoord(void*); +static IEntry *iesort(IEntry*); + +int icachesleeptime = 1000; /* milliseconds */ + +enum +{ + Bufsize = 8*1024*1024 +}; + +typedef struct IWrite IWrite; +struct IWrite +{ + Round round; + AState as; +}; + +static IWrite iwrite; + +void +initicachewrite(void) +{ + int i; + Index *ix; + + initround(&iwrite.round, "icache", 120*60*1000); + ix = mainindex; + for(i=0; i<ix->nsects; i++){ + ix->sects[i]->writechan = chancreate(sizeof(ulong), 1); + ix->sects[i]->writedonechan = chancreate(sizeof(ulong), 1); + vtproc(icachewriteproc, ix->sects[i]); + } + vtproc(icachewritecoord, nil); + vtproc(delaykickroundproc, &iwrite.round); +} + +static IEntry* +nextchunk(Index *ix, ISect *is, IEntry **pie, u64int *paddr, uint *pnbuf) +{ + u64int addr, naddr; + uint nbuf; + int bsize; + IEntry *iefirst, *ie, **l; + + bsize = 1<<is->blocklog; + iefirst = *pie; + addr = is->blockbase + ((u64int)(hashbits(iefirst->score, 32) / ix->div - is->start) << is->blocklog); + nbuf = 0; + for(l=&iefirst->nextdirty; (ie=*l)!=nil; l=&(*l)->nextdirty){ + naddr = is->blockbase + ((u64int)(hashbits(ie->score, 32) / ix->div - is->start) << is->blocklog); + if(naddr - addr >= Bufsize) + break; + nbuf = naddr-addr; + } + nbuf += bsize; + + *l = nil; + *pie = ie; + *paddr = addr; + *pnbuf = nbuf; + return iefirst; +} + +static int +icachewritesect(Index *ix, ISect *is, u8int *buf) +{ + int err, h, bsize; + u32int lo, hi; + u64int addr, naddr; + uint nbuf, off; + DBlock *b; + IBucket ib; + IEntry *ie, *iedirty, **l, *chunk; + + lo = is->start * ix->div; + if(TWID32/ix->div < is->stop) + hi = TWID32; + else + hi = is->stop * ix->div - 1; + + trace(TraceProc, "icachewritesect enter %ud %ud %llud", lo, hi, iwrite.as.aa); + + iedirty = icachedirty(lo, hi, iwrite.as.aa); + iedirty = iesort(iedirty); + bsize = 1<<is->blocklog; + err = 0; + + while(iedirty){ + sleep(icachesleeptime); + trace(TraceProc, "icachewritesect nextchunk"); + chunk = nextchunk(ix, is, &iedirty, &addr, &nbuf); + + trace(TraceProc, "icachewritesect readpart 0x%llux+0x%ux", addr, nbuf); + if(readpart(is->part, addr, buf, nbuf) < 0){ + // XXX + fprint(2, "icachewriteproc readpart: %r\n"); + err = -1; + continue; + } + trace(TraceProc, "icachewritesect updatebuf"); + addstat(StatIsectReadBytes, nbuf); + addstat(StatIsectRead, 1); + + for(l=&chunk; (ie=*l)!=nil; l=&ie->nextdirty){ + again: + naddr = is->blockbase + ((u64int)(hashbits(ie->score, 32) / ix->div - is->start) << is->blocklog); + off = naddr - addr; + if(off+bsize > nbuf){ + fprint(2, "whoops! addr=0x%llux nbuf=%ud addr+nbuf=0x%llux naddr=0x%llux\n", + addr, nbuf, addr+nbuf, naddr); + assert(off+bsize <= nbuf); + } + unpackibucket(&ib, buf+off, is->bucketmagic); + if(okibucket(&ib, is) < 0){ + fprint(2, "bad bucket XXX\n"); + goto skipit; + } + trace(TraceProc, "icachewritesect add %V at 0x%llux", ie->score, naddr); + h = bucklook(ie->score, ie->ia.type, ib.data, ib.n); + if(h & 1){ + h ^= 1; + packientry(ie, &ib.data[h]); + }else if(ib.n < is->buckmax){ + memmove(&ib.data[h+IEntrySize], &ib.data[h], ib.n*IEntrySize - h); + ib.n++; + packientry(ie, &ib.data[h]); + }else{ + fprint(2, "bucket overflow XXX\n"); + skipit: + err = -1; + *l = ie->nextdirty; + ie = *l; + if(ie) + goto again; + else + break; + } + packibucket(&ib, buf+off, is->bucketmagic); + if((b = _getdblock(is->part, naddr, ORDWR, 0)) != nil){ + memmove(b->data, buf+off, bsize); + putdblock(b); + } + } + + trace(TraceProc, "icachewritesect writepart", addr, nbuf); + if(writepart(is->part, addr, buf, nbuf) < 0){ + // XXX + fprint(2, "icachewriteproc writepart: %r\n"); + err = -1; + continue; + } + addstat(StatIsectWriteBytes, nbuf); + addstat(StatIsectWrite, 1); + icacheclean(chunk); + } + + trace(TraceProc, "icachewritesect done"); + return err; +} + +static void +icachewriteproc(void *v) +{ + uint bsize; + ISect *is; + Index *ix; + u8int *buf; + + ix = mainindex; + is = v; + threadsetname("icachewriteproc:%s", is->part->name); + + bsize = 1<<is->blocklog; + buf = emalloc(Bufsize+bsize); + buf = (u8int*)(((ulong)buf+bsize-1)&~(ulong)(bsize-1)); + + for(;;){ + trace(TraceProc, "icachewriteproc recv"); + recv(is->writechan, 0); + trace(TraceWork, "start"); + icachewritesect(ix, is, buf); + trace(TraceProc, "icachewriteproc send"); + trace(TraceWork, "finish"); + send(is->writedonechan, 0); + } +} + +static void +icachewritecoord(void *v) +{ + int i; + Index *ix; + AState as; + + USED(v); + + threadsetname("icachewritecoord"); + + ix = mainindex; + iwrite.as = diskstate(); + + for(;;){ + trace(TraceProc, "icachewritecoord sleep"); + waitforkick(&iwrite.round); + trace(TraceWork, "start"); + as = diskstate(); + if(as.arena==iwrite.as.arena && as.aa==iwrite.as.aa){ + /* will not be able to do anything more than last flush - kick disk */ + trace(TraceProc, "icachewritecoord flush dcache"); + kickdcache(); + trace(TraceProc, "icachewritecoord flushed dcache"); + } + iwrite.as = as; + + trace(TraceProc, "icachewritecoord start flush"); + if(iwrite.as.arena){ + for(i=0; i<ix->nsects; i++) + send(ix->sects[i]->writechan, 0); + if(ix->bloom) + send(ix->bloom->writechan, 0); + + for(i=0; i<ix->nsects; i++) + recv(ix->sects[i]->writedonechan, 0); + if(ix->bloom) + recv(ix->bloom->writedonechan, 0); + + trace(TraceProc, "icachewritecoord donewrite"); + setatailstate(&iwrite.as); + } + icacheclean(nil); /* wake up anyone waiting */ + trace(TraceWork, "finish"); + addstat(StatIcacheFlush, 1); + } +} + +void +flushicache(void) +{ + trace(TraceProc, "flushicache enter"); + kickround(&iwrite.round, 1); + trace(TraceProc, "flushicache exit"); +} + +void +kickicache(void) +{ + kickround(&iwrite.round, 0); +} + +void +delaykickicache(void) +{ + delaykickround(&iwrite.round); +} + +static IEntry* +iesort(IEntry *ie) +{ + int cmp; + IEntry **l; + IEntry *ie1, *ie2, *sorted; + + if(ie == nil || ie->nextdirty == nil) + return ie; + + /* split the lists */ + ie1 = ie; + ie2 = ie; + if(ie2) + ie2 = ie2->nextdirty; + if(ie2) + ie2 = ie2->nextdirty; + while(ie1 && ie2){ + ie1 = ie1->nextdirty; + ie2 = ie2->nextdirty; + if(ie2) + ie2 = ie2->nextdirty; + } + if(ie1){ + ie2 = ie1->nextdirty; + ie1->nextdirty = nil; + } + + /* sort the lists */ + ie1 = iesort(ie); + ie2 = iesort(ie2); + + /* merge the lists */ + sorted = nil; + l = &sorted; + cmp = 0; + while(ie1 || ie2){ + if(ie1 && ie2) + cmp = scorecmp(ie1->score, ie2->score); + if(ie1==nil || (ie2 && cmp > 0)){ + *l = ie2; + l = &ie2->nextdirty; + ie2 = ie2->nextdirty; + }else{ + *l = ie1; + l = &ie1->nextdirty; + ie1 = ie1->nextdirty; + } + } + *l = nil; + return sorted; +} + diff --git a/src/cmd/venti/srv/ifile.c b/src/cmd/venti/srv/ifile.c new file mode 100644 index 00000000..fc784c9b --- /dev/null +++ b/src/cmd/venti/srv/ifile.c @@ -0,0 +1,93 @@ +#include "stdinc.h" +#include "dat.h" +#include "fns.h" + +int +readifile(IFile *f, char *name) +{ + ZBlock *b; + + b = readfile(name); + if(b == nil) + return -1; + f->name = name; + f->b = b; + f->pos = 0; + return 0; +} + +void +freeifile(IFile *f) +{ + freezblock(f->b); + f->b = nil; + f->pos = 0; +} + +int +partifile(IFile *f, Part *part, u64int start, u32int size) +{ + ZBlock *b; + + b = alloczblock(size, 0, part->blocksize); + if(b == nil) + return -1; + if(readpart(part, start, b->data, size) < 0){ + seterr(EAdmin, "can't read %s: %r", part->name); + freezblock(b); + return -1; + } + f->name = part->name; + f->b = b; + f->pos = 0; + return 0; +} + +/* + * return the next non-blank input line, + * stripped of leading white space and with # comments eliminated + */ +char* +ifileline(IFile *f) +{ + char *s, *e, *t; + int c; + + for(;;){ + s = (char*)&f->b->data[f->pos]; + e = memchr(s, '\n', f->b->len - f->pos); + if(e == nil) + return nil; + *e++ = '\0'; + f->pos = e - (char*)f->b->data; + t = strchr(s, '#'); + if(t != nil) + *t = '\0'; + for(; c = *s; s++) + if(c != ' ' && c != '\t' && c != '\r') + return s; + } +} + +int +ifilename(IFile *f, char *dst) +{ + char *s; + + s = ifileline(f); + if(s == nil || strlen(s) >= ANameSize) + return -1; + namecp(dst, s); + return 0; +} + +int +ifileu32int(IFile *f, u32int *r) +{ + char *s; + + s = ifileline(f); + if(s == nil) + return -1; + return stru32int(s, r); +} diff --git a/src/cmd/venti/srv/index.c b/src/cmd/venti/srv/index.c new file mode 100644 index 00000000..46bf91e2 --- /dev/null +++ b/src/cmd/venti/srv/index.c @@ -0,0 +1,819 @@ +/* + * Index, mapping scores to log positions. + * + * The index is made up of some number of index sections, each of + * which is typically stored on a different disk. The blocks in all the + * index sections are logically numbered, with each index section + * responsible for a range of blocks. Blocks are typically 8kB. + * + * The N index blocks are treated as a giant hash table. The top 32 bits + * of score are used as the key for a lookup. Each index block holds + * one hash bucket, which is responsible for ceil(2^32 / N) of the key space. + * + * The index is sized so that a particular bucket is extraordinarily + * unlikely to overflow: assuming compressed data blocks are 4kB + * on disk, and assuming each block has a 40 byte index entry, + * the index data will be 1% of the total data. Since scores are essentially + * random, all buckets should be about the same fullness. + * A factor of 5 gives us a wide comfort boundary to account for + * random variation. So the index disk space should be 5% of the arena disk space. + */ + +#include "stdinc.h" +#include "dat.h" +#include "fns.h" + +//static int bucklook(u8int *score, int type, u8int *data, int n); +//static int writebucket(ISect *is, u32int buck, IBucket *ib, DBlock *b); +//static int okibucket(IBucket *ib, ISect *is); +static int initindex1(Index*); +static ISect *initisect1(ISect *is); +//static int splitiblock(Index *ix, DBlock *b, ISect *is, u32int buck, IBucket *ib); + +#define KEY(k,d) ((d) ? (k)>>(32-(d)) : 0) + +//static QLock indexlock; //ZZZ + +static char IndexMagic[] = "venti index configuration"; + +Index* +initindex(char *name, ISect **sects, int n) +{ + IFile f; + Index *ix; + ISect *is; + u32int last, blocksize, tabsize; + int i; + + if(n <= 0){ +fprint(2, "bad n\n"); + seterr(EOk, "no index sections to initialize index"); + return nil; + } + ix = MKZ(Index); + if(ix == nil){ +fprint(2, "no mem\n"); + seterr(EOk, "can't initialize index: out of memory"); + freeindex(ix); + return nil; + } + + tabsize = sects[0]->tabsize; + if(partifile(&f, sects[0]->part, sects[0]->tabbase, tabsize) < 0) + return nil; + if(parseindex(&f, ix) < 0){ + freeifile(&f); + freeindex(ix); + return nil; + } + freeifile(&f); + if(namecmp(ix->name, name) != 0){ + seterr(ECorrupt, "mismatched index name: found %s expected %s", ix->name, name); + return nil; + } + if(ix->nsects != n){ + seterr(ECorrupt, "mismatched number index sections: found %d expected %d", n, ix->nsects); + freeindex(ix); + return nil; + } + ix->sects = sects; + last = 0; + blocksize = ix->blocksize; + for(i = 0; i < ix->nsects; i++){ + is = sects[i]; + if(namecmp(ix->name, is->index) != 0 + || is->blocksize != blocksize + || is->tabsize != tabsize + || namecmp(is->name, ix->smap[i].name) != 0 + || is->start != ix->smap[i].start + || is->stop != ix->smap[i].stop + || last != is->start + || is->start > is->stop){ + seterr(ECorrupt, "inconsistent index sections in %s", ix->name); + freeindex(ix); + return nil; + } + last = is->stop; + } + ix->tabsize = tabsize; + ix->buckets = last; + + if(initindex1(ix) < 0){ + freeindex(ix); + return nil; + } + + ix->arenas = MKNZ(Arena*, ix->narenas); + if(maparenas(ix->amap, ix->arenas, ix->narenas, ix->name) < 0){ + freeindex(ix); + return nil; + } + + return ix; +} + +static int +initindex1(Index *ix) +{ + u32int buckets; + + ix->div = (((u64int)1 << 32) + ix->buckets - 1) / ix->buckets; + buckets = (((u64int)1 << 32) - 1) / ix->div + 1; + if(buckets != ix->buckets){ + seterr(ECorrupt, "inconsistent math for divisor and buckets in %s", ix->name); + return -1; + } + + return 0; +} + +int +wbindex(Index *ix) +{ + Fmt f; + ZBlock *b; + int i; + + if(ix->nsects == 0){ + seterr(EOk, "no sections in index %s", ix->name); + return -1; + } + b = alloczblock(ix->tabsize, 1, ix->blocksize); + if(b == nil){ + seterr(EOk, "can't write index configuration: out of memory"); + return -1; + } + fmtzbinit(&f, b); + if(outputindex(&f, ix) < 0){ + seterr(EOk, "can't make index configuration: table storage too small %d", ix->tabsize); + freezblock(b); + return -1; + } + for(i = 0; i < ix->nsects; i++){ + if(writepart(ix->sects[i]->part, ix->sects[i]->tabbase, b->data, ix->tabsize) < 0){ + seterr(EOk, "can't write index: %r"); + freezblock(b); + return -1; + } + } + freezblock(b); + + for(i = 0; i < ix->nsects; i++) + if(wbisect(ix->sects[i]) < 0) + return -1; + + return 0; +} + +/* + * index: IndexMagic '\n' version '\n' name '\n' blocksize '\n' [V2: bitblocks '\n'] sections arenas + * version, blocksize: u32int + * name: max. ANameSize string + * sections, arenas: AMap + */ +int +outputindex(Fmt *f, Index *ix) +{ + if(fmtprint(f, "%s\n%ud\n%s\n%ud\n", IndexMagic, ix->version, ix->name, ix->blocksize) < 0 + || outputamap(f, ix->smap, ix->nsects) < 0 + || outputamap(f, ix->amap, ix->narenas) < 0) + return -1; + return 0; +} + +int +parseindex(IFile *f, Index *ix) +{ + AMapN amn; + u32int v; + char *s; + + /* + * magic + */ + s = ifileline(f); + if(s == nil || strcmp(s, IndexMagic) != 0){ + seterr(ECorrupt, "bad index magic for %s", f->name); + return -1; + } + + /* + * version + */ + if(ifileu32int(f, &v) < 0){ + seterr(ECorrupt, "syntax error: bad version number in %s", f->name); + return -1; + } + ix->version = v; + if(ix->version != IndexVersion){ + seterr(ECorrupt, "bad version number in %s", f->name); + return -1; + } + + /* + * name + */ + if(ifilename(f, ix->name) < 0){ + seterr(ECorrupt, "syntax error: bad index name in %s", f->name); + return -1; + } + + /* + * block size + */ + if(ifileu32int(f, &v) < 0){ + seterr(ECorrupt, "syntax error: bad block size number in %s", f->name); + return -1; + } + ix->blocksize = v; + + if(parseamap(f, &amn) < 0) + return -1; + ix->nsects = amn.n; + ix->smap = amn.map; + + if(parseamap(f, &amn) < 0) + return -1; + ix->narenas = amn.n; + ix->amap = amn.map; + + return 0; +} + +/* + * initialize an entirely new index + */ +Index * +newindex(char *name, ISect **sects, int n) +{ + Index *ix; + AMap *smap; + u64int nb; + u32int div, ub, xb, fb, start, stop, blocksize, tabsize; + int i, j; + + if(n < 1){ + seterr(EOk, "creating index with no index sections"); + return nil; + } + + /* + * compute the total buckets available in the index, + * and the total buckets which are used. + */ + nb = 0; + blocksize = sects[0]->blocksize; + tabsize = sects[0]->tabsize; + for(i = 0; i < n; i++){ + if(sects[i]->start != 0 || sects[i]->stop != 0 + || sects[i]->index[0] != '\0'){ + seterr(EOk, "creating new index using non-empty section %s", sects[i]->name); + return nil; + } + if(blocksize != sects[i]->blocksize){ + seterr(EOk, "mismatched block sizes in index sections"); + return nil; + } + if(tabsize != sects[i]->tabsize){ + seterr(EOk, "mismatched config table sizes in index sections"); + return nil; + } + nb += sects[i]->blocks; + } + + /* + * check for duplicate names + */ + for(i = 0; i < n; i++){ + for(j = i + 1; j < n; j++){ + if(namecmp(sects[i]->name, sects[j]->name) == 0){ + seterr(EOk, "duplicate section name %s for index %s", sects[i]->name, name); + return nil; + } + } + } + + if(nb >= ((u64int)1 << 32)){ + seterr(EBug, "index too large"); + return nil; + } + + fb = 0; + div = (((u64int)1 << 32) + nb - 1) / nb; + ub = (((u64int)1 << 32) - 1) / div + 1; + if(div < 100){ + seterr(EBug, "index divisor too coarse"); + return nil; + } + if(ub > nb){ + seterr(EBug, "index initialization math wrong"); + return nil; + } + xb = nb - ub; + + /* + * initialize each of the index sections + * and the section map table + */ + smap = MKNZ(AMap, n); + if(smap == nil){ + seterr(EOk, "can't create new index: out of memory"); + return nil; + } + start = 0; + for(i = 0; i < n; i++){ + stop = start + sects[i]->blocks - xb / n; + if(i == n - 1) + stop = ub; + sects[i]->start = start; + sects[i]->stop = stop; + namecp(sects[i]->index, name); + + smap[i].start = start; + smap[i].stop = stop; + namecp(smap[i].name, sects[i]->name); + start = stop; + } + + /* + * initialize the index itself + */ + ix = MKZ(Index); + if(ix == nil){ + seterr(EOk, "can't create new index: out of memory"); + free(smap); + return nil; + } + ix->version = IndexVersion; + namecp(ix->name, name); + ix->sects = sects; + ix->smap = smap; + ix->nsects = n; + ix->blocksize = blocksize; + ix->buckets = ub; + ix->tabsize = tabsize; + ix->div = div; + ix->bitblocks = fb; + + if(initindex1(ix) < 0){ + free(smap); + return nil; + } + + return ix; +} + +ISect* +initisect(Part *part) +{ + ISect *is; + ZBlock *b; + int ok; + + b = alloczblock(HeadSize, 0, 0); + if(b == nil || readpart(part, PartBlank, b->data, HeadSize) < 0){ + seterr(EAdmin, "can't read index section header: %r"); + return nil; + } + + is = MKZ(ISect); + if(is == nil){ + freezblock(b); + return nil; + } + is->part = part; + ok = unpackisect(is, b->data); + freezblock(b); + if(ok < 0){ + seterr(ECorrupt, "corrupted index section header: %r"); + freeisect(is); + return nil; + } + + if(is->version != ISectVersion1 && is->version != ISectVersion2){ + seterr(EAdmin, "unknown index section version %d", is->version); + freeisect(is); + return nil; + } + + return initisect1(is); +} + +ISect* +newisect(Part *part, u32int vers, char *name, u32int blocksize, u32int tabsize) +{ + ISect *is; + u32int tabbase; + + is = MKZ(ISect); + if(is == nil) + return nil; + + namecp(is->name, name); + is->version = vers; + is->part = part; + is->blocksize = blocksize; + is->start = 0; + is->stop = 0; + tabbase = (PartBlank + HeadSize + blocksize - 1) & ~(blocksize - 1); + is->blockbase = (tabbase + tabsize + blocksize - 1) & ~(blocksize - 1); + is->blocks = is->part->size / blocksize - is->blockbase / blocksize; + is->bucketmagic = 0; + if(is->version == ISectVersion2){ + do{ + is->bucketmagic = fastrand(); + }while(is->bucketmagic==0); + } + is = initisect1(is); + if(is == nil) + return nil; + + return is; +} + +/* + * initialize the computed parameters for an index + */ +static ISect* +initisect1(ISect *is) +{ + u64int v; + + is->buckmax = (is->blocksize - IBucketSize) / IEntrySize; + is->blocklog = u64log2(is->blocksize); + if(is->blocksize != (1 << is->blocklog)){ + seterr(ECorrupt, "illegal non-power-of-2 bucket size %d\n", is->blocksize); + freeisect(is); + return nil; + } + partblocksize(is->part, is->blocksize); + is->tabbase = (PartBlank + HeadSize + is->blocksize - 1) & ~(is->blocksize - 1); + if(is->tabbase >= is->blockbase){ + seterr(ECorrupt, "index section config table overlaps bucket storage"); + freeisect(is); + return nil; + } + is->tabsize = is->blockbase - is->tabbase; + v = is->part->size & ~(u64int)(is->blocksize - 1); + if(is->blockbase + (u64int)is->blocks * is->blocksize != v){ + seterr(ECorrupt, "invalid blocks in index section %s", is->name); +//ZZZZZZZZZ +// freeisect(is); +// return nil; + } + + if(is->stop - is->start > is->blocks){ + seterr(ECorrupt, "index section overflows available space"); + freeisect(is); + return nil; + } + if(is->start > is->stop){ + seterr(ECorrupt, "invalid index section range"); + freeisect(is); + return nil; + } + + return is; +} + +int +wbisect(ISect *is) +{ + ZBlock *b; + + b = alloczblock(HeadSize, 1, 0); + if(b == nil) +//ZZZ set error? + return -1; + + if(packisect(is, b->data) < 0){ + seterr(ECorrupt, "can't make index section header: %r"); + freezblock(b); + return -1; + } + if(writepart(is->part, PartBlank, b->data, HeadSize) < 0){ + seterr(EAdmin, "can't write index section header: %r"); + freezblock(b); + return -1; + } + freezblock(b); + + return 0; +} + +void +freeisect(ISect *is) +{ + if(is == nil) + return; + free(is); +} + +void +freeindex(Index *ix) +{ + int i; + + if(ix == nil) + return; + free(ix->amap); + free(ix->arenas); + if(ix->sects) + for(i = 0; i < ix->nsects; i++) + freeisect(ix->sects[i]); + free(ix->sects); + free(ix->smap); + free(ix); +} + +/* + * write a clump to an available arena in the index + * and return the address of the clump within the index. +ZZZ question: should this distinguish between an arena +filling up and real errors writing the clump? + */ +u64int +writeiclump(Index *ix, Clump *c, u8int *clbuf, u64int *pa) +{ + u64int a; + int i; + + trace(TraceLump, "writeiclump enter"); + for(i = ix->mapalloc; i < ix->narenas; i++){ + a = writeaclump(ix->arenas[i], c, clbuf, ix->amap[i].start, pa); + if(a != TWID64){ + ix->mapalloc = i; /* assuming write is atomic, race is okay */ + trace(TraceLump, "writeiclump exit"); + return a; + } + } + + seterr(EAdmin, "no space left in arenas"); + trace(TraceLump, "writeiclump failed"); + return TWID64; +} + +/* + * convert an arena index to an relative arena address + */ +Arena* +amapitoa(Index *ix, u64int a, u64int *aa) +{ + int i, r, l, m; + + l = 1; + r = ix->narenas - 1; + while(l <= r){ + m = (r + l) / 2; + if(ix->amap[m].start <= a) + l = m + 1; + else + r = m - 1; + } + l--; + + if(a > ix->amap[l].stop){ +for(i=0; i<ix->narenas; i++) + print("arena %d: %llux - %llux\n", i, ix->amap[i].start, ix->amap[i].stop); +print("want arena %d for %llux\n", l, a); + seterr(ECrash, "unmapped address passed to amapitoa"); + return nil; + } + + if(ix->arenas[l] == nil){ + seterr(ECrash, "unmapped arena selected in amapitoa"); + return nil; + } + *aa = a - ix->amap[l].start; + return ix->arenas[l]; +} + +int +iaddrcmp(IAddr *ia1, IAddr *ia2) +{ + return ia1->type != ia2->type + || ia1->size != ia2->size + || ia1->blocks != ia2->blocks + || ia1->addr != ia2->addr; +} + +/* + * lookup the score in the partition + * + * nothing needs to be explicitly locked: + * only static parts of ix are used, and + * the bucket is locked by the DBlock lock. + */ +int +loadientry(Index *ix, u8int *score, int type, IEntry *ie) +{ + ISect *is; + DBlock *b; + IBucket ib; + u32int buck; + int h, ok; + + ok = -1; + + trace(TraceLump, "loadientry enter"); + + /* + qlock(&stats.lock); + stats.indexreads++; + qunlock(&stats.lock); + */ + + if(!inbloomfilter(mainindex->bloom, score)){ + trace(TraceLump, "loadientry bloomhit"); + return -1; + } + + trace(TraceLump, "loadientry loadibucket"); + b = loadibucket(ix, score, &is, &buck, &ib); + trace(TraceLump, "loadientry loadedibucket"); + if(b == nil) + return -1; + + if(okibucket(&ib, is) < 0){ + trace(TraceLump, "loadientry badbucket"); + goto out; + } + + h = bucklook(score, type, ib.data, ib.n); + if(h & 1){ + h ^= 1; + trace(TraceLump, "loadientry found"); + unpackientry(ie, &ib.data[h]); + ok = 0; + goto out; + } + trace(TraceLump, "loadientry notfound"); + addstat(StatBloomFalseMiss, 1); +out: + putdblock(b); + trace(TraceLump, "loadientry exit"); + return ok; +} + +int +okibucket(IBucket *ib, ISect *is) +{ + if(ib->n <= is->buckmax) + return 0; + + seterr(EICorrupt, "corrupted disk index bucket: n=%ud max=%ud, range=[%lud,%lud)", + ib->n, is->buckmax, is->start, is->stop); + return -1; +} + +/* + * look for score within data; + * return 1 | byte index of matching index, + * or 0 | index of least element > score + */ +int +bucklook(u8int *score, int otype, u8int *data, int n) +{ + int i, r, l, m, h, c, cc, type; + + type = vttodisktype(otype); + l = 0; + r = n - 1; + while(l <= r){ + m = (r + l) >> 1; + h = m * IEntrySize; + for(i = 0; i < VtScoreSize; i++){ + c = score[i]; + cc = data[h + i]; + if(c != cc){ + if(c > cc) + l = m + 1; + else + r = m - 1; + goto cont; + } + } + cc = data[h + IEntryTypeOff]; + if(type != cc){ + if(type > cc) + l = m + 1; + else + r = m - 1; + goto cont; + } + return h | 1; + cont:; + } + + return l * IEntrySize; +} + +/* + * compare two IEntries; consistent with bucklook + */ +int +ientrycmp(const void *vie1, const void *vie2) +{ + u8int *ie1, *ie2; + int i, v1, v2; + + ie1 = (u8int*)vie1; + ie2 = (u8int*)vie2; + for(i = 0; i < VtScoreSize; i++){ + v1 = ie1[i]; + v2 = ie2[i]; + if(v1 != v2){ + if(v1 < v2) + return -1; + return 1; + } + } + v1 = ie1[IEntryTypeOff]; + v2 = ie2[IEntryTypeOff]; + if(v1 != v2){ + if(v1 < v2) + return -1; + return 1; + } + return 0; +} + +/* + * find the number of the index section holding bucket #buck + */ +int +indexsect0(Index *ix, u32int buck) +{ + int r, l, m; + + l = 1; + r = ix->nsects - 1; + while(l <= r){ + m = (r + l) >> 1; + if(ix->sects[m]->start <= buck) + l = m + 1; + else + r = m - 1; + } + return l - 1; +} + +/* + * load the index block at bucket #buck + */ +static DBlock* +loadibucket0(Index *ix, u32int buck, ISect **pis, u32int *pbuck, IBucket *ib, int mode) +{ + ISect *is; + DBlock *b; + + is = ix->sects[indexsect0(ix, buck)]; + if(buck < is->start || is->stop <= buck){ + seterr(EAdmin, "index lookup out of range: %ud not found in index\n", buck); + return nil; + } + + buck -= is->start; + if((b = getdblock(is->part, is->blockbase + ((u64int)buck << is->blocklog), mode)) == nil) + return nil; + + if(pis) + *pis = is; + if(pbuck) + *pbuck = buck; + if(ib) + unpackibucket(ib, b->data, is->bucketmagic); + return b; +} + +/* + * find the number of the index section holding score + */ +static int +indexsect1(Index *ix, u8int *score) +{ + return indexsect0(ix, hashbits(score, 32) / ix->div); +} + +/* + * load the index block responsible for score. + */ +static DBlock* +loadibucket1(Index *ix, u8int *score, ISect **pis, u32int *pbuck, IBucket *ib) +{ + return loadibucket0(ix, hashbits(score, 32)/ix->div, pis, pbuck, ib, OREAD); +} + +int +indexsect(Index *ix, u8int *score) +{ + return indexsect1(ix, score); +} + +DBlock* +loadibucket(Index *ix, u8int *score, ISect **pis, u32int *pbuck, IBucket *ib) +{ + return loadibucket1(ix, score, pis, pbuck, ib); +} + + diff --git a/src/cmd/venti/srv/lump.c b/src/cmd/venti/srv/lump.c new file mode 100644 index 00000000..d1e58a6d --- /dev/null +++ b/src/cmd/venti/srv/lump.c @@ -0,0 +1,249 @@ +#include "stdinc.h" +#include "dat.h" +#include "fns.h" + +int queuewrites = 0; +int writestodevnull = 0; + +static Packet *readilump(Lump *u, IAddr *ia, u8int *score, int rac); + +Packet* +readlump(u8int *score, int type, u32int size, int *cached) +{ + Lump *u; + Packet *p; + IAddr ia; + u32int n; + int rac; + + trace(TraceLump, "readlump enter"); +/* + qlock(&stats.lock); + stats.lumpreads++; + qunlock(&stats.lock); +*/ + if(scorecmp(score, zeroscore) == 0) + return packetalloc(); + u = lookuplump(score, type); + if(u->data != nil){ + trace(TraceLump, "readlump lookuplump hit"); + if(cached) + *cached = 1; + n = packetsize(u->data); + if(n > size){ + seterr(EOk, "read too small: asked for %d need at least %d", size, n); + putlump(u); + + return nil; + } + p = packetdup(u->data, 0, n); + putlump(u); + return p; + } + + if(cached) + *cached = 0; + + if(lookupscore(score, type, &ia, &rac) < 0){ + //ZZZ place to check for someone trying to guess scores + seterr(EOk, "no block with score %V/%d exists", score, type); + + putlump(u); + return nil; + } + if(ia.size > size){ + seterr(EOk, "read too small 1: asked for %d need at least %d", size, ia.size); + + putlump(u); + return nil; + } + + trace(TraceLump, "readlump readilump"); + p = readilump(u, &ia, score, rac); + putlump(u); + + trace(TraceLump, "readlump exit"); + return p; +} + +/* + * save away a lump, and return it's score. + * doesn't store duplicates, but checks that the data is really the same. + */ +int +writelump(Packet *p, u8int *score, int type, u32int creator, uint ms) +{ + Lump *u; + int ok; + +/* + qlock(&stats.lock); + stats.lumpwrites++; + qunlock(&stats.lock); +*/ + + packetsha1(p, score); + if(packetsize(p) == 0 || writestodevnull==1){ + packetfree(p); + return 0; + } + + u = lookuplump(score, type); + if(u->data != nil){ + ok = 0; + if(packetcmp(p, u->data) != 0){ + seterr(EStrange, "score collision"); + ok = -1; + } + packetfree(p); + putlump(u); + return ok; + } + + if(writestodevnull==2){ + packetfree(p); + return 0; + } + + if(queuewrites) + return queuewrite(u, p, creator, ms); + + ok = writeqlump(u, p, creator, ms); + + putlump(u); + return ok; +} + +int +writeqlump(Lump *u, Packet *p, int creator, uint ms) +{ + ZBlock *flat; + Packet *old; + IAddr ia; + int ok; + int rac; + + if(lookupscore(u->score, u->type, &ia, &rac) == 0){ + /* assume the data is here! XXX */ + packetfree(p); + ms = msec() - ms; + addstat2(StatRpcWriteOld, 1, StatRpcWriteOldTime, ms); + return 0; + + /* + * if the read fails, + * assume it was corrupted data and store the block again + */ + old = readilump(u, &ia, u->score, rac); + if(old != nil){ + ok = 0; + if(packetcmp(p, old) != 0){ + seterr(EStrange, "score collision"); + ok = -1; + } + packetfree(p); + packetfree(old); + + ms = msec() - ms; + addstat2(StatRpcWriteOld, 1, StatRpcWriteOldTime, ms); + return ok; + } + logerr(EAdmin, "writelump: read %V failed, rewriting: %r\n", u->score); + } + + flat = packet2zblock(p, packetsize(p)); + ok = storeclump(mainindex, flat, u->score, u->type, creator, &ia); + freezblock(flat); + if(ok == 0) + ok = insertscore(u->score, &ia, 1); + if(ok == 0) + insertlump(u, p); + else + packetfree(p); + + ms = msec() - ms; + addstat2(StatRpcWriteNew, 1, StatRpcWriteNewTime, ms); + return ok; +} + +static void +lreadahead(u64int a, Arena *arena, u64int aa, int n) +{ + u8int buf[ClumpSize]; + Clump cl; + IAddr ia; + + while(n > 0) { + if (aa >= arena->memstats.used) + break; + if(readarena(arena, aa, buf, ClumpSize) < ClumpSize) + break; + if(unpackclump(&cl, buf, arena->clumpmagic) < 0) + break; + ia.addr = a; + ia.type = cl.info.type; + ia.size = cl.info.uncsize; + ia.blocks = (cl.info.size + ClumpSize + (1 << ABlockLog) - 1) >> ABlockLog; + insertscore(cl.info.score, &ia, 0); + a += ClumpSize + cl.info.size; + aa += ClumpSize + cl.info.size; + n--; + } +} + +static Packet* +readilump(Lump *u, IAddr *ia, u8int *score, int rac) +{ + Arena *arena; + ZBlock *zb; + Packet *p, *pp; + Clump cl; + u64int a, aa; + u8int sc[VtScoreSize]; + + trace(TraceLump, "readilump enter"); + arena = amapitoa(mainindex, ia->addr, &aa); + if(arena == nil){ + trace(TraceLump, "readilump amapitoa failed"); + return nil; + } + + trace(TraceLump, "readilump loadclump"); + zb = loadclump(arena, aa, ia->blocks, &cl, sc, paranoid); + if(zb == nil){ + trace(TraceLump, "readilump loadclump failed"); + return nil; + } + + if(ia->size != cl.info.uncsize){ + seterr(EInconsist, "index and clump size mismatch"); + freezblock(zb); + return nil; + } + if(ia->type != cl.info.type){ + seterr(EInconsist, "index and clump type mismatch"); + freezblock(zb); + return nil; + } + if(scorecmp(score, sc) != 0){ + seterr(ECrash, "score mismatch"); + freezblock(zb); + return nil; + } + + if(rac == 0) { + trace(TraceLump, "readilump readahead"); + a = ia->addr + ClumpSize + cl.info.size; + aa += ClumpSize + cl.info.size; + lreadahead(a, arena, aa, 20); + } + + trace(TraceLump, "readilump success"); + p = zblock2packet(zb, cl.info.uncsize); + freezblock(zb); + pp = packetdup(p, 0, packetsize(p)); + trace(TraceLump, "readilump insertlump"); + insertlump(u, pp); + trace(TraceLump, "readilump exit"); + return p; +} diff --git a/src/cmd/venti/srv/lumpcache.c b/src/cmd/venti/srv/lumpcache.c new file mode 100644 index 00000000..8a1e2d95 --- /dev/null +++ b/src/cmd/venti/srv/lumpcache.c @@ -0,0 +1,417 @@ +#include "stdinc.h" +#include "dat.h" +#include "fns.h" + +/* #define CHECK(x) x */ +#define CHECK(x) + +typedef struct LumpCache LumpCache; + +enum +{ + HashLog = 9, + HashSize = 1<<HashLog, + HashMask = HashSize - 1, +}; + +struct LumpCache +{ + QLock lock; + Rendez full; + Lump *free; /* list of available lumps */ + u32int allowed; /* total allowable space for packets */ + u32int avail; /* remaining space for packets */ + u32int now; /* ticks for usage timestamps */ + Lump **heads; /* hash table for finding address */ + int nheap; /* number of available victims */ + Lump **heap; /* heap for locating victims */ + int nblocks; /* number of blocks allocated */ + Lump *blocks; /* array of block descriptors */ +}; + +static LumpCache lumpcache; + +static void delheap(Lump *db); +static int downheap(int i, Lump *b); +static void fixheap(int i, Lump *b); +static int upheap(int i, Lump *b); +static Lump *bumplump(void); + +void +initlumpcache(u32int size, u32int nblocks) +{ + Lump *last, *b; + int i; + + lumpcache.full.l = &lumpcache.lock; + lumpcache.nblocks = nblocks; + lumpcache.allowed = size; + lumpcache.avail = size; + lumpcache.heads = MKNZ(Lump*, HashSize); + lumpcache.heap = MKNZ(Lump*, nblocks); + lumpcache.blocks = MKNZ(Lump, nblocks); + setstat(StatLcacheSize, lumpcache.nblocks); + + last = nil; + for(i = 0; i < nblocks; i++){ + b = &lumpcache.blocks[i]; + b->type = TWID8; + b->heap = TWID32; + b->next = last; + last = b; + } + lumpcache.free = last; + lumpcache.nheap = 0; +} + +Lump* +lookuplump(u8int *score, int type) +{ + uint ms; + Lump *b; + u32int h; + + ms = msec(); + trace(TraceLump, "lookuplump enter"); + + h = hashbits(score, HashLog); + + /* + * look for the block in the cache + */ + qlock(&lumpcache.lock); + CHECK(checklumpcache()); +again: + for(b = lumpcache.heads[h]; b != nil; b = b->next){ + if(scorecmp(score, b->score)==0 && type == b->type){ + addstat(StatLcacheHit, 1); + trace(TraceLump, "lookuplump hit"); + goto found; + } + } + + trace(TraceLump, "lookuplump miss"); + + /* + * missed: locate the block with the oldest second to last use. + * remove it from the heap, and fix up the heap. + */ + while(lumpcache.free == nil){ + trace(TraceLump, "lookuplump bump"); + CHECK(checklumpcache()); + if(bumplump() == nil){ + CHECK(checklumpcache()); + logerr(EAdmin, "all lump cache blocks in use"); + addstat(StatLcacheStall, 1); + CHECK(checklumpcache()); + rsleep(&lumpcache.full); + CHECK(checklumpcache()); + addstat(StatLcacheStall, -1); + goto again; + } + CHECK(checklumpcache()); + } + + addstat(StatLcacheMiss, 1); + b = lumpcache.free; + lumpcache.free = b->next; + + /* + * the new block has no last use, so assume it happens sometime in the middle +ZZZ this is not reasonable + */ + b->used = (b->used2 + lumpcache.now) / 2; + + /* + * rechain the block on the correct hash chain + */ + b->next = lumpcache.heads[h]; + lumpcache.heads[h] = b; + if(b->next != nil) + b->next->prev = b; + b->prev = nil; + + scorecp(b->score, score); + b->type = type; + b->size = 0; + b->data = nil; + +found: + b->ref++; + b->used2 = b->used; + b->used = lumpcache.now++; + if(b->heap != TWID32) + fixheap(b->heap, b); + CHECK(checklumpcache()); + qunlock(&lumpcache.lock); + + + addstat(StatLumpStall, 1); + qlock(&b->lock); + addstat(StatLumpStall, -1); + + trace(TraceLump, "lookuplump exit"); + addstat2(StatLcacheRead, 1, StatLcacheReadTime, msec()-ms); + return b; +} + +void +insertlump(Lump *b, Packet *p) +{ + u32int size; + + /* + * look for the block in the cache + */ + trace(TraceLump, "insertlump enter"); + qlock(&lumpcache.lock); + CHECK(checklumpcache()); +again: + + addstat(StatLcacheWrite, 1); + + /* + * missed: locate the block with the oldest second to last use. + * remove it from the heap, and fix up the heap. + */ + size = packetasize(p); +//ZZZ + while(lumpcache.avail < size){ + trace(TraceLump, "insertlump bump"); + CHECK(checklumpcache()); + if(bumplump() == nil){ + logerr(EAdmin, "all lump cache blocks in use"); + addstat(StatLcacheStall, 1); + CHECK(checklumpcache()); + rsleep(&lumpcache.full); + CHECK(checklumpcache()); + addstat(StatLcacheStall, -1); + goto again; + } + CHECK(checklumpcache()); + } + b->data = p; + b->size = size; + lumpcache.avail -= size; + CHECK(checklumpcache()); + qunlock(&lumpcache.lock); + trace(TraceLump, "insertlump exit"); +} + +void +putlump(Lump *b) +{ + if(b == nil) + return; + + trace(TraceLump, "putlump"); + qunlock(&b->lock); + qlock(&lumpcache.lock); + CHECK(checklumpcache()); + if(--b->ref == 0){ + if(b->heap == TWID32) + upheap(lumpcache.nheap++, b); + trace(TraceLump, "putlump wakeup"); + rwakeupall(&lumpcache.full); + } + CHECK(checklumpcache()); + qunlock(&lumpcache.lock); +} + +/* + * remove some lump from use and update the free list and counters + */ +static Lump* +bumplump(void) +{ + Lump *b; + u32int h; + + /* + * remove blocks until we find one that is unused + * referenced blocks are left in the heap even though + * they can't be scavenged; this is simple a speed optimization + */ + CHECK(checklumpcache()); + for(;;){ + if(lumpcache.nheap == 0){ + trace(TraceLump, "bumplump emptyheap"); + return nil; + } + b = lumpcache.heap[0]; + delheap(b); + if(!b->ref){ + trace(TraceLump, "bumplump wakeup"); + rwakeupall(&lumpcache.full); + break; + } + } + + /* + * unchain the block + */ + trace(TraceLump, "bumplump unchain"); + if(b->prev == nil){ + h = hashbits(b->score, HashLog); + if(lumpcache.heads[h] != b) + sysfatal("bad hash chains in lump cache"); + lumpcache.heads[h] = b->next; + }else + b->prev->next = b->next; + if(b->next != nil) + b->next->prev = b->prev; + + if(b->data != nil){ + packetfree(b->data); + b->data = nil; + lumpcache.avail += b->size; + b->size = 0; + } + b->type = TWID8; + + b->next = lumpcache.free; + lumpcache.free = b; + + CHECK(checklumpcache()); + trace(TraceLump, "bumplump exit"); + return b; +} + +/* + * delete an arbitrary block from the heap + */ +static void +delheap(Lump *db) +{ + fixheap(db->heap, lumpcache.heap[--lumpcache.nheap]); + db->heap = TWID32; +} + +/* + * push an element up or down to it's correct new location + */ +static void +fixheap(int i, Lump *b) +{ + if(upheap(i, b) == i) + downheap(i, b); +} + +static int +upheap(int i, Lump *b) +{ + Lump *bb; + u32int now; + int p; + + now = lumpcache.now; + for(; i != 0; i = p){ + p = (i - 1) >> 1; + bb = lumpcache.heap[p]; + if(b->used2 - now >= bb->used2 - now) + break; + lumpcache.heap[i] = bb; + bb->heap = i; + } + + lumpcache.heap[i] = b; + b->heap = i; + return i; +} + +static int +downheap(int i, Lump *b) +{ + Lump *bb; + u32int now; + int k; + + now = lumpcache.now; + for(; ; i = k){ + k = (i << 1) + 1; + if(k >= lumpcache.nheap) + break; + if(k + 1 < lumpcache.nheap && lumpcache.heap[k]->used2 - now > lumpcache.heap[k + 1]->used2 - now) + k++; + bb = lumpcache.heap[k]; + if(b->used2 - now <= bb->used2 - now) + break; + lumpcache.heap[i] = bb; + bb->heap = i; + } + + lumpcache.heap[i] = b; + b->heap = i; + return i; +} + +static void +findblock(Lump *bb) +{ + Lump *b, *last; + int h; + + last = nil; + h = hashbits(bb->score, HashLog); + for(b = lumpcache.heads[h]; b != nil; b = b->next){ + if(last != b->prev) + sysfatal("bad prev link"); + if(b == bb) + return; + last = b; + } + sysfatal("block score=%V type=%#x missing from hash table", bb->score, bb->type); +} + +void +checklumpcache(void) +{ + Lump *b; + u32int size, now, nfree; + int i, k, refed; + + now = lumpcache.now; + for(i = 0; i < lumpcache.nheap; i++){ + if(lumpcache.heap[i]->heap != i) + sysfatal("lc: mis-heaped at %d: %d", i, lumpcache.heap[i]->heap); + if(i > 0 && lumpcache.heap[(i - 1) >> 1]->used2 - now > lumpcache.heap[i]->used2 - now) + sysfatal("lc: bad heap ordering"); + k = (i << 1) + 1; + if(k < lumpcache.nheap && lumpcache.heap[i]->used2 - now > lumpcache.heap[k]->used2 - now) + sysfatal("lc: bad heap ordering"); + k++; + if(k < lumpcache.nheap && lumpcache.heap[i]->used2 - now > lumpcache.heap[k]->used2 - now) + sysfatal("lc: bad heap ordering"); + } + + refed = 0; + size = 0; + for(i = 0; i < lumpcache.nblocks; i++){ + b = &lumpcache.blocks[i]; + if(b->data == nil && b->size != 0) + sysfatal("bad size: %d data=%p", b->size, b->data); + if(b->ref && b->heap == TWID32) + refed++; + if(b->type != TWID8){ + findblock(b); + size += b->size; + } + if(b->heap != TWID32 + && lumpcache.heap[b->heap] != b) + sysfatal("lc: spurious heap value"); + } + if(lumpcache.avail != lumpcache.allowed - size){ + fprint(2, "mismatched available=%d and allowed=%d - used=%d space", lumpcache.avail, lumpcache.allowed, size); + *(int*)0=0; + } + + nfree = 0; + for(b = lumpcache.free; b != nil; b = b->next){ + if(b->type != TWID8 || b->heap != TWID32) + sysfatal("lc: bad free list"); + nfree++; + } + + if(lumpcache.nheap + nfree + refed != lumpcache.nblocks) + sysfatal("lc: missing blocks: %d %d %d %d", lumpcache.nheap, refed, nfree, lumpcache.nblocks); +} diff --git a/src/cmd/venti/srv/lumpqueue.c b/src/cmd/venti/srv/lumpqueue.c new file mode 100644 index 00000000..1b03f41c --- /dev/null +++ b/src/cmd/venti/srv/lumpqueue.c @@ -0,0 +1,187 @@ +#include "stdinc.h" +#include "dat.h" +#include "fns.h" + +typedef struct LumpQueue LumpQueue; +typedef struct WLump WLump; + +enum +{ + MaxLumpQ = 1 << 3 /* max. lumps on a single write queue, must be pow 2 */ +}; + +struct WLump +{ + Lump *u; + Packet *p; + int creator; + int gen; + uint ms; +}; + +struct LumpQueue +{ + QLock lock; + Rendez flush; + Rendez full; + Rendez empty; + WLump q[MaxLumpQ]; + int w; + int r; +}; + +static LumpQueue *lumpqs; +static int nqs; + +static QLock glk; +static int gen; + +static void queueproc(void *vq); + +int +initlumpqueues(int nq) +{ + LumpQueue *q; + + int i; + nqs = nq; + + lumpqs = MKNZ(LumpQueue, nq); + + for(i = 0; i < nq; i++){ + q = &lumpqs[i]; + q->full.l = &q->lock; + q->empty.l = &q->lock; + q->flush.l = &q->lock; + + if(vtproc(queueproc, q) < 0){ + seterr(EOk, "can't start write queue slave: %r"); + return -1; + } + if(vtproc(queueproc, q) < 0){ + seterr(EOk, "can't start write queue slave: %r"); + return -1; + } + if(vtproc(queueproc, q) < 0){ + seterr(EOk, "can't start write queue slave: %r"); + return -1; + } + if(vtproc(queueproc, q) < 0){ + seterr(EOk, "can't start write queue slave: %r"); + return -1; + } + if(vtproc(queueproc, q) < 0){ + seterr(EOk, "can't start write queue slave: %r"); + return -1; + } + } + + return 0; +} + +/* + * queue a lump & it's packet data for writing + */ +int +queuewrite(Lump *u, Packet *p, int creator, uint ms) +{ + LumpQueue *q; + int i; + + trace(TraceProc, "queuewrite"); + i = indexsect(mainindex, u->score); + if(i < 0 || i >= nqs){ + seterr(EBug, "internal error: illegal index section in queuewrite"); + return -1; + } + + q = &lumpqs[i]; + + qlock(&q->lock); + while(q->r == ((q->w + 1) & (MaxLumpQ - 1))){ + trace(TraceProc, "queuewrite sleep"); + rsleep(&q->full); + } + + q->q[q->w].u = u; + q->q[q->w].p = p; + q->q[q->w].creator = creator; + q->q[q->w].ms = ms; + q->q[q->w].gen = gen; + q->w = (q->w + 1) & (MaxLumpQ - 1); + + trace(TraceProc, "queuewrite wakeup"); + rwakeup(&q->empty); + + qunlock(&q->lock); + + return 0; +} + +void +flushqueue(void) +{ + int i; + LumpQueue *q; + + if(!lumpqs) + return; + + trace(TraceProc, "flushqueue"); + + qlock(&glk); + gen++; + qunlock(&glk); + + for(i=0; i<mainindex->nsects; i++){ + q = &lumpqs[i]; + qlock(&q->lock); + while(q->w != q->r && gen - q->q[q->r].gen > 0){ + trace(TraceProc, "flushqueue sleep q%d", i); + rsleep(&q->flush); + } + qunlock(&q->lock); + } +} + +static void +queueproc(void *vq) +{ + LumpQueue *q; + Lump *u; + Packet *p; + int creator; + uint ms; + + threadsetname("queueproc"); + + q = vq; + for(;;){ + qlock(&q->lock); + while(q->w == q->r){ + trace(TraceProc, "queueproc sleep empty"); + rsleep(&q->empty); + } + + u = q->q[q->r].u; + p = q->q[q->r].p; + creator = q->q[q->r].creator; + ms = q->q[q->r].ms; + + q->r = (q->r + 1) & (MaxLumpQ - 1); + trace(TraceProc, "queueproc wakeup flush"); + rwakeupall(&q->flush); + + trace(TraceProc, "queueproc wakeup full"); + rwakeup(&q->full); + + qunlock(&q->lock); + + trace(TraceProc, "queueproc writelump %V", u->score); + if(writeqlump(u, p, creator, ms) < 0) + fprint(2, "failed to write lump for %V: %r", u->score); + trace(TraceProc, "queueproc wrotelump %V", u->score); + + putlump(u); + } +} diff --git a/src/cmd/venti/srv/mkfile b/src/cmd/venti/srv/mkfile new file mode 100644 index 00000000..2fd4508e --- /dev/null +++ b/src/cmd/venti/srv/mkfile @@ -0,0 +1,146 @@ +<$PLAN9/src/mkhdr +CC=9c + +AR=ar + +LIBOFILES=\ + arena.$O\ + arenas.$O\ + bloom.$O\ + buildbuck.$O\ + clump.$O\ + config.$O\ + conv.$O\ + dcache.$O\ + dump.$O\ + graph.$O\ + httpd.$O\ + icache.$O\ + icachewrite.$O\ + ifile.$O\ + index.$O\ + lump.$O\ + lumpcache.$O\ + lumpqueue.$O\ + part.$O\ + png.$O\ + round.$O\ + score.$O\ + sortientry.$O\ + stats.$O\ + syncarena.$O\ + syncindex0.$O\ + trace.$O\ + unwhack.$O\ + utils.$O\ + unittoull.$O\ + whack.$O\ + xml.$O\ + zblock.$O\ + zeropart.$O\ + +SLIB=libvs.a + +LIB=$SLIB + +HFILES= dat.h\ + fns.h\ + stdinc.h\ + +TARG=\ + venti\ + fmtarenas\ + fmtbloom\ + fmtisect\ + fmtindex\ + buildindex\ + checkarenas\ + checkindex\ + clumpstats\ + findscore\ + rdarena\ + wrarena\ + syncindex\ + printarena\ + verifyarena\ + +OFILES= + +BIN=$BIN/venti + +it:V: $O.venti + +$O.venti: # debugmalloc2.$O # debugmalloc.$O #_p9dir.$O debugmalloc.$O + +CLEANFILES=$CLEANFILES $SLIB + +<$PLAN9/src/mkmany + +$SLIB: $LIBOFILES + $AR rvc $SLIB $LIBOFILES + +# xml.c:D: mkxml dat.h +# ./mkxml dat.h > xml.c + +ainstall:V: ${TARG:%=%.ainstall} + +%.ainstall:V: $O.% + scp $prereq amsterdam:/usr/local/bin/venti/$stem + +test:VQ: ${TARG:%=o.%} + slay o.venti|rc + vtmp=/home/tmp + echo '**********' FMTARENAS + ./o.fmtarenas -a 40M -b 8k arenas $vtmp/arena + echo '**********' FMTBLOOM + ./o.fmtbloom -s 10M $vtmp/bloom + echo '**********' FMTISECT + ./o.fmtisect -b 8k isect $vtmp/isect + ( + echo index main + echo isect $vtmp/isect + echo arenas $vtmp/arena + echo bloom $vtmp/bloom + echo webroot $HOME/src/venti/www + echo mem 64M + echo icmem 64M + echo bcmem 64M + ) >vtmp.conf + echo '**********' FMTINDEX + ./o.fmtindex vtmp.conf + echo '**********' VENTI + # ./o.venti -c vtmp.conf -B 64M -I 64M -C 64M -a 'tcp!*!17034' -h 'tcp!*!8001' >a 2>&1 & + ./o.venti -c vtmp.conf -a 'tcp!*!17034' -h 'tcp!*!8001' >a 2>&1 & + sleep 5 + echo '**********' VAC + venti='tcp!127.0.0.1!17034' export venti + 9 time vac /usr/local/plan9 >a.vac + case ${websync:-no} in + yes) + echo '**********' SYNC VIA WEB + hget http://127.0.0.1:8001/flushdcache + hget http://127.0.0.1:8001/flushicache + hget http://127.0.0.1:8001/flushdcache + echo '**********' KILL VENTI + killall -9 o.venti + ;; + no) + echo '**********' KILL VENTI + killall -9 o.venti + echo '**********' SYNCINDEX + ./o.syncindex -B64M -I64M -f vtmp.conf + ;; + esac + echo '**********' CHECKINDEX + ./o.checkindex -B64M vtmp.conf /home/tmp/check >check.out + wc check.out + +luadisk.o: luadisk.c + gcc -c -ggdb -Wall -I/usr/include/lua50 luadisk.c + +libluadisk.so: luadisk.o + gcc -shared -o $target luadisk.o -llua50 -llualib50 + +$O.xwrarena: xwrarena.$O + $LD -o $target xwrarena.$O + diff --git a/src/cmd/venti/srv/part.c b/src/cmd/venti/srv/part.c new file mode 100644 index 00000000..9b80b6e6 --- /dev/null +++ b/src/cmd/venti/srv/part.c @@ -0,0 +1,383 @@ +#ifdef PLAN9PORT /* SORRY! */ +#include <u.h> +#include <sys/types.h> +#include <sys/vfs.h> +#endif +#include "stdinc.h" +#include <ctype.h> +#include "dat.h" +#include "fns.h" + +u32int maxblocksize; +int readonly; + +static int +strtoullsuf(char *p, char **pp, int rad, u64int *u) +{ + u64int v; + + if(!isdigit(*p)) + return -1; + v = strtoull(p, &p, rad); + switch(*p){ + case 'k': + case 'K': + v *= 1024; + p++; + break; + case 'm': + case 'M': + v *= 1024*1024; + p++; + break; + case 'g': + case 'G': + v *= 1024*1024*1024; + p++; + break; + case 't': + case 'T': + v *= 1024*1024; + v *= 1024*1024; + p++; + break; + } + *pp = p; + *u = v; + return 0; +} + +static int +parsepart(char *name, char **file, u64int *lo, u64int *hi) +{ + char *p; + + *file = estrdup(name); + if((p = strrchr(*file, ':')) == nil){ + *lo = 0; + *hi = 0; + return 0; + } + *p++ = 0; + if(*p == '-') + *lo = 0; + else{ + if(strtoullsuf(p, &p, 0, lo) < 0){ + free(*file); + return -1; + } + } + if(*p == '-') + p++; + if(*p == 0){ + *hi = 0; + return 0; + } + if(strtoullsuf(p, &p, 0, hi) < 0 || *p != 0){ + free(*file); + return -1; + } + return 0; +} + +Part* +initpart(char *name, int mode) +{ + Part *part; + Dir *dir; + char *file; + u64int lo, hi; + + if(parsepart(name, &file, &lo, &hi) < 0) + return nil; + trace(TraceDisk, "initpart %s file %s lo 0x%llx hi 0x%llx", name, file, lo, hi); + part = MKZ(Part); + part->name = estrdup(name); + part->filename = estrdup(file); + if(readonly){ + mode &= (OREAD|OWRITE|ORDWR); + mode |= OREAD; + } + part->fd = open(file, mode); + if(part->fd < 0){ + if((mode&(OREAD|OWRITE|ORDWR)) == ORDWR) + part->fd = open(file, (mode&~ORDWR)|OREAD); + if(part->fd < 0){ + freepart(part); + fprint(2, "can't open partition='%s': %r\n", file); + seterr(EOk, "can't open partition='%s': %r", file); + fprint(2, "%r\n"); + free(file); + return nil; + } + fprint(2, "warning: %s opened for reading only\n", name); + } + part->offset = lo; + dir = dirfstat(part->fd); + if(dir == nil){ + freepart(part); + seterr(EOk, "can't stat partition='%s': %r", file); + free(file); + return nil; + } + if(dir->length == 0){ + free(dir); + freepart(part); + seterr(EOk, "can't determine size of partition %s", file); + free(file); + return nil; + } + if(dir->length < hi || dir->length < lo){ + freepart(part); + seterr(EOk, "partition '%s': bounds out of range (max %lld)", name, dir->length); + free(dir); + free(file); + return nil; + } + if(hi == 0) + hi = dir->length; + part->size = hi - part->offset; +#ifdef _LIBC_H_ + { + struct statfs sfs; + if(fstatfs(part->fd, &sfs) >= 0) + part->fsblocksize = sfs.f_bsize; + } +#endif + free(dir); + return part; +} + +void +freepart(Part *part) +{ + if(part == nil) + return; + if(part->fd >= 0) + close(part->fd); + free(part->name); + free(part); +} + +void +partblocksize(Part *part, u32int blocksize) +{ + if(part->blocksize) + sysfatal("resetting partition=%s's block size", part->name); + part->blocksize = blocksize; + if(blocksize > maxblocksize) + maxblocksize = blocksize; +} + +/* + * Read/write some amount of data between a block device or file and a memory buffer. + * + * Most Unix systems require that when accessing a block device directly, + * the buffer, offset, and count are all multiples of the device block size, + * making this a lot more complicated than it otherwise would be. + * + * Most of our callers will make things easy on us, but for some callers it's best + * if we just do the work here, with only one place to get it right (hopefully). + * + * If everything is aligned properly, prwb will try to do big transfers in the main + * body of the loop: up to MaxIo bytes at a time. If everything isn't aligned properly, + * we work one block at a time. + */ +#undef min +#define min(a, b) ((a) < (b) ? (a) : (b)) +int +prwb(char *name, int fd, int isread, u64int offset, void *vbuf, u32int count, u32int blocksize) +{ + char *op; + u8int *buf, *tmp, *freetmp, *dst; + u32int c, delta, icount, opsize; + int r; + + buf = vbuf; + tmp = nil; + freetmp = nil; + icount = count; + opsize = blocksize; + + if(count == 0){ + logerr(EStrange, "pwrb %s called to %s 0 bytes", name, isread ? "read" : "write"); + return 0; + } + + assert(blocksize > 0); + + /* allocate blocksize-aligned temp buffer if needed */ + if((ulong)offset%blocksize || (ulong)buf%blocksize || count%blocksize){ + if((freetmp = malloc(blocksize*2)) == nil) + return -1; + tmp = freetmp; + tmp += blocksize - (ulong)tmp%blocksize; + } + + /* handle beginning fringe */ + if((delta = (ulong)offset%blocksize) != 0){ + assert(tmp != nil); + if((r=pread(fd, tmp, blocksize, offset-delta)) != blocksize){ + dst = tmp; + offset = offset-delta; + op = "read"; + goto Error; + } + c = min(count, blocksize-delta); + assert(c > 0 && c < blocksize); + if(isread) + memmove(buf, tmp+delta, c); + else{ + memmove(tmp+delta, buf, c); + if((r=pwrite(fd, tmp, blocksize, offset-delta)) != blocksize){ + dst = tmp; + offset = offset-delta; + op = "read"; + goto Error; + } + } + assert(c > 0); + offset += c; + buf += c; + count -= c; + } + + /* handle full blocks */ + while(count >= blocksize){ + assert((ulong)offset%blocksize == 0); + if((ulong)buf%blocksize){ + assert(tmp != nil); + dst = tmp; + opsize = blocksize; + }else{ + dst = buf; + opsize = count - count%blocksize; + if(opsize > MaxIo) + opsize = MaxIo; + } + if(isread){ + if((r=pread(fd, dst, opsize, offset))<=0 || r%blocksize){ + op = "read"; + goto Error; + } + if(dst == tmp){ + assert(r == blocksize); + memmove(buf, tmp, blocksize); + } + }else{ + if(dst == tmp){ + assert(opsize == blocksize); + memmove(dst, buf, blocksize); + } + if((r=pwrite(fd, dst, opsize, offset))<=0 || r%blocksize){ + op = "write"; + goto Error; + } + if(dst == tmp) + assert(r == blocksize); + } + assert(r > 0); + offset += r; + buf += r; + count -= r; + } + + /* handle ending fringe */ + if(count > 0){ + assert((ulong)offset%blocksize == 0); + assert(tmp != nil); + /* + * Complicated condition: if we're reading it's okay to get less than + * a block as long as it's enough to satisfy the read - maybe this is + * a normal file. (We never write to normal files, or else things would + * be even more complicated.) + */ + r = pread(fd, tmp, blocksize, offset); + if((isread && r < count) || (!isread && r != blocksize)){ +print("FAILED isread=%d r=%d count=%d blocksize=%d\n", isread, r, count, blocksize); + dst = tmp; + op = "read"; + goto Error; + } + if(isread) + memmove(buf, tmp, count); + else{ + memmove(tmp, buf, count); + if(pwrite(fd, tmp, opsize, offset) != blocksize){ + dst = tmp; + op = "write"; + goto Error; + } + } + } + if(freetmp) + free(freetmp); + return icount; + +Error: + seterr(EAdmin, "%s %s offset 0x%llux count %ud buf %p returned %d: %r", + op, name, offset, opsize, dst, r); + if(freetmp) + free(freetmp); + return -1; +} + +int +rwpart(Part *part, int isread, u64int offset, u8int *buf, u32int count) +{ + u32int blocksize; + + trace(TraceDisk, "%s %s %ud at 0x%llx", + isread ? "read" : "write", part->name, count, offset); + if(offset >= part->size || offset+count > part->size){ + seterr(EStrange, "out of bounds %s offset 0x%llux count %ud to partition %s size 0x%llux", + isread ? "read" : "write", offset, count, part->name, part->size); + return -1; + } + + blocksize = part->fsblocksize; + if(blocksize == 0) + blocksize = part->blocksize; + if(blocksize == 0) + blocksize = 4096; + + return prwb(part->filename, part->fd, isread, part->offset+offset, buf, count, blocksize); +} + +int +readpart(Part *part, u64int offset, u8int *buf, u32int count) +{ + return rwpart(part, 1, offset, buf, count); +} + +int +writepart(Part *part, u64int offset, u8int *buf, u32int count) +{ + return rwpart(part, 0, offset, buf, count); +} + +ZBlock* +readfile(char *name) +{ + Part *p; + ZBlock *b; + + p = initpart(name, OREAD); + if(p == nil) + return nil; + b = alloczblock(p->size, 0, p->blocksize); + if(b == nil){ + seterr(EOk, "can't alloc %s: %r", name); + freepart(p); + return nil; + } + if(readpart(p, 0, b->data, p->size) < 0){ + seterr(EOk, "can't read %s: %r", name); + freepart(p); + freezblock(b); + return nil; + } + freepart(p); + return b; +} + diff --git a/src/cmd/venti/srv/png.c b/src/cmd/venti/srv/png.c new file mode 100644 index 00000000..966b7e96 --- /dev/null +++ b/src/cmd/venti/srv/png.c @@ -0,0 +1,241 @@ +#include "stdinc.h" +#include "dat.h" +#include "fns.h" + +enum +{ + IDATSIZE = 20000, + FilterNone = 0 +}; + +typedef struct ZlibR ZlibR; +typedef struct ZlibW ZlibW; + +struct ZlibR +{ + uchar *data; + int width; + int dx; + int dy; + int x; + int y; + int pixwid; +}; + +struct ZlibW +{ + Hio *io; + uchar *buf; + uchar *b; + uchar *e; +}; + +static ulong *crctab; +static uchar PNGmagic[] = { 137, 'P', 'N', 'G', '\r', '\n', 26, '\n'}; + +static void +put4(uchar *a, ulong v) +{ + a[0] = v>>24; + a[1] = v>>16; + a[2] = v>>8; + a[3] = v; +} + +static void +chunk(Hio *io, char *type, uchar *d, int n) +{ + uchar buf[4]; + ulong crc = 0; + + if(strlen(type) != 4) + return; + put4(buf, n); + hwrite(io, buf, 4); + hwrite(io, type, 4); + hwrite(io, d, n); + crc = blockcrc(crctab, crc, type, 4); + crc = blockcrc(crctab, crc, d, n); + put4(buf, crc); + hwrite(io, buf, 4); +} + +static int +zread(void *va, void *buf, int n) +{ + int a, i, pixels, pixwid; + uchar *b, *e, *img; + ZlibR *z; + + z = va; + pixwid = z->pixwid; + b = buf; + e = b+n; + while(b+pixwid <= e){ + if(z->y >= z->dy) + break; + if(z->x == 0) + *b++ = FilterNone; + pixels = (e-b)/pixwid; + if(pixels > z->dx - z->x) + pixels = z->dx - z->x; + img = z->data + z->width*z->y + pixwid*z->x; + memmove(b, img, pixwid*pixels); + if(pixwid == 4){ + /* + * Convert to non-premultiplied alpha. + */ + for(i=0; i<pixels; i++, b+=4){ + a = b[3]; + if(a == 255 || a == 0) + ; + else{ + if(b[0] >= a) + b[0] = a; + b[0] = (b[0]*255)/a; + if(b[1] >= a) + b[1] = a; + b[1] = (b[1]*255)/a; + if(b[2] >= a) + b[2] = a; + b[2] = (b[2]*255)/a; + } + } + }else + b += pixwid*pixels; + + z->x += pixels; + if(z->x >= z->dx){ + z->x = 0; + z->y++; + } + } + return b - (uchar*)buf; +} + +static void +IDAT(ZlibW *z) +{ + chunk(z->io, "IDAT", z->buf, z->b - z->buf); + z->b = z->buf; +} + +static int +zwrite(void *va, void *buf, int n) +{ + int m; + uchar *b, *e; + ZlibW *z; + + z = va; + b = buf; + e = b+n; + + while(b < e){ + m = z->e - z->b; + if(m > e - b) + m = e - b; + memmove(z->b, b, m); + z->b += m; + b += m; + if(z->b >= z->e) + IDAT(z); + } + return n; +} + +static Memimage* +memRGBA(Memimage *i) +{ + Memimage *ni; + char buf[32]; + ulong dst; + + /* + * [A]BGR because we want R,G,B,[A] in big-endian order. Sigh. + */ + chantostr(buf, i->chan); + if(strchr(buf, 'a')) + dst = ABGR32; + else + dst = BGR24; + + if(i->chan == dst) + return i; + + qlock(&memdrawlock); + ni = allocmemimage(i->r, dst); + if(ni) + memimagedraw(ni, ni->r, i, i->r.min, nil, i->r.min, S); + qunlock(&memdrawlock); + return ni; +} + +int +writepng(Hio *io, Memimage *m) +{ + static int first = 1; + static QLock lk; + uchar buf[200], *h; + Memimage *rgb; + ZlibR zr; + ZlibW zw; + + if(first){ + qlock(&lk); + if(first){ + deflateinit(); + crctab = mkcrctab(0xedb88320); + first = 0; + } + qunlock(&lk); + } + + rgb = memRGBA(m); + if(rgb == nil) + return -1; + + hwrite(io, PNGmagic, sizeof PNGmagic); + + /* IHDR chunk */ + h = buf; + put4(h, Dx(m->r)); h += 4; + put4(h, Dy(m->r)); h += 4; + *h++ = 8; /* 8 bits per channel */ + if(rgb->chan == BGR24) + *h++ = 2; /* RGB */ + else + *h++ = 6; /* RGBA */ + *h++ = 0; /* compression - deflate */ + *h++ = 0; /* filter - none */ + *h++ = 0; /* interlace - none */ + chunk(io, "IHDR", buf, h-buf); + + /* image data */ + zr.dx = Dx(m->r); + zr.dy = Dy(m->r); + zr.width = rgb->width * sizeof(ulong); + zr.data = rgb->data->bdata; + zr.x = 0; + zr.y = 0; + zr.pixwid = chantodepth(rgb->chan)/8; + zw.io = io; + zw.buf = vtmalloc(IDATSIZE); + zw.b = zw.buf; + zw.e = zw.b + IDATSIZE; + if(deflatezlib(&zw, zwrite, &zr, zread, 6, 0) < 0){ + free(zw.buf); + return -1; + } + if(zw.b > zw.buf) + IDAT(&zw); + free(zw.buf); + chunk(io, "IEND", nil, 0); + + if(m != rgb){ + qlock(&memdrawlock); + freememimage(rgb); + qunlock(&memdrawlock); + } + return 0; +} diff --git a/src/cmd/venti/srv/printarena.c b/src/cmd/venti/srv/printarena.c new file mode 100644 index 00000000..c305e818 --- /dev/null +++ b/src/cmd/venti/srv/printarena.c @@ -0,0 +1,130 @@ +#include "stdinc.h" +#include "dat.h" +#include "fns.h" + +void +usage(void) +{ + fprint(2, "usage: printarena arenafile [offset]\n"); + threadexitsall("usage"); +} + +static void +rdarena(Arena *arena, u64int offset) +{ + u64int a, aa, e; + u32int magic; + Clump cl; + uchar score[VtScoreSize]; + ZBlock *lump; + + printarena(2, arena); + + a = arena->base; + e = arena->base + arena->size; + if(offset != ~(u64int)0) { + if(offset >= e-a) + sysfatal("bad offset %llud >= %llud\n", + offset, e-a); + aa = offset; + } else + aa = 0; + + for(; aa < e; aa += ClumpSize+cl.info.size) { + magic = clumpmagic(arena, aa); + if(magic == ClumpFreeMagic) + break; + if(magic != arena->clumpmagic) { + fprint(2, "illegal clump magic number %#8.8ux offset %llud\n", + magic, aa); + break; + } + lump = loadclump(arena, aa, 0, &cl, score, 0); + if(lump == nil) { + fprint(2, "clump %llud failed to read: %r\n", aa); + break; + } + if(cl.info.type != VtCorruptType) { + scoremem(score, lump->data, cl.info.uncsize); + if(scorecmp(cl.info.score, score) != 0) { + fprint(2, "clump %llud has mismatched score\n", aa); + break; + } + if(vttypevalid(cl.info.type) < 0) { + fprint(2, "clump %llud has bad type %d\n", aa, cl.info.type); + break; + } + } + print("%22llud %V %3d %5d\n", aa, score, cl.info.type, cl.info.uncsize); + freezblock(lump); + } + print("end offset %llud\n", aa); +} + +void +threadmain(int argc, char *argv[]) +{ + char *file; + Arena *arena; + u64int offset, aoffset; + Part *part; + Dir *d; + uchar buf[8192]; + ArenaHead head; + + readonly = 1; /* for part.c */ + aoffset = 0; + ARGBEGIN{ + case 'o': + aoffset = strtoull(EARGF(usage()), 0, 0); + break; + default: + usage(); + break; + }ARGEND + + offset = ~(u64int)0; + switch(argc) { + default: + usage(); + case 2: + offset = strtoull(argv[1], 0, 0); + /* fall through */ + case 1: + file = argv[0]; + } + + + ventifmtinstall(); + statsinit(); + + if((d = dirstat(file)) == nil) + sysfatal("can't stat file %s: %r", file); + + part = initpart(file, OREAD|ODIRECT); + if(part == nil) + sysfatal("can't open file %s: %r", file); + if(readpart(part, aoffset, buf, sizeof buf) < 0) + sysfatal("can't read file %s: %r", file); + + if(unpackarenahead(&head, buf) < 0) + sysfatal("corrupted arena header: %r"); + + print("# arena head version=%d name=%.*s blocksize=%d size=%lld clumpmagic=0x%.8ux\n", + head.version, ANameSize, head.name, head.blocksize, + head.size, head.clumpmagic); + + if(aoffset+head.size > d->length) + sysfatal("arena is truncated: want %llud bytes have %llud\n", + head.size, d->length); + + partblocksize(part, head.blocksize); + initdcache(8 * MaxDiskBlock); + + arena = initarena(part, aoffset, head.size, head.blocksize); + if(arena == nil) + sysfatal("initarena: %r"); + + rdarena(arena, offset); + threadexitsall(0); +} diff --git a/src/cmd/venti/srv/printarenas.c b/src/cmd/venti/srv/printarenas.c new file mode 100644 index 00000000..111db018 --- /dev/null +++ b/src/cmd/venti/srv/printarenas.c @@ -0,0 +1,113 @@ +#include "stdinc.h" +#include "dat.h" +#include "fns.h" +#include <bio.h> + +Biobuf bout; + +static void +pie(IEntry *ie) +{ + Bprint(&bout, "%22lld %V %3d %5d\n", + ie->ia.addr, ie->score, ie->ia.type, ie->ia.size); +} + +void +usage(void) +{ + fprint(2, "usage: printarenas [-B blockcachesize] config [arenaname...]\n"); + threadexitsall(0); +} + +Config conf; + +int +shoulddump(char *name, int argc, char **argv) +{ + int i; + + if(argc == 0) + return 1; + for(i=0; i<argc; i++) + if(strcmp(name, argv[i]) == 0) + return 1; + return 0; +} + +enum +{ + ClumpChunks = 32*1024, +}; + +void +dumparena(Arena *arena, u64int a) +{ + IEntry ie; + ClumpInfo *ci, *cis; + u32int clump; + int i, n, nskip; + + cis = MKN(ClumpInfo, ClumpChunks); + nskip = 0; + memset(&ie, 0, sizeof(IEntry)); + for(clump = 0; clump < arena->memstats.clumps; clump += n){ + n = ClumpChunks; + if(n > arena->memstats.clumps - clump) + n = arena->memstats.clumps - clump; + if(readclumpinfos(arena, clump, cis, n) != n){ + fprint(2, "arena directory read failed: %r\n"); + break; + } + + for(i = 0; i < n; i++){ + ci = &cis[i]; + ie.ia.type = ci->type; + ie.ia.size = ci->uncsize; + ie.ia.addr = a; + a += ci->size + ClumpSize; + ie.ia.blocks = (ci->size + ClumpSize + (1 << ABlockLog) - 1) >> ABlockLog; + scorecp(ie.score, ci->score); + pie(&ie); + } + } + free(cis); +} + +void +threadmain(int argc, char *argv[]) +{ + int i; + Index *ix; + u32int bcmem; + + bcmem = 0; + ARGBEGIN{ + case 'B': + bcmem = unittoull(ARGF()); + break; + default: + usage(); + break; + }ARGEND + + if(argc < 1) + usage(); + + ventifmtinstall(); + + if(initventi(argv[0], &conf) < 0) + sysfatal("can't init venti: %r"); + + if(bcmem < maxblocksize * (mainindex->narenas + mainindex->nsects * 4 + 16)) + bcmem = maxblocksize * (mainindex->narenas + mainindex->nsects * 4 + 16); + if(0) fprint(2, "initialize %d bytes of disk block cache\n", bcmem); + initdcache(bcmem); + + Binit(&bout, 1, OWRITE); + ix = mainindex; + for(i=0; i<ix->narenas; i++) + if(shoulddump(ix->arenas[i]->name, argc-1, argv+1)) + dumparena(ix->arenas[i], ix->amap[i].start); + Bterm(&bout); + threadexitsall(0); +} diff --git a/src/cmd/venti/srv/printindex.c b/src/cmd/venti/srv/printindex.c new file mode 100644 index 00000000..edbcf793 --- /dev/null +++ b/src/cmd/venti/srv/printindex.c @@ -0,0 +1,99 @@ +#include "stdinc.h" +#include "dat.h" +#include "fns.h" +#include <bio.h> + +Biobuf bout; + +static void +pie(IEntry *ie) +{ + Bprint(&bout, "%22lld %V %3d %5d\n", + ie->ia.addr, ie->score, ie->ia.type, ie->ia.size); +} + +void +usage(void) +{ + fprint(2, "usage: printindex [-B blockcachesize] config [isectname...]\n"); + threadexitsall(0); +} + +Config conf; + +int +shoulddump(char *name, int argc, char **argv) +{ + int i; + + if(argc == 0) + return 1; + for(i=0; i<argc; i++) + if(strcmp(name, argv[i]) == 0) + return 1; + return 0; +} + +void +dumpisect(ISect *is) +{ + int j; + uchar *buf; + u32int i; + u64int off; + IBucket ib; + IEntry ie; + + buf = emalloc(is->blocksize); + for(i=0; i<is->blocks; i++){ + off = is->blockbase+(u64int)is->blocksize*i; + if(readpart(is->part, off, buf, is->blocksize) < 0) + fprint(2, "read %s at 0x%llux: %r\n", is->part->name, off); + else{ + unpackibucket(&ib, buf, is->bucketmagic); + for(j=0; j<ib.n; j++){ + unpackientry(&ie, &ib.data[j*IEntrySize]); + pie(&ie); + } + } + } +} + +void +threadmain(int argc, char *argv[]) +{ + int i; + Index *ix; + u32int bcmem; + + bcmem = 0; + ARGBEGIN{ + case 'B': + bcmem = unittoull(ARGF()); + break; + default: + usage(); + break; + }ARGEND + + if(argc < 1) + usage(); + + fmtinstall('H', encodefmt); + + if(initventi(argv[0], &conf) < 0) + sysfatal("can't init venti: %r"); + + if(bcmem < maxblocksize * (mainindex->narenas + mainindex->nsects * 4 + 16)) + bcmem = maxblocksize * (mainindex->narenas + mainindex->nsects * 4 + 16); + if(0) fprint(2, "initialize %d bytes of disk block cache\n", bcmem); + initdcache(bcmem); + + ix = mainindex; + Binit(&bout, 1, OWRITE); + for(i=0; i<ix->nsects; i++) + if(shoulddump(ix->sects[i]->name, argc-1, argv+1)) + dumpisect(ix->sects[i]); + Bterm(&bout); + threadexitsall(0); +} diff --git a/src/cmd/venti/srv/printmap.c b/src/cmd/venti/srv/printmap.c new file mode 100644 index 00000000..f3392ef8 --- /dev/null +++ b/src/cmd/venti/srv/printmap.c @@ -0,0 +1,42 @@ +#include "stdinc.h" +#include "dat.h" +#include "fns.h" + +void +usage(void) +{ + fprint(2, "usage: printmap [-B blockcachesize] config\n"); + threadexitsall("usage"); +} + +Config conf; + +void +threadmain(int argc, char *argv[]) +{ + u32int bcmem; + int fix; + + fix = 0; + bcmem = 0; + ARGBEGIN{ + case 'B': + bcmem = unittoull(ARGF()); + break; + default: + usage(); + break; + }ARGEND + + if(!fix) + readonly = 1; + + if(argc != 1) + usage(); + + if(initventi(argv[0], &conf) < 0) + sysfatal("can't init venti: %r"); + + printindex(1, mainindex); + threadexitsall(0); +} diff --git a/src/cmd/venti/srv/rdarena.c b/src/cmd/venti/srv/rdarena.c new file mode 100644 index 00000000..909cc206 --- /dev/null +++ b/src/cmd/venti/srv/rdarena.c @@ -0,0 +1,91 @@ +#include "stdinc.h" +#include "dat.h" +#include "fns.h" + +static int verbose; + +void +usage(void) +{ + fprint(2, "usage: rdarena [-v] arenapart arena\n"); + threadexitsall(0); +} + +static void +rdarena(Arena *arena) +{ + ZBlock *b; + u64int a, e; + u32int bs; + + fprint(2, "copying %s to standard output\n", arena->name); + printarena(2, arena); + + bs = MaxIoSize; + if(bs < arena->blocksize) + bs = arena->blocksize; + + b = alloczblock(bs, 0, arena->blocksize); + e = arena->base + arena->size + arena->blocksize; + for(a = arena->base - arena->blocksize; a + arena->blocksize <= e; a += bs){ + if(a + bs > e) + bs = arena->blocksize; + if(readpart(arena->part, a, b->data, bs) < 0) + fprint(2, "can't copy %s, read at %lld failed: %r\n", arena->name, a); + if(write(1, b->data, bs) != bs) + sysfatal("can't copy %s, write at %lld failed: %r", arena->name, a); + } + + freezblock(b); +} + +void +threadmain(int argc, char *argv[]) +{ + ArenaPart *ap; + Part *part; + char *file, *aname; + int i; + + ventifmtinstall(); + statsinit(); + + ARGBEGIN{ + case 'v': + verbose++; + break; + default: + usage(); + break; + }ARGEND + + readonly = 1; + + if(argc != 2) + usage(); + + file = argv[0]; + aname = argv[1]; + + part = initpart(file, OREAD|ODIRECT); + if(part == nil) + sysfatal("can't open partition %s: %r", file); + + ap = initarenapart(part); + if(ap == nil) + sysfatal("can't initialize arena partition in %s: %r", file); + + if(verbose) + printarenapart(2, ap); + + initdcache(8 * MaxDiskBlock); + + for(i = 0; i < ap->narenas; i++){ + if(strcmp(ap->arenas[i]->name, aname) == 0){ + rdarena(ap->arenas[i]); + threadexitsall(0); + } + } + + sysfatal("couldn't find arena %s\n", aname); +} diff --git a/src/cmd/venti/srv/round.c b/src/cmd/venti/srv/round.c new file mode 100644 index 00000000..bbf4a478 --- /dev/null +++ b/src/cmd/venti/srv/round.c @@ -0,0 +1,102 @@ +#include "stdinc.h" +#include "dat.h" +#include "fns.h" + +void +waitforkick(Round *r) +{ + int n; + + qlock(&r->lock); + r->last = r->current; + assert(r->current+1 == r->next); + rwakeupall(&r->finish); + while(!r->doanother) + rsleep(&r->start); + n = r->next++; + r->current = n; + r->doanother = 0; + qunlock(&r->lock); +} + +static void +_kickround(Round *r, int wait) +{ + int n; + + if(!r->doanother) + trace(TraceProc, "kick %s", r->name); + r->doanother = 1; + rwakeup(&r->start); + if(wait){ + n = r->next; + while((int)(n - r->last) > 0){ + r->doanother = 1; + rwakeup(&r->start); + rsleep(&r->finish); + } + } +} + +void +kickround(Round *r, int wait) +{ + qlock(&r->lock); + _kickround(r, wait); + qunlock(&r->lock); +} + +void +initround(Round *r, char *name, int delay) +{ + memset(r, 0, sizeof *r); + r->name = name; + r->start.l = &r->lock; + r->finish.l = &r->lock; + r->delaywait.l = &r->lock; + r->last = 0; + r->current = 0; + r->next = 1; + r->doanother = 0; + r->delaytime = delay; +} + +void +delaykickround(Round *r) +{ + qlock(&r->lock); + r->delaykick = 1; + rwakeup(&r->delaywait); + qunlock(&r->lock); +} + +void +delaykickroundproc(void *v) +{ + Round *r = v; + int n; + + threadsetname("delaykickproc %s", r->name); + qlock(&r->lock); + for(;;){ + while(r->delaykick == 0){ + trace(TraceProc, "sleep"); + rsleep(&r->delaywait); + } + + n = r->next; + qunlock(&r->lock); + + trace(TraceProc, "waitround 0x%ux", (uint)n); + sleep(r->delaytime); + + qlock(&r->lock); + if(n == r->next){ + trace(TraceProc, "kickround 0x%ux", (uint)n); + _kickround(r, 1); + } + + trace(TraceProc, "finishround 0x%ux", (uint)n); + } +} + diff --git a/src/cmd/venti/srv/score.c b/src/cmd/venti/srv/score.c new file mode 100644 index 00000000..0809e84f --- /dev/null +++ b/src/cmd/venti/srv/score.c @@ -0,0 +1,43 @@ +#include "stdinc.h" +#include "dat.h" +#include "fns.h" + +u8int zeroscore[VtScoreSize]; + +void +scoremem(u8int *score, u8int *buf, int n) +{ + DigestState s; + + memset(&s, 0, sizeof s); + sha1(buf, n, score, &s); +} + +static int +hexv(int c) +{ + if(c >= '0' && c <= '9') + return c - '0'; + if(c >= 'a' && c <= 'f') + return c - 'a' + 10; + if(c >= 'A' && c <= 'F') + return c - 'A' + 10; + return -1; +} + +int +strscore(char *s, u8int *score) +{ + int i, c, d; + + for(i = 0; i < VtScoreSize; i++){ + c = hexv(s[2 * i]); + if(c < 0) + return -1; + d = hexv(s[2 * i + 1]); + if(d < 0) + return -1; + score[i] = (c << 4) + d; + } + return s[2 * i] == '\0'; +} diff --git a/src/cmd/venti/srv/sortientry.c b/src/cmd/venti/srv/sortientry.c new file mode 100644 index 00000000..e1fc24b0 --- /dev/null +++ b/src/cmd/venti/srv/sortientry.c @@ -0,0 +1,376 @@ +#include "stdinc.h" +#include "dat.h" +#include "fns.h" +#include <bio.h> + +typedef struct IEBuck IEBuck; +typedef struct IEBucks IEBucks; + +enum +{ + ClumpChunks = 32*1024 +}; + +struct IEBuck +{ + u32int head; /* head of chain of chunks on the disk */ + u32int used; /* usage of the last chunk */ + u64int total; /* total number of bytes in this bucket */ + u8int *buf; /* chunk of entries for this bucket */ +}; + +struct IEBucks +{ + Part *part; + u64int off; /* offset for writing data in the partition */ + u32int chunks; /* total chunks written to fd */ + u64int max; /* max bytes entered in any one bucket */ + int bits; /* number of bits in initial bucket sort */ + int nbucks; /* 1 << bits, the number of buckets */ + u32int size; /* bytes in each of the buckets chunks */ + u32int usable; /* amount usable for IEntry data */ + u8int *buf; /* buffer for all chunks */ + u8int *xbuf; + IEBuck *bucks; +}; + +#define U32GET(p) (((p)[0]<<24)|((p)[1]<<16)|((p)[2]<<8)|(p)[3]) +#define U32PUT(p,v) (p)[0]=(v)>>24;(p)[1]=(v)>>16;(p)[2]=(v)>>8;(p)[3]=(v) + +static IEBucks *initiebucks(Part *part, int bits, u32int size); +static int flushiebuck(IEBucks *ib, int b, int reset); +static int flushiebucks(IEBucks *ib); +static u32int sortiebuck(IEBucks *ib, int b); +static u64int sortiebucks(IEBucks *ib); +static int sprayientry(IEBucks *ib, IEntry *ie); +static u32int readarenainfo(IEBucks *ib, Arena *arena, u64int a, Bloom *b); +static u32int readiebuck(IEBucks *ib, int b); +static void freeiebucks(IEBucks *ib); + +/* + * build a sorted file with all IEntries which should be in ix. + * assumes the arenas' directories are up to date. + * reads each, converts the entries to index entries, + * and sorts them. + */ +u64int +sortrawientries(Index *ix, Part *tmp, u64int *base, Bloom *bloom) +{ + IEBucks *ib; + u64int clumps, sorted; + u32int n; + int i, ok; + +//ZZZ should allow configuration of bits, bucket size + ib = initiebucks(tmp, 8, 64*1024); + if(ib == nil){ + seterr(EOk, "can't create sorting buckets: %r"); + return TWID64; + } + ok = 0; + clumps = 0; + fprint(2, "constructing entry list\n"); + for(i = 0; i < ix->narenas; i++){ + n = readarenainfo(ib, ix->arenas[i], ix->amap[i].start, bloom); + if(n == TWID32){ + ok = -1; + break; + } + clumps += n; + } + fprint(2, "sorting %lld entries\n", clumps); + if(ok == 0){ + sorted = sortiebucks(ib); + *base = (u64int)ib->chunks * ib->size; + if(sorted != clumps){ + fprint(2, "sorting messed up: clumps=%lld sorted=%lld\n", clumps, sorted); + ok = -1; + } + } + freeiebucks(ib); + if(ok < 0) + return TWID64; + return clumps; +} + +#define CHECK(cis) if(((ulong*)cis)[-4] != 0xA110C09) xabort(); + +void +xabort(void) +{ + int *x; + + x = 0; + *x = 0; +} + +/* + * read in all of the arena's clump directory, + * convert to IEntry format, and bucket sort based + * on the first few bits. + */ +static u32int +readarenainfo(IEBucks *ib, Arena *arena, u64int a, Bloom *b) +{ + IEntry ie; + ClumpInfo *ci, *cis; + u32int clump; + int i, n, ok, nskip; +// static Biobuf bout; + +//ZZZ remove fprint? +//fprint(2, "ra %s %d %d\n", arena->name, arena->memstats.clumps, arena->diskstats.clumps); + if(arena->memstats.clumps) + fprint(2, "\tarena %s: %d entries\n", arena->name, arena->memstats.clumps); + else + fprint(2, "[%s] ", arena->name); + + cis = MKN(ClumpInfo, ClumpChunks); + ok = 0; + nskip = 0; + memset(&ie, 0, sizeof(IEntry)); +// Binit(&bout, 1, OWRITE); + for(clump = 0; clump < arena->memstats.clumps; clump += n){ + n = ClumpChunks; + if(n > arena->memstats.clumps - clump) + n = arena->memstats.clumps - clump; + if(readclumpinfos(arena, clump, cis, n) != n){ + seterr(EOk, "arena directory read failed: %r"); + ok = -1; + break; + } + + for(i = 0; i < n; i++){ + ci = &cis[i]; + ie.ia.type = ci->type; + ie.ia.size = ci->uncsize; + ie.ia.addr = a; + a += ci->size + ClumpSize; + ie.ia.blocks = (ci->size + ClumpSize + (1 << ABlockLog) - 1) >> ABlockLog; + scorecp(ie.score, ci->score); + // Bprint(&bout, "%22lld %V %3d %5d\n", + // ie.ia.addr, ie.score, ie.ia.type, ie.ia.size); + if(ci->type == VtCorruptType){ + // print("! %V %22lld %3d %5d %3d\n", + // ie.score, ie.ia.addr, ie.ia.type, ie.ia.size, ie.ia.blocks); + nskip++; + }else + sprayientry(ib, &ie); + markbloomfilter(b, ie.score); + } + } +// Bterm(&bout); + free(cis); + if(ok < 0) + return TWID32; + return clump - nskip; +} + +/* + * initialize the external bucket sorting data structures + */ +static IEBucks* +initiebucks(Part *part, int bits, u32int size) +{ + IEBucks *ib; + int i; + + ib = MKZ(IEBucks); + if(ib == nil){ + seterr(EOk, "out of memory"); + return nil; + } + ib->bits = bits; + ib->nbucks = 1 << bits; + ib->size = size; + ib->usable = (size - U32Size) / IEntrySize * IEntrySize; + ib->bucks = MKNZ(IEBuck, ib->nbucks); + if(ib->bucks == nil){ + seterr(EOk, "out of memory allocation sorting buckets"); + freeiebucks(ib); + return nil; + } + ib->xbuf = MKN(u8int, size * ((1 << bits)+1)); + ib->buf = (u8int*)(((ulong)ib->xbuf+size-1)&~(ulong)(size-1)); + if(ib->buf == nil){ + seterr(EOk, "out of memory allocating sorting buckets' buffers"); + freeiebucks(ib); + return nil; + } + for(i = 0; i < ib->nbucks; i++){ + ib->bucks[i].head = TWID32; + ib->bucks[i].buf = &ib->buf[i * size]; + } + ib->part = part; + return ib; +} + +static void +freeiebucks(IEBucks *ib) +{ + if(ib == nil) + return; + free(ib->bucks); + free(ib->buf); + free(ib); +} + +/* + * initial sort: put the entry into the correct bucket + */ +static int +sprayientry(IEBucks *ib, IEntry *ie) +{ + u32int n; + int b; + + b = hashbits(ie->score, ib->bits); + n = ib->bucks[b].used; + if(n + IEntrySize > ib->usable){ + /* should be flushed below, but if flush fails, this can happen */ + seterr(EOk, "out of space in bucket"); + return -1; + } + packientry(ie, &ib->bucks[b].buf[n]); + n += IEntrySize; + ib->bucks[b].used = n; + if(n + IEntrySize <= ib->usable) + return 0; + return flushiebuck(ib, b, 1); +} + +/* + * finish sorting: + * for each bucket, read it in and sort it + * write out the the final file + */ +static u64int +sortiebucks(IEBucks *ib) +{ + u64int tot; + u32int n; + int i; + + if(flushiebucks(ib) < 0) + return TWID64; + for(i = 0; i < ib->nbucks; i++) + ib->bucks[i].buf = nil; + ib->off = (u64int)ib->chunks * ib->size; + free(ib->xbuf); +if(0){ + fprint(2, "ib->max = %lld\n", ib->max); + fprint(2, "ib->chunks = %ud\n", ib->chunks); +} + ib->buf = MKN(u8int, ib->max + U32Size); + if(ib->buf == nil){ + seterr(EOk, "out of memory allocating final sorting buffer; try more buckets"); + return TWID64; + } + tot = 0; + for(i = 0; i < ib->nbucks; i++){ + n = sortiebuck(ib, i); + if(n == TWID32) + return TWID64; + if(n != ib->bucks[i].total/IEntrySize) + fprint(2, "bucket %d changed count %d => %d\n", + i, (int)(ib->bucks[i].total/IEntrySize), n); + tot += n; + } + return tot; + return 0; +} + +/* + * sort from bucket b of ib into the output file to + */ +static u32int +sortiebuck(IEBucks *ib, int b) +{ + u32int n; + + n = readiebuck(ib, b); + if(n == TWID32) + return TWID32; + qsort(ib->buf, n, IEntrySize, ientrycmp); + if(writepart(ib->part, ib->off, ib->buf, n * IEntrySize) < 0){ + seterr(EOk, "can't write sorted bucket: %r"); + return TWID32; + } + ib->off += n * IEntrySize; + return n; +} + +/* + * write out a single bucket + */ +static int +flushiebuck(IEBucks *ib, int b, int reset) +{ + u32int n; + + if(ib->bucks[b].used == 0) + return 0; + n = ib->bucks[b].used; + U32PUT(&ib->bucks[b].buf[n], ib->bucks[b].head); + n += U32Size; + USED(n); + if(writepart(ib->part, (u64int)ib->chunks * ib->size, ib->bucks[b].buf, ib->size) < 0){ + seterr(EOk, "can't write sorting bucket to file: %r"); +xabort(); + return -1; + } + ib->bucks[b].head = ib->chunks++; + ib->bucks[b].total += ib->bucks[b].used; + if(reset) + ib->bucks[b].used = 0; + return 0; +} + +/* + * write out all of the buckets, and compute + * the maximum size of any bucket + */ +static int +flushiebucks(IEBucks *ib) +{ + int i; + + for(i = 0; i < ib->nbucks; i++){ + if(flushiebuck(ib, i, 0) < 0) + return -1; + if(ib->bucks[i].total > ib->max) + ib->max = ib->bucks[i].total; + } + return 0; +} + +/* + * read in the chained buffers for bucket b, + * and return it's total number of IEntries + */ +static u32int +readiebuck(IEBucks *ib, int b) +{ + u32int head, m, n; + + head = ib->bucks[b].head; + n = 0; + m = ib->bucks[b].used; + if(m == 0) + m = ib->usable; +// if(ib->bucks[b].total) +// fprint(2, "\tbucket %d: %d entries\n", b, ib->bucks[b].total/IEntrySize); + while(head != TWID32){ + if(readpart(ib->part, (u64int)head * ib->size, &ib->buf[n], m + U32Size) < 0){ + seterr(EOk, "can't read index sort bucket: %r"); + return TWID32; + } + n += m; + head = U32GET(&ib->buf[n]); + m = ib->usable; + } + if(n != ib->bucks[b].total) + fprint(2, "\tbucket %d: expected %d entries, got %d\n", + b, (int)ib->bucks[b].total/IEntrySize, n/IEntrySize); + return n / IEntrySize; +} diff --git a/src/cmd/venti/srv/stats.c b/src/cmd/venti/srv/stats.c new file mode 100644 index 00000000..874f7d27 --- /dev/null +++ b/src/cmd/venti/srv/stats.c @@ -0,0 +1,212 @@ +#include "stdinc.h" +#include "dat.h" +#include "fns.h" + +int collectstats = 1; + +/* keep in sync with dat.h:/NStat */ +Statdesc statdesc[NStat] = +{ + { "rpc total", }, + { "rpc reads", }, + { "rpc reads ok", }, + { "rpc reads failed", }, + { "rpc read bytes", }, + { "rpc read time", }, + { "rpc read cached", }, + { "rpc read cached time", }, + { "rpc read uncached", }, + { "rpc read uncached time "}, + + { "rpc writes", }, + { "rpc writes new", }, + { "rpc writes old", }, + { "rpc writes failed", }, + { "rpc write bytes", }, + { "rpc write time", }, + { "rpc write new time", }, + { "rpc write old time", }, + + { "lump cache hits", }, + { "lump cache misses", }, + { "lump cache reads", }, + { "lump cache writes", }, + { "lump cache size", }, + { "lump cache stall", }, + { "lump cache read time", }, + + { "disk cache hits", }, + { "disk cache misses", }, + { "disk cache lookups", }, + { "disk cache reads", }, + { "disk cache writes", }, + { "disk cache dirty", }, + { "disk cache size", }, + { "disk cache flushes", }, + { "disk cache stalls", }, + { "disk cache lookup time", }, + + { "disk block stalls", }, + { "lump stalls", }, + + { "index cache hits", }, + { "index cache misses", }, + { "index cache reads", }, + { "index cache writes", }, + { "index cache fills", }, + { "index cache prefetches", }, + { "index cache dirty", }, + { "index cache size", }, + { "index cache flushes", }, + { "index cache stalls", }, + { "index cache read time", }, + + { "bloom filter hits", }, + { "bloom filter misses", }, + { "bloom filter false misses", }, + { "bloom filter lookups", }, + { "bloom filter ones", }, + { "bloom filter bits", }, + { "bloom filter lookup time", }, + + { "arena block reads", }, + { "arena block read bytes", }, + { "arena block writes", }, + { "arena block write bytes", }, + + { "isect block reads", }, + { "isect block read bytes", }, + { "isect block writes", }, + { "isect block write bytes", }, + + { "sum reads", }, + { "sum read bytes", }, +}; + +QLock statslock; +Stats stats; +Stats *stathist; +int nstathist; +ulong statind; +ulong stattime; + +void +statsproc(void *v) +{ + USED(v); + + for(;;){ + stats.now = time(0); + stathist[stattime%nstathist] = stats; + stattime++; + sleep(1000); + } +} + +void +statsinit(void) +{ + nstathist = 90000; + stathist = MKNZ(Stats, nstathist); + vtproc(statsproc, nil); +} + +void +setstat(int index, long val) +{ + qlock(&statslock); + stats.n[index] = val; + qunlock(&statslock); +} + +void +addstat(int index, int inc) +{ + if(!collectstats) + return; + qlock(&statslock); + stats.n[index] += inc; + qunlock(&statslock); +} + +void +addstat2(int index, int inc, int index1, int inc1) +{ + if(!collectstats) + return; + qlock(&statslock); + stats.n[index] += inc; + stats.n[index1] += inc1; + qunlock(&statslock); +} + +void +printstats(void) +{ +} + +void +binstats(long (*fn)(Stats *s0, Stats *s1, void *arg), void *arg, + long t0, long t1, Statbin *bin, int nbin) +{ + long t, xt0, te, v; + int i, j, lo, hi, m, oj; + vlong tot; + Statbin *b; + + t = stats.now; + + /* negative times mean relative to now. */ + if(t0 <= 0) + t0 += t; + if(t1 <= 0) + t1 += t; + /* ten minute range if none given */ + if(t1 <= t0) + t0 = t1 - 60*10; + if(0) fprint(2, "stats %ld-%ld\n", t0, t1); + + /* binary search to find t0-1 or close */ + lo = stattime; + hi = stattime+nstathist; + while(lo+1 < hi){ + m = (lo+hi)/2; + if(stathist[m%nstathist].now >= t0) + hi = m; + else + lo = m; + } + xt0 = stathist[lo%nstathist].now; + if(0) fprint(2, "bsearch found %ld\n", xt0); + if(xt0 >= t1){ + /* no samples */ + memset(bin, 0, nbin*sizeof bin[0]); + return; + } + + hi = stattime+nstathist; + te = t0; + j = lo+1; + for(i=0; i<nbin; i++){ + t = te; + te = t0 + (t1-t0)*i/nbin; + b = &bin[i]; + memset(b, 0, sizeof *b); + tot = 0; + oj = j; + for(; j<hi && stathist[j%nstathist].now<te; j++){ + v = fn(&stathist[(j-1)%nstathist], &stathist[j%nstathist], arg); + if(b->nsamp==0 || v < b->min) + b->min = v; + if(b->nsamp==0 || v > b->max) + b->max = v; + tot += v; + b->nsamp++; + } + if(0) fprint(2, "bin%d: %ld to %ld; %d to %d - %d samples\n", i, t, te, oj, j, b->nsamp); + if(b->nsamp) + b->avg = tot / b->nsamp; + if(b->nsamp==0 && i>0) + *b = bin[i-1]; + } +} diff --git a/src/cmd/venti/srv/stdinc.h b/src/cmd/venti/srv/stdinc.h new file mode 100644 index 00000000..3fd06ccd --- /dev/null +++ b/src/cmd/venti/srv/stdinc.h @@ -0,0 +1,9 @@ +#include <u.h> +#include <libc.h> +#include <venti.h> +#include <flate.h> +#include <libsec.h> +#include <thread.h> +#include <httpd.h> +#include <draw.h> +#include <memdraw.h> diff --git a/src/cmd/venti/srv/syncarena.c b/src/cmd/venti/srv/syncarena.c new file mode 100644 index 00000000..d11ca4f3 --- /dev/null +++ b/src/cmd/venti/srv/syncarena.c @@ -0,0 +1,174 @@ +#include "stdinc.h" +#include "dat.h" +#include "fns.h" + +static int writeclumphead(Arena *arena, u64int aa, Clump *cl); +static int writeclumpmagic(Arena *arena, u64int aa, u32int magic); + +int +clumpinfocmp(ClumpInfo *c, ClumpInfo *d) +{ + return c->type != d->type + || c->size != d->size + || c->uncsize != d->uncsize + || scorecmp(c->score, d->score)!=0; +} + +/* + * synchronize the clump info directory with + * with the clumps actually stored in the arena. + * the directory should be at least as up to date + * as the arena's trailer. + * + * checks/updates at most n clumps. + * + * returns 0 if ok, flags if error occurred + */ +int +syncarena(Arena *arena, u64int start, u32int n, int zok, int fix) +{ + ZBlock *lump; + Clump cl; + ClumpInfo ci; + static ClumpInfo zci = { .type = -1 }; + u8int score[VtScoreSize]; + u64int uncsize, used, aa; + u32int clump, clumps, cclumps, magic; + int err, flush, broken; + AState as; + + used = arena->memstats.used; + clumps = arena->memstats.clumps; + cclumps = arena->memstats.cclumps; + uncsize = arena->memstats.uncsize; + trace(TraceProc, "syncarena start"); + flush = 0; + err = 0; + for(; n; n--){ + aa = arena->memstats.used; + clump = arena->memstats.clumps; + magic = clumpmagic(arena, aa); + if(magic == ClumpFreeMagic) + break; + if(magic != arena->clumpmagic){ + fprint(2, "%s: illegal clump magic number=%#8.8ux at clump=%d\n", arena->name, magic, clump); + /* err |= SyncDataErr; */ + if(fix && writeclumpmagic(arena, aa, ClumpFreeMagic) < 0){ + fprint(2, "can't write corrected clump free magic: %r"); + err |= SyncFixErr; + } + break; + } + + broken = 0; + lump = loadclump(arena, aa, 0, &cl, score, 0); + if(lump == nil){ + fprint(2, "%s: clump=%d failed to read correctly: %r\n", arena->name, clump); + break; + err |= SyncDataErr; + }else if(cl.info.type != VtCorruptType){ + scoremem(score, lump->data, cl.info.uncsize); + if(scorecmp(cl.info.score, score) != 0){ + /* ignore partially written block */ + if(cl.encoding == ClumpENone) + break; + fprint(2, "%s: clump=%d has mismatched score\n", arena->name, clump); + err |= SyncDataErr; + broken = 1; + }else if(vttypevalid(cl.info.type) < 0){ + fprint(2, "%s: clump=%d has invalid type %d", arena->name, clump, cl.info.type); + err |= SyncDataErr; + broken = 1; + } + if(broken && fix){ + cl.info.type = VtCorruptType; + if(writeclumphead(arena, aa, &cl) < 0){ + fprint(2, "%s: can't write corrected clump header: %r", arena->name); + err |= SyncFixErr; + } + } + } + freezblock(lump); + arena->memstats.used += ClumpSize + cl.info.size; + + arena->memstats.clumps++; + if(!broken && readclumpinfo(arena, clump, &ci)<0){ + fprint(2, "%s: arena directory read failed\n", arena->name); + broken = 1; + }else if(!broken && clumpinfocmp(&ci, &cl.info)!=0){ + if(clumpinfocmp(&ci, &zci) == 0){ + err |= SyncCIZero; + if(!zok) + fprint(2, "%s: unwritten clump info for clump=%d\n", arena->name, clump); + }else{ + err |= SyncCIErr; + fprint(2, "%s: bad clump info for clump=%d\n", arena->name, clump); + fprint(2, "\texpected score=%V type=%d size=%d uncsize=%d\n", + cl.info.score, cl.info.type, cl.info.size, cl.info.uncsize); + fprint(2, "\tfound score=%V type=%d size=%d uncsize=%d\n", + ci.score, ci.type, ci.size, ci.uncsize); + } + broken = 1; + } + if(broken && fix){ + flush = 1; + ci = cl.info; + if(writeclumpinfo(arena, clump, &ci) < 0){ + fprint(2, "%s: can't write correct clump directory: %r\n", arena->name); + err |= SyncFixErr; + } + } + trace(TraceProc, "syncarena unindexed clump %V %d", cl.info.score, arena->memstats.clumps); + + arena->memstats.uncsize += cl.info.uncsize; + if(cl.info.size < cl.info.uncsize) + arena->memstats.cclumps++; + } + + if(flush){ + trace(TraceProc, "syncarena flush"); + arena->wtime = now(); + if(arena->ctime == 0 && arena->memstats.clumps) + arena->ctime = arena->wtime; + flushdcache(); + } + + if(used != arena->memstats.used + || clumps != arena->memstats.clumps + || cclumps != arena->memstats.cclumps + || uncsize != arena->memstats.uncsize) + err |= SyncHeader; + if(start && (err&SyncHeader)){ + trace(TraceProc, "syncarena setdcachestate"); + as.arena = arena; + as.aa = start+arena->memstats.used; + as.stats = arena->memstats; + setdcachestate(&as); + } + + return err; +} + +static int +writeclumphead(Arena *arena, u64int aa, Clump *cl) +{ + ZBlock *zb; + int bad; + + zb = alloczblock(ClumpSize, 0, arena->blocksize); + if(zb == nil) + return -1; + bad = packclump(cl, zb->data, arena->clumpmagic)<0 + || writearena(arena, aa, zb->data, ClumpSize) != ClumpSize; + freezblock(zb); + return bad ? -1 : 0; +} + +static int +writeclumpmagic(Arena *arena, u64int aa, u32int magic) +{ + u8int buf[U32Size]; + + packmagic(magic, buf); + return writearena(arena, aa, buf, U32Size) == U32Size; +} diff --git a/src/cmd/venti/srv/syncindex.c b/src/cmd/venti/srv/syncindex.c new file mode 100644 index 00000000..b35ca2a4 --- /dev/null +++ b/src/cmd/venti/srv/syncindex.c @@ -0,0 +1,73 @@ +#include "stdinc.h" +#include "dat.h" +#include "fns.h" + +static int verbose; +void +usage(void) +{ + fprint(2, "usage: syncindex [-fv] [-B blockcachesize] config\n"); + threadexitsall("usage"); +} + +Config conf; + +void +threadmain(int argc, char *argv[]) +{ + u32int bcmem, icmem; + int fix; + + fix = 0; + bcmem = 0; + icmem = 0; + ARGBEGIN{ + case 'B': + bcmem = unittoull(EARGF(usage())); + break; + case 'I': + icmem = unittoull(EARGF(usage())); + break; + case 'f': + fix++; + break; + case 'v': + verbose++; + break; + default: + usage(); + break; + }ARGEND + + if(!fix) + readonly = 1; + + if(argc != 1) + usage(); + + if(initventi(argv[0], &conf) < 0) + sysfatal("can't init venti: %r"); + + if(bcmem < maxblocksize * (mainindex->narenas + mainindex->nsects * 4 + 16)) + bcmem = maxblocksize * (mainindex->narenas + mainindex->nsects * 4 + 16); + if(0) fprint(2, "initialize %d bytes of disk block cache\n", bcmem); + initdcache(bcmem); + initlumpcache(1*1024*1024, 1024/8); + icmem = u64log2(icmem / (sizeof(IEntry)+sizeof(IEntry*)) / ICacheDepth); + if(icmem < 4) + icmem = 4; + if(1) fprint(2, "initialize %d bytes of index cache for %d index entries\n", + (sizeof(IEntry)+sizeof(IEntry*)) * (1 << icmem) * ICacheDepth, + (1 << icmem) * ICacheDepth); + initicache(icmem, ICacheDepth); + initicachewrite(); + if(mainindex->bloom) + startbloomproc(mainindex->bloom); + + if(verbose) + printindex(2, mainindex); + if(syncindex(mainindex, fix, 1, 0) < 0) + sysfatal("failed to sync index=%s: %r\n", mainindex->name); + + threadexitsall(0); +} diff --git a/src/cmd/venti/srv/syncindex0.c b/src/cmd/venti/srv/syncindex0.c new file mode 100644 index 00000000..12b69ed2 --- /dev/null +++ b/src/cmd/venti/srv/syncindex0.c @@ -0,0 +1,167 @@ +#include "stdinc.h" +#include "dat.h" +#include "fns.h" + +enum +{ + ClumpChunks = 32*1024 +}; + +static int missing, wrong; + +/* + * shell sort is plenty good enough + * because we're going to do a bunch of disk i/o's + */ +static void +sortclumpinfo(ClumpInfo *ci, int *s, int n) +{ + int i, j, m, t; + + for(m = (n + 3) / 5; m > 0; m = (m + 1) / 3){ + for(i = n - m; i-- > 0;){ + for(j = i + m; j < n; j += m){ + if(memcmp(ci[s[j - m]].score, ci[s[j]].score, VtScoreSize) <= 0) + break; + t = s[j]; + s[j] = s[j - m]; + s[j - m] = t; + } + } + } +} + +int +syncarenaindex(Index *ix, Arena *arena, u32int clump, u64int a, int fix, int *pflush, int check) +{ + Packet *pack; + IEntry ie; + IAddr ia; + ClumpInfo *ci, *cis; + u64int *addrs; + int i, n, ok, *s, flush; + + trace(TraceProc, "syncarenaindex enter"); + + flush = 0; + cis = MKN(ClumpInfo, ClumpChunks); + addrs = MKN(u64int, ClumpChunks); + s = MKN(int, ClumpChunks); + ok = 0; + for(; clump < arena->memstats.clumps; clump += n){ + n = ClumpChunks; + if(n > arena->memstats.clumps - clump) + n = arena->memstats.clumps - clump; + n = readclumpinfos(arena, clump, cis, n); + if(n <= 0){ + fprint(2, "arena directory read failed\n"); + ok = -1; + break; + } + + for(i = 0; i < n; i++){ + addrs[i] = a; + a += cis[i].size + ClumpSize; + s[i] = i; + } + + sortclumpinfo(cis, s, n); + + for(i = 0; i < n; i++){ + ci = &cis[s[i]]; + ia.type = ci->type; + ia.size = ci->uncsize; + ia.addr = addrs[s[i]]; + ia.blocks = (ci->size + ClumpSize + (1 << ABlockLog) - 1) >> ABlockLog; + + if(!check) + goto Add; + if(loadientry(ix, ci->score, ci->type, &ie) < 0){ + trace(TraceProc, "syncarenaindex missing block %V.%d", ci->score, ci->type); + missing++; + if(0) fprint(2, "missing block type=%d score=%V\n", ci->type, ci->score); + }else if(iaddrcmp(&ia, &ie.ia) != 0){ + trace(TraceProc, "syncarenaindex mismatched entry"); + fprint(2, "\nmismatched index entry and clump at %d\n", clump + i); + fprint(2, "\tclump: type=%d size=%d blocks=%d addr=%lld\n", ia.type, ia.size, ia.blocks, ia.addr); + fprint(2, "\tindex: type=%d size=%d block=%d addr=%lld\n", ie.ia.type, ie.ia.size, ie.ia.blocks, ie.ia.addr); + pack = readlump(ie.score, ie.ia.type, ie.ia.size, nil); + packetfree(pack); + if(pack != nil){ + fprint(2, "duplicated lump\n"); + continue; + } + wrong++; + }else + continue; + Add: + if(!fix){ + ok = -1; + continue; + } + flush = 1; + trace(TraceProc, "syncarenaindex insert %V", ci->score); + insertscore(ci->score, &ia, 1); + } + + if(0 && clump / 1000 != (clump + n) / 1000) + fprint(2, "."); + } + free(cis); + free(addrs); + free(s); + if(flush){ + flushdcache(); + *pflush = 1; + } + return ok; +} + +int +syncindex(Index *ix, int fix, int mustflush, int check) +{ + Arena *arena; + u64int a; + u32int clump; + int i, e, e1, ok, ok1, flush; + + ok = 0; + flush = 0; + for(i = 0; i < ix->narenas; i++){ + trace(TraceProc, "syncindex start %d", i); + arena = ix->arenas[i]; + clump = arena->memstats.clumps; + a = arena->memstats.used; + e = syncarena(arena, ix->amap[i].start, TWID32, fix, fix); + e1 = e; + if(fix) + e1 &= ~(SyncHeader|SyncCIZero|SyncCIErr); + if(e1 == SyncHeader) + fprint(2, "arena %s: header is out-of-date\n", arena->name); + if(e1) + ok = -1; + else{ + ok1 = syncarenaindex(ix, arena, clump, a + ix->amap[i].start, fix, &flush, check); + if(ok1 < 0) + fprint(2, "syncarenaindex: %r\n"); + if(fix && ok1==0 && (e & SyncHeader) && wbarena(arena) < 0) + fprint(2, "arena=%s header write failed: %r\n", arena->name); + ok |= ok1; + } + } + if(missing || wrong) + fprint(2, "syncindex: %d missing entries, %d wrong entries (flush=%d)\n", missing, wrong, flush); + if(fix && wbindex(ix) < 0){ + fprint(2, "can't write back index header for %s: %r\n", ix->name); + return -1; + } + if(fix && flush){ + flushdcache(); + if(mustflush){ + flushicache(); + flushdcache(); + }else + kickicache(); + } + return ok; +} diff --git a/src/cmd/venti/srv/trace.c b/src/cmd/venti/srv/trace.c new file mode 100644 index 00000000..f8669b3d --- /dev/null +++ b/src/cmd/venti/srv/trace.c @@ -0,0 +1,38 @@ +#include "stdinc.h" +#include "dat.h" +#include "fns.h" + +char TraceDisk[] = "disk"; +char TraceLump[] = "lump"; +char TraceBlock[] = "block"; +char TraceProc[] = "proc"; +char TraceWork[] = "work"; +char TraceQuiet[] = "quiet"; +char TraceRpc[] = "rpc"; + +void +trace(char *level, char *fmt, ...) +{ + char buf[512]; + va_list arg; + + if(level == nil || !ventilogging) + return; + va_start(arg, fmt); + vsnprint(buf, sizeof buf, fmt, arg); + va_end(arg); + vtlog(level, "<font size=-1>%T %s:</font> %s<br>\n", + threadgetname(), buf); + vtlog("all", "<font size=-1>%T <font color=#777777>%s</font> %s:</font> %s<br>\n", + level, threadgetname(), buf); +} + +void +traceinit(void) +{ +} + +void +settrace(char *trace) +{ +} diff --git a/src/cmd/venti/srv/unittoull.c b/src/cmd/venti/srv/unittoull.c new file mode 100644 index 00000000..1f741170 --- /dev/null +++ b/src/cmd/venti/srv/unittoull.c @@ -0,0 +1,30 @@ +#include "stdinc.h" + +#define TWID64 ((u64int)~(u64int)0) + +u64int +unittoull(char *s) +{ + char *es; + u64int n; + + if(s == nil) + return TWID64; + n = strtoul(s, &es, 0); + if(*es == 'k' || *es == 'K'){ + n *= 1024; + es++; + }else if(*es == 'm' || *es == 'M'){ + n *= 1024*1024; + es++; + }else if(*es == 'g' || *es == 'G'){ + n *= 1024*1024*1024; + es++; + }else if(*es == 't' || *es == 'T'){ + n *= 1024*1024; + n *= 1024*1024; + } + if(*es != '\0') + return TWID64; + return n; +} diff --git a/src/cmd/venti/srv/unwhack.c b/src/cmd/venti/srv/unwhack.c new file mode 100644 index 00000000..5530bd07 --- /dev/null +++ b/src/cmd/venti/srv/unwhack.c @@ -0,0 +1,179 @@ +#include "stdinc.h" +#include "whack.h" + +enum +{ + DMaxFastLen = 7, + DBigLenCode = 0x3c, /* minimum code for large lenth encoding */ + DBigLenBits = 6, + DBigLenBase = 1 /* starting items to encode for big lens */ +}; + +static uchar lenval[1 << (DBigLenBits - 1)] = +{ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 3, 3, 3, 3, 3, 3, 3, 3, + 4, 4, 4, 4, + 5, + 6, + 255, + 255 +}; + +static uchar lenbits[] = +{ + 0, 0, 0, + 2, 3, 5, 5, +}; + +static uchar offbits[16] = +{ + 5, 5, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 12, 13 +}; + +static ushort offbase[16] = +{ + 0, 0x20, + 0x40, 0x60, + 0x80, 0xc0, + 0x100, 0x180, + 0x200, 0x300, + 0x400, 0x600, + 0x800, 0xc00, + 0x1000, + 0x2000 +}; + +void +unwhackinit(Unwhack *uw) +{ + uw->err[0] = '\0'; +} + +int +unwhack(Unwhack *uw, uchar *dst, int ndst, uchar *src, int nsrc) +{ + uchar *s, *d, *dmax, *smax, lit; + ulong uwbits, lithist; + int i, off, len, bits, use, code, uwnbits, overbits; + + d = dst; + dmax = d + ndst; + + smax = src + nsrc; + uwnbits = 0; + uwbits = 0; + overbits = 0; + lithist = ~0; + while(src < smax || uwnbits - overbits >= MinDecode){ + while(uwnbits <= 24){ + uwbits <<= 8; + if(src < smax) + uwbits |= *src++; + else + overbits += 8; + uwnbits += 8; + } + + /* + * literal + */ + len = lenval[(uwbits >> (uwnbits - 5)) & 0x1f]; + if(len == 0){ + if(lithist & 0xf){ + uwnbits -= 9; + lit = (uwbits >> uwnbits) & 0xff; + lit &= 255; + }else{ + uwnbits -= 8; + lit = (uwbits >> uwnbits) & 0x7f; + if(lit < 32){ + if(lit < 24){ + uwnbits -= 2; + lit = (lit << 2) | ((uwbits >> uwnbits) & 3); + }else{ + uwnbits -= 3; + lit = (lit << 3) | ((uwbits >> uwnbits) & 7); + } + lit = (lit - 64) & 0xff; + } + } + if(d >= dmax){ + snprint(uw->err, WhackErrLen, "too much output"); + return -1; + } + *d++ = lit; + lithist = (lithist << 1) | (lit < 32) | (lit > 127); + continue; + } + + /* + * length + */ + if(len < 255) + uwnbits -= lenbits[len]; + else{ + uwnbits -= DBigLenBits; + code = ((uwbits >> uwnbits) & ((1 << DBigLenBits) - 1)) - DBigLenCode; + len = DMaxFastLen; + use = DBigLenBase; + bits = (DBigLenBits & 1) ^ 1; + while(code >= use){ + len += use; + code -= use; + code <<= 1; + uwnbits--; + if(uwnbits < 0){ + snprint(uw->err, WhackErrLen, "len out of range"); + return -1; + } + code |= (uwbits >> uwnbits) & 1; + use <<= bits; + bits ^= 1; + } + len += code; + + while(uwnbits <= 24){ + uwbits <<= 8; + if(src < smax) + uwbits |= *src++; + else + overbits += 8; + uwnbits += 8; + } + } + + /* + * offset + */ + uwnbits -= 4; + bits = (uwbits >> uwnbits) & 0xf; + off = offbase[bits]; + bits = offbits[bits]; + + uwnbits -= bits; + off |= (uwbits >> uwnbits) & ((1 << bits) - 1); + off++; + + if(off > d - dst){ + snprint(uw->err, WhackErrLen, "offset out of range: off=%d d=%ld len=%d nbits=%d", off, d - dst, len, uwnbits); + return -1; + } + if(d + len > dmax){ + snprint(uw->err, WhackErrLen, "len out of range"); + return -1; + } + s = d - off; + for(i = 0; i < len; i++) + d[i] = s[i]; + d += len; + } + if(uwnbits < overbits){ + snprint(uw->err, WhackErrLen, "compressed data overrun"); + return -1; + } + + len = d - dst; + + return len; +} diff --git a/src/cmd/venti/srv/utils.c b/src/cmd/venti/srv/utils.c new file mode 100644 index 00000000..03fd9065 --- /dev/null +++ b/src/cmd/venti/srv/utils.c @@ -0,0 +1,252 @@ +#include "stdinc.h" +#include "dat.h" +#include "fns.h" + +int +namecmp(char *s, char *t) +{ + return strncmp(s, t, ANameSize); +} + +void +namecp(char *dst, char *src) +{ + strncpy(dst, src, ANameSize - 1); + dst[ANameSize - 1] = '\0'; +} + +int +nameok(char *name) +{ + char *t; + int c; + + if(name == nil) + return -1; + for(t = name; c = *t; t++) + if(t - name >= ANameSize + || c < ' ' || c >= 0x7f) + return -1; + return 0; +} + +int +stru32int(char *s, u32int *r) +{ + char *t; + u32int n, nn, m; + int c; + + m = TWID32 / 10; + n = 0; + for(t = s; ; t++){ + c = *t; + if(c < '0' || c > '9') + break; + if(n > m) + return -1; + nn = n * 10 + c - '0'; + if(nn < n) + return -1; + n = nn; + } + *r = n; + return s != t && *t == '\0'; +} + +int +stru64int(char *s, u64int *r) +{ + char *t; + u64int n, nn, m; + int c; + + m = TWID64 / 10; + n = 0; + for(t = s; ; t++){ + c = *t; + if(c < '0' || c > '9') + break; + if(n > m) + return -1; + nn = n * 10 + c - '0'; + if(nn < n) + return -1; + n = nn; + } + *r = n; + return s != t && *t == '\0'; +} + +int +vttypevalid(int type) +{ + return type < VtMaxType; +} + +static char* +logit(int severity, char *fmt, va_list args) +{ + char *s; + + s = vsmprint(fmt, args); + if(s == nil) + return nil; + if(argv0 == nil) + fprint(2, "%s: err %d: %s\n", argv0, severity, s); + else + fprint(2, "err %d: %s\n", severity, s); + return s; +} + +void +seterr(int severity, char *fmt, ...) +{ + char *s; + va_list args; + + va_start(args, fmt); + s = logit(severity, fmt, args); + va_end(args); + if(s == nil) + werrstr("error setting error"); + else{ + werrstr("%s", s); + free(s); + } +} + +void +logerr(int severity, char *fmt, ...) +{ + char *s; + va_list args; + + va_start(args, fmt); + s = logit(severity, fmt, args); + va_end(args); + free(s); +} + +u32int +now(void) +{ + return time(nil); +} + +int abortonmem = 1; + +void * +emalloc(ulong n) +{ + void *p; + + p = malloc(n); + if(p == nil){ + if(abortonmem) + abort(); + sysfatal("out of memory allocating %lud", n); + } + memset(p, 0xa5, n); +if(0)print("emalloc %p-%p by %lux\n", p, (char*)p+n, getcallerpc(&n)); + return p; +} + +void * +ezmalloc(ulong n) +{ + void *p; + + p = malloc(n); + if(p == nil){ + if(abortonmem) + abort(); + sysfatal("out of memory allocating %lud", n); + } + memset(p, 0, n); +if(0)print("ezmalloc %p-%p by %lux\n", p, (char*)p+n, getcallerpc(&n)); + return p; +} + +void * +erealloc(void *p, ulong n) +{ + p = realloc(p, n); + if(p == nil){ + if(abortonmem) + abort(); + sysfatal("out of memory allocating %lud", n); + } +if(0)print("erealloc %p-%p by %lux\n", p, (char*)p+n, getcallerpc(&p)); + return p; +} + +char * +estrdup(char *s) +{ + char *t; + int n; + + n = strlen(s) + 1; + t = emalloc(n); + memmove(t, s, n); +if(0)print("estrdup %p-%p by %lux\n", t, (char*)t+n, getcallerpc(&s)); + return t; +} + +/* + * return floor(log2(v)) + */ +int +u64log2(u64int v) +{ + int i; + + for(i = 0; i < 64; i++) + if((v >> i) <= 1) + break; + return i; +} + +int +vtproc(void (*fn)(void*), void *arg) +{ + proccreate(fn, arg, 256*1024); + return 0; +} + +int +ientryfmt(Fmt *fmt) +{ + IEntry *ie; + + ie = va_arg(fmt->args, IEntry*); + return fmtprint(fmt, "%V %22lld %3d %5d %3d", + ie->score, ie->ia.addr, ie->ia.type, ie->ia.size, ie->ia.blocks); +} + +void +ventifmtinstall(void) +{ + fmtinstall('F', vtfcallfmt); + fmtinstall('H', encodefmt); + fmtinstall('I', ientryfmt); + fmtinstall('V', vtscorefmt); +} + +uint +msec(void) +{ + return nsec()/1000000; +} + +uint +countbits(uint n) +{ + n = (n&0x55555555)+((n>>1)&0x55555555); + n = (n&0x33333333)+((n>>2)&0x33333333); + n = (n&0x0F0F0F0F)+((n>>4)&0x0F0F0F0F); + n = (n&0x00FF00FF)+((n>>8)&0x00FF00FF); + n = (n&0x0000FFFF)+((n>>16)&0x0000FFFF); + return n; +} diff --git a/src/cmd/venti/srv/venti.c b/src/cmd/venti/srv/venti.c new file mode 100644 index 00000000..91ded95c --- /dev/null +++ b/src/cmd/venti/srv/venti.c @@ -0,0 +1,266 @@ +#ifdef PLAN9PORT +#include <u.h> +#include <signal.h> +#endif +#include "stdinc.h" +#include "dat.h" +#include "fns.h" + +#include "whack.h" + +int debug; +int nofork; +int mainstacksize = 256*1024; +VtSrv *ventisrv; + +static void ventiserver(void*); + +void +usage(void) +{ + fprint(2, "usage: venti [-dw] [-a ventiaddress] [-h httpaddress] [-c config] [-C cachesize] [-I icachesize] [-B blockcachesize]\n"); + threadexitsall("usage"); +} +void +threadmain(int argc, char *argv[]) +{ + char *configfile, *haddr, *vaddr, *webroot; + u32int mem, icmem, bcmem, minbcmem; + Config config; + + traceinit(); + threadsetname("main"); + vaddr = nil; + haddr = nil; + configfile = nil; + webroot = nil; + mem = 0; + icmem = 0; + bcmem = 0; + ARGBEGIN{ + case 'a': + vaddr = EARGF(usage()); + break; + case 'B': + bcmem = unittoull(EARGF(usage())); + break; + case 'c': + configfile = EARGF(usage()); + break; + case 'C': + mem = unittoull(EARGF(usage())); + break; + case 'D': + settrace(EARGF(usage())); + break; + case 'd': + debug = 1; + nofork = 1; + break; + case 'h': + haddr = EARGF(usage()); + break; + case 'I': + icmem = unittoull(EARGF(usage())); + break; + case 'L': + ventilogging = 1; + break; + case 's': + nofork = 1; + break; + case 'W': + webroot = EARGF(usage()); + break; + default: + usage(); + }ARGEND + + if(argc) + usage(); + + if(!nofork) + rfork(RFNOTEG); + +#ifdef PLAN9PORT + { + /* sigh - needed to avoid signals when writing to hungup networks */ + struct sigaction sa; + memset(&sa, 0, sizeof sa); + sa.sa_handler = SIG_IGN; + sigaction(SIGPIPE, &sa, nil); + } +#endif + + trace(TraceQuiet, "venti started"); + fprint(2, "venti: "); + + ventifmtinstall(); + if(configfile == nil) + configfile = "venti.conf"; + + if(initarenasum() < 0) + fprint(2, "warning: can't initialize arena summing process: %r"); + + fprint(2, "conf..."); + if(initventi(configfile, &config) < 0) + sysfatal("can't init server: %r"); + + if(mem == 0) + mem = config.mem; + if(bcmem == 0) + bcmem = config.bcmem; + if(icmem == 0) + icmem = config.icmem; + if(haddr == nil) + haddr = config.haddr; + if(vaddr == nil) + vaddr = config.vaddr; + if(vaddr == nil) + vaddr = "tcp!*!venti"; + if(webroot == nil) + webroot = config.webroot; + if(queuewrites == 0) + queuewrites = config.queuewrites; + + if(haddr){ + fprint(2, "httpd %s...", haddr); + if(httpdinit(haddr, webroot) < 0) + fprint(2, "warning: can't start http server: %r"); + } + + fprint(2, "init..."); + + if(mem == 0xffffffffUL) + mem = 1 * 1024 * 1024; + if(0) fprint(2, "initialize %d bytes of lump cache for %d lumps\n", + mem, mem / (8 * 1024)); + initlumpcache(mem, mem / (8 * 1024)); + + icmem = u64log2(icmem / (sizeof(IEntry)+sizeof(IEntry*)) / ICacheDepth); + if(icmem < 4) + icmem = 4; + if(0) fprint(2, "initialize %d bytes of index cache for %d index entries\n", + (sizeof(IEntry)+sizeof(IEntry*)) * (1 << icmem) * ICacheDepth, + (1 << icmem) * ICacheDepth); + initicache(icmem, ICacheDepth); + initicachewrite(); + + /* + * need a block for every arena and every process + */ + minbcmem = maxblocksize * + (mainindex->narenas + mainindex->nsects*4 + 16); + if(bcmem < minbcmem) + bcmem = minbcmem; + + if(0) fprint(2, "initialize %d bytes of disk block cache\n", bcmem); + initdcache(bcmem); + + if(mainindex->bloom) + startbloomproc(mainindex->bloom); + + fprint(2, "sync..."); + if(syncindex(mainindex, 1, 0, 0) < 0) + sysfatal("can't sync server: %r"); + + if(queuewrites){ + fprint(2, "queue..."); + if(initlumpqueues(mainindex->nsects) < 0){ + fprint(2, "can't initialize lump queues," + " disabling write queueing: %r"); + queuewrites = 0; + } + } + + fprint(2, "announce %s...", vaddr); + ventisrv = vtlisten(vaddr); + if(ventisrv == nil) + sysfatal("can't announce %s: %r", vaddr); + + fprint(2, "serving.\n"); + if(nofork) + ventiserver(nil); + else + vtproc(ventiserver, nil); +} + +static void +vtrerror(VtReq *r, char *error) +{ + r->rx.msgtype = VtRerror; + r->rx.error = estrdup(error); +} + +static void +ventiserver(void *v) +{ + Packet *p; + VtReq *r; + char err[ERRMAX]; + uint ms; + int cached, ok; + + USED(v); + threadsetname("ventiserver"); + trace(TraceWork, "start"); + while((r = vtgetreq(ventisrv)) != nil){ + trace(TraceWork, "finish"); + trace(TraceWork, "start request %F", &r->tx); + trace(TraceRpc, "<- %F", &r->tx); + r->rx.msgtype = r->tx.msgtype+1; + addstat(StatRpcTotal, 1); + // print("req (arenas[0]=%p sects[0]=%p) %F\n", + // mainindex->arenas[0], mainindex->sects[0], &r->tx); + switch(r->tx.msgtype){ + default: + vtrerror(r, "unknown request"); + break; + case VtTread: + ms = msec(); + r->rx.data = readlump(r->tx.score, r->tx.blocktype, r->tx.count, &cached); + ms = msec() - ms; + addstat2(StatRpcRead, 1, StatRpcReadTime, ms); + if(r->rx.data == nil){ + addstat(StatRpcReadFail, 1); + rerrstr(err, sizeof err); + vtrerror(r, err); + }else{ + addstat(StatRpcReadBytes, packetsize(r->rx.data)); + addstat(StatRpcReadOk, 1); + if(cached) + addstat2(StatRpcReadCached, 1, StatRpcReadCachedTime, ms); + else + addstat2(StatRpcReadUncached, 1, StatRpcReadUncachedTime, ms); + } + break; + case VtTwrite: + p = r->tx.data; + r->tx.data = nil; + addstat(StatRpcWriteBytes, packetsize(p)); + ms = msec(); + ok = writelump(p, r->rx.score, r->tx.blocktype, 0, ms); + ms = msec() - ms; + addstat2(StatRpcWrite, 1, StatRpcWriteTime, ms); + + if(ok < 0){ + addstat(StatRpcWriteFail, 1); + rerrstr(err, sizeof err); + vtrerror(r, err); + } + break; + case VtTsync: + flushqueue(); + flushdcache(); + break; + } + trace(TraceRpc, "-> %F", &r->rx); + vtrespond(r); + trace(TraceWork, "start"); + } + flushdcache(); + flushicache(); + threadexitsall(0); +} + + diff --git a/src/cmd/venti/srv/verifyarena.c b/src/cmd/venti/srv/verifyarena.c new file mode 100644 index 00000000..5236c093 --- /dev/null +++ b/src/cmd/venti/srv/verifyarena.c @@ -0,0 +1,127 @@ +#include "stdinc.h" +#include "dat.h" +#include "fns.h" + +static int verbose; + +void +usage(void) +{ + fprint(2, "usage: verifyarena [-v]\n"); + threadexitsall(0); +} + +static void +readblock(uchar *buf, int n) +{ + int nr, m; + + for(nr = 0; nr < n; nr += m){ + m = n - nr; + m = read(0, &buf[nr], m); + if(m <= 0) + sysfatal("can't read arena from standard input: %r"); + } +} + +static void +verifyarena(void) +{ + Arena arena; + ArenaHead head; + ZBlock *b; + DigestState s; + u64int n, e; + u32int bs; + u8int score[VtScoreSize]; + + fprint(2, "verify arena from standard input\n"); + + memset(&arena, 0, sizeof arena); + memset(&s, 0, sizeof s); + + /* + * read the little bit, which will included the header + */ + bs = MaxIoSize; + b = alloczblock(bs, 0, 0); + readblock(b->data, HeadSize); + sha1(b->data, HeadSize, nil, &s); + if(unpackarenahead(&head, b->data) < 0) + sysfatal("corrupted arena header: %r"); + if(head.version != ArenaVersion4 && head.version != ArenaVersion5) + fprint(2, "warning: unknown arena version %d\n", head.version); + + /* + * now we know how much to read + * read everything but the last block, which is special + */ + e = head.size - head.blocksize; + for(n = HeadSize; n < e; n += bs){ + if(n + bs > e) + bs = e - n; + readblock(b->data, bs); + sha1(b->data, bs, nil, &s); + } + + /* + * read the last block update the sum. + * the sum is calculated assuming the slot for the sum is zero. + */ + bs = head.blocksize; + readblock(b->data, bs); + sha1(b->data, bs-VtScoreSize, nil, &s); + sha1(zeroscore, VtScoreSize, nil, &s); + sha1(nil, 0, score, &s); + + /* + * validity check on the trailer + */ + arena.blocksize = head.blocksize; + if(unpackarena(&arena, b->data) < 0) + sysfatal("corrupted arena trailer: %r"); + scorecp(arena.score, &b->data[arena.blocksize - VtScoreSize]); + + if(namecmp(arena.name, head.name) != 0) + sysfatal("arena header and trailer names clash: %s vs. %s\n", head.name, arena.name); + if(arena.version != head.version) + sysfatal("arena header and trailer versions clash: %d vs. %d\n", head.version, arena.version); + arena.size = head.size - 2 * head.blocksize; + + /* + * check for no checksum or the same + */ + if(scorecmp(score, arena.score) != 0){ + if(scorecmp(zeroscore, arena.score) != 0) + fprint(2, "warning: mismatched checksums for arena=%s, found=%V calculated=%V", + arena.name, arena.score, score); + scorecp(arena.score, score); + }else + fprint(2, "matched score\n"); + + printarena(2, &arena); +} + +void +threadmain(int argc, char *argv[]) +{ + ventifmtinstall(); + statsinit(); + + ARGBEGIN{ + case 'v': + verbose++; + break; + default: + usage(); + break; + }ARGEND + + readonly = 1; + + if(argc != 0) + usage(); + + verifyarena(); + threadexitsall(0); +} diff --git a/src/cmd/venti/srv/whack.c b/src/cmd/venti/srv/whack.c new file mode 100644 index 00000000..ecd29033 --- /dev/null +++ b/src/cmd/venti/srv/whack.c @@ -0,0 +1,331 @@ +#include "stdinc.h" +#include "whack.h" + +typedef struct Huff Huff; +int compressblocks = 1; + +enum +{ + MaxFastLen = 9, + BigLenCode = 0x1f4, /* minimum code for large lenth encoding */ + BigLenBits = 9, + BigLenBase = 4, /* starting items to encode for big lens */ + + MinOffBits = 6, + MaxOffBits = MinOffBits + 8, + + MaxLen = 2051 /* max. length encodable in 24 bits */ +}; + +enum +{ + StatBytes, + StatOutBytes, + StatLits, + StatMatches, + StatLitBits, + StatOffBits, + StatLenBits, + + MaxStat +}; + +struct Huff +{ + short bits; /* length of the code */ + ulong encode; /* the code */ +}; + +static Huff lentab[MaxFastLen] = +{ + {2, 0x2}, /* 10 */ + {3, 0x6}, /* 110 */ + {5, 0x1c}, /* 11100 */ + {5, 0x1d}, /* 11101 */ + {6, 0x3c}, /* 111100 */ + {7, 0x7a}, /* 1111010 */ + {7, 0x7b}, /* 1111011 */ + {8, 0xf8}, /* 11111000 */ + {8, 0xf9}, /* 11111001 */ +}; + +static int thwmaxcheck; + +void +whackinit(Whack *tw, int level) +{ + thwmaxcheck = (1 << level); + thwmaxcheck -= thwmaxcheck >> 2; + if(thwmaxcheck < 2) + thwmaxcheck = 2; + else if(thwmaxcheck > 1024) + thwmaxcheck = 1024; + memset(tw, 0, sizeof *tw); + tw->begin = 2 * WhackMaxOff; +} + +/* + * find a string in the dictionary + */ +static int +whackmatch(Whack *b, uchar **ss, uchar *esrc, ulong h, ulong now) +{ + ushort then, off, last; + int bestoff, bestlen, check; + uchar *s, *t; + + s = *ss; + if(esrc < s + MinMatch) + return -1; + if(s + MaxLen < esrc) + esrc = s + MaxLen; + + bestoff = 0; + bestlen = 0; + check = thwmaxcheck; + last = 0; + for(then = b->hash[h]; check-- > 0; then = b->next[then & (WhackMaxOff - 1)]){ + off = now - then; + if(off <= last || off > WhackMaxOff) + break; + + /* + * don't need to check for the end because + * 1) s too close check above + */ + t = s - off; + if(s[0] == t[0] && s[1] == t[1] && s[2] == t[2]){ + if(!bestlen || esrc - s > bestlen && s[bestlen] == t[bestlen]){ + t += 3; + for(s += 3; s < esrc; s++){ + if(*s != *t) + break; + t++; + } + if(s - *ss > bestlen){ + bestlen = s - *ss; + bestoff = off; + if(bestlen > thwmaxcheck) + break; + } + } + } + s = *ss; + last = off; + } + *ss += bestlen; + return bestoff; +} + +/* + * knuth vol. 3 multiplicative hashing + * each byte x chosen according to rules + * 1/4 < x < 3/10, 1/3 x < < 3/7, 4/7 < x < 2/3, 7/10 < x < 3/4 + * with reasonable spread between the bytes & their complements + * + * the 3 byte value appears to be as almost good as the 4 byte value, + * and might be faster on some machines + */ +/* +#define hashit(c) ((((ulong)(c) * 0x6b43a9) >> (24 - HashLog)) & HashMask) +*/ +#define hashit(c) (((((ulong)(c) & 0xffffff) * 0x6b43a9b5) >> (32 - HashLog)) & HashMask) + +/* + * lz77 compression with single lookup in a hash table for each block + */ +int +whack(Whack *w, uchar *dst, uchar *src, int n, ulong stats[WhackStats]) +{ + uchar *s, *ss, *sss, *esrc, *half, *wdst, *wdmax; + ulong cont, code, wbits; + ushort now; + int toff, lithist, h, len, bits, use, wnbits, lits, matches, offbits, lenbits; + + if(!compressblocks || n < MinMatch) + return -1; + + wdst = dst; + wdmax = dst + n; + + now = w->begin; + s = src; + w->data = s; + + cont = (s[0] << 16) | (s[1] << 8) | s[2]; + + esrc = s + n; + half = s + (n >> 1); + wnbits = 0; + wbits = 0; + lits = 0; + matches = 0; + offbits = 0; + lenbits = 0; + lithist = ~0; + while(s < esrc){ + h = hashit(cont); + + sss = s; + toff = whackmatch(w, &sss, esrc, h, now); + ss = sss; + + len = ss - s; + for(; wnbits >= 8; wnbits -= 8){ + if(wdst >= wdmax){ + w->begin = now; + return -1; + } + *wdst++ = wbits >> (wnbits - 8); + } + if(len < MinMatch){ + toff = *s; + lithist = (lithist << 1) | toff < 32 | toff > 127; + if(lithist & 0x1e){ + wbits = (wbits << 9) | toff; + wnbits += 9; + }else if(lithist & 1){ + toff = (toff + 64) & 0xff; + if(toff < 96){ + wbits = (wbits << 10) | toff; + wnbits += 10; + }else{ + wbits = (wbits << 11) | toff; + wnbits += 11; + } + }else{ + wbits = (wbits << 8) | toff; + wnbits += 8; + } + lits++; + + /* + * speed hack + * check for compression progress, bail if none achieved + */ + if(s > half){ + if(4 * (s - src) < 5 * lits){ + w->begin = now; + return -1; + } + half = esrc; + } + + if(s + MinMatch <= esrc){ + w->next[now & (WhackMaxOff - 1)] = w->hash[h]; + w->hash[h] = now; + if(s + MinMatch < esrc) + cont = (cont << 8) | s[MinMatch]; + } + now++; + s++; + continue; + } + + matches++; + + /* + * length of match + */ + if(len > MaxLen){ + len = MaxLen; + ss = s + len; + } + len -= MinMatch; + if(len < MaxFastLen){ + bits = lentab[len].bits; + wbits = (wbits << bits) | lentab[len].encode; + wnbits += bits; + lenbits += bits; + }else{ + code = BigLenCode; + bits = BigLenBits; + use = BigLenBase; + len -= MaxFastLen; + while(len >= use){ + len -= use; + code = (code + use) << 1; + use <<= (bits & 1) ^ 1; + bits++; + } + + wbits = (wbits << bits) | (code + len); + wnbits += bits; + lenbits += bits; + + for(; wnbits >= 8; wnbits -= 8){ + if(wdst >= wdmax){ + w->begin = now; + return -1; + } + *wdst++ = wbits >> (wnbits - 8); + } + } + + /* + * offset in history + */ + toff--; + for(bits = MinOffBits; toff >= (1 << bits); bits++) + ; + if(bits < MaxOffBits-1){ + wbits = (wbits << 3) | (bits - MinOffBits); + if(bits != MinOffBits) + bits--; + wnbits += bits + 3; + offbits += bits + 3; + }else{ + wbits = (wbits << 4) | 0xe | (bits - (MaxOffBits-1)); + bits--; + wnbits += bits + 4; + offbits += bits + 4; + } + wbits = (wbits << bits) | toff & ((1 << bits) - 1); + + for(; s != ss; s++){ + if(s + MinMatch <= esrc){ + h = hashit(cont); + w->next[now & (WhackMaxOff - 1)] = w->hash[h]; + w->hash[h] = now; + if(s + MinMatch < esrc) + cont = (cont << 8) | s[MinMatch]; + } + now++; + } + } + + w->begin = now; + + stats[StatBytes] += esrc - src; + stats[StatLits] += lits; + stats[StatMatches] += matches; + stats[StatLitBits] += (wdst - (dst + 2)) * 8 + wnbits - offbits - lenbits; + stats[StatOffBits] += offbits; + stats[StatLenBits] += lenbits; + + if(wnbits & 7){ + wbits <<= 8 - (wnbits & 7); + wnbits += 8 - (wnbits & 7); + } + for(; wnbits >= 8; wnbits -= 8){ + if(wdst >= wdmax) + return -1; + *wdst++ = wbits >> (wnbits - 8); + } + + stats[StatOutBytes] += wdst - dst; + + return wdst - dst; +} + +int +whackblock(uchar *dst, uchar *src, int ssize) +{ + Whack w; + ulong stats[MaxStat]; + int r; + + whackinit(&w, 6); + r = whack(&w, dst, src, ssize, stats); + return r; +} diff --git a/src/cmd/venti/srv/whack.h b/src/cmd/venti/srv/whack.h new file mode 100644 index 00000000..fb966169 --- /dev/null +++ b/src/cmd/venti/srv/whack.h @@ -0,0 +1,40 @@ +typedef struct Whack Whack; +typedef struct Unwhack Unwhack; + +enum +{ + WhackStats = 8, + WhackErrLen = 64, /* max length of error message from thwack or unthwack */ + WhackMaxOff = 16*1024, /* max allowed offset */ + + HashLog = 14, + HashSize = 1<<HashLog, + HashMask = HashSize - 1, + + MinMatch = 3, /* shortest match possible */ + + MinDecode = 8, /* minimum bits to decode a match or lit; >= 8 */ + + MaxSeqMask = 8, /* number of bits in coding block mask */ + MaxSeqStart = 256 /* max offset of initial coding block */ +}; + +struct Whack +{ + ushort begin; /* time of first byte in hash */ + ushort hash[HashSize]; + ushort next[WhackMaxOff]; + uchar *data; +}; + +struct Unwhack +{ + char err[WhackErrLen]; +}; + +void whackinit(Whack*, int level); +void unwhackinit(Unwhack*); +int whack(Whack*, uchar *dst, uchar *src, int nsrc, ulong stats[WhackStats]); +int unwhack(Unwhack*, uchar *dst, int ndst, uchar *src, int nsrc); + +int whackblock(uchar *dst, uchar *src, int ssize); diff --git a/src/cmd/venti/srv/wrarena.c b/src/cmd/venti/srv/wrarena.c new file mode 100644 index 00000000..4b8358ca --- /dev/null +++ b/src/cmd/venti/srv/wrarena.c @@ -0,0 +1,217 @@ +#include "stdinc.h" +#include "dat.h" +#include "fns.h" + +QLock godot; +char *host; +int readonly = 1; /* for part.c */ +int mainstacksize = 256*1024; +Channel *c; +VtConn *z; +int fast; /* and a bit unsafe; only for benchmarking */ +int haveaoffset; +int maxwrites = -1; + +typedef struct ZClump ZClump; +struct ZClump +{ + ZBlock *lump; + Clump cl; + u64int aa; +}; + +void +usage(void) +{ + fprint(2, "usage: wrarena [-h host] arenafile [offset]\n"); + threadexitsall("usage"); +} + +void +vtsendthread(void *v) +{ + ZClump zcl; + + USED(v); + while(recv(c, &zcl) == 1){ + if(zcl.lump == nil) + break; + if(vtwrite(z, zcl.cl.info.score, zcl.cl.info.type, zcl.lump->data, zcl.cl.info.uncsize) < 0) + sysfatal("failed writing clump %llud: %r", zcl.aa); + freezblock(zcl.lump); + } + /* + * All the send threads try to exit right when + * threadmain is calling threadexitsall. + * Either libthread or the Linux NPTL pthreads library + * can't handle this condition (I suspect NPTL but have + * not confirmed this) and we get a seg fault in exit. + * I spent a day tracking this down with no success, + * so we're going to work around it instead by just + * sitting here and waiting for the threadexitsall to + * take effect. + */ + qlock(&godot); +} + +static void +rdarena(Arena *arena, u64int offset) +{ + u64int a, aa, e; + u32int magic; + Clump cl; + uchar score[VtScoreSize]; + ZBlock *lump; + ZClump zcl; + + fprint(2, "wrarena: copying %s to venti\n", arena->name); + printarena(2, arena); + + a = arena->base; + e = arena->base + arena->size; + if(offset != ~(u64int)0) { + if(offset >= e-a) + sysfatal("bad offset %llud >= %llud\n", + offset, e-a); + aa = offset; + } else + aa = 0; + + if(maxwrites != 0) + for(; aa < e; aa += ClumpSize+cl.info.size) { + magic = clumpmagic(arena, aa); + if(magic == ClumpFreeMagic) + break; + if(magic != arena->clumpmagic) { + // fprint(2, "illegal clump magic number %#8.8ux offset %llud\n", + // magic, aa); + break; + } + lump = loadclump(arena, aa, 0, &cl, score, 0); + if(lump == nil) { + fprint(2, "clump %llud failed to read: %r\n", aa); + break; + } + if(!fast && cl.info.type != VtCorruptType) { + scoremem(score, lump->data, cl.info.uncsize); + if(scorecmp(cl.info.score, score) != 0) { + fprint(2, "clump %llud has mismatched score\n", aa); + break; + } + if(vttypevalid(cl.info.type) < 0) { + fprint(2, "clump %llud has bad type %d\n", aa, cl.info.type); + break; + } + } + if(z && cl.info.type != VtCorruptType){ + zcl.cl = cl; + zcl.lump = lump; + zcl.aa = aa; + send(c, &zcl); + }else + freezblock(lump); + if(maxwrites>0 && --maxwrites == 0) + break; + } + if(haveaoffset) + print("end offset %llud\n", aa); +} + +void +threadmain(int argc, char *argv[]) +{ + int i; + char *file; + Arena *arena; + u64int offset, aoffset; + Part *part; + Dir *d; + uchar buf[8192]; + ArenaHead head; + ZClump zerocl; + + qlock(&godot); + aoffset = 0; + ARGBEGIN{ + case 'f': + fast = 1; + ventidoublechecksha1 = 0; + break; + case 'h': + host = EARGF(usage()); + break; + case 'o': + haveaoffset = 1; + aoffset = strtoull(EARGF(usage()), 0, 0); + break; + case 'M': + maxwrites = atoi(EARGF(usage())); + break; + default: + usage(); + break; + }ARGEND + + offset = ~(u64int)0; + switch(argc) { + default: + usage(); + case 2: + offset = strtoull(argv[1], 0, 0); + /* fall through */ + case 1: + file = argv[0]; + } + + fmtinstall('V', vtscorefmt); + + statsinit(); + + if((d = dirstat(file)) == nil) + sysfatal("can't stat file %s: %r", file); + + part = initpart(file, OREAD); + if(part == nil) + sysfatal("can't open file %s: %r", file); + if(readpart(part, aoffset, buf, sizeof buf) < 0) + sysfatal("can't read file %s: %r", file); + + if(unpackarenahead(&head, buf) < 0) + sysfatal("corrupted arena header: %r"); + + if(aoffset+head.size > d->length) + sysfatal("arena is truncated: want %llud bytes have %llud\n", + head.size, d->length); + + partblocksize(part, head.blocksize); + initdcache(8 * MaxDiskBlock); + + arena = initarena(part, aoffset, head.size, head.blocksize); + if(arena == nil) + sysfatal("initarena: %r"); + + if(host && strcmp(host, "/dev/null") != 0){ + z = vtdial(host); + if(z == nil) + sysfatal("could not connect to server: %r"); + if(vtconnect(z) < 0) + sysfatal("vtconnect: %r"); + }else + z = nil; + + c = chancreate(sizeof(ZClump), 0); + for(i=0; i<12; i++) + vtproc(vtsendthread, nil); + + rdarena(arena, offset); + if(vtsync(z) < 0) + sysfatal("executing sync: %r"); + + memset(&zerocl, 0, sizeof zerocl); + for(i=0; i<12; i++) + send(c, &zerocl); + if(z){ + vthangup(z); + } + threadexitsall(0); +} diff --git a/src/cmd/venti/srv/xml.c b/src/cmd/venti/srv/xml.c new file mode 100644 index 00000000..e91afa05 --- /dev/null +++ b/src/cmd/venti/srv/xml.c @@ -0,0 +1,68 @@ +#include "stdinc.h" +#include "dat.h" +#include "fns.h" +#include "xml.h" + +void xmlarena(Hio *hout, Arena *s, char *tag, int indent){ + xmlindent(hout, indent); + hprint(hout, "<%s", tag); + xmlaname(hout, s->name, "name"); + xmlu32int(hout, s->version, "version"); + xmlaname(hout, s->part->name, "partition"); + xmlu32int(hout, s->blocksize, "blocksize"); + xmlu64int(hout, s->base, "start"); + xmlu64int(hout, s->base+2*s->blocksize, "stop"); + xmlu32int(hout, s->ctime, "created"); + xmlu32int(hout, s->wtime, "modified"); + xmlsealed(hout, s->memstats.sealed, "sealed"); + xmlscore(hout, s->score, "score"); + xmlu32int(hout, s->memstats.clumps, "clumps"); + xmlu32int(hout, s->memstats.cclumps, "compressedclumps"); + xmlu64int(hout, s->memstats.uncsize, "data"); + xmlu64int(hout, s->memstats.used - s->memstats.clumps * ClumpSize, "compresseddata"); + xmlu64int(hout, s->memstats.used + s->memstats.clumps * ClumpInfoSize, "storage"); + hprint(hout, "/>\n"); +} + +void xmlindex(Hio *hout, Index *s, char *tag, int indent){ + int i; + xmlindent(hout, indent); + hprint(hout, "<%s", tag); + xmlaname(hout, s->name, "name"); + xmlu32int(hout, s->version, "version"); + xmlu32int(hout, s->blocksize, "blocksize"); + xmlu32int(hout, s->tabsize, "tabsize"); + xmlu32int(hout, s->buckets, "buckets"); + xmlu32int(hout, s->div, "buckdiv"); + hprint(hout, ">\n"); + xmlindent(hout, indent + 1); + hprint(hout, "<sects>\n"); + for(i = 0; i < s->nsects; i++) + xmlamap(hout, &s->smap[i], "sect", indent + 2); + xmlindent(hout, indent + 1); + hprint(hout, "</sects>\n"); + xmlindent(hout, indent + 1); + hprint(hout, "<amaps>\n"); + for(i = 0; i < s->narenas; i++) + xmlamap(hout, &s->amap[i], "amap", indent + 2); + xmlindent(hout, indent + 1); + hprint(hout, "</amaps>\n"); + xmlindent(hout, indent + 1); + hprint(hout, "<arenas>\n"); + for(i = 0; i < s->narenas; i++) + xmlarena(hout, s->arenas[i], "arena", indent + 2); + xmlindent(hout, indent + 1); + hprint(hout, "</arenas>\n"); + xmlindent(hout, indent); + hprint(hout, "</%s>\n", tag); +} + +void xmlamap(Hio *hout, AMap *s, char *tag, int indent){ + xmlindent(hout, indent); + hprint(hout, "<%s", tag); + xmlaname(hout, s->name, "name"); + xmlu64int(hout, s->start, "start"); + xmlu64int(hout, s->stop, "stop"); + hprint(hout, "/>\n"); +} + diff --git a/src/cmd/venti/srv/xml.h b/src/cmd/venti/srv/xml.h new file mode 100644 index 00000000..c9e52b0b --- /dev/null +++ b/src/cmd/venti/srv/xml.h @@ -0,0 +1,11 @@ +void xmlamap(Hio *hout, AMap *v, char *tag, int indent); +void xmlarena(Hio *hout, Arena *v, char *tag, int indent); +void xmlindex(Hio *hout, Index *v, char *tag, int indent); + +void xmlaname(Hio *hout, char *v, char *tag); +void xmlscore(Hio *hout, u8int *v, char *tag); +void xmlsealed(Hio *hout, int v, char *tag); +void xmlu32int(Hio *hout, u32int v, char *tag); +void xmlu64int(Hio *hout, u64int v, char *tag); + +void xmlindent(Hio *hout, int indent); diff --git a/src/cmd/venti/srv/zblock.c b/src/cmd/venti/srv/zblock.c new file mode 100644 index 00000000..b33cdd25 --- /dev/null +++ b/src/cmd/venti/srv/zblock.c @@ -0,0 +1,93 @@ +#include "stdinc.h" +#include "dat.h" +#include "fns.h" + +void +fmtzbinit(Fmt *f, ZBlock *b) +{ + f->runes = 0; + f->start = b->data; + f->to = f->start; + f->stop = (char*)f->start + b->len; + f->flush = nil; + f->farg = nil; + f->nfmt = 0; +} + +#define ROUNDUP(p, n) ((void*)(((ulong)(p)+(n)-1)&~(ulong)((n)-1))) + +static char zmagic[] = "1234567890abcdefghijkl"; + +ZBlock * +alloczblock(u32int size, int zeroed, uint blocksize) +{ + uchar *p, *data; + ZBlock *b; + static ZBlock z; + int n; + + if(blocksize == 0) + blocksize = 32; /* try for cache line alignment */ + + n = size+32/*XXX*/+sizeof(ZBlock)+blocksize+8; + p = malloc(n); + if(p == nil){ + seterr(EOk, "out of memory"); + return nil; + } + + data = ROUNDUP(p, blocksize); + b = ROUNDUP(data+size+32/*XXX*/, 8); + if(0) fprint(2, "alloc %p-%p data %p-%p b %p-%p\n", + p, p+n, data, data+size, b, b+1); + *b = z; + b->data = data; + b->free = p; + b->len = size; + b->_size = size; + if(zeroed) + memset(b->data, 0, size); + memmove(b->data+size, zmagic, 32/*XXX*/); + return b; +} + +void +freezblock(ZBlock *b) +{ + if(b){ + if(memcmp(b->data+b->_size, zmagic, 32) != 0) + abort(); + memset(b->data+b->_size, 0, 32); + free(b->free); + } +} + +ZBlock* +packet2zblock(Packet *p, u32int size) +{ + ZBlock *b; + + if(p == nil) + return nil; + b = alloczblock(size, 0, 0); + if(b == nil) + return nil; + if(packetcopy(p, b->data, 0, size) < 0){ + freezblock(b); + return nil; + } + return b; +} + +Packet* +zblock2packet(ZBlock *zb, u32int size) +{ + Packet *p; + + if(zb == nil) + return nil; + p = packetalloc(); + packetappend(p, zb->data, size); + return p; +} + diff --git a/src/cmd/venti/srv/zeropart.c b/src/cmd/venti/srv/zeropart.c new file mode 100644 index 00000000..fe75c81d --- /dev/null +++ b/src/cmd/venti/srv/zeropart.c @@ -0,0 +1,31 @@ +#include "stdinc.h" +#include "dat.h" +#include "fns.h" + +void +zeropart(Part *part, int blocksize) +{ + ZBlock *b; + u64int addr; + int w; + + fprint(2, "clearing the partition\n"); +//fprint(2, "NOT!\n"); +//return; +//b=alloczblock(MaxIoSize, 1, blocksize); +//freezblock(b); + b = alloczblock(MaxIoSize, 1, blocksize); + + w = 0; + for(addr = PartBlank; addr + MaxIoSize <= part->size; addr += MaxIoSize){ + if(writepart(part, addr, b->data, MaxIoSize) < 0) + sysfatal("can't initialize %s, writing block %d failed: %r", part->name, w); + w++; + } + + for(; addr + blocksize <= part->size; addr += blocksize) + if(writepart(part, addr, b->data, blocksize) < 0) + sysfatal("can't initialize %s: %r", part->name); + + freezblock(b); +} diff --git a/src/cmd/venti/sync.c b/src/cmd/venti/sync.c new file mode 100644 index 00000000..9d817a72 --- /dev/null +++ b/src/cmd/venti/sync.c @@ -0,0 +1,54 @@ +#include <u.h> +#include <libc.h> +#include <thread.h> +#include <venti.h> + +char *host; +int donothing; + +void +usage(void) +{ + fprint(2, "usage: sync [-h host]\n"); + threadexitsall("usage"); +} + +void +threadmain(int argc, char *argv[]) +{ + VtConn *z; + + fmtinstall('V', vtscorefmt); + fmtinstall('F', vtfcallfmt); + + ARGBEGIN{ + case 'h': + host = EARGF(usage()); + if(host == nil) + usage(); + break; + case 'x': + donothing = 1; + break; + default: + usage(); + break; + }ARGEND + + if(argc != 0) + usage(); + + z = vtdial(host); + if(z == nil) + sysfatal("could not connect to server: %r"); + + if(vtconnect(z) < 0) + sysfatal("vtconnect: %r"); + + if(!donothing) + if(vtsync(z) < 0) + sysfatal("vtsync: %r"); + + vthangup(z); + threadexitsall(0); +} diff --git a/src/cmd/venti/write.c b/src/cmd/venti/write.c new file mode 100644 index 00000000..c11a5a31 --- /dev/null +++ b/src/cmd/venti/write.c @@ -0,0 +1,62 @@ +#include <u.h> +#include <libc.h> +#include <venti.h> +#include <libsec.h> +#include <thread.h> + +void +usage(void) +{ + fprint(2, "usage: write [-z] [-h host] [-t type] <datablock\n"); + threadexitsall("usage"); +} + +void +threadmain(int argc, char *argv[]) +{ + char *host; + int dotrunc, n, type; + uchar *p, score[VtScoreSize]; + VtConn *z; + + fmtinstall('F', vtfcallfmt); + fmtinstall('V', vtscorefmt); + + host = nil; + dotrunc = 0; + type = VtDataType; + ARGBEGIN{ + case 'z': + dotrunc = 1; + break; + case 'h': + host = EARGF(usage()); + break; + case 't': + type = atoi(EARGF(usage())); + break; + default: + usage(); + break; + }ARGEND + + if(argc != 0) + usage(); + + p = vtmallocz(VtMaxLumpSize+1); + n = readn(0, p, VtMaxLumpSize+1); + if(n > VtMaxLumpSize) + sysfatal("input too big: max block size is %d", VtMaxLumpSize); + z = vtdial(host); + if(z == nil) + sysfatal("could not connect to server: %r"); + if(vtconnect(z) < 0) + sysfatal("vtconnect: %r"); + if(dotrunc) + n = vtzerotruncate(type, p, n); + if(vtwrite(z, score, type, p, n) < 0) + sysfatal("vtwrite: %r"); + vthangup(z); + print("%V\n", score); + threadexitsall(0); +} |