aboutsummaryrefslogtreecommitdiff
path: root/src/cmd/venti
diff options
context:
space:
mode:
authorrsc <devnull@localhost>2006-07-18 15:26:33 +0000
committerrsc <devnull@localhost>2006-07-18 15:26:33 +0000
commit28b49df3542a635cca788f3de213385f3fcb6334 (patch)
treea3a30774249929e66988bf77e76df9459acb50bc /src/cmd/venti
parent686bd37d9d8db5e3b969a3aa2d5b455e0976b262 (diff)
downloadplan9port-28b49df3542a635cca788f3de213385f3fcb6334.tar.gz
plan9port-28b49df3542a635cca788f3de213385f3fcb6334.tar.bz2
plan9port-28b49df3542a635cca788f3de213385f3fcb6334.zip
assorted changes from Plan 9
Diffstat (limited to 'src/cmd/venti')
-rw-r--r--src/cmd/venti/srv/arena.c40
-rw-r--r--src/cmd/venti/srv/arenas.c6
-rw-r--r--src/cmd/venti/srv/bloom.c62
-rw-r--r--src/cmd/venti/srv/buildbuck.c6
-rw-r--r--src/cmd/venti/srv/buildindex.c1018
-rw-r--r--src/cmd/venti/srv/checkindex.c4
-rw-r--r--src/cmd/venti/srv/clump.c7
-rw-r--r--src/cmd/venti/srv/conv.c66
-rw-r--r--src/cmd/venti/srv/dat.h42
-rw-r--r--src/cmd/venti/srv/dcache.c64
-rw-r--r--src/cmd/venti/srv/disksched.c88
-rw-r--r--src/cmd/venti/srv/findscore.c2
-rw-r--r--src/cmd/venti/srv/fixarenas.c1894
-rw-r--r--src/cmd/venti/srv/fns.h9
-rw-r--r--src/cmd/venti/srv/graph.c16
-rw-r--r--src/cmd/venti/srv/httpd.c219
-rw-r--r--src/cmd/venti/srv/icache.c50
-rw-r--r--src/cmd/venti/srv/icachewrite.c36
-rw-r--r--src/cmd/venti/srv/index.c22
-rw-r--r--src/cmd/venti/srv/lump.c27
-rw-r--r--src/cmd/venti/srv/lumpcache.c13
-rw-r--r--src/cmd/venti/srv/lumpqueue.c16
-rw-r--r--src/cmd/venti/srv/mirrorarenas.c464
-rw-r--r--src/cmd/venti/srv/mkfile3
-rw-r--r--src/cmd/venti/srv/part.c260
-rw-r--r--src/cmd/venti/srv/printarenapart.c160
-rw-r--r--src/cmd/venti/srv/printarenas.c2
-rw-r--r--src/cmd/venti/srv/sortientry.c17
-rw-r--r--src/cmd/venti/srv/stats.c2
-rw-r--r--src/cmd/venti/srv/syncarena.c21
-rw-r--r--src/cmd/venti/srv/syncindex.c2
-rw-r--r--src/cmd/venti/srv/syncindex0.c16
-rw-r--r--src/cmd/venti/srv/unwhack.c2
-rw-r--r--src/cmd/venti/srv/utils.c5
-rw-r--r--src/cmd/venti/srv/venti.c6
-rw-r--r--src/cmd/venti/srv/verifyarena.c220
-rw-r--r--src/cmd/venti/srv/wrarena.c4
-rw-r--r--src/cmd/venti/srv/zblock.c6
-rw-r--r--src/cmd/venti/srv/zeropart.c4
39 files changed, 4540 insertions, 361 deletions
diff --git a/src/cmd/venti/srv/arena.c b/src/cmd/venti/srv/arena.c
index 8cfe3e5d..0121d451 100644
--- a/src/cmd/venti/srv/arena.c
+++ b/src/cmd/venti/srv/arena.c
@@ -20,6 +20,7 @@ static void sumproc(void *);
static QLock sumlock;
static Rendez sumwait;
static ASum *sumq;
+static ASum *sumqtail;
static uchar zero[8192];
int arenasumsleeptime;
@@ -257,7 +258,6 @@ writearena(Arena *arena, u64int aa, u8int *clbuf, u32int n)
if(m > n - nn)
m = n - nn;
memmove(&b->data[off], &clbuf[nn], m);
- /* ok = writepart(arena->part, a, b->data, blocksize); */
ok = 0;
putdblock(b);
if(ok < 0){
@@ -329,7 +329,6 @@ writeaclump(Arena *arena, Clump *c, u8int *clbuf, u64int start, u64int *pa)
if(m > n - nn)
m = n - nn;
memmove(&b->data[off], &clbuf[nn], m);
- /* ok = writepart(arena->part, a, b->data, blocksize); */
ok = 0;
putdblock(b);
if(ok < 0){
@@ -356,6 +355,7 @@ writeaclump(Arena *arena, Clump *c, u8int *clbuf, u64int start, u64int *pa)
arena->ctime = arena->wtime;
writeclumpinfo(arena, clump, &c->info);
+ wbarena(arena);
/* set up for call to setdcachestate */
as.arena = arena;
@@ -410,6 +410,9 @@ setatailstate(AState *as)
trace(0, "setatailstate %s 0x%llux clumps %d", as->arena->name, as->aa, as->stats.clumps);
+ /*
+ * Look up as->arena to find index.
+ */
ix = mainindex;
for(i=0; i<ix->narenas; i++)
if(ix->arenas[i] == as->arena)
@@ -419,6 +422,9 @@ setatailstate(AState *as)
return;
}
+ /*
+ * Walk backward until we find the last time these were in sync.
+ */
for(j=i; --j>=0; ){
a = ix->arenas[j];
if(atailcmp(&a->diskstats, &a->memstats) == 0)
@@ -464,8 +470,12 @@ backsumarena(Arena *arena)
return;
qlock(&sumlock);
as->arena = arena;
- as->next = sumq;
- sumq = as;
+ as->next = nil;
+ if(sumq)
+ sumqtail->next = as;
+ else
+ sumq = as;
+ sumqtail = as;
rwakeup(&sumwait);
qunlock(&sumlock);
}
@@ -499,6 +509,7 @@ sumarena(Arena *arena)
DigestState s;
u64int a, e;
u32int bs;
+ int t;
u8int score[VtScoreSize];
bs = MaxIoSize;
@@ -512,7 +523,12 @@ sumarena(Arena *arena)
b = alloczblock(bs, 0, arena->part->blocksize);
e = arena->base + arena->size;
for(a = arena->base - arena->blocksize; a + arena->blocksize <= e; a += bs){
- sleep(arenasumsleeptime);
+ disksched();
+ while((t=arenasumsleeptime) == SleepForever){
+ sleep(1000);
+ disksched();
+ }
+ sleep(t);
if(a + bs > e)
bs = arena->blocksize;
if(readpart(arena->part, a, b->data, bs) < 0)
@@ -595,7 +611,7 @@ wbarenahead(Arena *arena)
b = alloczblock(arena->blocksize, 1, arena->part->blocksize);
if(b == nil){
logerr(EAdmin, "can't write arena header: %r");
-/*/ZZZ add error message? */
+/* ZZZ add error message? */
return -1;
}
/*
@@ -681,18 +697,22 @@ okarena(Arena *arena)
ok = 0;
dsize = arenadirsize(arena, arena->diskstats.clumps);
if(arena->diskstats.used + dsize > arena->size){
- seterr(ECorrupt, "arena used > size");
+ seterr(ECorrupt, "arena %s used > size", arena->name);
ok = -1;
}
if(arena->diskstats.cclumps > arena->diskstats.clumps)
- logerr(ECorrupt, "arena has more compressed clumps than total clumps");
+ logerr(ECorrupt, "arena %s has more compressed clumps than total clumps", arena->name);
+ /*
+ * This need not be true if some of the disk is corrupted.
+ *
if(arena->diskstats.uncsize + arena->diskstats.clumps * ClumpSize + arena->blocksize < arena->diskstats.used)
- logerr(ECorrupt, "arena uncompressed size inconsistent with used space %lld %d %lld", arena->diskstats.uncsize, arena->diskstats.clumps, arena->diskstats.used);
+ logerr(ECorrupt, "arena %s uncompressed size inconsistent with used space %lld %d %lld", arena->name, arena->diskstats.uncsize, arena->diskstats.clumps, arena->diskstats.used);
+ */
if(arena->ctime > arena->wtime)
- logerr(ECorrupt, "arena creation time after last write time");
+ logerr(ECorrupt, "arena %s creation time after last write time", arena->name);
return ok;
}
diff --git a/src/cmd/venti/srv/arenas.c b/src/cmd/venti/srv/arenas.c
index 0bffd3cf..05cb396b 100644
--- a/src/cmd/venti/srv/arenas.c
+++ b/src/cmd/venti/srv/arenas.c
@@ -214,7 +214,7 @@ wbarenapart(ArenaPart *ap)
return -1;
b = alloczblock(HeadSize, 1, 0);
if(b == nil)
-/*ZZZ set error message? */
+/* ZZZ set error message? */
return -1;
if(packarenapart(ap, b->data) < 0){
@@ -337,8 +337,8 @@ wbarenamap(AMap *am, int n, Part *part, u64int base, u64int size)
/*
* amap: n '\n' amapelem * n
* n: u32int
- * amapelem: name '\t' astart '\t' asize '\n'
- * astart, asize: u64int
+ * amapelem: name '\t' astart '\t' astop '\n'
+ * astart, astop: u64int
*/
int
parseamap(IFile *f, AMapN *amn)
diff --git a/src/cmd/venti/srv/bloom.c b/src/cmd/venti/srv/bloom.c
index e54e3885..7ea5f640 100644
--- a/src/cmd/venti/srv/bloom.c
+++ b/src/cmd/venti/srv/bloom.c
@@ -7,6 +7,8 @@
#include "dat.h"
#include "fns.h"
+int ignorebloom;
+
int
bloominit(Bloom *b, vlong vsize, u8int *data)
{
@@ -24,6 +26,7 @@ bloominit(Bloom *b, vlong vsize, u8int *data)
if(unpackbloomhead(b, data) < 0)
return -1;
+fprint(2, "bloom size %lud nhash %d\n", b->size, b->nhash);
b->mask = b->size-1;
b->data = data;
return 0;
@@ -38,11 +41,7 @@ wbbloomhead(Bloom *b)
Bloom*
readbloom(Part *p)
{
- int i, n;
- uint ones;
uchar buf[512];
- uchar *data;
- u32int *a;
Bloom *b;
b = vtmallocz(sizeof *b);
@@ -52,14 +51,40 @@ readbloom(Part *p)
vtfree(b);
return nil;
}
+ b->part = p;
+ return b;
+}
+
+int
+resetbloom(Bloom *b)
+{
+ uchar *data;
+
data = vtmallocz(b->size);
- if(readpart(p, 0, data, b->size) < 0){
+fprint(2, "bloom data %lud\n", b->size);
+ b->data = data;
+ if(b->size == MaxBloomSize) /* 2^32 overflows ulong */
+ addstat(StatBloomBits, b->size*8-1);
+ else
+ addstat(StatBloomBits, b->size*8);
+ return 0;
+}
+
+int
+loadbloom(Bloom *b)
+{
+ int i, n;
+ uint ones;
+ uchar *data;
+ u32int *a;
+
+ data = vtmallocz(b->size);
+ if(readpart(b->part, 0, data, b->size) < 0){
vtfree(b);
vtfree(data);
- return nil;
+ return -1;
}
b->data = data;
- b->part = p;
a = (u32int*)b->data;
n = b->size/4;
@@ -73,7 +98,7 @@ readbloom(Part *p)
else
addstat(StatBloomBits, b->size*8);
- return b;
+ return 0;
}
int
@@ -101,6 +126,8 @@ gethashes(u8int *score, ulong *h)
a ^= *(u32int*)(score+i);
b ^= *(u32int*)(score+i+4);
}
+ if(i+4 <= VtScoreSize) /* 20 is not 4-aligned */
+ a ^= *(u32int*)(score+i);
for(i=0; i<BloomMaxHash; i++, a+=b)
h[i] = a < BloomHeadSize*8 ? BloomHeadSize*8 : a;
}
@@ -154,14 +181,17 @@ inbloomfilter(Bloom *b, u8int *score)
int r;
uint ms;
- if(b == nil)
+ if(b == nil || b->data == nil)
return 1;
+ if(ignorebloom)
+ return 1;
+
ms = msec();
rlock(&b->lk);
r = _inbloomfilter(b, score);
runlock(&b->lk);
- ms = msec() - ms;
+ ms = ms - msec();
addstat2(StatBloomLookup, 1, StatBloomLookupTime, ms);
if(r)
addstat(StatBloomMiss, 1);
@@ -173,7 +203,7 @@ inbloomfilter(Bloom *b, u8int *score)
void
markbloomfilter(Bloom *b, u8int *score)
{
- if(b == nil)
+ if(b == nil || b->data == nil)
return;
rlock(&b->lk);
@@ -186,14 +216,18 @@ markbloomfilter(Bloom *b, u8int *score)
static void
bloomwriteproc(void *v)
{
+ int ret;
Bloom *b;
-
+
+ threadsetname("bloomwriteproc");
b = v;
for(;;){
recv(b->writechan, 0);
- if(writebloom(b) < 0)
+ if((ret=writebloom(b)) < 0)
fprint(2, "oops! writing bloom: %r\n");
- send(b->writedonechan, 0);
+ else
+ ret = 0;
+ sendul(b->writedonechan, ret);
}
}
diff --git a/src/cmd/venti/srv/buildbuck.c b/src/cmd/venti/srv/buildbuck.c
index 225bdc43..73f8056b 100644
--- a/src/cmd/venti/srv/buildbuck.c
+++ b/src/cmd/venti/srv/buildbuck.c
@@ -21,7 +21,7 @@ initiestream(Part *part, u64int off, u64int clumps, u32int size)
{
IEStream *ies;
-/*ZZZ out of memory? */
+/* out of memory? */
ies = MKZ(IEStream);
ies->buf = MKN(u8int, size);
ies->epos = ies->buf;
@@ -61,7 +61,7 @@ peekientry(IEStream *ies)
nn -= n;
if(nn == 0)
return nil;
-/*fprint(2, "peek %d from %llud into %p\n", nn, ies->off, ies->epos); */
+//fprint(2, "peek %d from %llud into %p\n", nn, ies->off, ies->epos);
if(readpart(ies->part, ies->off, ies->epos, nn) < 0){
seterr(EOk, "can't read sorted index entries: %r");
return nil;
@@ -101,7 +101,7 @@ buildbucket(Index *ix, IEStream *ies, IBucket *ib, uint maxdata)
b = peekientry(ies);
if(b == nil)
return TWID32;
-/*fprint(2, "b=%p ies->n=%lld ib.n=%d buck=%d score=%V\n", b, ies->n, ib->n, iebuck(ix, b, ib, ies), b); */
+/* fprint(2, "b=%p ies->n=%lld ib.n=%d buck=%d score=%V\n", b, ies->n, ib->n, iebuck(ix, b, ib, ies), b); */
if(ib->n == 0)
buck = iebuck(ix, b, ib, ies);
else{
diff --git a/src/cmd/venti/srv/buildindex.c b/src/cmd/venti/srv/buildindex.c
index e70a830d..b6866daf 100644
--- a/src/cmd/venti/srv/buildindex.c
+++ b/src/cmd/venti/srv/buildindex.c
@@ -1,164 +1,936 @@
/*
- * Rebuild the Venti index from scratch.
+ * Rebuild the index from scratch, in place.
*/
-
#include "stdinc.h"
#include "dat.h"
#include "fns.h"
-/*
- * Write a single bucket. Could profit from a big buffer here
- * so that we can absorb sporadic runs of blocks into one write,
- * avoiding disk seeks.
- */
-static int
-writebucket(Index *ix, u32int buck, IBucket *ib, ZBlock *b)
+enum
{
- ISect *is;
+ MinBufSize = 64*1024,
+ MaxBufSize = 4*1024*1024,
+};
- is = ix->sects[indexsect0(ix, buck)];
- if(buck < is->start || buck >= is->stop){
- seterr(EAdmin, "cannot find index section for bucket %lud\n", (ulong)buck);
- return -1;
- }
- buck -= is->start;
+int dumb;
+int errors;
+char **isect;
+int nisect;
+int bloom;
+int zero;
-/*
- qlock(&stats.lock);
- stats.indexwrites++;
- qunlock(&stats.lock);
-*/
- packibucket(ib, b->data, is->bucketmagic);
- return writepart(is->part, is->blockbase + ((u64int)buck << is->blocklog), b->data, is->blocksize);
-}
+u32int isectmem;
+u64int totalbuckets;
+u64int totalclumps;
+Channel *arenadonechan;
+Channel *isectdonechan;
+Index *ix;
-static int
-buildindex(Index *ix, Part *part, u64int off, u64int clumps, int zero)
-{
- IEStream *ies;
- IBucket ib, zib;
- ZBlock *z, *b;
- u32int next, buck;
- int ok;
- uint nbuck;
- u64int found = 0;
-
-/*ZZZ make buffer size configurable */
- b = alloczblock(ix->blocksize, 0, ix->blocksize);
- z = alloczblock(ix->blocksize, 1, ix->blocksize);
- ies = initiestream(part, off, clumps, 64*1024);
- if(b == nil || z == nil || ies == nil){
- ok = 0;
- goto breakout;
- return -1;
- }
- ok = 0;
- next = 0;
- memset(&ib, 0, sizeof ib);
- ib.data = b->data + IBucketSize;
- zib.data = z->data + IBucketSize;
- zib.n = 0;
- nbuck = 0;
- for(;;){
- buck = buildbucket(ix, ies, &ib, ix->blocksize-IBucketSize);
- found += ib.n;
- if(zero){
- for(; next != buck; next++){
- if(next == ix->buckets){
- if(buck != TWID32){
- fprint(2, "bucket out of range\n");
- ok = -1;
- }
- goto breakout;
- }
- if(writebucket(ix, next, &zib, z) < 0){
- fprint(2, "can't write zero bucket to buck=%d: %r", next);
- ok = -1;
- }
- }
- }
- if(buck >= ix->buckets){
- if(buck == TWID32)
- break;
- fprint(2, "bucket out of range\n");
- ok = -1;
- goto breakout;
- }
- if(writebucket(ix, buck, &ib, b) < 0){
- fprint(2, "bad bucket found=%lld: %r\n", found);
- ok = -1;
- }
- next = buck + 1;
- if(++nbuck%10000 == 0)
- fprint(2, "\t%,d buckets written...\n", nbuck);
- }
-breakout:;
- fprint(2, "wrote index with %lld entries\n", found);
- freeiestream(ies);
- freezblock(z);
- freezblock(b);
- return ok;
-}
+u64int arenaentries;
+u64int skipentries;
+u64int indexentries;
+
+static int shouldprocess(ISect*);
+static void isectproc(void*);
+static void arenapartproc(void*);
void
usage(void)
{
- fprint(2, "usage: buildindex [-Z] [-B blockcachesize] config tmppart\n");
- threadexitsall(0);
+ fprint(2, "usage: buildindex [-b] [-i isect]... [-M imem] venti.conf\n");
+ threadexitsall("usage");
}
-Config conf;
-
void
threadmain(int argc, char *argv[])
{
- Part *part;
- u64int clumps, base;
- u32int bcmem;
- int zero;
-
- zero = 1;
- bcmem = 0;
+ int fd, i, napart;
+ u32int bcmem, imem;
+ Config conf;
+ Part *p;
+
ventifmtinstall();
+ imem = 256*1024*1024;
ARGBEGIN{
- case 'B':
- bcmem = unittoull(ARGF());
+ case 'b':
+ bloom = 1;
+ break;
+ case 'i':
+ isect = vtrealloc(isect, (nisect+1)*sizeof(isect[0]));
+ isect[nisect++] = EARGF(usage());
break;
- case 'Z':
- zero = 0;
+ case 'd': /* debugging - make sure to run all 3 passes */
+ dumb = 1;
+ break;
+ case 'M':
+ imem = unittoull(EARGF(usage()));
break;
default:
usage();
break;
}ARGEND
-
- if(argc != 2)
+
+ if(argc != 1)
usage();
if(initventi(argv[0], &conf) < 0)
sysfatal("can't init venti: %r");
+ ix = mainindex;
+ if(nisect == 0 && ix->bloom)
+ bloom = 1;
+ if(bloom && ix->bloom && resetbloom(ix->bloom) < 0)
+ sysfatal("loadbloom: %r");
+ if(bloom && !ix->bloom)
+ sysfatal("-b specified but no bloom filter");
+ if(!bloom)
+ ix->bloom = nil;
+ isectmem = imem/ix->nsects;
- if(bcmem < maxblocksize * (mainindex->narenas + mainindex->nsects * 4 + 16))
- bcmem = maxblocksize * (mainindex->narenas + mainindex->nsects * 4 + 16);
+ /*
+ * safety first - only need read access to arenas
+ */
+ p = nil;
+ for(i=0; i<ix->narenas; i++){
+ if(ix->arenas[i]->part != p){
+ p = ix->arenas[i]->part;
+ if((fd = open(p->filename, OREAD)) < 0)
+ sysfatal("cannot reopen %s: %r", p->filename);
+ dup(fd, p->fd);
+ close(fd);
+ }
+ }
+
+ /*
+ * need a block for every arena
+ */
+ bcmem = maxblocksize * (mainindex->narenas + 16);
if(0) fprint(2, "initialize %d bytes of disk block cache\n", bcmem);
initdcache(bcmem);
+
+ totalclumps = 0;
+ for(i=0; i<ix->narenas; i++)
+ totalclumps += ix->arenas[i]->diskstats.clumps;
+
+ totalbuckets = 0;
+ for(i=0; i<ix->nsects; i++)
+ totalbuckets += ix->sects[i]->blocks;
+ fprint(2, "%,lld clumps, %,lld buckets\n", totalclumps, totalbuckets);
+
+ /* start index procs */
+ fprint(2, "%T read index\n");
+ isectdonechan = chancreate(sizeof(void*), 0);
+ for(i=0; i<ix->nsects; i++){
+ if(shouldprocess(ix->sects[i]))
+ ix->sects[i]->writechan = chancreate(sizeof(IEntry), 0);
+ vtproc(isectproc, ix->sects[i]);
+ }
+
+ for(i=0; i<nisect; i++)
+ if(isect[i])
+ fprint(2, "warning: did not find index section %s\n", isect[i]);
+
+ /* start arena procs */
+ p = nil;
+ napart = 0;
+ arenadonechan = chancreate(sizeof(void*), 0);
+ for(i=0; i<ix->narenas; i++){
+ if(ix->arenas[i]->part != p){
+ p = ix->arenas[i]->part;
+ vtproc(arenapartproc, p);
+ napart++;
+ }
+ }
+
+ /* wait for arena procs to finish */
+ for(i=0; i<napart; i++)
+ recvp(arenadonechan);
+
+ /* tell index procs to finish */
+ for(i=0; i<ix->nsects; i++)
+ if(ix->sects[i]->writechan)
+ send(ix->sects[i]->writechan, nil);
+
+ /* wait for index procs to finish */
+ for(i=0; i<ix->nsects; i++)
+ if(ix->sects[i]->writechan)
+ recvp(isectdonechan);
+
+ if(ix->bloom && writebloom(ix->bloom) < 0)
+ fprint(2, "writing bloom filter: %r\n");
+
+ fprint(2, "%T done arenaentries=%,lld indexed=%,lld (nskip=%,lld)\n",
+ arenaentries, indexentries, skipentries);
+ threadexitsall(nil);
+}
+
+static int
+shouldprocess(ISect *is)
+{
+ int i;
+
+ if(nisect == 0)
+ return 1;
+
+ for(i=0; i<nisect; i++)
+ if(isect[i] && strcmp(isect[i], is->name) == 0){
+ isect[i] = nil;
+ return 1;
+ }
+ return 0;
+}
+
+static void
+add(u64int *a, u64int n)
+{
+ static Lock l;
+
+ lock(&l);
+ *a += n;
+ unlock(&l);
+}
+
+/*
+ * Read through an arena partition and send each of its IEntries
+ * to the appropriate index section. When finished, send on
+ * arenadonechan.
+ */
+enum
+{
+ ClumpChunks = 32*1024,
+};
+static void
+arenapartproc(void *v)
+{
+ int i, j, n, nskip, x;
+ u32int clump;
+ u64int addr, tot;
+ Arena *a;
+ ClumpInfo *ci, *cis;
+ IEntry ie;
+ Part *p;
+
+ p = v;
+ threadsetname("arenaproc %s", p->name);
+
+ nskip = 0;
+ tot = 0;
+ cis = MKN(ClumpInfo, ClumpChunks);
+ for(i=0; i<ix->narenas; i++){
+ a = ix->arenas[i];
+ if(a->part != p)
+ continue;
+ if(a->memstats.clumps)
+ fprint(2, "%T arena %s: %d entries\n",
+ a->name, a->memstats.clumps);
+ addr = ix->amap[i].start;
+ for(clump=0; clump<a->memstats.clumps; clump+=n){
+ n = ClumpChunks;
+ if(n > a->memstats.clumps - clump)
+ n = a->memstats.clumps - clump;
+ if(readclumpinfos(a, clump, cis, n) != n){
+ fprint(2, "%T arena %s: directory read: %r\n", a->name);
+ errors = 1;
+ break;
+ }
+ for(j=0; j<n; j++){
+ ci = &cis[j];
+ ie.ia.type = ci->type;
+ ie.ia.size = ci->uncsize;
+ ie.ia.addr = addr;
+ addr += ci->size + ClumpSize;
+ ie.ia.blocks = (ci->size + ClumpSize + (1<<ABlockLog)-1) >> ABlockLog;
+ scorecp(ie.score, ci->score);
+ if(ci->type == VtCorruptType)
+ nskip++;
+ else{
+ tot++;
+ x = indexsect(ix, ie.score);
+ assert(0 <= x && x < ix->nsects);
+ if(ix->sects[x]->writechan)
+ send(ix->sects[x]->writechan, &ie);
+ if(ix->bloom)
+ markbloomfilter(ix->bloom, ie.score);
+ }
+ }
+ }
+ }
+ add(&arenaentries, tot);
+ add(&skipentries, nskip);
+ sendp(arenadonechan, p);
+}
+
+/*
+ * Convert score into relative bucket number in isect.
+ * Can pass a packed ientry instead of score - score is first.
+ */
+static u32int
+score2bucket(ISect *is, uchar *score)
+{
+ u32int b;
+
+ b = hashbits(score, 32)/ix->div;
+ assert(is->start <= b && b < is->stop);
+ return b - is->start;
+}
+
+/*
+ * Convert offset in index section to bucket number.
+ */
+static u32int
+offset2bucket(ISect *is, u64int offset)
+{
+ u32int b;
+
+ assert(is->blockbase <= offset);
+ offset -= is->blockbase;
+ b = offset/is->blocksize;
+ assert(b < is->stop-is->start);
+ return b;
+}
+
+/*
+ * Convert bucket number to offset.
+ */
+static u64int
+bucket2offset(ISect *is, u32int b)
+{
+ assert(b <= is->stop-is->start);
+ return is->blockbase + (u64int)b*is->blocksize;
+}
+
+/*
+ * IEntry buffers to hold initial round of spraying.
+ */
+typedef struct Buf Buf;
+struct Buf
+{
+ Part *part; /* partition being written */
+ uchar *bp; /* current block */
+ uchar *ep; /* end of block */
+ uchar *wp; /* write position in block */
+ u64int boffset; /* start offset */
+ u64int woffset; /* next write offset */
+ u64int eoffset; /* end offset */
+ u32int nentry; /* number of entries written */
+};
+
+static void
+bflush(Buf *buf)
+{
+ u32int bufsize;
+
+ if(buf->woffset >= buf->eoffset)
+ sysfatal("buf index chunk overflow - need bufger index");
+ bufsize = buf->ep - buf->bp;
+ if(writepart(buf->part, buf->woffset, buf->bp, bufsize) < 0){
+ fprint(2, "write %s: %r\n", buf->part->name);
+ errors = 1;
+ }
+ buf->woffset += bufsize;
+ memset(buf->bp, 0, bufsize);
+ buf->wp = buf->bp;
+}
+
+static void
+bwrite(Buf *buf, IEntry *ie)
+{
+ if(buf->wp+IEntrySize > buf->ep)
+ bflush(buf);
+ assert(buf->bp <= buf->wp && buf->wp < buf->ep);
+ packientry(ie, buf->wp);
+ buf->wp += IEntrySize;
+ assert(buf->bp <= buf->wp && buf->wp <= buf->ep);
+ buf->nentry++;
+}
+
+/*
+ * Minibuffer. In-memory data structure holds our place
+ * in the buffer but has no block data. We are writing and
+ * reading the minibuffers at the same time. (Careful!)
+ */
+typedef struct Minibuf Minibuf;
+struct Minibuf
+{
+ u64int boffset; /* start offset */
+ u64int roffset; /* read offset */
+ u64int woffset; /* write offset */
+ u64int eoffset; /* end offset */
+ u32int nentry; /* # entries left to read */
+ u32int nwentry; /* # entries written */
+};
+
+/*
+ * Index entry pool. Used when trying to shuffle around
+ * the entries in a big buffer into the corresponding M minibuffers.
+ * Sized to hold M*EntriesPerBlock entries, so that there will always
+ * either be room in the pool for another block worth of entries
+ * or there will be an entire block worth of sorted entries to
+ * write out.
+ */
+typedef struct IEntryLink IEntryLink;
+typedef struct IPool IPool;
+
+struct IEntryLink
+{
+ uchar ie[IEntrySize]; /* raw IEntry */
+ IEntryLink *next; /* next in chain */
+};
+
+struct IPool
+{
+ ISect *isect;
+ u32int buck0; /* first bucket in pool */
+ u32int mbufbuckets; /* buckets per minibuf */
+ IEntryLink *entry; /* all IEntryLinks */
+ u32int nentry; /* # of IEntryLinks */
+ IEntryLink *free; /* free list */
+ u32int nfree; /* # on free list */
+ Minibuf *mbuf; /* all minibufs */
+ u32int nmbuf; /* # of minibufs */
+ IEntryLink **mlist; /* lists for each minibuf */
+ u32int *mcount; /* # on each mlist[i] */
+ u32int bufsize; /* block buffer size */
+ uchar *rbuf; /* read buffer */
+ uchar *wbuf; /* write buffer */
+ u32int epbuf; /* entries per block buffer */
+};
+
+/*
+static int
+countsokay(IPool *p)
+{
+ int i;
+ u64int n;
+
+ n = 0;
+ for(i=0; i<p->nmbuf; i++)
+ n += p->mcount[i];
+ n += p->nfree;
+ if(n != p->nentry){
+ print("free %ud:", p->nfree);
+ for(i=0; i<p->nmbuf; i++)
+ print(" %ud", p->mcount[i]);
+ print(" = %lld nentry: %ud\n", n, p->nentry);
+ }
+ return n == p->nentry;
+}
+*/
- fprint(2, "building a new index %s using %s for temporary storage\n", mainindex->name, argv[1]);
+static IPool*
+mkipool(ISect *isect, Minibuf *mbuf, u32int nmbuf,
+ u32int mbufbuckets, u32int bufsize)
+{
+ u32int i, nentry;
+ uchar *data;
+ IPool *p;
+ IEntryLink *l;
+
+ nentry = (nmbuf+1)*bufsize / IEntrySize;
+ p = ezmalloc(sizeof(IPool)
+ +nentry*sizeof(IEntry)
+ +nmbuf*sizeof(IEntryLink*)
+ +nmbuf*sizeof(u32int)
+ +3*bufsize);
+
+ p->isect = isect;
+ p->mbufbuckets = mbufbuckets;
+ p->bufsize = bufsize;
+ p->entry = (IEntryLink*)(p+1);
+ p->nentry = nentry;
+ p->mlist = (IEntryLink**)(p->entry+nentry);
+ p->mcount = (u32int*)(p->mlist+nmbuf);
+ p->nmbuf = nmbuf;
+ p->mbuf = mbuf;
+ data = (uchar*)(p->mcount+nmbuf);
+ data += bufsize - (u32int)data%bufsize;
+ p->rbuf = data;
+ p->wbuf = data+bufsize;
+ p->epbuf = bufsize/IEntrySize;
- part = initpart(argv[1], ORDWR|ODIRECT);
- if(part == nil)
- sysfatal("can't initialize temporary partition: %r");
+ for(i=0; i<p->nentry; i++){
+ l = &p->entry[i];
+ l->next = p->free;
+ p->free = l;
+ p->nfree++;
+ }
+ return p;
+}
- clumps = sortrawientries(mainindex, part, &base, mainindex->bloom);
- if(clumps == TWID64)
- sysfatal("can't build sorted index: %r");
- fprint(2, "found and sorted index entries for clumps=%lld at %lld\n", clumps, base);
+/*
+ * Add the index entry ie to the pool p.
+ * Caller must know there is room.
+ */
+static void
+ipoolinsert(IPool *p, uchar *ie)
+{
+ u32int buck, x;
+ IEntryLink *l;
+
+ assert(p->free != nil);
+
+ buck = score2bucket(p->isect, ie);
+ x = (buck-p->buck0) / p->mbufbuckets;
+ if(x >= p->nmbuf){
+ fprint(2, "buck=%ud mbufbucket=%ud x=%ud\n",
+ buck, p->mbufbuckets, x);
+ }
+ assert(x < p->nmbuf);
- if(buildindex(mainindex, part, base, clumps, zero) < 0)
- sysfatal("can't build new index: %r");
+ l = p->free;
+ p->free = l->next;
+ p->nfree--;
+ memmove(l->ie, ie, IEntrySize);
+ l->next = p->mlist[x];
+ p->mlist[x] = l;
+ p->mcount[x]++;
+}
+
+/*
+ * Pull out a block containing as many
+ * entries as possible for minibuffer x.
+ */
+static u32int
+ipoolgetbuf(IPool *p, u32int x)
+{
+ uchar *bp, *ep, *wp;
+ IEntryLink *l;
+ u32int n;
+
+ bp = p->wbuf;
+ ep = p->wbuf + p->bufsize;
+ n = 0;
+ assert(x < p->nmbuf);
+ for(wp=bp; wp+IEntrySize<=ep && p->mlist[x]; wp+=IEntrySize){
+ l = p->mlist[x];
+ p->mlist[x] = l->next;
+ p->mcount[x]--;
+ memmove(wp, l->ie, IEntrySize);
+ l->next = p->free;
+ p->free = l;
+ p->nfree++;
+ n++;
+ }
+ memset(wp, 0, ep-wp);
+ return n;
+}
+
+/*
+ * Read a block worth of entries from the minibuf
+ * into the pool. Caller must know there is room.
+ */
+static void
+ipoolloadblock(IPool *p, Minibuf *mb)
+{
+ u32int i, n;
- if(mainindex->bloom)
- writebloom(mainindex->bloom);
+ assert(mb->nentry > 0);
+ assert(mb->roffset >= mb->woffset);
+ assert(mb->roffset < mb->eoffset);
- threadexitsall(0);
+ n = p->bufsize/IEntrySize;
+ if(n > mb->nentry)
+ n = mb->nentry;
+ if(readpart(p->isect->part, mb->roffset, p->rbuf, p->bufsize) < 0)
+ fprint(2, "readpart %s: %r\n", p->isect->part->name);
+ else{
+ for(i=0; i<n; i++)
+ ipoolinsert(p, p->rbuf+i*IEntrySize);
+ }
+ mb->nentry -= n;
+ mb->roffset += p->bufsize;
}
+
+/*
+ * Write out a block worth of entries to minibuffer x.
+ * If necessary, pick up the data there before overwriting it.
+ */
+static void
+ipoolflush0(IPool *pool, u32int x)
+{
+ u32int bufsize;
+ Minibuf *mb;
+
+ mb = pool->mbuf+x;
+ bufsize = pool->bufsize;
+ mb->nwentry += ipoolgetbuf(pool, x);
+ if(mb->nentry > 0 && mb->roffset == mb->woffset){
+ assert(pool->nfree >= pool->bufsize/IEntrySize);
+ /*
+ * There will be room in the pool -- we just
+ * removed a block worth.
+ */
+ ipoolloadblock(pool, mb);
+ }
+ if(writepart(pool->isect->part, mb->woffset, pool->wbuf, bufsize) < 0)
+ fprint(2, "writepart %s: %r\n", pool->isect->part->name);
+ mb->woffset += bufsize;
+}
+
+/*
+ * Write out some full block of entries.
+ * (There must be one -- the pool is almost full!)
+ */
+static void
+ipoolflush1(IPool *pool)
+{
+ u32int i;
+
+ assert(pool->nfree <= pool->epbuf);
+
+ for(i=0; i<pool->nmbuf; i++){
+ if(pool->mcount[i] >= pool->epbuf){
+ ipoolflush0(pool, i);
+ return;
+ }
+ }
+ /* can't be reached - someone must be full */
+ sysfatal("ipoolflush1");
+}
+
+/*
+ * Flush all the entries in the pool out to disk.
+ * Nothing more to read from disk.
+ */
+static void
+ipoolflush(IPool *pool)
+{
+ u32int i;
+
+ for(i=0; i<pool->nmbuf; i++)
+ while(pool->mlist[i])
+ ipoolflush0(pool, i);
+ assert(pool->nfree == pool->nentry);
+}
+
+/*
+ * Third pass. Pick up each minibuffer from disk into
+ * memory and then write out the buckets.
+ */
+
+/*
+ * Compare two packed index entries.
+ * Usual ordering except break ties by putting higher
+ * index addresses first (assumes have duplicates
+ * due to corruption in the lower addresses).
+ */
+static int
+ientrycmpaddr(const void *va, const void *vb)
+{
+ int i;
+ uchar *a, *b;
+
+ a = (uchar*)va;
+ b = (uchar*)vb;
+ i = ientrycmp(a, b);
+ if(i)
+ return i;
+ return -memcmp(a+IEntryAddrOff, b+IEntryAddrOff, 8);
+}
+
+static void
+zerorange(Part *p, u64int o, u64int e)
+{
+ static uchar zero[MaxIoSize];
+ u32int n;
+
+ for(; o<e; o+=n){
+ n = sizeof zero;
+ if(o+n > e)
+ n = e-o;
+ if(writepart(p, o, zero, n) < 0)
+ fprint(2, "writepart %s: %r\n", p->name);
+ }
+}
+
+/*
+ * Load a minibuffer into memory and write out the
+ * corresponding buckets.
+ */
+static void
+sortminibuffer(ISect *is, Minibuf *mb, uchar *buf, u32int nbuf, u32int bufsize)
+{
+ uchar *buckdata, *p, *q, *ep;
+ u32int b, lastb, memsize, n;
+ u64int o;
+ IBucket ib;
+ Part *part;
+
+ part = is->part;
+ buckdata = emalloc(is->blocksize);
+
+ if(mb->nwentry == 0)
+ return;
+
+ /*
+ * read entire buffer.
+ */
+ assert(mb->nwentry*IEntrySize <= mb->woffset-mb->boffset);
+ assert(mb->woffset-mb->boffset <= nbuf);
+ if(readpart(part, mb->boffset, buf, mb->woffset-mb->boffset) < 0){
+ fprint(2, "readpart %s: %r\n", part->name);
+ errors = 1;
+ return;
+ }
+ assert(*(uint*)buf != 0xa5a5a5a5);
+
+ /*
+ * remove fragmentation due to IEntrySize
+ * not evenly dividing Bufsize
+ */
+ memsize = (bufsize/IEntrySize)*IEntrySize;
+ for(o=mb->boffset, p=q=buf; o<mb->woffset; o+=bufsize){
+ memmove(p, q, memsize);
+ p += memsize;
+ q += bufsize;
+ }
+ ep = buf + mb->nwentry*IEntrySize;
+ assert(ep <= buf+nbuf);
+
+ /*
+ * sort entries
+ */
+ qsort(buf, mb->nwentry, IEntrySize, ientrycmpaddr);
+
+ /*
+ * write buckets out
+ */
+ n = 0;
+ lastb = offset2bucket(is, mb->boffset);
+ for(p=buf; p<ep; p=q){
+ b = score2bucket(is, p);
+ for(q=p; q<ep && score2bucket(is, q)==b; q+=IEntrySize)
+ ;
+ if(lastb+1 < b && zero)
+ zerorange(part, bucket2offset(is, lastb+1), bucket2offset(is, b));
+ if(IBucketSize+(q-p) > is->blocksize)
+ sysfatal("bucket overflow - make index bigger");
+ memmove(buckdata+IBucketSize, p, q-p);
+ ib.n = (q-p)/IEntrySize;
+ n += ib.n;
+ packibucket(&ib, buckdata, is->bucketmagic);
+ if(writepart(part, bucket2offset(is, b), buckdata, is->blocksize) < 0)
+ fprint(2, "write %s: %r\n", part->name);
+ lastb = b;
+ }
+ if(lastb+1 < is->stop-is->start && zero)
+ zerorange(part, bucket2offset(is, lastb+1), bucket2offset(is, is->stop - is->start));
+
+ if(n != mb->nwentry)
+ fprint(2, "sortminibuffer bug: n=%ud nwentry=%ud have=%ld\n", n, mb->nwentry, (ep-buf)/IEntrySize);
+
+ free(buckdata);
+}
+
+static void
+isectproc(void *v)
+{
+ u32int buck, bufbuckets, bufsize, epbuf, i, j;
+ u32int mbufbuckets, n, nbucket, nn, space;
+ u32int nbuf, nminibuf, xminiclump, prod;
+ u64int blocksize, offset, xclump;
+ uchar *data, *p;
+ Buf *buf;
+ IEntry ie;
+ IPool *ipool;
+ ISect *is;
+ Minibuf *mbuf, *mb;
+
+ is = v;
+ blocksize = is->blocksize;
+ nbucket = is->stop - is->start;
+
+ /*
+ * Three passes:
+ * pass 1 - write index entries from arenas into
+ * large sequential sections on index disk.
+ * requires nbuf * bufsize memory.
+ *
+ * pass 2 - split each section into minibufs.
+ * requires nminibuf * bufsize memory.
+ *
+ * pass 3 - read each minibuf into memory and
+ * write buckets out.
+ * requires entries/minibuf * IEntrySize memory.
+ *
+ * The larger we set bufsize the less seeking hurts us.
+ *
+ * The fewer sections and minibufs we have, the less
+ * seeking hurts us.
+ *
+ * The fewer sections and minibufs we have, the
+ * more entries we end up with in each minibuf
+ * at the end.
+ *
+ * Shoot for using half our memory to hold each
+ * minibuf. The chance of a random distribution
+ * getting off by 2x is quite low.
+ *
+ * Once that is decided, figure out the smallest
+ * nminibuf and nsection/biggest bufsize we can use
+ * and still fit in the memory constraints.
+ */
+
+ /* expected number of clump index entries we'll see */
+ xclump = nbucket * (double)totalclumps/totalbuckets;
+
+ /* number of clumps we want to see in a minibuf */
+ xminiclump = isectmem/2/IEntrySize;
+
+ /* total number of minibufs we need */
+ prod = xclump / xminiclump;
+
+ /* if possible, skip second pass */
+ if(!dumb && prod*MinBufSize < isectmem){
+ nbuf = prod;
+ nminibuf = 1;
+ }else{
+ /* otherwise use nsection = sqrt(nmini) */
+ for(nbuf=1; nbuf*nbuf<prod; nbuf++)
+ ;
+ if(nbuf*MinBufSize > isectmem)
+ sysfatal("not enough memory");
+ nminibuf = nbuf;
+ }
+ /* size buffer to use extra memory */
+ bufsize = MinBufSize;
+ while(bufsize*2*nbuf <= isectmem && bufsize < MaxBufSize)
+ bufsize *= 2;
+ data = emalloc(nbuf*bufsize);
+ epbuf = bufsize/IEntrySize;
+
+ fprint(2, "%T %s: %,ud buckets, %,ud groups, %,ud minigroups, %,ud buffer\n",
+ is->part->name, nbucket, nbuf, nminibuf, bufsize);
+ /*
+ * Accept index entries from arena procs.
+ */
+ buf = MKNZ(Buf, nbuf);
+ p = data;
+ offset = is->blockbase;
+ bufbuckets = (nbucket+nbuf-1)/nbuf;
+ for(i=0; i<nbuf; i++){
+ buf[i].part = is->part;
+ buf[i].bp = p;
+ buf[i].wp = p;
+ p += bufsize;
+ buf[i].ep = p;
+ buf[i].boffset = offset;
+ buf[i].woffset = offset;
+ if(i < nbuf-1){
+ offset += bufbuckets*blocksize;
+ buf[i].eoffset = offset;
+ }else{
+ offset = is->blockbase + nbucket*blocksize;
+ buf[i].eoffset = offset;
+ }
+ }
+ assert(p == data+nbuf*bufsize);
+
+ n = 0;
+ while(recv(is->writechan, &ie) == 1){
+ if(ie.ia.addr == 0)
+ break;
+ buck = score2bucket(is, ie.score);
+ i = buck/bufbuckets;
+ assert(i < nbuf);
+ bwrite(&buf[i], &ie);
+ n++;
+ }
+ add(&indexentries, n);
+
+ nn = 0;
+ for(i=0; i<nbuf; i++){
+ bflush(&buf[i]);
+ buf[i].bp = nil;
+ buf[i].ep = nil;
+ buf[i].wp = nil;
+ nn += buf[i].nentry;
+ }
+ if(n != nn)
+ fprint(2, "isectproc bug: n=%ud nn=%ud\n", n, nn);
+
+ free(data);
+
+ fprint(2, "%T %s: reordering\n", is->part->name);
+
+ /*
+ * Rearrange entries into minibuffers and then
+ * split each minibuffer into buckets.
+ */
+ mbuf = MKN(Minibuf, nminibuf);
+ mbufbuckets = (bufbuckets+nminibuf-1)/nminibuf;
+ for(i=0; i<nbuf; i++){
+ /*
+ * Set up descriptors.
+ */
+ n = buf[i].nentry;
+ nn = 0;
+ offset = buf[i].boffset;
+ memset(mbuf, 0, nminibuf*sizeof(mbuf[0]));
+ for(j=0; j<nminibuf; j++){
+ mb = &mbuf[j];
+ mb->boffset = offset;
+ if(j < nminibuf-1){
+ offset += mbufbuckets*blocksize;
+ mb->eoffset = offset;
+ }else
+ mb->eoffset = buf[i].eoffset;
+ mb->roffset = mb->boffset;
+ mb->woffset = mb->boffset;
+ mb->nentry = epbuf * (mb->eoffset - mb->boffset)/bufsize;
+ if(mb->nentry > buf[i].nentry)
+ mb->nentry = buf[i].nentry;
+ buf[i].nentry -= mb->nentry;
+ nn += mb->nentry;
+ }
+ if(n != nn)
+ fprint(2, "isectproc bug2: n=%ud nn=%ud (i=%d)\n", n, nn, i);;
+ /*
+ * Rearrange.
+ */
+ if(!dumb && nminibuf == 1){
+ mbuf[0].nwentry = mbuf[0].nentry;
+ mbuf[0].woffset = buf[i].woffset;
+ }else{
+ ipool = mkipool(is, mbuf, nminibuf, mbufbuckets, bufsize);
+ ipool->buck0 = bufbuckets*i;
+ for(j=0; j<nminibuf; j++){
+ mb = &mbuf[j];
+ while(mb->nentry > 0){
+ if(ipool->nfree < epbuf){
+ ipoolflush1(ipool);
+ /* ipoolflush1 might change mb->nentry */
+ continue;
+ }
+ assert(ipool->nfree >= epbuf);
+ ipoolloadblock(ipool, mb);
+ }
+ }
+ ipoolflush(ipool);
+ nn = 0;
+ for(j=0; j<nminibuf; j++)
+ nn += mbuf[j].nwentry;
+ if(n != nn)
+ fprint(2, "isectproc bug3: n=%ud nn=%ud (i=%d)\n", n, nn, i);
+ free(ipool);
+ }
+
+ /*
+ * Make buckets.
+ */
+ space = 0;
+ for(j=0; j<nminibuf; j++)
+ if(space < mbuf[j].woffset - mbuf[j].boffset)
+ space = mbuf[j].woffset - mbuf[j].boffset;
+
+ data = emalloc(space);
+ for(j=0; j<nminibuf; j++){
+ mb = &mbuf[j];
+ sortminibuffer(is, mb, data, space, bufsize);
+ }
+ free(data);
+ }
+
+ sendp(isectdonechan, is);
+}
+
+
+
diff --git a/src/cmd/venti/srv/checkindex.c b/src/cmd/venti/srv/checkindex.c
index 9397d789..7639b2ca 100644
--- a/src/cmd/venti/srv/checkindex.c
+++ b/src/cmd/venti/srv/checkindex.c
@@ -109,7 +109,7 @@ checkindex(Index *ix, Part *part, u64int off, u64int clumps, int zero)
int ok, bok;
u64int found = 0;
-/*ZZZ make buffer size configurable */
+/* ZZZ make buffer size configurable */
b = alloczblock(ix->blocksize, 0, ix->blocksize);
z = alloczblock(ix->blocksize, 1, ix->blocksize);
ies = initiestream(part, off, clumps, 64*1024);
@@ -260,6 +260,8 @@ threadmain(int argc, char *argv[])
if(initventi(argv[0], &conf) < 0)
sysfatal("can't init venti: %r");
+ if(mainindex->bloom && loadbloom(mainindex->bloom) < 0)
+ sysfatal("can't load bloom filter: %r");
oldbloom = mainindex->bloom;
newbloom = nil;
if(oldbloom){
diff --git a/src/cmd/venti/srv/clump.c b/src/cmd/venti/srv/clump.c
index 88ebdb50..ec277864 100644
--- a/src/cmd/venti/srv/clump.c
+++ b/src/cmd/venti/srv/clump.c
@@ -91,7 +91,7 @@ clumpmagic(Arena *arena, u64int aa)
{
u8int buf[U32Size];
- if(readarena(arena, aa, buf, U32Size) < 0)
+ if(readarena(arena, aa, buf, U32Size) == TWID32)
return TWID32;
return unpackmagic(buf);
}
@@ -138,6 +138,11 @@ loadclump(Arena *arena, u64int aa, int blocks, Clump *cl, u8int *score, int veri
freezblock(cb);
return nil;
}
+ if(cl->info.type == VtCorruptType){
+ seterr(EOk, "clump is marked corrupt");
+ freezblock(cb);
+ return nil;
+ }
n -= ClumpSize;
if(n < cl->info.size){
freezblock(cb);
diff --git a/src/cmd/venti/srv/conv.c b/src/cmd/venti/srv/conv.c
index 83f51df0..58b3d25c 100644
--- a/src/cmd/venti/srv/conv.c
+++ b/src/cmd/venti/srv/conv.c
@@ -23,7 +23,7 @@ static struct {
ArenaHeadMagic, "ArenaHeadMagic",
ArenaMagic, "ArenaMagic",
ISectMagic, "ISectMagic",
- BloomMagic, "BloomMagic"
+ BloomMagic, "BloomMagic",
};
static char*
@@ -138,9 +138,6 @@ unpackarena(Arena *arena, u8int *buf)
p += U64Size;
arena->diskstats.sealed = U8GET(p);
p += U8Size;
-
- arena->memstats = arena->diskstats;
-
switch(arena->version){
case ArenaVersion4:
sz = ArenaSize4;
@@ -153,6 +150,35 @@ unpackarena(Arena *arena, u8int *buf)
seterr(ECorrupt, "arena has bad version number %d", arena->version);
return -1;
}
+ /*
+ * Additional fields for the memstats version of the stats.
+ * Diskstats reflects what is committed to the index.
+ * Memstats reflects what is in the arena. Originally intended
+ * this to be a version 5 extension, but might as well use for
+ * all the existing version 4 arenas too.
+ *
+ * To maintain backwards compatibility with existing venti
+ * installations using the older format, we define that if
+ * memstats == diskstats, then the extension fields are not
+ * included (see packarena below). That is, only partially
+ * indexed arenas have these fields. Fully indexed arenas
+ * (in particular, sealed arenas) do not.
+ */
+ if(U8GET(p) == 1){
+ sz += ArenaSize5a-ArenaSize5;
+ p += U8Size;
+ arena->memstats.clumps = U32GET(p);
+ p += U32Size;
+ arena->memstats.cclumps = U32GET(p);
+ p += U32Size;
+ arena->memstats.used = U64GET(p);
+ p += U64Size;
+ arena->memstats.uncsize = U64GET(p);
+ p += U64Size;
+ arena->memstats.sealed = U8GET(p);
+ p += U8Size;
+ }else
+ arena->memstats = arena->diskstats;
if(buf + sz != p)
sysfatal("unpackarena unpacked wrong amount");
@@ -162,6 +188,12 @@ unpackarena(Arena *arena, u8int *buf)
int
packarena(Arena *arena, u8int *buf)
{
+ return _packarena(arena, buf, 0);
+}
+
+int
+_packarena(Arena *arena, u8int *buf, int forceext)
+{
int sz;
u8int *p;
u32int t32;
@@ -207,6 +239,30 @@ packarena(Arena *arena, u8int *buf)
p += U64Size;
U8PUT(p, arena->diskstats.sealed);
p += U8Size;
+
+ /*
+ * Extension fields; see above.
+ */
+ if(forceext
+ || arena->memstats.clumps != arena->diskstats.clumps
+ || arena->memstats.cclumps != arena->diskstats.cclumps
+ || arena->memstats.used != arena->diskstats.used
+ || arena->memstats.uncsize != arena->diskstats.uncsize
+ || arena->memstats.sealed != arena->diskstats.sealed){
+ sz += ArenaSize5a - ArenaSize5;
+ U8PUT(p, 1);
+ p += U8Size;
+ U32PUT(p, arena->memstats.clumps);
+ p += U32Size;
+ U32PUT(p, arena->memstats.cclumps);
+ p += U32Size;
+ U64PUT(p, arena->memstats.used, t32);
+ p += U64Size;
+ U64PUT(p, arena->memstats.uncsize, t32);
+ p += U64Size;
+ U8PUT(p, arena->memstats.sealed);
+ p += U8Size;
+ }
if(buf + sz != p)
sysfatal("packarena packed wrong amount");
@@ -525,6 +581,8 @@ unpackientry(IEntry *ie, u8int *buf)
p += U32Size;
ie->train = U16GET(p);
p += U16Size;
+ if(p - buf != IEntryAddrOff)
+ sysfatal("unpackentry bad IEntryAddrOff amount");
ie->ia.addr = U64GET(p);
if(ie->ia.addr>>56) print("%.8H => %llux\n", p, ie->ia.addr);
p += U64Size;
diff --git a/src/cmd/venti/srv/dat.h b/src/cmd/venti/srv/dat.h
index 5101ff88..4801204f 100644
--- a/src/cmd/venti/srv/dat.h
+++ b/src/cmd/venti/srv/dat.h
@@ -75,23 +75,17 @@ enum
/*
* magic numbers on disk
*/
-/* _ClumpMagic = 0xd15cb10cU, / * clump header, deprecated */
-#define _ClumpMagic 0xd15cb10cU
+ _ClumpMagic = 0xd15cb10cU, /* clump header, deprecated */
ClumpFreeMagic = 0, /* free clump; terminates active clump log */
-/* ArenaPartMagic = 0xa9e4a5e7U, / * arena partition header */
-/* ArenaMagic = 0xf2a14eadU, / * arena trailer */
-/* ArenaHeadMagic = 0xd15c4eadU, / * arena header */
-#define ArenaPartMagic 0xa9e4a5e7U
-#define ArenaMagic 0xf2a14eadU
-#define ArenaHeadMagic 0xd15c4eadU
-
-/* BloomMagic = 0xb1004eadU, / * bloom filter header */
-#define BloomMagic 0xb1004eadU
+ ArenaPartMagic = 0xa9e4a5e7U, /* arena partition header */
+ ArenaMagic = 0xf2a14eadU, /* arena trailer */
+ ArenaHeadMagic = 0xd15c4eadU, /* arena header */
+
+ BloomMagic = 0xb1004eadU, /* bloom filter header */
BloomMaxHash = 32,
-/* ISectMagic = 0xd15c5ec7U, / * index header */
-#define ISectMagic 0xd15c5ec7U
+ ISectMagic = 0xd15c5ec7U, /* index header */
ArenaPartVersion = 3,
ArenaVersion4 = 4,
@@ -120,6 +114,7 @@ enum
ArenaPartSize = 4 * U32Size,
ArenaSize4 = 2 * U64Size + 6 * U32Size + ANameSize + U8Size,
ArenaSize5 = ArenaSize4 + U32Size,
+ ArenaSize5a = ArenaSize5 + 2 * U8Size + 2 * U32Size + 2 * U64Size,
ArenaHeadSize4 = U64Size + 3 * U32Size + ANameSize,
ArenaHeadSize5 = ArenaHeadSize4 + U32Size,
BloomHeadSize = 4 * U32Size,
@@ -137,10 +132,14 @@ enum
*/
IBucketSize = U32Size + U16Size,
IEntrySize = U64Size + U32Size + 2*U16Size + 2*U8Size + VtScoreSize,
- IEntryTypeOff = VtScoreSize + U64Size + U32Size + 2 * U16Size,
+ IEntryTypeOff = VtScoreSize + U32Size + U16Size + U64Size + U16Size,
+ IEntryAddrOff = VtScoreSize + U32Size + U16Size,
MaxClumpBlocks = (VtMaxLumpSize + ClumpSize + (1 << ABlockLog) - 1) >> ABlockLog,
+
+ IcacheFrac = 1000000, /* denominator */
+ SleepForever = 1000000000, /* magic value for sleep time */
/*
* dirty flags - order controls disk write order
*/
@@ -356,13 +355,11 @@ struct Arena
int blocksize; /* size of block to read or write */
u64int base; /* base address on disk */
u64int size; /* total space in the arena */
- u64int limit; /* storage limit for clumps */
u8int score[VtScoreSize]; /* score of the entire sealed & summed arena */
int clumpmax; /* ClumpInfos per block */
AState mem;
int inqueue;
- DigestState sha1;
/*
* fields stored on disk
@@ -477,6 +474,8 @@ struct ISect
u32int tabsize; /* max. bytes in index config */
Channel *writechan;
Channel *writedonechan;
+ void *ig; /* used by buildindex only */
+ int ng;
/*
* fields stored on disk
@@ -716,7 +715,18 @@ extern int writestodevnull; /* dangerous - for performance debugging */
extern int collectstats;
extern QLock memdrawlock;
extern int icachesleeptime;
+extern int minicachesleeptime;
extern int arenasumsleeptime;
+extern int manualscheduling;
+extern int l0quantum;
+extern int l1quantum;
+extern int ignorebloom;
+extern int icacheprefetch;
+extern int syncwrites;
+
+extern Stats *stathist;
+extern int nstathist;
+extern ulong stattime;
#ifndef PLAN9PORT
#pragma varargck type "V" uchar*
diff --git a/src/cmd/venti/srv/dcache.c b/src/cmd/venti/srv/dcache.c
index f5cc8e64..4d6d0865 100644
--- a/src/cmd/venti/srv/dcache.c
+++ b/src/cmd/venti/srv/dcache.c
@@ -34,7 +34,7 @@ enum
{
HashLog = 9,
HashSize = 1<<HashLog,
- HashMask = HashSize - 1
+ HashMask = HashSize - 1,
};
struct DCache
@@ -212,8 +212,6 @@ return;
lastmiss.part = part;
lastmiss.addr = addr;
}
-
-/* fprint(2, "%s %llx %s\n", part->name, addr, miss ? "miss" : "hit"); */
}
int
@@ -230,6 +228,7 @@ rareadpart(Part *part, u64int addr, u8int *buf, uint n, int load)
}
if(load != 2 || addr >= part->size){ /* addr >= part->size: let readpart do the error */
runlock(&ralock);
+ diskaccess(0);
return readpart(part, addr, buf, n);
}
@@ -239,6 +238,7 @@ fprint(2, "raread %s %llx\n", part->name, addr);
nn = dcache.ramax;
if(addr+nn > part->size)
nn = part->size - addr;
+ diskaccess(0);
if(readpart(part, addr, dcache.rabuf, nn) < 0){
wunlock(&ralock);
return -1;
@@ -297,7 +297,6 @@ _getdblock(Part *part, u64int addr, int mode, int load)
/*
* look for the block in the cache
*/
-/*checkdcache(); */
qlock(&dcache.lock);
again:
for(b = dcache.heads[h]; b != nil; b = b->next){
@@ -367,7 +366,6 @@ found:
fixheap(b->heap, b);
qunlock(&dcache.lock);
-/*checkdcache(); */
trace(TraceBlock, "getdblock lock");
addstat(StatDblockStall, 1);
@@ -427,7 +425,6 @@ putdblock(DBlock *b)
else
wunlock(&b->lock);
-/*checkdcache(); */
qlock(&dcache.lock);
if(--b->ref == 0 && !b->dirty){
if(b->heap == TWID32)
@@ -435,7 +432,6 @@ putdblock(DBlock *b)
rwakeupall(&dcache.full);
}
qunlock(&dcache.lock);
-/*checkdcache(); */
}
void
@@ -474,6 +470,25 @@ dirtydblock(DBlock *b, int dirty)
qunlock(&dcache.lock);
}
+static void
+unchain(DBlock *b)
+{
+ ulong h;
+
+ /*
+ * unchain the block
+ */
+ if(b->prev == nil){
+ h = pbhash(b->addr);
+ if(dcache.heads[h] != b)
+ sysfatal("bad hash chains in disk cache");
+ dcache.heads[h] = b->next;
+ }else
+ b->prev->next = b->next;
+ if(b->next != nil)
+ b->next->prev = b->prev;
+}
+
/*
* remove some block from use and update the free list and counters
*/
@@ -481,7 +496,6 @@ static DBlock*
bumpdblock(void)
{
DBlock *b;
- ulong h;
trace(TraceBlock, "bumpdblock enter");
b = dcache.free;
@@ -512,22 +526,28 @@ bumpdblock(void)
trace(TraceBlock, "bumpdblock bumping %s 0x%llux", b->part->name, b->addr);
- /*
- * unchain the block
- */
- if(b->prev == nil){
- h = pbhash(b->addr);
- if(dcache.heads[h] != b)
- sysfatal("bad hash chains in disk cache");
- dcache.heads[h] = b->next;
- }else
- b->prev->next = b->next;
- if(b->next != nil)
- b->next->prev = b->prev;
-
+ unchain(b);
return b;
}
+void
+emptydcache(void)
+{
+ DBlock *b;
+
+ qlock(&dcache.lock);
+ while(dcache.nheap > 0){
+ b = dcache.heap[0];
+ delheap(b);
+ if(!b->ref && !b->dirty){
+ unchain(b);
+ b->next = dcache.free;
+ dcache.free = b;
+ }
+ }
+ qunlock(&dcache.lock);
+}
+
/*
* delete an arbitrary block from the heap
*/
@@ -683,6 +703,7 @@ static int
parallelwrites(DBlock **b, DBlock **eb, int dirty)
{
DBlock **p, **q;
+
for(p=b; p<eb && (*p)->dirty == dirty; p++){
assert(b<=p && p<eb);
sendp((*p)->part->writechan, *p);
@@ -803,6 +824,7 @@ writeproc(void *v)
trace(TraceProc, "wlock %s 0x%llux", p->name, b->addr);
wlock(&b->lock);
trace(TraceProc, "writepart %s 0x%llux", p->name, b->addr);
+ diskaccess(0);
if(writepart(p, b->addr, b->data, b->size) < 0)
fprint(2, "write error: %r\n"); /* XXX details! */
addstat(StatApartWrite, 1);
diff --git a/src/cmd/venti/srv/disksched.c b/src/cmd/venti/srv/disksched.c
new file mode 100644
index 00000000..687616e1
--- /dev/null
+++ b/src/cmd/venti/srv/disksched.c
@@ -0,0 +1,88 @@
+#include "stdinc.h"
+#include "dat.h"
+#include "fns.h"
+
+ulong lasttime[2];
+int manualscheduling;
+int l0quantum = 120;
+int l1quantum = 120;
+ulong lasticachechange;
+
+void
+disksched(void)
+{
+ int p, nwrite, nflush, ndirty, tdirty, toflush;
+ ulong t;
+ vlong cflush;
+ Stats *prev;
+
+ /*
+ * no locks because all the data accesses are atomic.
+ */
+ t = time(0);
+ if(manualscheduling){
+ lasticachechange = t;
+ return;
+ }
+
+ if(t-lasttime[0] < l0quantum){
+ /* level-0 disk access going on */
+ p = icachedirtyfrac();
+ if(p < IcacheFrac*5/10){ /* can wait */
+ icachesleeptime = SleepForever;
+ lasticachechange = t;
+ }else if(p > IcacheFrac*9/10){ /* can't wait */
+ icachesleeptime = 0;
+ lasticachechange = t;
+ }else if(t-lasticachechange > 60){
+ /* have minute worth of data for current rate */
+ prev = &stathist[(stattime-60+nstathist)%nstathist];
+
+ /* # entries written to index cache */
+ nwrite = stats.n[StatIcacheWrite] - prev->n[StatIcacheWrite];
+
+ /* # dirty entries in index cache */
+ ndirty = stats.n[StatIcacheDirty] - prev->n[StatIcacheDirty];
+
+ /* # entries flushed to disk */
+ nflush = nwrite - ndirty;
+
+ /* want to stay around 70% dirty */
+ tdirty = (vlong)stats.n[StatIcacheSize]*700/1000;
+
+ /* assume nflush*icachesleeptime is a constant */
+ cflush = (vlong)nflush*(icachesleeptime+1);
+
+ /* computer number entries to write in next minute */
+ toflush = nwrite + (stats.n[StatIcacheDirty] - tdirty);
+
+ /* schedule for that many */
+ if(toflush <= 0 || cflush/toflush > 100000)
+ icachesleeptime = SleepForever;
+ else
+ icachesleeptime = cflush/toflush;
+ }
+ arenasumsleeptime = SleepForever;
+ return;
+ }
+ if(t-lasttime[1] < l1quantum){
+ /* level-1 disk access (icache flush) going on */
+ icachesleeptime = 0;
+ arenasumsleeptime = SleepForever;
+ return;
+ }
+ /* no disk access going on - no holds barred*/
+ icachesleeptime = 0;
+ arenasumsleeptime = 0;
+}
+
+void
+diskaccess(int level)
+{
+ if(level < 0 || level >= nelem(lasttime)){
+ fprint(2, "bad level in diskaccess; caller=%lux\n", getcallerpc(&level));
+ return;
+ }
+ lasttime[level] = time(0);
+}
+
diff --git a/src/cmd/venti/srv/findscore.c b/src/cmd/venti/srv/findscore.c
index 6681503d..226d97ae 100644
--- a/src/cmd/venti/srv/findscore.c
+++ b/src/cmd/venti/srv/findscore.c
@@ -27,7 +27,7 @@ findscore(Arena *arena, uchar *score)
u32int clump;
int i, n, found;
-/*ZZZ remove fprint? */
+//ZZZ remove fprint?
if(arena->memstats.clumps)
fprint(2, "reading directory for arena=%s with %d entries\n", arena->name, arena->memstats.clumps);
diff --git a/src/cmd/venti/srv/fixarenas.c b/src/cmd/venti/srv/fixarenas.c
new file mode 100644
index 00000000..95515942
--- /dev/null
+++ b/src/cmd/venti/srv/fixarenas.c
@@ -0,0 +1,1894 @@
+/*
+ * Check and fix an arena partition.
+ *
+ * This is a lot grittier than the rest of Venti because
+ * it can't just give up if a byte here or there is wrong.
+ *
+ * The rule here (hopefully followed!) is that block corruption
+ * only ever has a local effect -- there are no blocks that you
+ * can wipe out that will cause large portions of
+ * uncorrupted data blocks to be useless.
+ */
+
+#include "stdinc.h"
+#include "dat.h"
+#include "fns.h"
+#include "whack.h"
+
+#pragma varargck type "z" uvlong
+#pragma varargck type "z" vlong
+#pragma varargck type "t" uint
+
+enum
+{
+ K = 1024,
+ M = 1024*1024,
+ G = 1024*1024*1024,
+
+ Block = 4096,
+};
+
+int debugsha1;
+
+int verbose;
+Part *part;
+char *file;
+char *basename;
+char *dumpbase;
+int fix;
+int badreads;
+int unseal;
+uchar zero[MaxDiskBlock];
+
+Arena lastarena;
+ArenaPart ap;
+uvlong arenasize;
+int nbadread;
+int nbad;
+uvlong partend;
+void checkarena(vlong, int);
+
+void
+usage(void)
+{
+ fprint(2, "usage: fixarenas [-fv] [-a arenasize] [-b blocksize] file [ranges]\n");
+ threadexitsall(0);
+}
+
+/*
+ * Format number in simplest way that is okay with unittoull.
+ */
+static int
+zfmt(Fmt *fmt)
+{
+ vlong x;
+
+ x = va_arg(fmt->args, vlong);
+ if(x == 0)
+ return fmtstrcpy(fmt, "0");
+ if(x%G == 0)
+ return fmtprint(fmt, "%lldG", x/G);
+ if(x%M == 0)
+ return fmtprint(fmt, "%lldM", x/M);
+ if(x%K == 0)
+ return fmtprint(fmt, "%lldK", x/K);
+ return fmtprint(fmt, "%lld", x);
+}
+
+/*
+ * Format time like ctime without newline.
+ */
+static int
+tfmt(Fmt *fmt)
+{
+ uint t;
+ char buf[30];
+
+ t = va_arg(fmt->args, uint);
+ strcpy(buf, ctime(t));
+ buf[28] = 0;
+ return fmtstrcpy(fmt, buf);
+}
+
+/*
+ * Coalesce messages about unreadable sectors into larger ranges.
+ * bad(0, 0) flushes the buffer.
+ */
+static void
+bad(char *msg, vlong o, int len)
+{
+ static vlong lb0, lb1;
+ static char *lmsg;
+
+ if(msg == nil)
+ msg = lmsg;
+ if(o == -1){
+ lmsg = nil;
+ lb0 = 0;
+ lb1 = 0;
+ return;
+ }
+ if(lb1 != o || (msg && lmsg && strcmp(msg, lmsg) != 0)){
+ if(lb0 != lb1)
+ print("%s %#llux+%#llux (%,lld+%,lld)\n",
+ lmsg, lb0, lb1-lb0, lb0, lb1-lb0);
+ lb0 = o;
+ }
+ lmsg = msg;
+ lb1 = o+len;
+}
+
+/*
+ * Read in the len bytes of data at the offset. If can't for whatever reason,
+ * fill it with garbage but print an error.
+ */
+static uchar*
+readdisk(uchar *buf, vlong offset, int len)
+{
+ int i, j, k, n;
+
+ if(offset >= partend){
+ memset(buf, 0xFB, sizeof buf);
+ return buf;
+ }
+
+ if(offset+len > partend){
+ memset(buf, 0xFB, sizeof buf);
+ len = partend - offset;
+ }
+
+ if(readpart(part, offset, buf, len) >= 0)
+ return buf;
+
+ /*
+ * The read failed. Clear the buffer to nonsense, and
+ * then try reading in smaller pieces. If that fails,
+ * read in even smaller pieces. And so on down to sectors.
+ */
+ memset(buf, 0xFD, len);
+ for(i=0; i<len; i+=64*K){
+ n = 64*K;
+ if(i+n > len)
+ n = len-i;
+ if(readpart(part, offset+i, buf+i, n) >= 0)
+ continue;
+ for(j=i; j<len && j<i+64*K; j+=4*K){
+ n = 4*K;
+ if(j+n > len)
+ n = len-j;
+ if(readpart(part, offset+j, buf+j, n) >= 0)
+ continue;
+ for(k=j; k<len && k<j+4*K; k+=512){
+ if(readpart(part, offset+k, buf+k, 512) >= 0)
+ continue;
+ bad("disk read failed at", k, 512);
+ badreads++;
+ }
+ }
+ }
+ bad(nil, 0, 0);
+ return buf;
+}
+
+/*
+ * Buffer to support running SHA1 hash of the disk.
+ */
+typedef struct Shabuf Shabuf;
+struct Shabuf
+{
+ int fd;
+ vlong offset;
+ DigestState state;
+ int rollback;
+ vlong r0;
+ DigestState *hist;
+ int nhist;
+};
+
+void
+sbdebug(Shabuf *sb, char *file)
+{
+ int fd;
+
+ if(sb->fd > 0){
+ close(sb->fd);
+ sb->fd = 0;
+ }
+ if((fd = create(file, OWRITE, 0666)) < 0)
+ return;
+ if(fd == 0){
+ fd = dup(fd, -1);
+ close(0);
+ }
+ sb->fd = fd;
+}
+
+void
+sbupdate(Shabuf *sb, uchar *p, vlong offset, int len)
+{
+ int n, x;
+ vlong o;
+
+ if(sb->rollback && !sb->hist){
+ sb->r0 = offset;
+ sb->nhist = 1;
+ sb->hist = vtmalloc(sb->nhist*sizeof *sb->hist);
+ memset(sb->hist, 0, sizeof sb->hist[0]);
+ }
+ if(sb->r0 == 0)
+ sb->r0 = offset;
+
+ if(sb->offset < offset || sb->offset >= offset+len){
+ if(0) print("sbupdate %p %#llux+%d but offset=%#llux\n",
+ p, offset, len, sb->offset);
+ return;
+ }
+ x = sb->offset - offset;
+ if(0) print("sbupdate %p %#llux+%d skip %d\n",
+ sb, offset, len, x);
+ if(x){
+ p += x;
+ offset += x;
+ len -= x;
+ }
+ assert(sb->offset == offset);
+
+ if(sb->fd > 0)
+ pwrite(sb->fd, p, len, offset - sb->r0);
+
+ if(!sb->rollback){
+ sha1(p, len, nil, &sb->state);
+ sb->offset += len;
+ return;
+ }
+
+ /* save state every 4M so we can roll back quickly */
+ o = offset - sb->r0;
+ while(len > 0){
+ n = 4*M - o%(4*M);
+ if(n > len)
+ n = len;
+ sha1(p, n, nil, &sb->state);
+ sb->offset += n;
+ o += n;
+ p += n;
+ len -= n;
+ if(o%(4*M) == 0){
+ x = o/(4*M);
+ if(x >= sb->nhist){
+ if(x != sb->nhist)
+ print("oops! x=%d nhist=%d\n", x, sb->nhist);
+ sb->nhist += 32;
+ sb->hist = vtrealloc(sb->hist, sb->nhist*sizeof *sb->hist);
+ }
+ sb->hist[x] = sb->state;
+ }
+ }
+}
+
+void
+sbdiskhash(Shabuf *sb, vlong eoffset)
+{
+ static uchar dbuf[4*M];
+ int n;
+
+ while(sb->offset < eoffset){
+ n = sizeof dbuf;
+ if(sb->offset+n > eoffset)
+ n = eoffset - sb->offset;
+ readdisk(dbuf, sb->offset, n);
+ sbupdate(sb, dbuf, sb->offset, n);
+ }
+}
+
+void
+sbrollback(Shabuf *sb, vlong offset)
+{
+ int x;
+ vlong o;
+ Dir d;
+
+ if(!sb->rollback || !sb->r0){
+ print("cannot rollback sha\n");
+ return;
+ }
+ if(offset >= sb->offset)
+ return;
+ o = offset - sb->r0;
+ x = o/(4*M);
+ if(x >= sb->nhist){
+ print("cannot rollback sha\n");
+ return;
+ }
+ sb->state = sb->hist[x];
+ sb->offset = sb->r0 + x*4*M;
+ assert(sb->offset <= offset);
+
+ if(sb->fd > 0){
+ nulldir(&d);
+ d.length = sb->offset - sb->r0;
+ dirfwstat(sb->fd, &d);
+ }
+}
+
+void
+sbscore(Shabuf *sb, uchar *score)
+{
+ if(sb->hist){
+ free(sb->hist);
+ sb->hist = nil;
+ }
+ sha1(nil, 0, score, &sb->state);
+}
+
+/*
+ * If we're fixing arenas, then editing this memory edits the disk!
+ * It will be written back out as new data is paged in.
+ */
+uchar buf[4*M];
+uchar sbuf[4*M];
+vlong bufoffset;
+int buflen;
+
+static void pageout(void);
+static uchar*
+pagein(vlong offset, int len)
+{
+ pageout();
+ if(offset >= partend){
+ memset(buf, 0xFB, sizeof buf);
+ return buf;
+ }
+
+ if(offset+len > partend){
+ memset(buf, 0xFB, sizeof buf);
+ len = partend - offset;
+ }
+ bufoffset = offset;
+ buflen = len;
+ readdisk(buf, offset, len);
+ memmove(sbuf, buf, len);
+ return buf;
+}
+
+static void
+pageout(void)
+{
+ if(buflen==0 || !fix || memcmp(buf, sbuf, buflen) == 0){
+ buflen = 0;
+ return;
+ }
+ if(writepart(part, bufoffset, buf, buflen) < 0)
+ print("disk write failed at %#llux+%#ux (%,lld+%,d)\n",
+ bufoffset, buflen, bufoffset, buflen);
+ buflen = 0;
+}
+
+static void
+zerorange(vlong offset, int len)
+{
+ int i;
+ vlong ooff;
+ int olen;
+ enum { MinBlock = 4*K, MaxBlock = 8*K };
+
+ if(0)
+ if(bufoffset <= offset && offset+len <= bufoffset+buflen){
+ memset(buf+(offset-bufoffset), 0, len);
+ return;
+ }
+
+ ooff = bufoffset;
+ olen = buflen;
+
+ i = offset%MinBlock;
+ if(i+len < MaxBlock){
+ pagein(offset-i, (len+MinBlock-1)&~(MinBlock-1));
+ memset(buf+i, 0, len);
+ }else{
+ pagein(offset-i, MaxBlock);
+ memset(buf+i, 0, MaxBlock-i);
+ offset += MaxBlock-i;
+ len -= MaxBlock-i;
+ while(len >= MaxBlock){
+ pagein(offset, MaxBlock);
+ memset(buf, 0, MaxBlock);
+ offset += MaxBlock;
+ len -= MaxBlock;
+ }
+ pagein(offset, (len+MinBlock-1)&~(MinBlock-1));
+ memset(buf, 0, len);
+ }
+ pagein(ooff, olen);
+}
+
+/*
+ * read/write integers
+ *
+static void
+p16(uchar *p, u16int u)
+{
+ p[0] = (u>>8) & 0xFF;
+ p[1] = u & 0xFF;
+}
+*/
+
+static u16int
+u16(uchar *p)
+{
+ return (p[0]<<8)|p[1];
+}
+
+static void
+p32(uchar *p, u32int u)
+{
+ p[0] = (u>>24) & 0xFF;
+ p[1] = (u>>16) & 0xFF;
+ p[2] = (u>>8) & 0xFF;
+ p[3] = u & 0xFF;
+}
+
+static u32int
+u32(uchar *p)
+{
+ return (p[0]<<24)|(p[1]<<16)|(p[2]<<8)|p[3];
+}
+
+/*
+static void
+p64(uchar *p, u64int u)
+{
+ p32(p, u>>32);
+ p32(p, u);
+}
+*/
+
+static u64int
+u64(uchar *p)
+{
+ return ((u64int)u32(p)<<32) | u32(p+4);
+}
+
+static int
+vlongcmp(const void *va, const void *vb)
+{
+ vlong a, b;
+
+ a = *(vlong*)va;
+ b = *(vlong*)vb;
+ if(a < b)
+ return -1;
+ if(b > a)
+ return 1;
+ return 0;
+}
+
+/* D and S are in draw.h */
+#define D VD
+#define S VS
+
+enum
+{
+ D = 0x10000,
+ Z = 0x20000,
+ S = 0x30000,
+ T = 0x40000,
+ N = 0xFFFF
+};
+typedef struct Info Info;
+struct Info
+{
+ int len;
+ char *name;
+};
+
+Info partinfo[] = {
+ 4, "magic",
+ D|4, "version",
+ Z|4, "blocksize",
+ 4, "arenabase",
+ 0
+};
+
+Info headinfo4[] = {
+ 4, "magic",
+ D|4, "version",
+ S|ANameSize, "name",
+ Z|4, "blocksize",
+ Z|8, "size",
+ 0
+};
+
+Info headinfo5[] = {
+ 4, "magic",
+ D|4, "version",
+ S|ANameSize, "name",
+ Z|4, "blocksize",
+ Z|8, "size",
+ 4, "clumpmagic",
+ 0
+};
+
+Info tailinfo4[] = {
+ 4, "magic",
+ D|4, "version",
+ S|ANameSize, "name",
+ D|4, "clumps",
+ D|4, "cclumps",
+ T|4, "ctime",
+ T|4, "wtime",
+ D|8, "used",
+ D|8, "uncsize",
+ 1, "sealed",
+ 0
+};
+
+Info tailinfo4a[] = {
+ /* tailinfo 4 */
+ 4, "magic",
+ D|4, "version",
+ S|ANameSize, "name",
+ D|4, "clumps",
+ D|4, "cclumps",
+ T|4, "ctime",
+ T|4, "wtime",
+ D|8, "used",
+ D|8, "uncsize",
+ 1, "sealed",
+
+ /* mem stats */
+ 1, "extension",
+ D|4, "mem.clumps",
+ D|4, "mem.cclumps",
+ D|8, "mem.used",
+ D|8, "mem.uncsize",
+ 1, "mem.sealed",
+ 0
+};
+
+Info tailinfo5[] = {
+ 4, "magic",
+ D|4, "version",
+ S|ANameSize, "name",
+ D|4, "clumps",
+ D|4, "cclumps",
+ T|4, "ctime",
+ T|4, "wtime",
+ 4, "clumpmagic",
+ D|8, "used",
+ D|8, "uncsize",
+ 1, "sealed",
+ 0
+};
+
+Info tailinfo5a[] = {
+ /* tailinfo 5 */
+ 4, "magic",
+ D|4, "version",
+ S|ANameSize, "name",
+ D|4, "clumps",
+ D|4, "cclumps",
+ T|4, "ctime",
+ T|4, "wtime",
+ 4, "clumpmagic",
+ D|8, "used",
+ D|8, "uncsize",
+ 1, "sealed",
+
+ /* mem stats */
+ 1, "extension",
+ D|4, "mem.clumps",
+ D|4, "mem.cclumps",
+ D|8, "mem.used",
+ D|8, "mem.uncsize",
+ 1, "mem.sealed",
+ 0
+};
+
+void
+showdiffs(uchar *want, uchar *have, int len, Info *info)
+{
+ int n;
+
+ while(len > 0 && (n=info->len&N) > 0){
+ if(memcmp(have, want, n) != 0){
+ switch(info->len){
+ case 1:
+ print("\t%s: correct=%d disk=%d\n",
+ info->name, *want, *have);
+ break;
+ case 4:
+ print("\t%s: correct=%#ux disk=%#ux\n",
+ info->name, u32(want), u32(have));
+ break;
+ case D|4:
+ print("\t%s: correct=%,ud disk=%,ud\n",
+ info->name, u32(want), u32(have));
+ break;
+ case T|4:
+ print("\t%s: correct=%t\n\t\tdisk=%t\n",
+ info->name, u32(want), u32(have));
+ break;
+ case Z|4:
+ print("\t%s: correct=%z disk=%z\n",
+ info->name, (uvlong)u32(want), (uvlong)u32(have));
+ break;
+ case D|8:
+ print("\t%s: correct=%,lld disk=%,lld\n",
+ info->name, u64(want), u64(have));
+ break;
+ case Z|8:
+ print("\t%s: correct=%z disk=%z\n",
+ info->name, u64(want), u64(have));
+ break;
+ case S|ANameSize:
+ print("\t%s: correct=%s disk=%.*s\n",
+ info->name, (char*)want,
+ utfnlen((char*)have, ANameSize-1),
+ (char*)have);
+ break;
+ default:
+ print("\t%s: correct=%.*H disk=%.*H\n",
+ info->name, n, want, n, have);
+ break;
+ }
+ }
+ have += n;
+ want += n;
+ len -= n;
+ info++;
+ }
+ if(len > 0 && memcmp(have, want, len) != 0){
+ if(memcmp(want, zero, len) != 0)
+ print("!!\textra want data in showdiffs (bug in fixarenas)\n");
+ else
+ print("\tnon-zero data on disk after structure\n");
+ if(verbose > 1){
+ print("want: %.*H\n", len, want);
+ print("have: %.*H\n", len, have);
+ }
+ }
+}
+
+static int tabsizes[] = { 64*1024, 512*1024, };
+/*
+ * Poke around on the disk to guess what the ArenaPart numbers are.
+ */
+void
+guessgeometry(void)
+{
+ int i, j, n, bestn, ndiff, nhead, ntail;
+ uchar *p, *ep, *sp;
+ u64int diff[100], head[20], tail[20];
+ u64int offset, bestdiff;
+
+ ap.version = ArenaPartVersion;
+
+ if(arenasize == 0 || ap.blocksize == 0){
+ /*
+ * The ArenaPart block at offset PartBlank may be corrupt or just wrong.
+ * Instead, look for the individual arena headers and tails, which there
+ * are many of, and once we've seen enough, infer the spacing.
+ *
+ * Of course, nothing in the file format requires that arenas be evenly
+ * spaced, but fmtarenas always does that for us.
+ */
+ nhead = 0;
+ ntail = 0;
+ for(offset=PartBlank; offset<partend; offset+=4*M){
+ p = pagein(offset, 4*M);
+ for(sp=p, ep=p+4*M; p<ep; p+=K){
+ if(u32(p) == ArenaHeadMagic && nhead < nelem(head)){
+ if(verbose)
+ print("arena head at %#llx\n", offset+(p-sp));
+ head[nhead++] = offset+(p-sp);
+ }
+ if(u32(p) == ArenaMagic && ntail < nelem(tail)){
+ tail[ntail++] = offset+(p-sp);
+ if(verbose)
+ print("arena tail at %#llx\n", offset+(p-sp));
+ }
+ }
+ if(nhead == nelem(head) && ntail == nelem(tail))
+ break;
+ }
+ if(nhead < 3 && ntail < 3)
+ sysfatal("too few intact arenas: %d heads, %d tails", nhead, ntail);
+
+ /*
+ * Arena size is likely the most common
+ * inter-head or inter-tail spacing.
+ */
+ ndiff = 0;
+ for(i=1; i<nhead; i++)
+ diff[ndiff++] = head[i] - head[i-1];
+ for(i=1; i<ntail; i++)
+ diff[ndiff++] = tail[i] - tail[i-1];
+ qsort(diff, ndiff, sizeof diff[0], vlongcmp);
+ bestn = 0;
+ bestdiff = 0;
+ for(i=1, n=1; i<=ndiff; i++, n++){
+ if(i==ndiff || diff[i] != diff[i-1]){
+ if(n > bestn){
+ bestn = n;
+ bestdiff = diff[i-1];
+ }
+ n = 0;
+ }
+ }
+ print("arena size likely %z (%d of %d)\n", bestdiff, bestn, ndiff);
+ if(arenasize != 0 && arenasize != bestdiff)
+ print("using user-specified size %z instead\n", arenasize);
+ else
+ arenasize = bestdiff;
+
+ /*
+ * The arena tail for an arena is arenasize-blocksize from the head.
+ */
+ ndiff = 0;
+ for(i=j=0; i<nhead && j<ntail; ){
+ if(tail[j] < head[i]){
+ j++;
+ continue;
+ }
+ if(tail[j] < head[i]+arenasize){
+ diff[ndiff++] = head[i]+arenasize - tail[j];
+ j++;
+ continue;
+ }
+ i++;
+ }
+ if(ndiff < 3)
+ sysfatal("too few intact arenas: %d head, tail pairs", ndiff);
+ qsort(diff, ndiff, sizeof diff[0], vlongcmp);
+ bestn = 0;
+ bestdiff = 0;
+ for(i=1, n=1; i<=ndiff; i++, n++){
+ if(i==ndiff || diff[i] != diff[i-1]){
+ if(n > bestn){
+ bestn = n;
+ bestdiff = diff[i-1];
+ }
+ n = 0;
+ }
+ }
+ print("block size likely %z (%d of %d)\n", bestdiff, bestn, ndiff);
+ if(ap.blocksize != 0 && ap.blocksize != bestdiff)
+ print("using user-specified size %z instead\n", (vlong)ap.blocksize);
+ else
+ ap.blocksize = bestdiff;
+ if(ap.blocksize == 0 || ap.blocksize&(ap.blocksize-1))
+ sysfatal("block size not a power of two");
+ if(ap.blocksize > MaxDiskBlock)
+ sysfatal("block size too big (max=%d)", MaxDiskBlock);
+
+ /*
+ * Use head/tail information to deduce arena base.
+ */
+ ndiff = 0;
+ for(i=0; i<nhead; i++)
+ diff[ndiff++] = head[i]%arenasize;
+ for(i=0; i<ntail; i++)
+ diff[ndiff++] = (tail[i]+ap.blocksize)%arenasize;
+ qsort(diff, ndiff, sizeof diff[0], vlongcmp);
+ bestn = 0;
+ bestdiff = 0;
+ for(i=1, n=1; i<=ndiff; i++, n++){
+ if(i==ndiff || diff[i] != diff[i-1]){
+ if(n > bestn){
+ bestn = n;
+ bestdiff = diff[i-1];
+ }
+ n = 0;
+ }
+ }
+ ap.arenabase = bestdiff;
+ }
+
+ ap.tabbase = (PartBlank+HeadSize+ap.blocksize-1)&~(ap.blocksize-1);
+ /*
+ * XXX pick up table, check arenabase.
+ * XXX pick up table, record base name.
+ */
+
+ /*
+ * Somewhat standard computation.
+ * Fmtarenas used to use 64k tab, now uses 512k tab.
+ */
+ if(ap.arenabase == 0){
+ for(i=0; i<nelem(tabsizes); i++){
+ ap.arenabase = (PartBlank+HeadSize+tabsizes[i]+ap.blocksize-1)&~(ap.blocksize-1);
+ p = pagein(ap.arenabase, Block);
+ if(u32(p) == ArenaHeadMagic)
+ break;
+ }
+ }
+ p = pagein(ap.arenabase, Block);
+ print("arena base likely %z%s\n", (vlong)ap.arenabase,
+ u32(p)!=ArenaHeadMagic ? " (but no arena head there)" : "");
+
+ ap.tabsize = ap.arenabase - ap.tabbase;
+
+}
+
+/*
+ * Check the arena partition blocks and then the arenas listed in range.
+ */
+void
+checkarenas(char *range)
+{
+ char *s, *t;
+ int i, lo, hi, narena;
+ uchar dbuf[HeadSize];
+ uchar *p;
+
+ guessgeometry();
+
+ partend -= partend%ap.blocksize;
+
+ memset(dbuf, 0, sizeof dbuf);
+ packarenapart(&ap, dbuf);
+ p = pagein(PartBlank, Block);
+ if(memcmp(p, dbuf, HeadSize) != 0){
+ print("on-disk arena part superblock incorrect\n");
+ showdiffs(dbuf, p, HeadSize, partinfo);
+ }
+ memmove(p, dbuf, HeadSize);
+
+ narena = (partend-ap.arenabase + arenasize-1)/arenasize;
+ if(range == nil){
+ for(i=0; i<narena; i++)
+ checkarena(ap.arenabase+(vlong)i*arenasize, i);
+ }else if(strcmp(range, "none") == 0){
+ /* nothing */
+ }else{
+ /* parse, e.g., -4,8-9,10- */
+ for(s=range; *s; s=t){
+ t = strchr(s, ',');
+ if(t)
+ *t++ = 0;
+ else
+ t = s+strlen(s);
+ if(*s == '-')
+ lo = 0;
+ else
+ lo = strtol(s, &s, 0);
+ hi = lo;
+ if(*s == '-'){
+ s++;
+ if(*s == 0)
+ hi = narena-1;
+ else
+ hi = strtol(s, &s, 0);
+ }
+ if(*s != 0){
+ print("bad arena range: %s\n", s);
+ continue;
+ }
+ for(i=lo; i<=hi; i++)
+ checkarena(ap.arenabase+(vlong)i*arenasize, i);
+ }
+ }
+}
+
+/*
+ * Is there a clump here at p?
+ */
+static int
+isclump(uchar *p, Clump *cl, u32int *pmagic)
+{
+ int n;
+ u32int magic;
+ uchar score[VtScoreSize], *bp;
+ Unwhack uw;
+ uchar ubuf[70*1024];
+
+ bp = p;
+ magic = u32(p);
+ if(magic == 0)
+ return 0;
+ p += U32Size;
+
+ cl->info.type = vtfromdisktype(*p);
+ if(cl->info.type == 0xFF)
+ return 0;
+ p++;
+ cl->info.size = u16(p);
+ p += U16Size;
+ cl->info.uncsize = u16(p);
+ if(cl->info.size > cl->info.uncsize)
+ return 0;
+ p += U16Size;
+ scorecp(cl->info.score, p);
+ p += VtScoreSize;
+ cl->encoding = *p;
+ p++;
+ cl->creator = u32(p);
+ p += U32Size;
+ cl->time = u32(p);
+ p += U32Size;
+
+ switch(cl->encoding){
+ case ClumpENone:
+ if(cl->info.size != cl->info.uncsize)
+ return 0;
+ scoremem(score, p, cl->info.size);
+ if(scorecmp(score, cl->info.score) != 0)
+ return 0;
+ break;
+ case ClumpECompress:
+ if(cl->info.size >= cl->info.uncsize)
+ return 0;
+ unwhackinit(&uw);
+ n = unwhack(&uw, ubuf, cl->info.uncsize, p, cl->info.size);
+ if(n != cl->info.uncsize)
+ return 0;
+ scoremem(score, ubuf, cl->info.uncsize);
+ if(scorecmp(score, cl->info.score) != 0)
+ return 0;
+ break;
+ default:
+ return 0;
+ }
+ p += cl->info.size;
+
+ /* it all worked out in the end */
+ *pmagic = magic;
+ return p - bp;
+}
+
+/*
+ * All ClumpInfos seen in this arena.
+ * Kept in binary tree so we can look up by score.
+ */
+typedef struct Cit Cit;
+struct Cit
+{
+ int left;
+ int right;
+ vlong corrupt;
+ ClumpInfo ci;
+};
+Cit *cibuf;
+int ciroot;
+int ncibuf, mcibuf;
+
+void
+resetcibuf(void)
+{
+ ncibuf = 0;
+ ciroot = -1;
+}
+
+int*
+ltreewalk(int *p, uchar *score)
+{
+ int i;
+
+ for(;;){
+ if(*p == -1)
+ return p;
+ i = scorecmp(cibuf[*p].ci.score, score);
+ if(i == 0)
+ return p;
+ if(i < 0)
+ p = &cibuf[*p].right;
+ else
+ p = &cibuf[*p].left;
+ }
+ return nil; /* stupid 8c */
+}
+
+void
+addcibuf(ClumpInfo *ci, vlong corrupt)
+{
+ Cit *cit;
+
+ if(ncibuf == mcibuf){
+ mcibuf += 131072;
+ cibuf = vtrealloc(cibuf, mcibuf*sizeof cibuf[0]);
+ }
+ cit = &cibuf[ncibuf];
+ cit->ci = *ci;
+ cit->left = -1;
+ cit->right = -1;
+ cit->corrupt = corrupt;
+ if(!corrupt)
+ *ltreewalk(&ciroot, ci->score) = ncibuf;
+ ncibuf++;
+}
+
+void
+addcicorrupt(vlong len)
+{
+ static ClumpInfo zci;
+
+ addcibuf(&zci, len);
+}
+
+int
+haveclump(uchar *score)
+{
+ int i;
+ int p;
+
+ p = ciroot;
+ for(;;){
+ if(p == -1)
+ return 0;
+ i = scorecmp(cibuf[p].ci.score, score);
+ if(i == 0)
+ return 1;
+ if(i < 0)
+ p = cibuf[p].right;
+ else
+ p = cibuf[p].left;
+ }
+ return 0; /* stupid 8c */
+}
+
+int
+matchci(ClumpInfo *ci, uchar *p)
+{
+ if(ci->type != vtfromdisktype(p[0]))
+ return 0;
+ if(ci->size != u16(p+1))
+ return 0;
+ if(ci->uncsize != u16(p+3))
+ return 0;
+ if(scorecmp(ci->score, p+5) != 0)
+ return 0;
+ return 1;
+}
+
+int
+sealedarena(uchar *p, int blocksize)
+{
+ int v, n;
+
+ v = u32(p+4);
+ switch(v){
+ default:
+ return 0;
+ case ArenaVersion4:
+ n = ArenaSize4;
+ break;
+ case ArenaVersion5:
+ n = ArenaSize5;
+ break;
+ }
+ if(p[n-1] != 1){
+ print("arena tail says not sealed\n");
+ return 0;
+ }
+ if(memcmp(p+n, zero, blocksize-VtScoreSize-n) != 0){
+ print("arena tail followed by non-zero data\n");
+ return 0;
+ }
+ if(memcmp(p+blocksize-VtScoreSize, zero, VtScoreSize) == 0){
+ print("arena score zero\n");
+ return 0;
+ }
+ return 1;
+}
+
+int
+okayname(char *name, int n)
+{
+ char buf[20];
+
+ if(nameok(name) < 0)
+ return 0;
+ sprint(buf, "%d", n);
+ if(strlen(name) < strlen(buf)
+ || strcmp(name+strlen(name)-strlen(buf), buf) != 0)
+ return 0;
+ return 1;
+}
+
+int
+clumpinfocmp(ClumpInfo *a, ClumpInfo *b)
+{
+ if(a->type != b->type)
+ return a->type - b->type;
+ if(a->size != b->size)
+ return a->size - b->size;
+ if(a->uncsize != b->uncsize)
+ return a->uncsize - b->uncsize;
+ return scorecmp(a->score, b->score);
+}
+
+ClumpInfo*
+loadci(vlong offset, Arena *arena, int nci)
+{
+ int i, j, per;
+ uchar *p, *sp;
+ ClumpInfo *bci, *ci;
+
+ per = arena->blocksize/ClumpInfoSize;
+ bci = vtmalloc(nci*sizeof bci[0]);
+ ci = bci;
+ offset += arena->size - arena->blocksize;
+ p = sp = nil;
+ for(i=0; i<nci; i+=per){
+ if(p == sp){
+ sp = pagein(offset-4*M, 4*M);
+ p = sp+4*M;
+ }
+ p -= arena->blocksize;
+ offset -= arena->blocksize;
+ for(j=0; j<per && i+j<nci; j++)
+ unpackclumpinfo(ci++, p+j*ClumpInfoSize);
+ }
+ return bci;
+}
+
+vlong
+writeci(vlong offset, Arena *arena, ClumpInfo *ci, int nci)
+{
+ int i, j, per;
+ uchar *p, *sp;
+
+ per = arena->blocksize/ClumpInfoSize;
+ offset += arena->size - arena->blocksize;
+ p = sp = nil;
+ for(i=0; i<nci; i+=per){
+ if(p == sp){
+ sp = pagein(offset-4*M, 4*M);
+ p = sp+4*M;
+ }
+ p -= arena->blocksize;
+ offset -= arena->blocksize;
+ memset(p, 0, arena->blocksize);
+ for(j=0; j<per && i+j<nci; j++)
+ packclumpinfo(ci++, p+j*ClumpInfoSize);
+ }
+ pageout();
+ return offset;
+}
+
+void
+loadarenabasics(vlong offset0, int anum, ArenaHead *head, Arena *arena)
+{
+ char dname[ANameSize];
+ static char lastbase[ANameSize];
+ uchar *p;
+ Arena oarena;
+ ArenaHead ohead;
+
+ /*
+ * Fmtarenas makes all arenas the same size
+ * except the last, which may be smaller.
+ * It uses the same block size for arenas as for
+ * the arena partition blocks.
+ */
+ arena->size = arenasize;
+ if(offset0+arena->size > partend)
+ arena->size = partend - offset0;
+ head->size = arena->size;
+
+ arena->blocksize = ap.blocksize;
+ head->blocksize = arena->blocksize;
+
+ /*
+ * Look for clump magic and name in head/tail blocks.
+ * All the other info we will reconstruct just in case.
+ */
+ p = pagein(offset0, arena->blocksize);
+ memset(&ohead, 0, sizeof ohead);
+ if(unpackarenahead(&ohead, p) >= 0){
+ head->version = ohead.version;
+ head->clumpmagic = ohead.clumpmagic;
+ if(okayname(ohead.name, anum))
+ strcpy(head->name, ohead.name);
+ }
+
+ p = pagein(offset0+arena->size-arena->blocksize,
+ arena->blocksize);
+ memset(&oarena, 0, sizeof oarena);
+ if(unpackarena(&oarena, p) >= 0){
+ arena->version = oarena.version;
+ arena->clumpmagic = oarena.clumpmagic;
+ if(okayname(oarena.name, anum))
+ strcpy(arena->name, oarena.name);
+ arena->diskstats.clumps = oarena.diskstats.clumps;
+print("old arena: sealed=%d\n", oarena.diskstats.sealed);
+ arena->diskstats.sealed = oarena.diskstats.sealed;
+ }
+
+ /* Head trumps arena. */
+ if(head->version){
+ arena->version = head->version;
+ arena->clumpmagic = head->clumpmagic;
+ }
+ if(arena->version == 0)
+ arena->version = ArenaVersion5;
+ if(basename)
+ snprint(arena->name, ANameSize, "%s%d", basename, anum);
+ else if(lastbase[0])
+ snprint(arena->name, ANameSize, "%s%d", lastbase, anum);
+ else if(head->name[0])
+ strcpy(arena->name, head->name);
+ else if(arena->name[0] == 0)
+ sysfatal("cannot determine base name for arena; use -n");
+ strcpy(lastbase, arena->name);
+ sprint(dname, "%d", anum);
+ lastbase[strlen(lastbase)-strlen(dname)] = 0;
+
+ /* Was working in arena, now copy to head. */
+ head->version = arena->version;
+ memmove(head->name, arena->name, sizeof head->name);
+ head->blocksize = arena->blocksize;
+ head->size = arena->size;
+}
+
+void
+shahead(Shabuf *sb, vlong offset0, ArenaHead *head)
+{
+ uchar headbuf[MaxDiskBlock];
+
+ sb->offset = offset0;
+ memset(headbuf, 0, sizeof headbuf);
+ packarenahead(head, headbuf);
+ sbupdate(sb, headbuf, offset0, head->blocksize);
+}
+
+u32int
+newclumpmagic(int version)
+{
+ u32int m;
+
+ if(version == ArenaVersion4)
+ return _ClumpMagic;
+ do{
+ m = fastrand();
+ }while(m==0 || m == _ClumpMagic);
+ return m;
+}
+
+/*
+ * Poke around in the arena to find the clump data
+ * and compute the relevant statistics.
+ */
+void
+guessarena(vlong offset0, int anum, ArenaHead *head, Arena *arena,
+ uchar *oldscore, uchar *score)
+{
+ uchar dbuf[MaxDiskBlock];
+ int needtozero, clumps, nb1, nb2, minclumps;
+ int inbad, n, ncib, printed, sealing, smart;
+ u32int magic;
+ uchar *sp, *ep, *p;
+ vlong boffset, eoffset, lastclumpend, leaked;
+ vlong offset, toffset, totalcorrupt, v;
+ Clump cl;
+ ClumpInfo *bci, *ci, *eci, *xci;
+ Cit *bcit, *cit, *ecit;
+ Shabuf oldsha, newsha;
+
+ /*
+ * We expect to find an arena, with data, between offset
+ * and offset+arenasize. With any luck, the data starts at
+ * offset+ap.blocksize. The blocks have variable size and
+ * aren't padded at all, which doesn't give us any alignment
+ * constraints. The blocks are compressed or high entropy,
+ * but the headers are pretty low entropy (except the score):
+ *
+ * type[1] (range 0 thru 9, 13)
+ * size[2]
+ * uncsize[2] (<= size)
+ *
+ * so we can look for these. We check the scores as we go,
+ * so we can't make any wrong turns. If we find ourselves
+ * in a dead end, scan forward looking for a new start.
+ */
+
+ resetcibuf();
+ memset(head, 0, sizeof *head);
+ memset(arena, 0, sizeof *arena);
+ memset(oldscore, 0, VtScoreSize);
+ memset(score, 0, VtScoreSize);
+ memset(&oldsha, 0, sizeof oldsha);
+ memset(&newsha, 0, sizeof newsha);
+ newsha.rollback = 1;
+
+ if(0){
+ sbdebug(&oldsha, "old.sha");
+ sbdebug(&newsha, "new.sha");
+ }
+
+ loadarenabasics(offset0, anum, head, arena);
+
+ /* start the clump hunt */
+
+ clumps = 0;
+ totalcorrupt = 0;
+ sealing = 1;
+ boffset = offset0 + arena->blocksize;
+ offset = boffset;
+ eoffset = offset0+arena->size - arena->blocksize;
+ toffset = eoffset;
+ sp = pagein(offset0, 4*M);
+
+ if(arena->diskstats.sealed){
+ oldsha.offset = offset0;
+ sbupdate(&oldsha, sp, offset0, 4*M);
+ }
+ ep = sp+4*M;
+ p = sp + (boffset - offset0);
+ ncib = arena->blocksize / ClumpInfoSize; /* ci per block in index */
+ lastclumpend = offset;
+ nbad = 0;
+ inbad = 0;
+ needtozero = 0;
+ minclumps = 0;
+ while(offset < eoffset){
+ /*
+ * Shift buffer if we're running out of room.
+ */
+ if(p+70*K >= ep){
+ /*
+ * Start the post SHA1 buffer. By now we should know the
+ * clumpmagic and arena version, so we can create a
+ * correct head block to get things going.
+ */
+ if(sealing && fix && newsha.offset == 0){
+ newsha.offset = offset0;
+ if(arena->clumpmagic == 0){
+ if(arena->version == 0)
+ arena->version = ArenaVersion5;
+ arena->clumpmagic = newclumpmagic(arena->version);
+ }
+ head->clumpmagic = arena->clumpmagic;
+ shahead(&newsha, offset0, head);
+ }
+ n = 4*M-256*K;
+ if(sealing && fix){
+ sbdiskhash(&newsha, bufoffset);
+ sbupdate(&newsha, buf, bufoffset, 4*M-256*K);
+ }
+ pagein(bufoffset+n, 4*M);
+ p -= n;
+ if(arena->diskstats.sealed)
+ sbupdate(&oldsha, buf, bufoffset, 4*M);
+ }
+
+ /*
+ * Check for a clump at p, which is at offset in the disk.
+ * Duplicate clumps happen in corrupted disks
+ * (the same pattern gets written many times in a row)
+ * and should never happen during regular use.
+ */
+ if((n = isclump(p, &cl, &magic)) > 0){
+ /*
+ * If we were in the middle of some corrupted data,
+ * flush a warning about it and then add any clump
+ * info blocks as necessary.
+ */
+ if(inbad){
+ inbad = 0;
+ v = offset-lastclumpend;
+ if(needtozero){
+ zerorange(lastclumpend, v);
+ sbrollback(&newsha, lastclumpend);
+ print("corrupt clump data - %#llux+%#llux (%,llud bytes)\n",
+ lastclumpend, v, v);
+ }
+ addcicorrupt(v);
+ totalcorrupt += v;
+ nb1 = (minclumps+ncib-1)/ncib;
+ minclumps += (v+ClumpSize+VtMaxLumpSize-1)/(ClumpSize+VtMaxLumpSize);
+ nb2 = (minclumps+ncib-1)/ncib;
+ eoffset -= (nb2-nb1)*arena->blocksize;
+ }
+
+ if(haveclump(cl.info.score))
+ print("warning: duplicate clump %d %V\n", cl.info.type, cl.info.score);
+
+ /*
+ * If clumps use different magic numbers, we don't care.
+ * We'll just use the first one we find and make the others
+ * follow suit.
+ */
+ if(arena->clumpmagic == 0){
+ print("clump type %d size %d score %V magic %x\n",
+ cl.info.type, cl.info.size, cl.info.score, magic);
+ arena->clumpmagic = magic;
+ if(magic == _ClumpMagic)
+ arena->version = ArenaVersion4;
+ else
+ arena->version = ArenaVersion5;
+ }
+ if(magic != arena->clumpmagic)
+ p32(p, arena->clumpmagic);
+ if(clumps == 0)
+ arena->ctime = cl.time;
+
+ /*
+ * Record the clump, update arena stats,
+ * grow clump info blocks if needed.
+ */
+ if(verbose > 1)
+ print("\tclump %d: %d %V at %#llux+%#ux (%d)\n",
+ clumps, cl.info.type, cl.info.score, offset, n, n);
+ addcibuf(&cl.info, 0);
+ if(minclumps%ncib == 0)
+ eoffset -= arena->blocksize;
+ minclumps++;
+ clumps++;
+ if(cl.encoding != ClumpENone)
+ arena->diskstats.cclumps++;
+ arena->diskstats.uncsize += cl.info.uncsize;
+ arena->wtime = cl.time;
+
+ /*
+ * Move to next clump.
+ */
+ offset += n;
+ p += n;
+ lastclumpend = offset;
+ }else{
+ /*
+ * Overwrite malformed clump data with zeros later.
+ * For now, just record whether it needs to be overwritten.
+ * Bad regions must be of size at least ClumpSize.
+ * Postponing the overwriting keeps us from writing past
+ * the end of the arena data (which might be directory data)
+ * with zeros.
+ */
+ if(!inbad){
+ inbad = 1;
+ needtozero = 0;
+ if(memcmp(p, zero, ClumpSize) != 0)
+ needtozero = 1;
+ p += ClumpSize;
+ offset += ClumpSize;
+ nbad++;
+ }else{
+ if(*p != 0)
+ needtozero = 1;
+ p++;
+ offset++;
+ }
+ }
+ }
+ pageout();
+
+ if(verbose)
+ print("readable clumps: %d; min. directory entries: %d\n",
+ clumps, minclumps);
+ arena->diskstats.used = lastclumpend - boffset;
+ leaked = eoffset - lastclumpend;
+ if(verbose)
+ print("used from %#llux to %#llux = %,lld (%,lld unused)\n",
+ boffset, lastclumpend, arena->diskstats.used, leaked);
+
+ /*
+ * Finish the SHA1 of the old data.
+ */
+ if(arena->diskstats.sealed){
+ sbdiskhash(&oldsha, toffset);
+ readdisk(dbuf, toffset, arena->blocksize);
+ scorecp(dbuf+arena->blocksize-VtScoreSize, zero);
+ sbupdate(&oldsha, dbuf, toffset, arena->blocksize);
+ sbscore(&oldsha, oldscore);
+ }
+
+ /*
+ * If we still don't know the clump magic, the arena
+ * must be empty. It still needs a value, so make
+ * something up.
+ */
+ if(arena->version == 0)
+ arena->version = ArenaVersion5;
+ if(arena->clumpmagic == 0){
+ if(arena->version == ArenaVersion4)
+ arena->clumpmagic = _ClumpMagic;
+ else{
+ do
+ arena->clumpmagic = fastrand();
+ while(arena->clumpmagic==_ClumpMagic
+ ||arena->clumpmagic==0);
+ }
+ head->clumpmagic = arena->clumpmagic;
+ }
+
+ /*
+ * Guess at number of clumpinfo blocks to load.
+ * If we guess high, it's no big deal. If we guess low,
+ * we'll be forced into rewriting the whole directory.
+ * Still not such a big deal.
+ */
+ if(clumps == 0 || arena->diskstats.used == totalcorrupt)
+ goto Nocib;
+ if(clumps < arena->diskstats.clumps)
+ clumps = arena->diskstats.clumps;
+ if(clumps < ncibuf)
+ clumps = ncibuf;
+ clumps += totalcorrupt/
+ ((arena->diskstats.used - totalcorrupt)/clumps);
+ clumps += totalcorrupt/2000;
+ if(clumps < minclumps)
+ clumps = minclumps;
+ clumps += ncib-1;
+ clumps -= clumps%ncib;
+
+ /*
+ * Can't write into the actual data.
+ */
+ v = offset0 + arena->size - arena->blocksize;
+ v -= (clumps+ncib-1)/ncib * arena->blocksize;
+ if(v < lastclumpend){
+ v = offset0 + arena->size - arena->blocksize;
+ clumps = (v-lastclumpend)/arena->blocksize * ncib;
+ }
+
+ if(clumps < minclumps)
+ print("cannot happen?\n");
+
+ /*
+ * Check clumpinfo blocks against directory we created.
+ * The tricky part is handling the corrupt sections of arena.
+ * If possible, we remark just the affected directory entries
+ * rather than slide everything down.
+ *
+ * Allocate clumps+1 blocks and check that we don't need
+ * the last one at the end.
+ */
+ bci = loadci(offset0, arena, clumps+1);
+ eci = bci+clumps+1;
+ bcit = cibuf;
+ ecit = cibuf+ncibuf;
+ smart = 1;
+Again:
+ nbad = 0;
+ ci = bci;
+ for(cit=bcit; cit<ecit && ci<eci; cit++){
+ if(cit->corrupt){
+ vlong n, m;
+ if(smart){
+ /*
+ * If we can, just mark existing entries as corrupt.
+ */
+ n = cit->corrupt;
+ for(xci=ci; n>0 && xci<eci; xci++)
+ n -= ClumpSize+xci->size;
+ if(n > 0 || xci >= eci)
+ goto Dumb;
+ printed = 0;
+ for(; ci<xci; ci++){
+ if(verbose && ci->type != VtCorruptType){
+ if(!printed){
+ print("marking directory %d-%d as corrupt\n",
+ (int)(ci-bci), (int)(xci-bci));
+ printed = 1;
+ }
+ print("\ttype=%d size=%d uncsize=%d score=%V\n",
+ ci->type, ci->size, ci->uncsize, ci->score);
+ }
+ ci->type = VtCorruptType;
+ }
+ }else{
+ Dumb:
+ print("\trewriting clump directory\n");
+ /*
+ * Otherwise, blaze a new trail.
+ */
+ n = cit->corrupt;
+ while(n > 0 && ci < eci){
+ if(n < ClumpSize)
+ sysfatal("bad math in clump corrupt");
+ if(n <= VtMaxLumpSize+ClumpSize)
+ m = n;
+ else{
+ m = VtMaxLumpSize+ClumpSize;
+ if(n-m < ClumpSize)
+ m -= ClumpSize;
+ }
+ ci->type = VtCorruptType;
+ ci->size = m-ClumpSize;
+ ci->uncsize = m-ClumpSize;
+ memset(ci->score, 0, VtScoreSize);
+ ci++;
+ n -= m;
+ }
+ }
+ continue;
+ }
+ if(clumpinfocmp(&cit->ci, ci) != 0){
+ if(verbose && (smart || verbose>1)){
+ print("clumpinfo %d\n", (int)(ci-bci));
+ print("\twant: %d %d %d %V\n",
+ cit->ci.type, cit->ci.size,
+ cit->ci.uncsize, cit->ci.score);
+ print("\thave: %d %d %d %V\n",
+ ci->type, ci->size,
+ ci->uncsize, ci->score);
+ }
+ *ci = cit->ci;
+ nbad++;
+ }
+ ci++;
+ }
+ if(ci >= eci || cit < ecit){
+ print("ran out of space editing existing directory; rewriting\n");
+ print("# eci %ld ci %ld ecit %ld cit %ld\n", eci-bci, ci-bci, ecit-bcit, cit-bcit);
+ assert(smart); /* can't happen second time thru */
+ smart = 0;
+ goto Again;
+ }
+
+ assert(ci <= eci);
+ arena->diskstats.clumps = ci-bci;
+ eoffset = writeci(offset0, arena, bci, ci-bci);
+ if(sealing && fix)
+ sbrollback(&newsha, v);
+print("eoffset=%lld lastclumpend=%lld diff=%lld unseal=%d\n", eoffset, lastclumpend, eoffset-lastclumpend, unseal);
+ if(lastclumpend > eoffset)
+ print("arena directory overwrote blocks! cannot happen!\n");
+ free(bci);
+ if(smart && nbad)
+ print("arena directory has %d bad or missing entries\n", nbad);
+Nocib:
+ if(eoffset - lastclumpend > 64*1024 && (!arena->diskstats.sealed || unseal)){
+ if(arena->diskstats.sealed)
+ print("unsealing arena\n");
+ sealing = 0;
+ memset(oldscore, 0, VtScoreSize);
+ }
+
+ /*
+ * Finish the SHA1 of the new data - only meaningful
+ * if we've been writing to disk (`fix').
+ */
+ arena->diskstats.sealed = sealing;
+ arena->memstats = arena->diskstats;
+ if(sealing && fix){
+ uchar tbuf[MaxDiskBlock];
+
+ sbdiskhash(&newsha, toffset);
+ memset(tbuf, 0, sizeof tbuf);
+ packarena(arena, tbuf);
+ sbupdate(&newsha, tbuf, toffset, arena->blocksize);
+ sbscore(&newsha, score);
+ }
+}
+
+void
+dumparena(vlong offset, int anum, Arena *arena)
+{
+ char buf[1000];
+ vlong o, e;
+ int fd, n;
+
+ snprint(buf, sizeof buf, "%s.%d", dumpbase, anum);
+ if((fd = create(buf, OWRITE, 0666)) < 0){
+ fprint(2, "create %s: %r\n", buf);
+ return;
+ }
+ e = offset+arena->size;
+ for(o=offset; o<e; o+=n){
+ n = 4*M;
+ if(o+n > e)
+ n = e-o;
+ if(pwrite(fd, pagein(o, n), n, o-offset) != n){
+ fprint(2, "write %s at %#llux: %r\n", buf, o-offset);
+ return;
+ }
+ }
+}
+
+void
+checkarena(vlong offset, int anum)
+{
+ uchar dbuf[MaxDiskBlock];
+ uchar *p, oldscore[VtScoreSize], score[VtScoreSize];
+ Arena arena, oarena;
+ ArenaHead head;
+ Info *fmt, *fmta;
+ int sz;
+
+ print("# arena %d: offset %#llux\n", anum, offset);
+
+ if(offset >= partend){
+ print("arena offset out of bounds\n");
+ return;
+ }
+
+ guessarena(offset, anum, &head, &arena, oldscore, score);
+
+ if(verbose){
+ print("#\tversion=%d name=%s blocksize=%d size=%z",
+ head.version, head.name, head.blocksize, head.size);
+ if(head.clumpmagic)
+ print(" clumpmagic=%#.8ux", head.clumpmagic);
+ print("\n#\tclumps=%d cclumps=%d used=%,lld uncsize=%,lld\n",
+ arena.diskstats.clumps, arena.diskstats.cclumps,
+ arena.diskstats.used, arena.diskstats.uncsize);
+ print("#\tctime=%t\n", arena.ctime);
+ print("#\twtime=%t\n", arena.wtime);
+ if(arena.diskstats.sealed)
+ print("#\tsealed score=%V\n", score);
+ }
+
+ if(dumpbase){
+ dumparena(offset, anum, &arena);
+ return;
+ }
+
+ memset(dbuf, 0, sizeof dbuf);
+ packarenahead(&head, dbuf);
+ p = pagein(offset, arena.blocksize);
+ if(memcmp(dbuf, p, arena.blocksize) != 0){
+ print("on-disk arena header incorrect\n");
+ showdiffs(dbuf, p, arena.blocksize,
+ arena.version==ArenaVersion4 ? headinfo4 : headinfo5);
+ }
+ memmove(p, dbuf, arena.blocksize);
+
+ memset(dbuf, 0, sizeof dbuf);
+ packarena(&arena, dbuf);
+ if(arena.diskstats.sealed)
+ scorecp(dbuf+arena.blocksize-VtScoreSize, score);
+ p = pagein(offset+arena.size-arena.blocksize, arena.blocksize);
+ memset(&oarena, 0, sizeof oarena);
+ unpackarena(&oarena, p);
+ if(arena.version == ArenaVersion4){
+ sz = ArenaSize4;
+ fmt = tailinfo4;
+ fmta = tailinfo4a;
+ }else{
+ sz = ArenaSize5;
+ fmt = tailinfo5;
+ fmta = tailinfo5a;
+ }
+ if(p[sz] == 1){
+ fmt = fmta;
+ if(oarena.diskstats.sealed){
+ /*
+ * some arenas were sealed with the extension
+ * before we adopted the convention that if it didn't
+ * add new information it gets dropped.
+ */
+ _packarena(&arena, dbuf, 1);
+ }
+ }
+ if(memcmp(dbuf, p, arena.blocksize-VtScoreSize) != 0){
+ print("on-disk arena tail incorrect\n");
+ showdiffs(dbuf, p, arena.blocksize-VtScoreSize, fmt);
+ }
+ if(arena.diskstats.sealed){
+ if(oarena.diskstats.sealed)
+ if(scorecmp(p+arena.blocksize-VtScoreSize, oldscore) != 0){
+ print("on-disk arena seal score incorrect\n");
+ print("\tcorrect=%V\n", oldscore);
+ print("\t disk=%V\n", p+arena.blocksize-VtScoreSize);
+ }
+ if(fix && scorecmp(p+arena.blocksize-VtScoreSize, score) != 0){
+ print("%ssealing arena%s: %V\n",
+ oarena.diskstats.sealed ? "re" : "",
+ scorecmp(oldscore, score) == 0 ?
+ "" : " after changes", score);
+ }
+ }
+ memmove(p, dbuf, arena.blocksize);
+
+ pageout();
+}
+
+AMapN*
+buildamap(void)
+{
+ uchar *p;
+ vlong o;
+ ArenaHead h;
+ AMapN *an;
+ AMap *m;
+
+ an = vtmallocz(sizeof *an);
+ for(o=ap.arenabase; o<partend; o+=arenasize){
+ p = pagein(o, Block);
+ if(unpackarenahead(&h, p) >= 0){
+ an->map = vtrealloc(an->map, (an->n+1)*sizeof an->map[0]);
+ m = &an->map[an->n++];
+ m->start = o;
+ m->stop = o+h.size;
+ strcpy(m->name, h.name);
+ }
+ }
+ return an;
+}
+
+void
+checkmap(void)
+{
+ char *s;
+ uchar *p;
+ int i, len;
+ AMapN *an;
+ Fmt fmt;
+
+ an = buildamap();
+ fmtstrinit(&fmt);
+ fmtprint(&fmt, "%ud\n", an->n);
+ for(i=0; i<an->n; i++)
+ fmtprint(&fmt, "%s\t%lld\t%lld\n",
+ an->map[i].name, an->map[i].start, an->map[i].stop);
+ s = fmtstrflush(&fmt);
+ len = strlen(s);
+ if(len > ap.tabsize){
+ print("arena partition map too long: need %z bytes have %z\n",
+ (vlong)len, (vlong)ap.tabsize);
+ len = ap.tabsize;
+ }
+
+ if(ap.tabsize >= 4*M){ /* can't happen - max arenas is 2000 */
+ print("arena partition map *way* too long\n");
+ return;
+ }
+
+ p = pagein(ap.tabbase, ap.tabsize);
+ if(memcmp(p, s, len) != 0){
+ print("arena partition map incorrect; rewriting.\n");
+ memmove(p, s, len);
+ }
+ pageout();
+}
+
+int mainstacksize = 512*1024;
+
+void
+threadmain(int argc, char **argv)
+{
+ int mode;
+
+ mode = OREAD;
+ readonly = 1;
+ ARGBEGIN{
+ case 'U':
+ unseal = 1;
+ break;
+ case 'a':
+ arenasize = unittoull(EARGF(usage()));
+ break;
+ case 'b':
+ ap.blocksize = unittoull(EARGF(usage()));
+ break;
+ case 'f':
+ fix = 1;
+ mode = ORDWR;
+ readonly = 0;
+ break;
+ case 'n':
+ basename = EARGF(usage());
+ break;
+ case 'v':
+ verbose++;
+ break;
+ case 'x':
+ dumpbase = EARGF(usage());
+ break;
+ default:
+ usage();
+ }ARGEND
+
+ if(argc != 1 && argc != 2)
+ usage();
+
+ file = argv[0];
+
+ ventifmtinstall();
+ fmtinstall('z', zfmt);
+ fmtinstall('t', tfmt);
+ quotefmtinstall();
+
+ part = initpart(file, mode|ODIRECT);
+ if(part == nil)
+ sysfatal("can't open %s: %r", file);
+ partend = part->size;
+
+ checkarenas(argc > 1 ? argv[1] : nil);
+ checkmap();
+ threadexitsall(nil);
+}
+
diff --git a/src/cmd/venti/srv/fns.h b/src/cmd/venti/srv/fns.h
index f35580ed..1a6f1e4b 100644
--- a/src/cmd/venti/srv/fns.h
+++ b/src/cmd/venti/srv/fns.h
@@ -24,8 +24,13 @@ void delaykickicache(void);
void delaykickround(Round*);
void delaykickroundproc(void*);
void dirtydblock(DBlock*, int);
+void diskaccess(int);
+void disksched(void);
AState diskstate(void);
void *emalloc(ulong);
+void emptydcache(void);
+void emptyicache(void);
+void emptylumpcache(void);
void *erealloc(void *, ulong);
char *estrdup(char*);
void *ezmalloc(ulong);
@@ -49,6 +54,7 @@ u32int hashbits(u8int *score, int nbits);
int httpdinit(char *address, char *webroot);
int iaddrcmp(IAddr *ia1, IAddr *ia2);
IEntry* icachedirty(u32int, u32int, u64int);
+ulong icachedirtyfrac(void);
void icacheclean(IEntry*);
int ientrycmp(const void *vie1, const void *vie2);
char *ifileline(IFile *f);
@@ -77,6 +83,7 @@ int insertscore(u8int *score, IAddr *ia, int write);
void kickdcache(void);
void kickicache(void);
void kickround(Round*, int wait);
+int loadbloom(Bloom*);
ZBlock *loadclump(Arena *arena, u64int aa, int blocks, Clump *cl, u8int *score, int verify);
DBlock *loadibucket(Index *index, u8int *score, ISect **is, u32int *buck, IBucket *ib);
int loadientry(Index *index, u8int *score, int type, IEntry *ie);
@@ -98,6 +105,7 @@ int okamap(AMap *am, int n, u64int start, u64int stop, char *what);
int okibucket(IBucket*, ISect*);
int outputamap(Fmt *f, AMap *am, int n);
int outputindex(Fmt *f, Index *ix);
+int _packarena(Arena *arena, u8int *buf, int);
int packarena(Arena *arena, u8int *buf);
int packarenahead(ArenaHead *head, u8int *buf);
int packarenapart(ArenaPart *as, u8int *buf);
@@ -129,6 +137,7 @@ ZBlock *readfile(char *name);
int readifile(IFile *f, char *name);
Packet *readlump(u8int *score, int type, u32int size, int *cached);
int readpart(Part *part, u64int addr, u8int *buf, u32int n);
+int resetbloom(Bloom*);
int runconfig(char *config, Config*);
int scorecmp(u8int *, u8int *);
void scoremem(u8int *score, u8int *buf, int size);
diff --git a/src/cmd/venti/srv/graph.c b/src/cmd/venti/srv/graph.c
index 647c74b2..9c906ad7 100644
--- a/src/cmd/venti/srv/graph.c
+++ b/src/cmd/venti/srv/graph.c
@@ -55,7 +55,11 @@ ginit(void)
first = 0;
memimageinit();
+#ifdef PLAN9PORT
smallfont = openmemsubfont(unsharp("#9/font/lucsans/lstr.10"));
+#else
+ smallfont = openmemsubfont("/lib/font/bit/lucidasans/lstr.10");
+#endif
black = memblack;
blue = allocrepl(DBlue);
red = allocrepl(DRed);
@@ -121,7 +125,7 @@ statgraph(Graph *g)
if(g->wid > nelem(bin))
g->wid = nelem(bin);
if(g->fill < 0)
- g->fill = ((uint)(uintptr)g->arg>>8)%nelem(lofill);
+ g->fill = ((uint)g->arg>>8)%nelem(lofill);
if(g->fill > nelem(lofill))
g->fill %= nelem(lofill);
@@ -151,7 +155,7 @@ statgraph(Graph *g)
qlock(&memdrawlock);
ginit();
if(smallfont==nil || black==nil || blue==nil || red==nil || hifill==nil || lofill==nil){
- werrstr("graphics initialization failed");
+ werrstr("graphics initialization failed: %r");
qunlock(&memdrawlock);
return nil;
}
@@ -186,12 +190,12 @@ statgraph(Graph *g)
if(0)
if(lastlo != -1){
if(lastlo < lo)
- memimagedraw(m, Rect(x-1, lastlo, x, lo), hifill[g->fill], ZP, memopaque, ZP, S);
+ memimagedraw(m, Rect(x-1, lastlo, x, lo), hifill[g->fill%nelem(hifill)], ZP, memopaque, ZP, S);
else if(lastlo > lo)
- memimagedraw(m, Rect(x-1, lo, x, lastlo), hifill[g->fill], ZP, memopaque, ZP, S);
+ memimagedraw(m, Rect(x-1, lo, x, lastlo), hifill[g->fill%nelem(hifill)], ZP, memopaque, ZP, S);
}
- memimagedraw(m, Rect(x, hi, x+1,lo), hifill[g->fill], ZP, memopaque, ZP, S);
- memimagedraw(m, Rect(x, lo, x+1, r.max.y), lofill[g->fill], ZP, memopaque, ZP, S);
+ memimagedraw(m, Rect(x, hi, x+1,lo), hifill[g->fill%nelem(hifill)], ZP, memopaque, ZP, S);
+ memimagedraw(m, Rect(x, lo, x+1, r.max.y), lofill[g->fill%nelem(lofill)], ZP, memopaque, ZP, S);
lastlo = lo;
}
diff --git a/src/cmd/venti/srv/httpd.c b/src/cmd/venti/srv/httpd.c
index ad7222dd..04d19d9d 100644
--- a/src/cmd/venti/srv/httpd.c
+++ b/src/cmd/venti/srv/httpd.c
@@ -9,7 +9,7 @@ extern QLock memdrawlock;
enum
{
ObjNameSize = 64,
- MaxObjs = 16
+ MaxObjs = 64
};
struct HttpObj
@@ -28,6 +28,12 @@ static int dindex(HConnect *c);
static int xindex(HConnect *c);
static int xlog(HConnect *c);
static int sindex(HConnect *c);
+static int hempty(HConnect *c);
+static int hlcacheempty(HConnect *c);
+static int hdcacheempty(HConnect *c);
+static int hicacheempty(HConnect *c);
+static int hicachekick(HConnect *c);
+static int hdcachekick(HConnect *c);
static int hicacheflush(HConnect *c);
static int hdcacheflush(HConnect *c);
static int notfound(HConnect *c);
@@ -53,10 +59,17 @@ httpdinit(char *address, char *dir)
httpdobj("/xindex", xindex);
httpdobj("/flushicache", hicacheflush);
httpdobj("/flushdcache", hdcacheflush);
+ httpdobj("/kickicache", hicachekick);
+ httpdobj("/kickdcache", hdcachekick);
httpdobj("/graph/", xgraph);
+ httpdobj("/set", xset);
httpdobj("/set/", xset);
httpdobj("/log", xlog);
httpdobj("/log/", xlog);
+ httpdobj("/empty", hempty);
+ httpdobj("/emptyicache", hicacheempty);
+ httpdobj("/emptylumpcache", hlcacheempty);
+ httpdobj("/emptydcache", hdcacheempty);
if(vtproc(listenproc, address) < 0)
return -1;
@@ -105,8 +118,6 @@ listenproc(void *vaddress)
char *address, ndir[NETPATHLEN], dir[NETPATHLEN];
int ctl, nctl, data;
-/*sleep(1000); // let strace find us */
-
address = vaddress;
ctl = announce(address, dir);
if(ctl < 0){
@@ -148,7 +159,6 @@ httpproc(void *v)
HConnect *c;
int ok, i, n;
-/*sleep(1000); // let strace find us */
c = v;
for(;;){
@@ -182,7 +192,7 @@ httpproc(void *v)
}
static int
-percent(long v, long total)
+percent(ulong v, ulong total)
{
if(total == 0)
total = 1;
@@ -240,6 +250,31 @@ preqtext(HConnect *c)
}
static int
+herror(HConnect *c)
+{
+ int n;
+ Hio *hout;
+
+ hout = &c->hout;
+ n = snprint(c->xferbuf, HBufSize, "<html><head><title>Error</title></head>\n<body><h1>Error</h1>\n<pre>%r</pre>\n</body></html>");
+ hprint(hout, "%s %s\r\n", hversion, "400 Bad Request");
+ hprint(hout, "Date: %D\r\n", time(nil));
+ hprint(hout, "Server: Venti\r\n");
+ hprint(hout, "Content-Type: text/html\r\n");
+ hprint(hout, "Content-Length: %d\r\n", n);
+ if(c->head.closeit)
+ hprint(hout, "Connection: close\r\n");
+ else if(!http11(c))
+ hprint(hout, "Connection: Keep-Alive\r\n");
+ hprint(hout, "\r\n");
+
+ if(c->req.meth == nil || strcmp(c->req.meth, "HEAD") != 0)
+ hwrite(hout, c->xferbuf, n);
+
+ return hflush(hout);
+}
+
+static int
notfound(HConnect *c)
{
int r;
@@ -325,21 +360,53 @@ static struct
"logging", &ventilogging,
"stats", &collectstats,
"icachesleeptime", &icachesleeptime,
+ "minicachesleeptime", &minicachesleeptime,
"arenasumsleeptime", &arenasumsleeptime,
+ "l0quantum", &l0quantum,
+ "l1quantum", &l1quantum,
+ "manualscheduling", &manualscheduling,
+ "ignorebloom", &ignorebloom,
+ "syncwrites", &syncwrites,
+ "icacheprefetch", &icacheprefetch,
0
};
static int
+xsetlist(HConnect *c)
+{
+ int i;
+
+ if(preqtype(c, "text/plain") < 0)
+ return -1;
+ for(i=0; namedints[i].name; i++)
+ print("%s = %d\n", namedints[i].name, *namedints[i].p);
+ hflush(&c->hout);
+ return 0;
+}
+
+
+
+static int
xset(HConnect *c)
{
int i, nf, r;
char *f[10], *s;
+ if(strcmp(c->req.uri, "/set") == 0 || strcmp(c->req.uri, "/set/") == 0)
+ return xsetlist(c);
+
s = estrdup(c->req.uri);
nf = getfields(s+strlen("/set/"), f, nelem(f), 1, "/");
- if(nf < 1)
- return notfound(c);
+ if(nf < 1){
+ r = preqtext(c);
+ if(r < 0)
+ return r;
+ for(i=0; namedints[i].name; i++)
+ hprint(&c->hout, "%s = %d\n", namedints[i].name, *namedints[i].p);
+ hflush(&c->hout);
+ return 0;
+ }
for(i=0; namedints[i].name; i++){
if(strcmp(f[0], namedints[i].name) == 0){
if(nf >= 2)
@@ -495,6 +562,108 @@ darena(Hio *hout, Arena *arena)
}
static int
+hempty(HConnect *c)
+{
+ Hio *hout;
+ int r;
+
+ r = preqtext(c);
+ if(r < 0)
+ return r;
+ hout = &c->hout;
+
+ emptylumpcache();
+ emptydcache();
+ emptyicache();
+ hprint(hout, "emptied all caches\n");
+ hflush(hout);
+ return 0;
+}
+
+static int
+hlcacheempty(HConnect *c)
+{
+ Hio *hout;
+ int r;
+
+ r = preqtext(c);
+ if(r < 0)
+ return r;
+ hout = &c->hout;
+
+ emptylumpcache();
+ hprint(hout, "emptied lumpcache\n");
+ hflush(hout);
+ return 0;
+}
+
+static int
+hicacheempty(HConnect *c)
+{
+ Hio *hout;
+ int r;
+
+ r = preqtext(c);
+ if(r < 0)
+ return r;
+ hout = &c->hout;
+
+ emptyicache();
+ hprint(hout, "emptied icache\n");
+ hflush(hout);
+ return 0;
+}
+
+static int
+hdcacheempty(HConnect *c)
+{
+ Hio *hout;
+ int r;
+
+ r = preqtext(c);
+ if(r < 0)
+ return r;
+ hout = &c->hout;
+
+ emptydcache();
+ hprint(hout, "emptied dcache\n");
+ hflush(hout);
+ return 0;
+}
+static int
+hicachekick(HConnect *c)
+{
+ Hio *hout;
+ int r;
+
+ r = preqtext(c);
+ if(r < 0)
+ return r;
+ hout = &c->hout;
+
+ kickicache();
+ hprint(hout, "kicked icache\n");
+ hflush(hout);
+ return 0;
+}
+
+static int
+hdcachekick(HConnect *c)
+{
+ Hio *hout;
+ int r;
+
+ r = preqtext(c);
+ if(r < 0)
+ return r;
+ hout = &c->hout;
+
+ kickdcache();
+ hprint(hout, "kicked dcache\n");
+ hflush(hout);
+ return 0;
+}
+static int
hicacheflush(HConnect *c)
{
Hio *hout;
@@ -569,6 +738,7 @@ rawgraph(Stats *s, Stats *t, void *va)
{
Arg *a;
+ USED(s);
a = va;
return t->n[a->index];
}
@@ -587,6 +757,7 @@ pctgraph(Stats *s, Stats *t, void *va)
{
Arg *a;
+ USED(s);
a = va;
return percent(t->n[a->index], t->n[a->index2]);
}
@@ -722,7 +893,7 @@ static char* graphname[] =
"isectwritebyte",
"sumread",
- "sumreadbyte"
+ "sumreadbyte",
};
static int
@@ -733,7 +904,6 @@ findname(char *s)
for(i=0; i<nelem(graphname); i++)
if(strcmp(graphname[i], s) == 0)
return i;
-fprint(2, "no name '%s'\n", s);
return -1;
}
@@ -769,10 +939,14 @@ xgraph(HConnect *c)
if(0) fprint(2, "graph %s\n" ,s);
memset(&g, 0, sizeof g);
nf = getfields(s+strlen("/graph/"), f, nelem(f), 1, "/");
- if(nf < 1)
- goto notfound;
- if((arg.index = findname(f[0])) == -1 && strcmp(f[0], "*") != 0)
- goto notfound;
+ if(nf < 1){
+ werrstr("bad syntax -- not enough fields");
+ goto error;
+ }
+ if((arg.index = findname(f[0])) == -1 && strcmp(f[0], "*") != 0){
+ werrstr("unknown name %s", f[0]);
+ goto error;
+ }
g.arg = &arg;
g.t0 = -120;
g.t1 = 0;
@@ -793,14 +967,18 @@ if(0) fprint(2, "graph %s\n" ,s);
else if(strncmp(f[i], "max=", 4) == 0)
g.max = atoi(f[i]+4);
else if(strncmp(f[i], "pct=", 4) == 0){
- if((arg.index2 = findname(f[i]+4)) == -1)
- goto notfound;
+ if((arg.index2 = findname(f[i]+4)) == -1){
+ werrstr("unknown name %s", f[i]+4);
+ goto error;
+ }
g.fn = pctgraph;
g.min = 0;
g.max = 100;
}else if(strncmp(f[i], "pctdiff=", 8) == 0){
- if((arg.index2 = findname(f[i]+8)) == -1)
- goto notfound;
+ if((arg.index2 = findname(f[i]+8)) == -1){
+ werrstr("unknown name %s", f[i]+8);
+ goto error;
+ }
g.fn = pctdiffgraph;
g.min = 0;
g.max = 100;
@@ -830,7 +1008,7 @@ if(0) fprint(2, "graph %s\n" ,s);
m = statgraph(&g);
if(m == nil)
- goto notfound;
+ goto error;
if(preqtype(c, "image/png") < 0)
return -1;
@@ -843,9 +1021,9 @@ if(0) fprint(2, "graph %s\n" ,s);
free(s);
return 0;
-notfound:
+error:
free(s);
- return notfound(c);
+ return herror(c);
}
static int
@@ -944,7 +1122,6 @@ vtloghdump(Hio *h, VtLog *l)
name = l ? l->name : "&lt;nil&gt;";
-fprint(2, "hdump xfer %d\n", h->xferenc);
hprint(h, "<html><head>\n");
hprint(h, "<title>Venti Server Log: %s</title>\n", name);
hprint(h, "</head><body>\n");
diff --git a/src/cmd/venti/srv/icache.c b/src/cmd/venti/srv/icache.c
index 46d411e5..49f741e7 100644
--- a/src/cmd/venti/srv/icache.c
+++ b/src/cmd/venti/srv/icache.c
@@ -11,6 +11,7 @@ struct ICache
int bits; /* bits to use for indexing heads */
u32int size; /* number of heads; == 1 << bits, should be < entries */
IEntry *base; /* all allocated hash table entries */
+ IEntry *free;
u32int entries; /* elements in base */
IEntry *dirty; /* chain of dirty elements */
u32int ndirty;
@@ -23,6 +24,8 @@ struct ICache
int nlast;
};
+int icacheprefetch = 1;
+
static ICache icache;
static IEntry *icachealloc(IAddr *ia, u8int *score);
@@ -45,6 +48,12 @@ initicache(int bits, int depth)
setstat(StatIcacheSize, icache.entries);
}
+ulong
+icachedirtyfrac(void)
+{
+ return (vlong)icache.ndirty*IcacheFrac / icache.entries;
+}
+
u32int
hashbits(u8int *sc, int bits)
{
@@ -141,14 +150,16 @@ lookupscore(u8int *score, int type, IAddr *ia, int *rac)
* load the table of contents for that arena into the cache.
*/
ie = icachealloc(&d.ia, score);
- icache.last[icache.nlast++%nelem(icache.last)] = amapitoa(mainindex, ie->ia.addr, &aa);
- aa = ie->ia.addr - aa; /* compute base addr of arena */
- for(i=0; i<nelem(icache.last); i++)
- if(icache.last[i] != icache.last[0])
- break;
- if(i==nelem(icache.last) && icache.lastload != icache.last[0]){
- load = icache.last[0];
- icache.lastload = load;
+ if(icacheprefetch){
+ icache.last[icache.nlast++%nelem(icache.last)] = amapitoa(mainindex, ie->ia.addr, &aa);
+ aa = ie->ia.addr - aa; /* compute base addr of arena */
+ for(i=0; i<nelem(icache.last); i++)
+ if(icache.last[i] != icache.last[0])
+ break;
+ if(i==nelem(icache.last) && icache.lastload != icache.last[0]){
+ load = icache.last[0];
+ icache.lastload = load;
+ }
}
found:
@@ -249,6 +260,11 @@ icachealloc(IAddr *ia, u8int *score)
trace(TraceLump, "icachealloc unused");
goto Found;
}
+
+ if((ie = icache.free) != nil){
+ icache.free = ie->next;
+ goto Found;
+ }
h = icache.stolen;
for(i=0;; i++){
@@ -346,3 +362,21 @@ icacheclean(IEntry *ie)
trace(TraceProc, "icachedirty exit");
}
+void
+emptyicache(void)
+{
+ int i;
+ IEntry *ie, **lie;
+
+ qlock(&icache.lock);
+ for(i=0; i<icache.size; i++)
+ for(lie=&icache.heads[i]; (ie=*lie); ){
+ if(ie->dirty == 0){
+ *lie = ie->next;
+ ie->next = icache.free;
+ icache.free = ie;
+ }else
+ lie = &ie->next;
+ }
+ qunlock(&icache.lock);
+}
diff --git a/src/cmd/venti/srv/icachewrite.c b/src/cmd/venti/srv/icachewrite.c
index 9c36ba2c..003abb18 100644
--- a/src/cmd/venti/srv/icachewrite.c
+++ b/src/cmd/venti/srv/icachewrite.c
@@ -12,6 +12,7 @@ static void icachewritecoord(void*);
static IEntry *iesort(IEntry*);
int icachesleeptime = 1000; /* milliseconds */
+int minicachesleeptime = 50;
enum
{
@@ -74,7 +75,7 @@ nextchunk(Index *ix, ISect *is, IEntry **pie, u64int *paddr, uint *pnbuf)
static int
icachewritesect(Index *ix, ISect *is, u8int *buf)
{
- int err, h, bsize;
+ int err, h, bsize, t;
u32int lo, hi;
u64int addr, naddr;
uint nbuf, off;
@@ -96,7 +97,14 @@ icachewritesect(Index *ix, ISect *is, u8int *buf)
err = 0;
while(iedirty){
- sleep(icachesleeptime);
+ disksched();
+ while((t=icachesleeptime) == SleepForever){
+ sleep(1000);
+ disksched();
+ }
+ if(t < minicachesleeptime)
+ t = minicachesleeptime;
+ sleep(t);
trace(TraceProc, "icachewritesect nextchunk");
chunk = nextchunk(ix, is, &iedirty, &addr, &nbuf);
@@ -146,12 +154,15 @@ icachewritesect(Index *ix, ISect *is, u8int *buf)
break;
}
packibucket(&ib, buf+off, is->bucketmagic);
+ /* XXX not right - must update cache after writepart */
if((b = _getdblock(is->part, naddr, ORDWR, 0)) != nil){
memmove(b->data, buf+off, bsize);
putdblock(b);
}
}
+ diskaccess(1);
+
trace(TraceProc, "icachewritesect writepart", addr, nbuf);
if(writepart(is->part, addr, buf, nbuf) < 0){
/* XXX */
@@ -171,6 +182,7 @@ icachewritesect(Index *ix, ISect *is, u8int *buf)
static void
icachewriteproc(void *v)
{
+ int ret;
uint bsize;
ISect *is;
Index *ix;
@@ -188,17 +200,17 @@ icachewriteproc(void *v)
trace(TraceProc, "icachewriteproc recv");
recv(is->writechan, 0);
trace(TraceWork, "start");
- icachewritesect(ix, is, buf);
+ ret = icachewritesect(ix, is, buf);
trace(TraceProc, "icachewriteproc send");
trace(TraceWork, "finish");
- send(is->writedonechan, 0);
+ sendul(is->writedonechan, ret);
}
}
static void
icachewritecoord(void *v)
{
- int i;
+ int i, err;
Index *ix;
AState as;
@@ -216,9 +228,9 @@ icachewritecoord(void *v)
as = diskstate();
if(as.arena==iwrite.as.arena && as.aa==iwrite.as.aa){
/* will not be able to do anything more than last flush - kick disk */
- trace(TraceProc, "icachewritecoord flush dcache");
+ trace(TraceProc, "icachewritecoord kick dcache");
kickdcache();
- trace(TraceProc, "icachewritecoord flushed dcache");
+ trace(TraceProc, "icachewritecoord kicked dcache");
}
iwrite.as = as;
@@ -229,13 +241,15 @@ icachewritecoord(void *v)
if(ix->bloom)
send(ix->bloom->writechan, 0);
+ err = 0;
for(i=0; i<ix->nsects; i++)
- recv(ix->sects[i]->writedonechan, 0);
+ err |= recvul(ix->sects[i]->writedonechan);
if(ix->bloom)
- recv(ix->bloom->writedonechan, 0);
+ err |= recvul(ix->bloom->writedonechan);
- trace(TraceProc, "icachewritecoord donewrite");
- setatailstate(&iwrite.as);
+ trace(TraceProc, "icachewritecoord donewrite err=%d", err);
+ if(err == 0)
+ setatailstate(&iwrite.as);
}
icacheclean(nil); /* wake up anyone waiting */
trace(TraceWork, "finish");
diff --git a/src/cmd/venti/srv/index.c b/src/cmd/venti/srv/index.c
index 8cff4180..c69192a7 100644
--- a/src/cmd/venti/srv/index.c
+++ b/src/cmd/venti/srv/index.c
@@ -23,17 +23,11 @@
#include "dat.h"
#include "fns.h"
-/*static int bucklook(u8int *score, int type, u8int *data, int n); */
-/*static int writebucket(ISect *is, u32int buck, IBucket *ib, DBlock *b); */
-/*static int okibucket(IBucket *ib, ISect *is); */
static int initindex1(Index*);
static ISect *initisect1(ISect *is);
-/*static int splitiblock(Index *ix, DBlock *b, ISect *is, u32int buck, IBucket *ib); */
#define KEY(k,d) ((d) ? (k)>>(32-(d)) : 0)
-/*static QLock indexlock; //ZZZ */
-
static char IndexMagic[] = "venti index configuration";
Index*
@@ -375,6 +369,8 @@ initisect(Part *part)
seterr(EAdmin, "can't read index section header: %r");
return nil;
}
+print("read %s at %d: %.2ux %.2ux %.2ux %.2ux\n",
+ part->name, PartBlank, b->data[0], b->data[1], b->data[2], b->data[3]);
is = MKZ(ISect);
if(is == nil){
@@ -457,9 +453,10 @@ initisect1(ISect *is)
v = is->part->size & ~(u64int)(is->blocksize - 1);
if(is->blockbase + (u64int)is->blocks * is->blocksize != v){
seterr(ECorrupt, "invalid blocks in index section %s", is->name);
-/*ZZZZZZZZZ */
-/* freeisect(is); */
-/* return nil; */
+ /* ZZZ what to do?
+ freeisect(is);
+ return nil;
+ */
}
if(is->stop - is->start > is->blocks){
@@ -482,9 +479,10 @@ wbisect(ISect *is)
ZBlock *b;
b = alloczblock(HeadSize, 1, 0);
- if(b == nil)
-/*ZZZ set error? */
+ if(b == nil){
+ /* ZZZ set error? */
return -1;
+ }
if(packisect(is, b->data) < 0){
seterr(ECorrupt, "can't make index section header: %r");
@@ -789,7 +787,7 @@ loadibucket0(Index *ix, u32int buck, ISect **pis, u32int *pbuck, IBucket *ib, in
/*
* find the number of the index section holding score
*/
-static int
+int
indexsect1(Index *ix, u8int *score)
{
return indexsect0(ix, hashbits(score, 32) / ix->div);
diff --git a/src/cmd/venti/srv/lump.c b/src/cmd/venti/srv/lump.c
index 1fe3cf5c..13e6fe6a 100644
--- a/src/cmd/venti/srv/lump.c
+++ b/src/cmd/venti/srv/lump.c
@@ -2,6 +2,7 @@
#include "dat.h"
#include "fns.h"
+int syncwrites = 0;
int queuewrites = 0;
int writestodevnull = 0;
@@ -45,7 +46,7 @@ readlump(u8int *score, int type, u32int size, int *cached)
*cached = 0;
if(lookupscore(score, type, &ia, &rac) < 0){
- /*ZZZ place to check for someone trying to guess scores */
+ /* ZZZ place to check for someone trying to guess scores */
seterr(EOk, "no block with score %V/%d exists", score, type);
putlump(u);
@@ -92,7 +93,15 @@ writelump(Packet *p, u8int *score, int type, u32int creator, uint ms)
if(u->data != nil){
ok = 0;
if(packetcmp(p, u->data) != 0){
- seterr(EStrange, "score collision");
+ uchar nscore[VtScoreSize];
+
+ packetsha1(u->data, nscore);
+ if(scorecmp(u->score, score) != 0)
+ seterr(EStrange, "lookuplump returned bad score %V not %V", u->score, score);
+ else if(scorecmp(u->score, nscore) != 0)
+ seterr(EStrange, "lookuplump returned bad data %V not %V", nscore, u->score);
+ else
+ seterr(EStrange, "score collision %V", score);
ok = -1;
}
packetfree(p);
@@ -138,7 +147,13 @@ writeqlump(Lump *u, Packet *p, int creator, uint ms)
if(old != nil){
ok = 0;
if(packetcmp(p, old) != 0){
- seterr(EStrange, "score collision");
+ uchar nscore[VtScoreSize];
+
+ packetsha1(old, nscore);
+ if(scorecmp(u->score, nscore) != 0)
+ seterr(EStrange, "readilump returned bad data %V not %V", nscore, u->score);
+ else
+ seterr(EStrange, "score collision %V", u->score);
ok = -1;
}
packetfree(p);
@@ -160,6 +175,12 @@ writeqlump(Lump *u, Packet *p, int creator, uint ms)
insertlump(u, p);
else
packetfree(p);
+
+ if(syncwrites){
+ flushdcache();
+ flushicache();
+ flushdcache();
+ }
ms = msec() - ms;
addstat2(StatRpcWriteNew, 1, StatRpcWriteNewTime, ms);
diff --git a/src/cmd/venti/srv/lumpcache.c b/src/cmd/venti/srv/lumpcache.c
index f183e128..b989c3cb 100644
--- a/src/cmd/venti/srv/lumpcache.c
+++ b/src/cmd/venti/srv/lumpcache.c
@@ -11,7 +11,7 @@ enum
{
HashLog = 9,
HashSize = 1<<HashLog,
- HashMask = HashSize - 1
+ HashMask = HashSize - 1,
};
struct LumpCache
@@ -175,7 +175,6 @@ again:
* remove it from the heap, and fix up the heap.
*/
size = packetasize(p);
-/*ZZZ */
while(lumpcache.avail < size){
trace(TraceLump, "insertlump bump");
CHECK(checklumpcache());
@@ -277,6 +276,15 @@ bumplump(void)
return b;
}
+void
+emptylumpcache(void)
+{
+ qlock(&lumpcache.lock);
+ while(bumplump())
+ ;
+ qunlock(&lumpcache.lock);
+}
+
/*
* delete an arbitrary block from the heap
*/
@@ -415,3 +423,4 @@ checklumpcache(void)
if(lumpcache.nheap + nfree + refed != lumpcache.nblocks)
sysfatal("lc: missing blocks: %d %d %d %d", lumpcache.nheap, refed, nfree, lumpcache.nblocks);
}
+
diff --git a/src/cmd/venti/srv/lumpqueue.c b/src/cmd/venti/srv/lumpqueue.c
index 1b03f41c..869eaeae 100644
--- a/src/cmd/venti/srv/lumpqueue.c
+++ b/src/cmd/venti/srv/lumpqueue.c
@@ -58,22 +58,6 @@ initlumpqueues(int nq)
seterr(EOk, "can't start write queue slave: %r");
return -1;
}
- if(vtproc(queueproc, q) < 0){
- seterr(EOk, "can't start write queue slave: %r");
- return -1;
- }
- if(vtproc(queueproc, q) < 0){
- seterr(EOk, "can't start write queue slave: %r");
- return -1;
- }
- if(vtproc(queueproc, q) < 0){
- seterr(EOk, "can't start write queue slave: %r");
- return -1;
- }
- if(vtproc(queueproc, q) < 0){
- seterr(EOk, "can't start write queue slave: %r");
- return -1;
- }
}
return 0;
diff --git a/src/cmd/venti/srv/mirrorarenas.c b/src/cmd/venti/srv/mirrorarenas.c
new file mode 100644
index 00000000..253b4edb
--- /dev/null
+++ b/src/cmd/venti/srv/mirrorarenas.c
@@ -0,0 +1,464 @@
+/*
+ * Mirror one arena partition onto another.
+ * Be careful to copy only new data.
+ */
+
+#include "stdinc.h"
+#include "dat.h"
+#include "fns.h"
+
+Channel *writechan;
+
+typedef struct Write Write;
+struct Write
+{
+ uchar *p;
+ int n;
+ uvlong o;
+ int error;
+};
+
+Part *src;
+Part *dst;
+int force;
+int verbose;
+char *status;
+uvlong astart, aend;
+
+void
+usage(void)
+{
+ fprint(2, "usage: mirrorarenas [-v] src dst [ranges]\n");
+ threadexitsall("usage");
+}
+
+int
+ereadpart(Part *p, u64int offset, u8int *buf, u32int count)
+{
+ if(readpart(p, offset, buf, count) != count){
+ print("%T readpart %s at %#llux+%ud: %r\n", p->name, offset, count);
+ return -1;
+ }
+ return 0;
+}
+
+int
+ewritepart(Part *p, u64int offset, u8int *buf, u32int count)
+{
+ if(writepart(p, offset, buf, count) != count){
+ print("%T writepart %s at %#llux+%ud: %r\n", p->name, offset, count);
+ return -1;
+ }
+ return 0;
+}
+
+/*
+ * Extra proc to do writes to dst, so that we can overlap reading
+ * src with writing dst during copy. This is an easy factor of two
+ * (almost) in performance.
+ */
+static void
+writeproc(void *v)
+{
+ Write *w;
+
+ USED(v);
+ while((w = recvp(writechan)) != nil){
+ if(w->n == 0)
+ continue;
+ if(ewritepart(dst, w->o, w->p, w->n) < 0)
+ w->error = 1;
+ }
+}
+
+int
+copy(uvlong start, uvlong end, char *what, DigestState *ds)
+{
+ int i, n;
+ uvlong o;
+ static uchar tmp[2][1024*1024];
+ Write w[2];
+
+ assert(start <= end);
+ assert(astart <= start && start < aend);
+ assert(astart <= end && end <= aend);
+
+ if(verbose && start != end)
+ print("%T copy %,llud-%,llud %s\n", start, end, what);
+
+ i = 0;
+ memset(w, 0, sizeof w);
+ for(o=start; o<end; o+=n){
+ if(w[i].error)
+ goto error;
+ n = sizeof tmp[i];
+ if(o+n > end)
+ n = end - o;
+ if(ereadpart(src, o, tmp[i], n) < 0)
+ goto error;
+ w[i].p = tmp[i];
+ w[i].o = o;
+ w[i].n = n;
+ w[i].error = 0;
+ sendp(writechan, &w[i]);
+ if(ds)
+ sha1(tmp[i], n, nil, ds);
+ i = 1-i;
+ }
+ if(w[i].error)
+ goto error;
+
+ /*
+ * wait for queued write to finish
+ */
+ w[i].p = nil;
+ w[i].o = 0;
+ w[i].n = 0;
+ w[i].error = 0;
+ sendp(writechan, &w[i]);
+ i = 1-i;
+ if(w[i].error)
+ return -1;
+ return 0;
+
+error:
+ /*
+ * sync with write proc
+ */
+ w[i].p = nil;
+ w[i].o = 0;
+ w[i].n = 0;
+ w[i].error = 0;
+ sendp(writechan, &w[i]);
+ return -1;
+}
+
+/* single-threaded, for reference */
+int
+copy1(uvlong start, uvlong end, char *what, DigestState *ds)
+{
+ int n;
+ uvlong o;
+ static uchar tmp[1024*1024];
+
+ assert(start <= end);
+ assert(astart <= start && start < aend);
+ assert(astart <= end && end <= aend);
+
+ if(verbose && start != end)
+ print("%T copy %,llud-%,llud %s\n", start, end, what);
+
+ for(o=start; o<end; o+=n){
+ n = sizeof tmp;
+ if(o+n > end)
+ n = end - o;
+ if(ereadpart(src, o, tmp, n) < 0)
+ return -1;
+ if(ds)
+ sha1(tmp, n, nil, ds);
+ if(ewritepart(dst, o, tmp, n) < 0)
+ return -1;
+ }
+ return 0;
+}
+
+int
+asha1(Part *p, uvlong start, uvlong end, DigestState *ds)
+{
+ int n;
+ uvlong o;
+ static uchar tmp[1024*1024];
+
+ if(start == end)
+ return 0;
+ assert(start < end);
+
+ if(verbose)
+ print("%T sha1 %,llud-%,llud\n", start, end);
+
+ for(o=start; o<end; o+=n){
+ n = sizeof tmp;
+ if(o+n > end)
+ n = end - o;
+ if(ereadpart(p, o, tmp, n) < 0)
+ return -1;
+ sha1(tmp, n, nil, ds);
+ }
+ return 0;
+}
+
+uvlong
+rdown(uvlong a, int b)
+{
+ return a-a%b;
+}
+
+uvlong
+rup(uvlong a, int b)
+{
+ if(a%b == 0)
+ return a;
+ return a+b-a%b;
+}
+
+void
+mirror(Arena *sa, Arena *da)
+{
+ vlong v, si, di, end;
+ int clumpmax, blocksize;
+ static uchar buf[MaxIoSize];
+ ArenaHead h;
+ DigestState xds, *ds;
+ vlong shaoff, base;
+
+ base = sa->base;
+ blocksize = sa->blocksize;
+ end = sa->base + sa->size;
+
+ astart = base - blocksize;
+ aend = end + blocksize;
+
+ shaoff = 0;
+
+ if(force){
+ copy(astart, aend, "all", nil);
+ return;
+ }
+
+ if(verbose)
+ print("%T %s (%,llud-%,llud)\n", sa->name, astart, aend);
+
+ if(sa->diskstats.sealed && da->diskstats.sealed && scorecmp(da->score, zeroscore) != 0){
+ if(scorecmp(sa->score, da->score) == 0)
+ return;
+ print("%T arena %s: sealed score mismatch %V vs %V\n", sa->name, sa->score, da->score);
+ status = "errors";
+ return;
+ }
+ if(da->diskstats.sealed && scorecmp(da->score, zeroscore) != 0){
+ print("%T arena %s: dst is sealed, src is not\n", sa->name);
+ status = "errors";
+ return;
+ }
+ if(sa->diskstats.used < da->diskstats.used){
+ print("%T arena %s: src used %,lld < dst used %,lld\n", sa->name, sa->diskstats.used, da->diskstats.used);
+ status = "errors";
+ return;
+ }
+
+ if(da->clumpmagic != sa->clumpmagic){
+ /*
+ * Write this now to reduce the window in which
+ * the head and tail disagree about clumpmagic.
+ */
+ da->clumpmagic = sa->clumpmagic;
+ memset(buf, 0, sizeof buf);
+ packarena(da, buf);
+ if(ewritepart(dst, end, buf, blocksize) < 0)
+ return;
+ }
+
+ memset(&h, 0, sizeof h);
+ h.version = da->version;
+ strcpy(h.name, da->name);
+ h.blocksize = da->blocksize;
+ h.size = da->size + 2*da->blocksize;
+ h.clumpmagic = da->clumpmagic;
+ memset(buf, 0, sizeof buf);
+ packarenahead(&h, buf);
+ if(ewritepart(dst, base - blocksize, buf, blocksize) < 0)
+ return;
+
+ ds = nil;
+ if(sa->diskstats.sealed && scorecmp(sa->score, zeroscore) != 0){
+ /* start sha1 state with header */
+ memset(&xds, 0, sizeof xds);
+ ds = &xds;
+ sha1(buf, blocksize, nil, ds);
+ shaoff = base;
+ }
+
+ if(sa->diskstats.used != da->diskstats.used){
+ di = base+rdown(da->diskstats.used, blocksize);
+ si = base+rup(sa->diskstats.used, blocksize);
+ if(ds && asha1(dst, shaoff, di, ds) < 0)
+ return;
+ if(copy(di, si, "data", ds) < 0)
+ return;
+ shaoff = si;
+ }
+
+ clumpmax = sa->clumpmax;
+ di = end - da->diskstats.clumps/clumpmax * blocksize;
+ si = end - (sa->diskstats.clumps+clumpmax-1)/clumpmax * blocksize;
+
+ if(sa->diskstats.sealed){
+ /*
+ * might be a small hole between the end of the
+ * data and the beginning of the directory.
+ */
+ v = base+rup(sa->diskstats.used, blocksize);
+ if(ds && asha1(dst, shaoff, v, ds) < 0)
+ return;
+ if(copy(v, si, "hole", ds) < 0)
+ return;
+ shaoff = si;
+ }
+
+ if(da->diskstats.clumps != sa->diskstats.clumps){
+ if(ds && asha1(dst, shaoff, si, ds) < 0)
+ return;
+ if(copy(si, di, "directory", ds) < 0) /* si < di because clumpinfo blocks grow down */
+ return;
+ shaoff = di;
+ }
+
+ da->ctime = sa->ctime;
+ da->wtime = sa->wtime;
+ da->diskstats = sa->diskstats;
+ da->diskstats.sealed = 0;
+
+ memset(buf, 0, sizeof buf);
+ packarena(da, buf);
+ if(ewritepart(dst, end, buf, blocksize) < 0)
+ return;
+
+ if(ds){
+ asha1(dst, shaoff, end, ds);
+ da->diskstats.sealed = 1;
+ memset(buf, 0, sizeof buf);
+ packarena(da, buf);
+ sha1(buf, blocksize, da->score, ds);
+ if(scorecmp(sa->score, da->score) == 0){
+ if(verbose)
+ print("%T arena %s: %V\n", sa->name, da->score);
+ scorecp(buf+blocksize-VtScoreSize, da->score);
+ if(ewritepart(dst, end, buf, blocksize) < 0)
+ return;
+ }else{
+ print("%T arena %s: sealing dst: score mismatch: %V vs %V\n", sa->name, sa->score, da->score);
+ memset(&xds, 0, sizeof xds);
+ asha1(dst, base-blocksize, end, &xds);
+ sha1(buf, blocksize, da->score, &xds);
+ print("%T reseal: %V\n", da->score);
+ status = "errors";
+ }
+ }
+}
+
+void
+mirrormany(ArenaPart *sp, ArenaPart *dp, char *range)
+{
+ int i, lo, hi;
+ char *s, *t;
+ Arena *sa, *da;
+
+ if(range == nil){
+ for(i=0; i<sp->narenas; i++){
+ sa = sp->arenas[i];
+ da = dp->arenas[i];
+ mirror(sa, da);
+ }
+ return;
+ }
+ if(strcmp(range, "none") == 0)
+ return;
+
+ for(s=range; *s; s=t){
+ t = strchr(s, ',');
+ if(t)
+ *t++ = 0;
+ else
+ t = s+strlen(s);
+ if(*s == '-')
+ lo = 0;
+ else
+ lo = strtol(s, &s, 0);
+ hi = lo;
+ if(*s == '-'){
+ s++;
+ if(*s == 0)
+ hi = sp->narenas-1;
+ else
+ hi = strtol(s, &s, 0);
+ }
+ if(*s != 0){
+ print("%T bad arena range: %s\n", s);
+ continue;
+ }
+ for(i=lo; i<=hi; i++){
+ sa = sp->arenas[i];
+ da = dp->arenas[i];
+ mirror(sa, da);
+ }
+ }
+}
+
+
+void
+threadmain(int argc, char **argv)
+{
+ int i;
+ Arena *sa, *da;
+ ArenaPart *s, *d;
+ char *ranges;
+
+ ventifmtinstall();
+
+ ARGBEGIN{
+ case 'F':
+ force = 1;
+ break;
+ case 'v':
+ verbose++;
+ break;
+ default:
+ usage();
+ }ARGEND
+
+ if(argc != 2 && argc != 3)
+ usage();
+ ranges = nil;
+ if(argc == 3)
+ ranges = argv[2];
+
+ if((src = initpart(argv[0], OREAD)) == nil)
+ sysfatal("initpart %s: %r", argv[0]);
+ if((dst = initpart(argv[1], ORDWR)) == nil)
+ sysfatal("initpart %s: %r", argv[1]);
+ if((s = initarenapart(src)) == nil)
+ sysfatal("initarenapart %s: %r", argv[0]);
+ for(i=0; i<s->narenas; i++)
+ delarena(s->arenas[i]);
+ if((d = initarenapart(dst)) == nil)
+ sysfatal("loadarenapart %s: %r", argv[1]);
+ for(i=0; i<d->narenas; i++)
+ delarena(d->arenas[i]);
+
+ /*
+ * The arena geometries must match or all bets are off.
+ */
+ if(s->narenas != d->narenas)
+ sysfatal("arena count mismatch: %d vs %d", s->narenas, d->narenas);
+ for(i=0; i<s->narenas; i++){
+ sa = s->arenas[i];
+ da = d->arenas[i];
+ if(sa->version != da->version)
+ sysfatal("arena %d: version mismatch: %d vs %d", i, sa->version, da->version);
+ if(sa->blocksize != da->blocksize)
+ sysfatal("arena %d: blocksize mismatch: %d vs %d", i, sa->blocksize, da->blocksize);
+ if(sa->size != da->size)
+ sysfatal("arena %d: size mismatch: %,lld vs %,lld", i, sa->size, da->size);
+ if(strcmp(sa->name, da->name) != 0)
+ sysfatal("arena %d: name mismatch: %s vs %s", i, sa->name, da->name);
+ }
+
+ /*
+ * Mirror one arena at a time.
+ */
+ writechan = chancreate(sizeof(void*), 0);
+ vtproc(writeproc, nil);
+ mirrormany(s, d, ranges);
+ sendp(writechan, nil);
+ threadexitsall(status);
+}
diff --git a/src/cmd/venti/srv/mkfile b/src/cmd/venti/srv/mkfile
index c50539fc..2f9a89da 100644
--- a/src/cmd/venti/srv/mkfile
+++ b/src/cmd/venti/srv/mkfile
@@ -11,6 +11,7 @@ LIBOFILES=\
config.$O\
conv.$O\
dcache.$O\
+ disksched.$O\
dump.$O\
graph.$O\
httpd.$O\
@@ -52,11 +53,13 @@ TARG=\
fmtbloom\
fmtisect\
fmtindex\
+ fixarenas\
buildindex\
checkarenas\
checkindex\
clumpstats\
findscore\
+ mirrorarenas\
rdarena\
wrarena\
syncindex\
diff --git a/src/cmd/venti/srv/part.c b/src/cmd/venti/srv/part.c
index ff5c98e5..1f8238c4 100644
--- a/src/cmd/venti/srv/part.c
+++ b/src/cmd/venti/srv/part.c
@@ -145,8 +145,6 @@ initpart(char *name, int mode)
if(hi == 0)
hi = dir->length;
part->size = hi - part->offset;
-fprint(2, "part %s: file %s offset %,lld size %,lld\n",
- name, file, part->offset, part->size);
#ifdef CANBLOCKSIZE
{
struct statfs sfs;
@@ -203,10 +201,32 @@ prwb(char *name, int fd, int isread, u64int offset, void *vbuf, u32int count, u3
u32int c, delta, icount, opsize;
int r;
+ icount = count;
buf = vbuf;
+
+#ifndef PLAN9PORT
+ op = isread ? "read" : "write";
+ dst = buf;
+ freetmp = nil;
+ while(count > 0){
+ opsize = min(count, 131072 /* blocksize */);
+ if(isread)
+ r = pread(fd, dst, opsize, offset);
+ else
+ r = pwrite(fd, dst, opsize, offset);
+ if(r <= 0)
+ goto Error;
+ offset += r;
+ count -= r;
+ dst += r;
+ if(r != opsize)
+ goto Error;
+ }
+ return icount;
+#endif
+
tmp = nil;
freetmp = nil;
- icount = count;
opsize = blocksize;
if(count == 0){
@@ -313,7 +333,7 @@ print("FAILED isread=%d r=%d count=%d blocksize=%d\n", isread, r, count, blocksi
memmove(buf, tmp, count);
else{
memmove(tmp, buf, count);
- if(pwrite(fd, tmp, blocksize, offset) != blocksize){
+ if(pwrite(fd, tmp, opsize, offset) != blocksize){
dst = tmp;
op = "write";
goto Error;
@@ -332,9 +352,16 @@ Error:
return -1;
}
+#ifndef PLAN9PORT
+static int sdreset(Part*);
+static int reopen(Part*);
+static int threadspawnl(int[3], char*, char*, ...);
+#endif
+
int
rwpart(Part *part, int isread, u64int offset, u8int *buf, u32int count)
{
+ int n, try;
u32int blocksize;
trace(TraceDisk, "%s %s %ud at 0x%llx",
@@ -351,9 +378,33 @@ rwpart(Part *part, int isread, u64int offset, u8int *buf, u32int count)
if(blocksize == 0)
blocksize = 4096;
- return prwb(part->filename, part->fd, isread, part->offset+offset, buf, count, blocksize);
-}
+ for(try=0;; try++){
+ n = prwb(part->filename, part->fd, isread, part->offset+offset, buf, count, blocksize);
+ if(n >= 0 || try > 10)
+ break;
+#ifndef PLAN9PORT
+ {
+ char err[ERRMAX];
+ /*
+ * This happens with the sdmv disks frustratingly often.
+ * Try to fix things up and continue.
+ */
+ rerrstr(err, sizeof err);
+ if(strstr(err, "i/o timeout") || strstr(err, "i/o error")){
+ if(sdreset(part) >= 0)
+ reopen(part);
+ continue;
+ }else if(strstr(err, "partition has changed")){
+ reopen(part);
+ continue;
+ }
+ }
+#endif
+ break;
+ }
+ return n;
+}
int
readpart(Part *part, u64int offset, u8int *buf, u32int count)
{
@@ -391,3 +442,200 @@ readfile(char *name)
return b;
}
+
+
+
+
+
+
+
+#ifndef PLAN9PORT
+static int
+sdreset(Part *part)
+{
+ char *name, *p;
+ int i, fd, xfd[3], rv;
+ static QLock resetlk;
+ Dir *d, *dd;
+
+ fprint(2, "sdreset %s\n", part->name);
+ name = emalloc(strlen(part->filename)+20);
+ strcpy(name, part->filename);
+ p = strrchr(name, '/');
+ if(p)
+ p++;
+ else
+ p = name;
+
+ strcpy(p, "ctl");
+ d = dirstat(name);
+ if(d == nil){
+ free(name);
+ return -1;
+ }
+
+ /*
+ * We don't need multiple people resetting the disk.
+ */
+ qlock(&resetlk);
+ if((fd = open(name, OWRITE)) < 0)
+ goto error;
+ dd = dirfstat(fd);
+ if(d && dd && d->qid.vers != dd->qid.vers){
+ fprint(2, "sdreset %s: got scooped\n", part->name);
+ /* Someone else got here first. */
+ if(access(part->filename, AEXIST) >= 0)
+ goto ok;
+ goto error;
+ }
+
+ /*
+ * Write "reset" to the ctl file to cause the chipset
+ * to reinitialize itself (specific to sdmv driver).
+ * Ignore error in case using other disk.
+ */
+ fprint(2, "sdreset %s: reset ctl\n", part->name);
+ write(fd, "reset", 5);
+
+ if(access(part->filename, AEXIST) >= 0)
+ goto ok;
+
+ /*
+ * Re-run fdisk and prep. Don't use threadwaitchan
+ * to avoid coordinating for it. Reopen ctl because
+ * we reset the disk.
+ */
+ strcpy(p, "ctl");
+ close(fd);
+ if((fd = open(name, OWRITE)) < 0)
+ goto error;
+ strcpy(p, "data");
+ xfd[0] = open("/dev/null", OREAD);
+ xfd[1] = dup(fd, -1);
+ xfd[2] = dup(2, -1);
+ fprint(2, "sdreset %s: run fdisk %s\n", part->name, name);
+ if(threadspawnl(xfd, "/bin/disk/fdisk", "disk/fdisk", "-p", name, nil) < 0){
+ close(xfd[0]);
+ close(xfd[1]);
+ close(xfd[2]);
+ goto error;
+ }
+ strcpy(p, "plan9");
+ for(i=0; i<=20; i++){
+ sleep(i*100);
+ if(access(part->filename, AEXIST) >= 0)
+ goto ok;
+ if(access(name, AEXIST) >= 0)
+ goto prep;
+ }
+ goto error;
+
+prep:
+ strcpy(p, "ctl");
+ close(fd);
+ if((fd = open(name, OWRITE)) < 0)
+ goto error;
+ strcpy(p, "plan9");
+ xfd[0] = open("/dev/null", OREAD);
+ xfd[1] = dup(fd, -1);
+ xfd[2] = dup(2, -1);
+ fprint(2, "sdreset %s: run prep\n", part->name);
+ if(threadspawnl(xfd, "/bin/disk/prep", "disk/prep", "-p", name, nil) < 0){
+ close(xfd[0]);
+ close(xfd[1]);
+ close(xfd[2]);
+ goto error;
+ }
+ for(i=0; i<=20; i++){
+ sleep(i*100);
+ if(access(part->filename, AEXIST) >= 0)
+ goto ok;
+ }
+
+error:
+ fprint(2, "sdreset %s: error: %r\n", part->name);
+ rv = -1;
+ if(fd >= 0)
+ close(fd);
+ goto out;
+
+ok:
+ fprint(2, "sdreset %s: all okay\n", part->name);
+ rv = 0;
+ goto out;
+
+out:
+ free(name);
+ qunlock(&resetlk);
+ return rv;
+}
+
+static int
+reopen(Part *part)
+{
+ int fd;
+
+ fprint(2, "reopen %s\n", part->filename);
+ if((fd = open(part->filename, ORDWR)) < 0){
+ fprint(2, "reopen %s: %r\n", part->filename);
+ return -1;
+ }
+ if(fd != part->fd){
+ dup(fd, part->fd);
+ close(fd);
+ }
+ return 0;
+}
+
+typedef struct Spawn Spawn;
+struct Spawn
+{
+ Channel *c;
+ int fd[3];
+ char *file;
+ char **argv;
+};
+
+static void
+spawnproc(void *v)
+{
+ int i, *fd;
+ Spawn *s;
+
+ rfork(RFFDG);
+ s = v;
+ fd = s->fd;
+ for(i=0; i<3; i++)
+ dup(fd[i], i);
+ if(fd[0] > 2)
+ close(fd[0]);
+ if(fd[1] > 2 && fd[1] != fd[0])
+ close(fd[1]);
+ if(fd[2] > 2 && fd[2] != fd[1] && fd[2] != fd[0])
+ close(fd[2]);
+ procexec(s->c, s->file, s->argv);
+}
+
+static int
+threadspawnl(int fd[3], char *file, char *argv0, ...)
+{
+ int pid;
+ Spawn s;
+
+ s.c = chancreate(sizeof(void*), 0);
+ memmove(s.fd, fd, sizeof(s.fd));
+ s.file = file;
+ s.argv = &argv0;
+ vtproc(spawnproc, &s);
+ pid = recvul(s.c);
+ if(pid < 0)
+ return -1;
+ close(fd[0]);
+ if(fd[1] != fd[0])
+ close(fd[1]);
+ if(fd[2] != fd[1] && fd[2] != fd[0])
+ close(fd[2]);
+ return pid;
+}
+
+#endif
diff --git a/src/cmd/venti/srv/printarenapart.c b/src/cmd/venti/srv/printarenapart.c
new file mode 100644
index 00000000..25418beb
--- /dev/null
+++ b/src/cmd/venti/srv/printarenapart.c
@@ -0,0 +1,160 @@
+#include "stdinc.h"
+#include "dat.h"
+#include "fns.h"
+
+uchar buf[64*1024];
+
+void
+usage(void)
+{
+ fprint(2, "usage: printarenapart arenafile [offset]\n");
+ threadexitsall("usage");
+}
+
+static void
+rdarena(Arena *arena, u64int offset)
+{
+ u64int a, aa, e;
+ u32int magic;
+ Clump cl;
+ uchar score[VtScoreSize];
+ ZBlock *lump;
+
+ printarena(2, arena);
+
+ a = arena->base;
+ e = arena->base + arena->size;
+ if(offset != ~(u64int)0) {
+ if(offset >= e-a)
+ sysfatal("bad offset %llud >= %llud\n",
+ offset, e-a);
+ aa = offset;
+ } else
+ aa = 0;
+
+ for(; aa < e; aa += ClumpSize+cl.info.size) {
+ magic = clumpmagic(arena, aa);
+ if(magic == ClumpFreeMagic)
+ break;
+ if(magic != arena->clumpmagic) {
+ fprint(2, "illegal clump magic number %#8.8ux offset %llud\n",
+ magic, aa);
+ break;
+ }
+ lump = loadclump(arena, aa, 0, &cl, score, 0);
+ if(lump == nil) {
+ fprint(2, "clump %llud failed to read: %r\n", aa);
+ break;
+ }
+ if(cl.info.type != VtCorruptType) {
+ scoremem(score, lump->data, cl.info.uncsize);
+ if(scorecmp(cl.info.score, score) != 0) {
+ fprint(2, "clump %llud has mismatched score\n", aa);
+ break;
+ }
+ if(vttypevalid(cl.info.type) < 0) {
+ fprint(2, "clump %llud has bad type %d\n", aa, cl.info.type);
+ break;
+ }
+ }
+ print("%22llud %V %3d %5d\n", aa, score, cl.info.type, cl.info.uncsize);
+ freezblock(lump);
+ }
+ print("end offset %llud\n", aa);
+}
+
+void
+threadmain(int argc, char *argv[])
+{
+ char *file, *p, *name;
+ char *table;
+ u64int offset;
+ Part *part;
+ ArenaPart ap;
+ ArenaHead head;
+ Arena tail;
+ char ct[40], mt[40];
+
+ readonly = 1; /* for part.c */
+ ARGBEGIN{
+ default:
+ usage();
+ break;
+ }ARGEND
+
+ switch(argc) {
+ default:
+ usage();
+ case 1:
+ file = argv[0];
+ }
+
+ ventifmtinstall();
+ statsinit();
+
+ part = initpart(file, OREAD|ODIRECT);
+ if(part == nil)
+ sysfatal("can't open file %s: %r", file);
+ if(readpart(part, PartBlank, buf, sizeof buf) < 0)
+ sysfatal("can't read file %s: %r", file);
+
+ if(unpackarenapart(&ap, buf) < 0)
+ sysfatal("corrupted arena part header: %r");
+
+ print("# arena part version=%d blocksize=%d arenabase=%d\n",
+ ap.version, ap.blocksize, ap.arenabase);
+ ap.tabbase = (PartBlank+HeadSize+ap.blocksize-1)&~(ap.blocksize-1);
+ ap.tabsize = ap.arenabase - ap.tabbase;
+
+print("A");
+ table = malloc(ap.tabsize+1);
+ if(readpart(part, ap.tabbase, (uchar*)table, ap.tabsize) < 0)
+ sysfatal("read %s: %r", file);
+ table[ap.tabsize] = 0;
+
+print("A");
+ partblocksize(part, ap.blocksize);
+ initdcache(8 * MaxDiskBlock);
+
+print("A");
+/* XXX - read the number of arenas from the first line */
+ for(p=table; p && *p; p=strchr(p, '\n')){
+ if(*p == '\n')
+ p++;
+ name = p;
+ p = strpbrk(p, " \t");
+ if(p == nil){
+ fprint(2, "bad line: %s\n", name);
+ break;
+ }
+print("%p\n", p);
+ offset = strtoull(p, nil, 0);
+ if(readpart(part, offset, buf, sizeof buf) < 0){
+ fprint(2, "%s: read %s: %r\n", argv0, file);
+ continue;
+ }
+ if(unpackarenahead(&head, buf) < 0){
+ fprint(2, "%s: unpackarenahead: %r\n", argv0);
+ continue;
+ }
+ if(readpart(part, offset+head.size-head.blocksize, buf, head.blocksize) < 0){
+ fprint(2, "%s: read %s: %r\n", argv0, file);
+ continue;
+ }
+ if(unpackarena(&tail, buf) < 0){
+ fprint(2, "%s: unpackarena: %r\n", argv0);
+ continue;
+ }
+ print("arena %s %lld clumps=%,d cclumps=%,d used=%,lld uncsize=%,lld%s\n",
+ tail.name, offset,
+ tail.diskstats.clumps, tail.diskstats.cclumps,
+ tail.diskstats.used, tail.diskstats.uncsize,
+ tail.diskstats.sealed ? " sealed" : "");
+ strcpy(ct, ctime(tail.ctime));
+ ct[28] = 0;
+ strcpy(mt, ctime(tail.wtime));
+ mt[28] = 0;
+ print("\tctime=%s\n\tmtime=%s\n", ct, mt);
+ }
+ threadexitsall(0);
+}
diff --git a/src/cmd/venti/srv/printarenas.c b/src/cmd/venti/srv/printarenas.c
index 90c74ccc..111db018 100644
--- a/src/cmd/venti/srv/printarenas.c
+++ b/src/cmd/venti/srv/printarenas.c
@@ -36,7 +36,7 @@ shoulddump(char *name, int argc, char **argv)
enum
{
- ClumpChunks = 32*1024
+ ClumpChunks = 32*1024,
};
void
diff --git a/src/cmd/venti/srv/sortientry.c b/src/cmd/venti/srv/sortientry.c
index 7ed9ba3a..fc5e85e7 100644
--- a/src/cmd/venti/srv/sortientry.c
+++ b/src/cmd/venti/srv/sortientry.c
@@ -61,7 +61,7 @@ sortrawientries(Index *ix, Part *tmp, u64int *base, Bloom *bloom)
u32int n;
int i, ok;
-/*ZZZ should allow configuration of bits, bucket size */
+/* ZZZ should allow configuration of bits, bucket size */
ib = initiebucks(tmp, 8, 64*1024);
if(ib == nil){
seterr(EOk, "can't create sorting buckets: %r");
@@ -116,10 +116,7 @@ readarenainfo(IEBucks *ib, Arena *arena, u64int a, Bloom *b)
ClumpInfo *ci, *cis;
u32int clump;
int i, n, ok, nskip;
-/* static Biobuf bout; */
-/*ZZZ remove fprint? */
-/*fprint(2, "ra %s %d %d\n", arena->name, arena->memstats.clumps, arena->diskstats.clumps); */
if(arena->memstats.clumps)
fprint(2, "\tarena %s: %d entries\n", arena->name, arena->memstats.clumps);
else
@@ -129,7 +126,6 @@ readarenainfo(IEBucks *ib, Arena *arena, u64int a, Bloom *b)
ok = 0;
nskip = 0;
memset(&ie, 0, sizeof(IEntry));
-/* Binit(&bout, 1, OWRITE); */
for(clump = 0; clump < arena->memstats.clumps; clump += n){
n = ClumpChunks;
if(n > arena->memstats.clumps - clump)
@@ -148,18 +144,15 @@ readarenainfo(IEBucks *ib, Arena *arena, u64int a, Bloom *b)
a += ci->size + ClumpSize;
ie.ia.blocks = (ci->size + ClumpSize + (1 << ABlockLog) - 1) >> ABlockLog;
scorecp(ie.score, ci->score);
- /* Bprint(&bout, "%22lld %V %3d %5d\n", */
- /* ie.ia.addr, ie.score, ie.ia.type, ie.ia.size); */
if(ci->type == VtCorruptType){
- /* print("! %V %22lld %3d %5d %3d\n", */
- /* ie.score, ie.ia.addr, ie.ia.type, ie.ia.size, ie.ia.blocks); */
+ if(0) print("! %V %22lld %3d %5d %3d\n",
+ ie.score, ie.ia.addr, ie.ia.type, ie.ia.size, ie.ia.blocks);
nskip++;
}else
sprayientry(ib, &ie);
markbloomfilter(b, ie.score);
}
}
-/* Bterm(&bout); */
free(cis);
if(ok < 0)
return TWID32;
@@ -358,8 +351,8 @@ readiebuck(IEBucks *ib, int b)
m = ib->bucks[b].used;
if(m == 0)
m = ib->usable;
-/* if(ib->bucks[b].total) */
-/* fprint(2, "\tbucket %d: %d entries\n", b, ib->bucks[b].total/IEntrySize); */
+ if(0) if(ib->bucks[b].total)
+ fprint(2, "\tbucket %d: %d entries\n", b, ib->bucks[b].total/IEntrySize);
while(head != TWID32){
if(readpart(ib->part, (u64int)head * ib->size, &ib->buf[n], m+U32Size) < 0){
seterr(EOk, "can't read index sort bucket: %r");
diff --git a/src/cmd/venti/srv/stats.c b/src/cmd/venti/srv/stats.c
index f578860a..874f7d27 100644
--- a/src/cmd/venti/srv/stats.c
+++ b/src/cmd/venti/srv/stats.c
@@ -80,7 +80,7 @@ Statdesc statdesc[NStat] =
{ "isect block write bytes", },
{ "sum reads", },
- { "sum read bytes", }
+ { "sum read bytes", },
};
QLock statslock;
diff --git a/src/cmd/venti/srv/syncarena.c b/src/cmd/venti/srv/syncarena.c
index 7a5d6f9d..89546f55 100644
--- a/src/cmd/venti/srv/syncarena.c
+++ b/src/cmd/venti/srv/syncarena.c
@@ -30,12 +30,11 @@ syncarena(Arena *arena, u64int start, u32int n, int zok, int fix)
ZBlock *lump;
Clump cl;
ClumpInfo ci;
- static ClumpInfo zci = { -1 };
+ static ClumpInfo zci = { .type = -1 };
u8int score[VtScoreSize];
u64int uncsize, used, aa;
u32int clump, clumps, cclumps, magic;
int err, flush, broken;
- AState as;
used = arena->memstats.used;
clumps = arena->memstats.clumps;
@@ -133,19 +132,21 @@ syncarena(Arena *arena, u64int start, u32int n, int zok, int fix)
flushdcache();
}
+fprint(2, "arena %s: start=%lld fix=%d flush=%d %lld->%lld %ud->%ud %ud->%ud %lld->%lld\n",
+ arena->name,
+ start,
+ fix,
+ flush,
+ used, arena->memstats.used,
+ clumps, arena->memstats.clumps,
+ cclumps, arena->memstats.cclumps,
+ uncsize, arena->memstats.uncsize);
+
if(used != arena->memstats.used
|| clumps != arena->memstats.clumps
|| cclumps != arena->memstats.cclumps
|| uncsize != arena->memstats.uncsize)
err |= SyncHeader;
- if(start && (err&SyncHeader)){
- trace(TraceProc, "syncarena setdcachestate");
- as.arena = arena;
- as.aa = start+arena->memstats.used;
- as.stats = arena->memstats;
- setdcachestate(&as);
- }
-
return err;
}
diff --git a/src/cmd/venti/srv/syncindex.c b/src/cmd/venti/srv/syncindex.c
index 56bf1527..72d45f18 100644
--- a/src/cmd/venti/srv/syncindex.c
+++ b/src/cmd/venti/srv/syncindex.c
@@ -48,6 +48,8 @@ threadmain(int argc, char *argv[])
ventifmtinstall();
if(initventi(argv[0], &conf) < 0)
sysfatal("can't init venti: %r");
+ if(mainindex->bloom && loadbloom(mainindex->bloom) < 0)
+ sysfatal("can't load bloom filter: %r");
if(bcmem < maxblocksize * (mainindex->narenas + mainindex->nsects * 4 + 16))
bcmem = maxblocksize * (mainindex->narenas + mainindex->nsects * 4 + 16);
diff --git a/src/cmd/venti/srv/syncindex0.c b/src/cmd/venti/srv/syncindex0.c
index 12b69ed2..e214d712 100644
--- a/src/cmd/venti/srv/syncindex0.c
+++ b/src/cmd/venti/srv/syncindex0.c
@@ -121,6 +121,7 @@ int
syncindex(Index *ix, int fix, int mustflush, int check)
{
Arena *arena;
+ AState as;
u64int a;
u32int clump;
int i, e, e1, ok, ok1, flush;
@@ -130,7 +131,12 @@ syncindex(Index *ix, int fix, int mustflush, int check)
for(i = 0; i < ix->narenas; i++){
trace(TraceProc, "syncindex start %d", i);
arena = ix->arenas[i];
- clump = arena->memstats.clumps;
+ /*
+ * Syncarena will scan through the arena looking for blocks
+ * that have been forgotten. It will update arena->memstats.used,
+ * so save the currenct copy as the place to start the
+ * syncarenaindex scan.
+ */
a = arena->memstats.used;
e = syncarena(arena, ix->amap[i].start, TWID32, fix, fix);
e1 = e;
@@ -138,15 +144,23 @@ syncindex(Index *ix, int fix, int mustflush, int check)
e1 &= ~(SyncHeader|SyncCIZero|SyncCIErr);
if(e1 == SyncHeader)
fprint(2, "arena %s: header is out-of-date\n", arena->name);
+ clump = arena->diskstats.clumps;
if(e1)
ok = -1;
else{
ok1 = syncarenaindex(ix, arena, clump, a + ix->amap[i].start, fix, &flush, check);
if(ok1 < 0)
fprint(2, "syncarenaindex: %r\n");
+fprint(2, "arena %s: wbarena in syncindex\n", arena->name);
if(fix && ok1==0 && (e & SyncHeader) && wbarena(arena) < 0)
fprint(2, "arena=%s header write failed: %r\n", arena->name);
ok |= ok1;
+
+fprint(2, "arena %s: setdcachestate\n", arena->name);
+ as.arena = arena;
+ as.aa = ix->amap[i].start + arena->memstats.used;
+ as.stats = arena->memstats;
+ setdcachestate(&as);
}
}
if(missing || wrong)
diff --git a/src/cmd/venti/srv/unwhack.c b/src/cmd/venti/srv/unwhack.c
index 587046cc..5530bd07 100644
--- a/src/cmd/venti/srv/unwhack.c
+++ b/src/cmd/venti/srv/unwhack.c
@@ -23,7 +23,7 @@ static uchar lenval[1 << (DBigLenBits - 1)] =
static uchar lenbits[] =
{
0, 0, 0,
- 2, 3, 5, 5
+ 2, 3, 5, 5,
};
static uchar offbits[16] =
diff --git a/src/cmd/venti/srv/utils.c b/src/cmd/venti/srv/utils.c
index 03fd9065..0fd0f04f 100644
--- a/src/cmd/venti/srv/utils.c
+++ b/src/cmd/venti/srv/utils.c
@@ -148,6 +148,7 @@ emalloc(ulong n)
sysfatal("out of memory allocating %lud", n);
}
memset(p, 0xa5, n);
+ setmalloctag(p, getcallerpc(&n));
if(0)print("emalloc %p-%p by %lux\n", p, (char*)p+n, getcallerpc(&n));
return p;
}
@@ -164,6 +165,7 @@ ezmalloc(ulong n)
sysfatal("out of memory allocating %lud", n);
}
memset(p, 0, n);
+ setmalloctag(p, getcallerpc(&n));
if(0)print("ezmalloc %p-%p by %lux\n", p, (char*)p+n, getcallerpc(&n));
return p;
}
@@ -177,6 +179,7 @@ erealloc(void *p, ulong n)
abort();
sysfatal("out of memory allocating %lud", n);
}
+ setrealloctag(p, getcallerpc(&p));
if(0)print("erealloc %p-%p by %lux\n", p, (char*)p+n, getcallerpc(&p));
return p;
}
@@ -190,6 +193,7 @@ estrdup(char *s)
n = strlen(s) + 1;
t = emalloc(n);
memmove(t, s, n);
+ setmalloctag(t, getcallerpc(&s));
if(0)print("estrdup %p-%p by %lux\n", t, (char*)t+n, getcallerpc(&s));
return t;
}
@@ -231,6 +235,7 @@ ventifmtinstall(void)
fmtinstall('F', vtfcallfmt);
fmtinstall('H', encodefmt);
fmtinstall('I', ientryfmt);
+ fmtinstall('T', vttimefmt);
fmtinstall('V', vtscorefmt);
}
diff --git a/src/cmd/venti/srv/venti.c b/src/cmd/venti/srv/venti.c
index 1e924aeb..e9ca0536 100644
--- a/src/cmd/venti/srv/venti.c
+++ b/src/cmd/venti/srv/venti.c
@@ -105,6 +105,8 @@ threadmain(int argc, char *argv[])
fprint(2, "conf...");
if(initventi(configfile, &config) < 0)
sysfatal("can't init server: %r");
+ if(mainindex->bloom && loadbloom(mainindex->bloom) < 0)
+ sysfatal("can't load bloom filter: %r");
if(mem == 0)
mem = config.mem;
@@ -210,8 +212,8 @@ ventiserver(void *v)
trace(TraceRpc, "<- %F", &r->tx);
r->rx.msgtype = r->tx.msgtype+1;
addstat(StatRpcTotal, 1);
- /* print("req (arenas[0]=%p sects[0]=%p) %F\n", */
- /* mainindex->arenas[0], mainindex->sects[0], &r->tx); */
+ if(0) print("req (arenas[0]=%p sects[0]=%p) %F\n",
+ mainindex->arenas[0], mainindex->sects[0], &r->tx);
switch(r->tx.msgtype){
default:
vtrerror(r, "unknown request");
diff --git a/src/cmd/venti/srv/verifyarena.c b/src/cmd/venti/srv/verifyarena.c
index 5236c093..2cdb7ba0 100644
--- a/src/cmd/venti/srv/verifyarena.c
+++ b/src/cmd/venti/srv/verifyarena.c
@@ -3,65 +3,102 @@
#include "fns.h"
static int verbose;
+static int fd;
+static uchar *data;
+static int blocksize;
+static int sleepms;
void
usage(void)
{
- fprint(2, "usage: verifyarena [-v]\n");
+ fprint(2, "usage: verifyarena [-b blocksize] [-s ms] [-v] [arenapart [name...]]\n");
threadexitsall(0);
}
-static void
+static int
+preadblock(uchar *buf, int n, vlong off)
+{
+ int nr, m;
+
+ for(nr = 0; nr < n; nr += m){
+ m = n - nr;
+ m = pread(fd, &buf[nr], m, off+nr);
+ if(m <= 0){
+ if(m == 0)
+ werrstr("early eof");
+ return -1;
+ }
+ }
+ return 0;
+}
+
+static int
readblock(uchar *buf, int n)
{
int nr, m;
for(nr = 0; nr < n; nr += m){
m = n - nr;
- m = read(0, &buf[nr], m);
- if(m <= 0)
- sysfatal("can't read arena from standard input: %r");
+ m = read(fd, &buf[nr], m);
+ if(m <= 0){
+ if(m == 0)
+ werrstr("early eof");
+ return -1;
+ }
}
+ return 0;
}
static void
-verifyarena(void)
+verifyarena(char *name, vlong len)
{
Arena arena;
ArenaHead head;
- ZBlock *b;
DigestState s;
u64int n, e;
u32int bs;
u8int score[VtScoreSize];
- fprint(2, "verify arena from standard input\n");
+ fprint(2, "verify %s\n", name);
memset(&arena, 0, sizeof arena);
memset(&s, 0, sizeof s);
/*
- * read the little bit, which will included the header
+ * read a little bit, which will include the header
*/
- bs = MaxIoSize;
- b = alloczblock(bs, 0, 0);
- readblock(b->data, HeadSize);
- sha1(b->data, HeadSize, nil, &s);
- if(unpackarenahead(&head, b->data) < 0)
- sysfatal("corrupted arena header: %r");
+ if(readblock(data, HeadSize) < 0){
+ fprint(2, "%s: reading header: %r\n", name);
+ return;
+ }
+ sha1(data, HeadSize, nil, &s);
+ if(unpackarenahead(&head, data) < 0){
+ fprint(2, "%s: corrupt arena header: %r\n", name);
+ return;
+ }
if(head.version != ArenaVersion4 && head.version != ArenaVersion5)
- fprint(2, "warning: unknown arena version %d\n", head.version);
+ fprint(2, "%s: warning: unknown arena version %d\n", name, head.version);
+ if(len != 0 && len != head.size)
+ fprint(2, "%s: warning: unexpected length %lld != %lld\n", name, head.size, len);
+ if(strcmp(name, "<stdin>") != 0 && strcmp(head.name, name) != 0)
+ fprint(2, "%s: warning: unexpected name %s\n", name, head.name);
/*
* now we know how much to read
* read everything but the last block, which is special
*/
e = head.size - head.blocksize;
+ bs = blocksize;
for(n = HeadSize; n < e; n += bs){
if(n + bs > e)
bs = e - n;
- readblock(b->data, bs);
- sha1(b->data, bs, nil, &s);
+ if(readblock(data, bs) < 0){
+ fprint(2, "%s: read data: %r\n", name);
+ return;
+ }
+ sha1(data, bs, nil, &s);
+ if(sleepms)
+ sleep(sleepms);
}
/*
@@ -69,8 +106,11 @@ verifyarena(void)
* the sum is calculated assuming the slot for the sum is zero.
*/
bs = head.blocksize;
- readblock(b->data, bs);
- sha1(b->data, bs-VtScoreSize, nil, &s);
+ if(readblock(data, bs) < 0){
+ fprint(2, "%s: read last block: %r\n", name);
+ return;
+ }
+ sha1(data, bs-VtScoreSize, nil, &s);
sha1(zeroscore, VtScoreSize, nil, &s);
sha1(nil, 0, score, &s);
@@ -78,37 +118,73 @@ verifyarena(void)
* validity check on the trailer
*/
arena.blocksize = head.blocksize;
- if(unpackarena(&arena, b->data) < 0)
- sysfatal("corrupted arena trailer: %r");
- scorecp(arena.score, &b->data[arena.blocksize - VtScoreSize]);
-
- if(namecmp(arena.name, head.name) != 0)
- sysfatal("arena header and trailer names clash: %s vs. %s\n", head.name, arena.name);
- if(arena.version != head.version)
- sysfatal("arena header and trailer versions clash: %d vs. %d\n", head.version, arena.version);
+ if(unpackarena(&arena, data) < 0){
+ fprint(2, "%s: corrupt arena trailer: %r\n", name);
+ return;
+ }
+ scorecp(arena.score, &data[arena.blocksize - VtScoreSize]);
+
+ if(namecmp(arena.name, head.name) != 0){
+ fprint(2, "%s: wrong name in trailer: %s vs. %s\n",
+ name, head.name, arena.name);
+ return;
+ }
+ if(arena.version != head.version){
+ fprint(2, "%s: wrong version in trailer: %d vs. %d\n",
+ name, head.version, arena.version);
+ return;
+ }
arena.size = head.size - 2 * head.blocksize;
/*
* check for no checksum or the same
*/
- if(scorecmp(score, arena.score) != 0){
- if(scorecmp(zeroscore, arena.score) != 0)
- fprint(2, "warning: mismatched checksums for arena=%s, found=%V calculated=%V",
- arena.name, arena.score, score);
- scorecp(arena.score, score);
- }else
- fprint(2, "matched score\n");
-
+ if(scorecmp(score, arena.score) == 0)
+ fprint(2, "%s: verified score\n", name);
+ else if(scorecmp(zeroscore, arena.score) == 0)
+ fprint(2, "%s: unsealed\n", name);
+ else{
+ fprint(2, "%s: mismatch checksum - found=%V calculated=%V\n",
+ name, arena.score, score);
+ return;
+ }
printarena(2, &arena);
}
+static int
+shouldcheck(char *name, char **s, int n)
+{
+ int i;
+
+ if(n == 0)
+ return 1;
+
+ for(i=0; i<n; i++){
+ if(s[i] && strcmp(name, s[i]) == 0){
+ s[i] = nil;
+ return 1;
+ }
+ }
+ return 0;
+}
+
void
threadmain(int argc, char *argv[])
{
+ int i, nline;
+ char *p, *q, *table, *f[10], line[256];
+ vlong start, stop;
+ ArenaPart ap;
+
ventifmtinstall();
- statsinit();
-
+ blocksize = MaxIoSize;
ARGBEGIN{
+ case 'b':
+ blocksize = unittoull(EARGF(usage()));
+ break;
+ case 's':
+ sleepms = atoi(EARGF(usage()));
+ break;
case 'v':
verbose++;
break;
@@ -117,11 +193,69 @@ threadmain(int argc, char *argv[])
break;
}ARGEND
- readonly = 1;
+ data = vtmalloc(blocksize);
+ if(argc == 0){
+ fd = 0;
+ verifyarena("<stdin>", 0);
+ threadexitsall(nil);
+ }
+
+ if((fd = open(argv[0], OREAD)) < 0)
+ sysfatal("open %s: %r", argv[0]);
- if(argc != 0)
- usage();
+ if(preadblock(data, 8192, PartBlank) < 0)
+ sysfatal("read arena part header: %r");
+ if(unpackarenapart(&ap, data) < 0)
+ sysfatal("corrupted arena part header: %r");
+ fprint(2, "# arena part version=%d blocksize=%d arenabase=%d\n",
+ ap.version, ap.blocksize, ap.arenabase);
+ ap.tabbase = (PartBlank+HeadSize+ap.blocksize-1)&~(ap.blocksize-1);
+ ap.tabsize = ap.arenabase - ap.tabbase;
+ table = malloc(ap.tabsize+1);
+ if(preadblock((uchar*)table, ap.tabsize, ap.tabbase) < 0)
+ sysfatal("reading arena part directory: %r");
+ table[ap.tabsize] = 0;
+
+ nline = atoi(table);
+ p = strchr(table, '\n');
+ if(p)
+ p++;
+ for(i=0; i<nline; i++){
+ if(p == nil){
+ fprint(2, "warning: unexpected arena table end\n");
+ break;
+ }
+ q = strchr(p, '\n');
+ if(q)
+ *q++ = 0;
+ if(strlen(p) >= sizeof line){
+ fprint(2, "warning: long arena table line: %s\n", p);
+ p = q;
+ continue;
+ }
+ strcpy(line, p);
+ memset(f, 0, sizeof f);
+ if(tokenize(line, f, nelem(f)) < 3){
+ fprint(2, "warning: bad arena table line: %s\n", p);
+ p = q;
+ continue;
+ }
+ p = q;
+ if(shouldcheck(f[0], argv+1, argc-1)){
+ start = strtoull(f[1], 0, 0);
+ stop = strtoull(f[2], 0, 0);
+ if(stop <= start){
+ fprint(2, "%s: bad start,stop %lld,%lld\n", f[0], stop, start);
+ continue;
+ }
+ if(seek(fd, start, 0) < 0)
+ fprint(2, "%s: seek to start: %r\n", f[0]);
+ verifyarena(f[0], stop - start);
+ }
+ }
+ for(i=1; i<argc; i++)
+ if(argv[i] != 0)
+ fprint(2, "%s: did not find arena\n", argv[i]);
- verifyarena();
- threadexitsall(0);
+ threadexitsall(nil);
}
diff --git a/src/cmd/venti/srv/wrarena.c b/src/cmd/venti/srv/wrarena.c
index 8a2e9299..a0d57f2e 100644
--- a/src/cmd/venti/srv/wrarena.c
+++ b/src/cmd/venti/srv/wrarena.c
@@ -83,8 +83,8 @@ rdarena(Arena *arena, u64int offset)
if(magic == ClumpFreeMagic)
break;
if(magic != arena->clumpmagic) {
- /* fprint(2, "illegal clump magic number %#8.8ux offset %llud\n", */
- /* magic, aa); */
+ if(0) fprint(2, "illegal clump magic number %#8.8ux offset %llud\n",
+ magic, aa);
break;
}
lump = loadclump(arena, aa, 0, &cl, score, 0);
diff --git a/src/cmd/venti/srv/zblock.c b/src/cmd/venti/srv/zblock.c
index 4cc96dd4..4aa11f45 100644
--- a/src/cmd/venti/srv/zblock.c
+++ b/src/cmd/venti/srv/zblock.c
@@ -5,11 +5,13 @@
void
fmtzbinit(Fmt *f, ZBlock *b)
{
- memset(f, 0, sizeof *f);
- fmtlocaleinit(f, nil, nil, nil);
+ f->runes = 0;
f->start = b->data;
f->to = f->start;
f->stop = (char*)f->start + b->len;
+ f->flush = nil;
+ f->farg = nil;
+ f->nfmt = 0;
}
#define ROUNDUP(p, n) ((void*)(((uintptr)(p)+(n)-1)&~(uintptr)((n)-1)))
diff --git a/src/cmd/venti/srv/zeropart.c b/src/cmd/venti/srv/zeropart.c
index 70c25b73..7602627c 100644
--- a/src/cmd/venti/srv/zeropart.c
+++ b/src/cmd/venti/srv/zeropart.c
@@ -10,10 +10,6 @@ zeropart(Part *part, int blocksize)
int w;
fprint(2, "clearing the partition\n");
-/*fprint(2, "NOT!\n"); */
-/*return; */
-/*b=alloczblock(MaxIoSize, 1, blocksize); */
-/*freezblock(b); */
b = alloczblock(MaxIoSize, 1, blocksize);
w = 0;