aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--src/cmd/venti/buildbuck.c2
-rw-r--r--src/cmd/venti/buildindex.c12
-rw-r--r--src/cmd/venti/checkindex.c9
-rw-r--r--src/cmd/venti/clump.c2
-rw-r--r--src/cmd/venti/conv.c4
-rw-r--r--src/cmd/venti/dat.h15
-rw-r--r--src/cmd/venti/dcache.c34
-rw-r--r--src/cmd/venti/fns.h2
-rw-r--r--src/cmd/venti/httpd.c17
-rw-r--r--src/cmd/venti/icache.c1
-rw-r--r--src/cmd/venti/index.c700
-rw-r--r--src/cmd/venti/lump.c1
-rw-r--r--src/cmd/venti/mkfile4
-rw-r--r--src/cmd/venti/part.c6
-rw-r--r--src/cmd/venti/venti.c4
15 files changed, 450 insertions, 363 deletions
diff --git a/src/cmd/venti/buildbuck.c b/src/cmd/venti/buildbuck.c
index 4232bb47..e5aed260 100644
--- a/src/cmd/venti/buildbuck.c
+++ b/src/cmd/venti/buildbuck.c
@@ -80,7 +80,7 @@ buildbucket(Index *ix, IEStream *ies, IBucket *ib)
buck = TWID32;
ib->n = 0;
- ib->next = 0;
+ ib->depth = 0;
while(ies->n){
b = peekientry(ies);
if(b == nil)
diff --git a/src/cmd/venti/buildindex.c b/src/cmd/venti/buildindex.c
index 952e75dd..8058ba09 100644
--- a/src/cmd/venti/buildindex.c
+++ b/src/cmd/venti/buildindex.c
@@ -7,15 +7,9 @@ writebucket(Index *ix, u32int buck, IBucket *ib, ZBlock *b)
{
ISect *is;
- is = findisect(ix, buck);
- if(is == nil){
- seterr(EAdmin, "bad math in writebucket");
+ is = findibucket(ix, buck, &buck);
+ if(is == nil)
return -1;
- }
- if(buck < is->start || buck >= is->stop)
- seterr(EAdmin, "index write out of bounds: %d not in [%d,%d)\n",
- buck, is->start, is->stop);
- buck -= is->start;
qlock(&stats.lock);
stats.indexwrites++;
qunlock(&stats.lock);
@@ -47,7 +41,7 @@ buildindex(Index *ix, Part *part, u64int off, u64int clumps, int zero)
ib.data = b->data + IBucketSize;
zib.data = z->data + IBucketSize;
zib.n = 0;
- zib.next = 0;
+ zib.depth = 0;
for(;;){
buck = buildbucket(ix, ies, &ib);
found += ib.n;
diff --git a/src/cmd/venti/checkindex.c b/src/cmd/venti/checkindex.c
index fa6f5efc..34edb370 100644
--- a/src/cmd/venti/checkindex.c
+++ b/src/cmd/venti/checkindex.c
@@ -11,12 +11,7 @@ checkbucket(Index *ix, u32int buck, IBucket *ib)
IEntry ie, eie;
int i, ei, ok, c;
- is = findisect(ix, buck);
- if(is == nil){
- seterr(EAdmin, "bad math in checkbuckets");
- return -1;
- }
- buck -= is->start;
+ is = findibucket(ix, buck, &buck);
eb = getdblock(is->part, is->blockbase + ((u64int)buck << is->blocklog), 1);
if(eb == nil)
return -1;
@@ -87,7 +82,7 @@ u64int found = 0;
ib.data = b->data;
zib.data = z->data;
zib.n = 0;
- zib.next = 0;
+ zib.depth = 0;
for(;;){
buck = buildbucket(ix, ies, &ib);
found += ib.n;
diff --git a/src/cmd/venti/clump.c b/src/cmd/venti/clump.c
index 272d7aec..33f5950a 100644
--- a/src/cmd/venti/clump.c
+++ b/src/cmd/venti/clump.c
@@ -60,7 +60,7 @@ if(0)print("whackedblock %08x %p\n", mainindex->arenas[0], &cl);
a = writeiclump(ix, &cl, cb->data);
freezblock(cb);
- if(a == 0)
+ if(a == TWID64)
return -1;
qlock(&stats.lock);
diff --git a/src/cmd/venti/conv.c b/src/cmd/venti/conv.c
index ae89baa7..4688b076 100644
--- a/src/cmd/venti/conv.c
+++ b/src/cmd/venti/conv.c
@@ -488,7 +488,7 @@ void
unpackibucket(IBucket *b, u8int *buf)
{
b->n = U16GET(buf);
- b->next = U32GET(&buf[U16Size]);
+ b->depth = U32GET(&buf[U16Size]);
b->data = buf + IBucketSize;
}
@@ -496,5 +496,5 @@ void
packibucket(IBucket *b, u8int *buf)
{
U16PUT(buf, b->n);
- U32PUT(&buf[U16Size], b->next);
+ U32PUT(&buf[U16Size], b->depth);
}
diff --git a/src/cmd/venti/dat.h b/src/cmd/venti/dat.h
index d6d7d1b0..49c8b38b 100644
--- a/src/cmd/venti/dat.h
+++ b/src/cmd/venti/dat.h
@@ -79,7 +79,8 @@ enum
ArenaPartVersion = 3,
ArenaVersion = 4,
- IndexVersion = 1,
+ IndexVersion1 = 1,
+ IndexVersion2 = 2,
ISectVersion = 1,
/*
@@ -116,11 +117,14 @@ enum
MaxClumpBlocks = (VtMaxLumpSize + ClumpSize + (1 << ABlockLog) - 1) >> ABlockLog,
/*
- * dirty flags
+ * dirty flags - order controls disk write order
*/
DirtyArena = 1,
+ DirtyIndexSplit,
DirtyIndex,
+ DirtyIndexBitmap,
DirtyArenaCib,
+ DirtyMax,
VentiZZZZZZZZ
};
@@ -367,6 +371,11 @@ struct Index
u32int buckets; /* last bucket used in disk hash table */
u32int blocksize;
u32int tabsize; /* max. bytes in index config */
+ u32int bitblocks;
+ u32int maxdepth;
+ u32int bitkeylog;
+ u32int bitkeymask;
+
int mapalloc; /* first arena to check when adding a lump */
Arena **arenas; /* arenas in the mapping */
ISect **sects; /* sections which hold the buckets */
@@ -440,7 +449,7 @@ struct IEntry
struct IBucket
{
u16int n; /* number of active indices */
- u32int next; /* overflow bucket */
+ u32int depth; /* depth in version 2 (was overflow in v1) */
u8int *data;
};
diff --git a/src/cmd/venti/dcache.c b/src/cmd/venti/dcache.c
index c99e79b3..dcb47bcf 100644
--- a/src/cmd/venti/dcache.c
+++ b/src/cmd/venti/dcache.c
@@ -230,7 +230,6 @@ dirtydblock(DBlock *b, int dirty)
int odirty;
Part *p;
-fprint(2, "dirty %p\n", b);
rlock(&dcache.dirtylock);
assert(b->ref != 0);
assert(b->dirtying == 0);
@@ -242,8 +241,16 @@ fprint(2, "dirty %p\n", b);
stats.dirtydblocks++;
qunlock(&stats.lock);
+ /*
+ * In general, shouldn't mark any block as more than one
+ * type, except that split index blocks are a subcase of index
+ * blocks. Only clean blocks ever get marked DirtyIndexSplit,
+ * though, so we don't need the opposite conjunction here.
+ */
if(b->dirty)
- assert(b->dirty == dirty);
+ assert(b->dirty == dirty
+ || (b->dirty==DirtyIndexSplit && dirty==DirtyIndex));
+
odirty = b->dirty;
b->dirty = dirty;
p = b->part;
@@ -533,7 +540,7 @@ flushtimerproc(void *v)
static void
flushproc(void *v)
{
- int i, n;
+ int i, j, n;
DBlock *b, **write;
USED(v);
@@ -575,25 +582,10 @@ flushproc(void *v)
qsort(write, n, sizeof(write[0]), writeblockcmp);
- /*
- * At the beginning of the array are the arena blocks.
- */
- fprint(2, "flushproc: write arena blocks\n");
+ /* Write each stage of blocks out. */
i = 0;
- i += parallelwrites(write+i, write+n, DirtyArena);
-
- /*
- * Next are the index blocks.
- */
- fprint(2, "flushproc: write index blocks\n");
- i += parallelwrites(write+i, write+n, DirtyIndex);
-
- /*
- * Finally, the arena clump info blocks.
- */
- fprint(2, "flushproc: write cib blocks\n");
- i += parallelwrites(write+i, write+n, DirtyArenaCib);
-
+ for(j=1; j<DirtyMax; j++)
+ i += parallelwrites(write+i, write+n, j);
assert(i == n);
fprint(2, "flushproc: update dirty bits\n");
diff --git a/src/cmd/venti/fns.h b/src/cmd/venti/fns.h
index b3d158f6..f8274397 100644
--- a/src/cmd/venti/fns.h
+++ b/src/cmd/venti/fns.h
@@ -20,7 +20,6 @@ void *erealloc(void *, ulong);
char *estrdup(char*);
void *ezmalloc(ulong);
Arena *findarena(char *name);
-ISect *findisect(Index *ix, u32int buck);
int flushciblocks(Arena *arena);
void flushdcache(void);
void flushqueue(void);
@@ -57,6 +56,7 @@ int initventi(char *config);
void insertlump(Lump *lump, Packet *p);
int insertscore(u8int *score, IAddr *ia, int write);
ZBlock *loadclump(Arena *arena, u64int aa, int blocks, Clump *cl, u8int *score, int verify);
+DBlock *loadibucket(Index *index, u8int *score, ISect **is, u32int *buck, IBucket *ib);
int loadientry(Index *index, u8int *score, int type, IEntry *ie);
void logerr(int severity, char *fmt, ...);
Lump *lookuplump(u8int *score, int type);
diff --git a/src/cmd/venti/httpd.c b/src/cmd/venti/httpd.c
index 8215ccb6..859c106e 100644
--- a/src/cmd/venti/httpd.c
+++ b/src/cmd/venti/httpd.c
@@ -137,14 +137,12 @@ httpproc(void *v)
c = v;
for(t = 15*60*1000; ; t = 15*1000){
-fprint(2, "httpd: get headers\n");
if(hparsereq(c, t) < 0)
break;
ok = -1;
for(i = 0; i < MaxObjs && objs[i].name[0]; i++){
if(strcmp(c->req.uri, objs[i].name) == 0){
-fprint(2, "httpd: call function %p\n", objs[i].f);
ok = (*objs[i].f)(c);
break;
}
@@ -158,9 +156,7 @@ fprint(2, "httpd: call function %p\n", objs[i].f);
if(ok < 0)
break;
}
-print("httpd cleanup %d\n", c->hin.fd);
hreqcleanup(c);
-print("close %d\n", c->hin.fd);
close(c->hin.fd);
free(c);
}
@@ -239,12 +235,9 @@ estats(HConnect *c)
r = preqtext(c);
if(r < 0)
-{
-fprint(2, "preqtext failed\n");
return r;
-}
-fprint(2, "write stats\n");
+
hout = &c->hout;
hprint(hout, "lump writes=%,ld\n", stats.lumpwrites);
hprint(hout, "lump reads=%,ld\n", stats.lumpreads);
@@ -277,21 +270,21 @@ fprint(2, "write stats\n");
hprint(hout, "disk cache misses=%,ld\n", stats.pcmiss);
hprint(hout, "disk cache reads=%,ld\n", stats.pcreads);
hprint(hout, "disk cache bytes read=%,lld\n", stats.pcbreads);
-fprint(2, "write new stats\n");
+
hprint(hout, "disk cache writes=%,ld\n", stats.dirtydblocks);
hprint(hout, "disk cache writes absorbed=%,ld %d%%\n", stats.absorbedwrites,
percent(stats.absorbedwrites, stats.dirtydblocks));
-fprint(2, "back to old stats\n");
+ hprint(hout, "disk cache flushes=%,ld\n", stats.dcacheflushes);
+ hprint(hout, "disk cache flush writes=%,ld (%,ld per flush)\n",
+ stats.dcacheflushwrites, stats.dcacheflushwrites/stats.dcacheflushes);
hprint(hout, "disk writes=%,ld\n", stats.diskwrites);
hprint(hout, "disk bytes written=%,lld\n", stats.diskbwrites);
hprint(hout, "disk reads=%,ld\n", stats.diskreads);
hprint(hout, "disk bytes read=%,lld\n", stats.diskbreads);
-fprint(2, "hflush stats\n");
hflush(hout);
-fprint(2, "done with stats\n");
return 0;
}
diff --git a/src/cmd/venti/icache.c b/src/cmd/venti/icache.c
index 04f1134e..fc1d32e7 100644
--- a/src/cmd/venti/icache.c
+++ b/src/cmd/venti/icache.c
@@ -58,7 +58,6 @@ lookupscore(u8int *score, int type, IAddr *ia, int *rac)
IEntry d, *ie, *last;
u32int h;
-fprint(2, "lookupscore %V %d\n", score, type);
qlock(&stats.lock);
stats.iclookups++;
qunlock(&stats.lock);
diff --git a/src/cmd/venti/index.c b/src/cmd/venti/index.c
index 7fc15fd2..168b13f6 100644
--- a/src/cmd/venti/index.c
+++ b/src/cmd/venti/index.c
@@ -1,101 +1,110 @@
/*
- * Index, mapping scores to log positions. The log data mentioned in
- * the index _always_ goes out to disk before the index blocks themselves.
- * A counter in the arena tail records which logged blocks have been
- * successfully indexed. The ordering of dirtydcache calls along with
- * the flags passed to dirtydcache ensure the proper write ordering.
+ * Index, mapping scores to log positions.
+ *
+ * The index is made up of some number of index sections, each of
+ * which is typically stored on a different disk. The blocks in all the
+ * index sections are logically numbered, with each index section
+ * responsible for a range of blocks. Blocks are typically 8kB.
+ *
+ * Index Version 1:
*
- * For historical reasons, there are two indexing schemes. In both,
- * the index is broken up into some number of fixed-size (say, 8kB)
- * buckets holding index entries. An index entry is about 40 bytes.
- * The index can be spread across many disks, although in small
- * configurations it is not uncommon for the index and arenas to be
- * on the same disk.
- * *
- * In the first scheme, the many buckets are treated as a giant on-disk
- * hash table. If there are N buckets, then the top 32 bits of the
- * score are used as an index into the hash table, with each bucket
- * holding 2^32 / N of the index space. The index must be sized so
- * that a bucket can't ever overflow. Assuming that a typical compressed
- * data block is about 4000 bytes, the index size is expected to be
- * about 1% of the total data size. Since scores are essentially
- * random, they will be distributed evenly among the buckets, so all
- * buckets should be about the same fullness. A factor of 5 gives us
- * a wide comfort boundary, so the index storage is suggested to be
- * 5% of the total data storage.
+ * The N index blocks are treated as a giant hash table. The top 32 bits
+ * of score are used as the key for a lookup. Each index block holds
+ * one hash bucket, which is responsible for ceil(2^32 / N) of the key space.
*
- * Unfortunately, this very sparse index does not make good use of the
- * disk -- most of it is empty, and disk reads, which are costly because
- * of the random seek to get to an arbitrary bucket, tend to bring in
- * only a few entries, making them hardly cost effective. The second
- * scheme is a variation on the first scheme that tries to lay out the
- * index in a denser format on the disk. In this scheme, the index
- * buckets are organized in a binary tree, with all data at the leaf
- * nodes. Bucket numbers are easiest to treat in binary. In the
- * beginning, there is a single bucket with 0-bit number "". When a
- * bucket with number x fills, it splits into buckets 0x and 1x. Since
- * x and 0x are the same number, this means that half the bucket space
- * is assigned to a new bucket, 1x. So "" splits into 0 and 1, 1
- * splits into 01 and 11, and so on. The bucket number determines the
- * placement on disk, and the bucket header includes the number of
- * bits represented by the bucket. To find the right bucket for a
- * given score with top 32-bits x, read bucket "" off disk and check
- * its depth. If it is zero, we're done. If x doesn't match the
- * number of bits in 0's header, we know that the block has split, so
- * we use the last 1 bit of x to load a new block (perhaps the same
- * one) and repeat, using successively more bits of x until we find
- * the block responsible for x. Note that we're using bits from the
- * _right_ not the left. This gives the "split into 0x and 1x" property
- * needed by the tree and is easier than using the reversal of the
- * bits on the left.
- * *
- * At the moment, this second scheme sounds worse than the first --
- * there are log n disk reads to find a block instead of just 1. But
- * we can keep the tree structure in memory, using 1 bit per block to
- * keep track of whether that block has been allocated. Want to know
- * whether block x has been split? Check whether 1x is allocated. 1
- * bit per 8kB gives us an in-use bitmap 1/65536 the size of the index.
- * The index data is 1/100 the size of the arena data, explained above.
- * In this scheme, after the first block split, the index is always
- * at least half full (proof by induction), so it is at most 2x the
- * size of the index data. This gives a bitmap size of 2/6,553,600
- * of the data size. Let's call that one millionth. So each terabyte
- * of storage requires one megabyte of free bitmap. The bitmap is
- * going to be accessed so much that it will be effectively pinned in
- * the cache. So it still only takes one disk read to find the block
- * -- the tree walking can be done by consulting the in-core bitmap
- * describing the tree structure.
- * *
- * Now we have to worry about write ordering, though. What if the
- * bitmap ends up out of sync with the index blocks? When block x
- * splits into 0x and 1x, causing an update to bitmap block b, the
- * dcache flushing code makes sure that the writes happen in this
- * order: first 1x, then 0x, then the bitmap. Writing 1x before 0x
- * makes sure we write the split-off entries to disk before we discard
- * them from 0x. Writing the bitmap after both simplifies the following
- * case analysis.
+ * The index is sized so that a particular bucket is extraordinarily
+ * unlikely to overflow: assuming compressed data blocks are 4kB
+ * on disk, and assuming each block has a 40 byte index entry,
+ * the index data will be 1% of the total data. Since scores are essentially
+ * random, all buckets should be about the same fullness.
+ * A factor of 5 gives us a wide comfort boundary to account for
+ * random variation. So the index disk space should be 5% of the arena disk space.
+ *
+ * Problems with Index Version 1:
*
- * If Venti is interrupted while flushing blocks to disk, the state
- * of the disk upon next startup can be one of the following:
- * *
-
- * (a) none of 0x, 1x, and b are written
- * Looks like nothing happened - use as is.
+ * Because the index size is chosen to handle the worst case load,
+ * the index is very sparse, especially when the Venti server is mostly empty.
+ * This has a few bad properties.
+ *
+ * Loading an index block (which typically requires a random disk seek)
+ * is a very expensive operation, yet it yields only a couple index entries.
+ * We're not making efficient use of the disk arm.
+ *
+ * Writing a block requires first checking to see if the block already
+ * exists on the server, which in turn requires an index lookup. When
+ * writing fresh data, these lookups will fail. The index entry cache
+ * cannot serve these, so they must go to disk, which is expensive.
+ *
+ * Thus both the reading and the writing of blocks are held back by
+ * the expense of accessing the index.
+ *
+ * Index Version 2:
+ *
+ * The index is sized to be exactly 2^M blocks. The index blocks are
+ * logically arranged in a (not exactly balanced) binary tree of depth at
+ * most M. The nodes are named by bit strings describing the path from
+ * the root to the node. The root is . (dot). The left child of the root is .0,
+ * the right child .1. The node you get to by starting at the root and going
+ * left right right left is .0110. At the beginning, there is only the root block.
+ * When a block with name .xxx fills, it splits into .xxx0 and .xxx1.
+ * All the index data is kept in the leaves of the tree.
+ *
+ * Index leaf blocks are laid out on disk by interpreting the bitstring as a
+ * binary fraction and multiplying by 2^M -- .0 is the first block, .1 is
+ * the block halfway into the index, .0110 is at position 6/16, and
+ * .xxx and .xxx0 map to the same block (but only one can be a leaf
+ * node at any given time, so this is okay!). A cheap implementation of
+ * this is to append zeros to the bit string to make it M bits long. That's
+ * the integer index block number.
+ *
+ * To find the leaf block that should hold a score, use the bits of the
+ * score one at a time to walk down the tree to a leaf node. If the tree
+ * has leaf nodes .0, .10, and .11, then score 0110101... ends up in bucket
+ * .0 while score 101110101... ends up in bucket .10. There is no leaf node
+ * .1 because it has split into .10 and .11.
+ *
+ * If we know which disk blocks are in use, we can reconstruct the interior
+ * of the tree: if .xxx1 is in use, then .xxx has been split. We keep an in-use
+ * bitmap of all index disk blocks to aid in reconstructing the interior of the
+ * tree. At one bit per index block, the bitmap is small enough to be kept
+ * in memory even on the largest of Venti servers.
+ *
+ * After the root block splits, the index blocks being used will always be
+ * at least half full (averaged across the entire index). So unlike Version 1,
+ * Index Version 2 is quite dense, alleviating the two problems above.
+ * Index block reads now return many index entries. More importantly,
+ * small servers can keep most of the index in the disk cache, making them
+ * more likely to handle negative lookups without going to disk.
+ *
+ * As the index becomes more full, Index Version 2's performance
+ * degrades gracefully into Index Version 1. V2 is really an optimization
+ * for little servers.
+ *
+ * Write Ordering for Index Version 2:
+ *
+ * Unlike Index Version 1, Version 2 must worry about write ordering
+ * within the index. What happens if the in-use bitmap is out of sync
+ * with the actual leaf nodes? What happens if .xxx splits into .xxx0 and
+ * .xxx1 but only one of the new blocks gets written to disk?
+ *
+ * We arrange that when .xxx splits, the .xxx1 block is written first,
+ * then the .xxx0 block, and finally the in-use bitmap entry for .xxx1.
+ * The split is committed by the writing of .xxx0. This ordering leaves
+ * two possible partial disk writes:
*
- * (b) 1x is written
- * Since 0x hasn't been rewritten and the bitmap doesn't record 1x
- * as being in use, it's like this never happened. See (a).
- * This does mean that the bitmap trumps actual disk contents:
- * no need to zero the index disks anymore.
+ * (a) If .xxx1 is written but .xxx0 and the bitmap are not, then it's as if
+ * the split never happened -- we won't think .xxx1 is in use, and we
+ * won't go looking for it.
*
- * (c) 0x and 1x are written, but not the bitmap
- * Writing 0x commits the change. When we next encounter
- * 0x or 1x on a lookup (we can't get to 1x except through x==0x),
- * the bitmap will direct us to x, we'll load the block and find out
- * that its now 0x, so we update the bitmap.
+ * (b) If .xxx1 and .xxx0 are written but the bitmap is not, then the first
+ * time we try to load .xxx, we'll get .xxx0 instead, realize the bitmap is
+ * out of date, and update the bitmap.
*
- * (d) 0x, 1x, and b are written.
- * Great - just use as is.
+ * Backwards Compatibility
+ *
+ * Because there are large Venti servers in use with Index V1, this code
+ * will read either index version, but when asked to create a new index,
+ * will only create V2.
*/
#include "stdinc.h"
@@ -105,6 +114,7 @@
static int bucklook(u8int *score, int type, u8int *data, int n);
static int writebucket(ISect *is, u32int buck, IBucket *ib, DBlock *b);
static int okibucket(IBucket *ib, ISect *is);
+static int initindex1(Index*);
static ISect *initisect1(ISect *is);
//static QLock indexlock; //ZZZ
@@ -118,7 +128,7 @@ initindex(char *name, ISect **sects, int n)
Index *ix;
ISect *is;
u32int last, blocksize, tabsize;
- int i, nbits;
+ int i;
if(n <= 0){
seterr(EOk, "no index sections to initialize index");
@@ -171,21 +181,7 @@ initindex(char *name, ISect **sects, int n)
ix->tabsize = tabsize;
ix->buckets = last;
- /* compute number of buckets used for in-use map */
- nbits = blocksize*8;
- ix->bitbuckets = (ix->buckets+nbits-1)/nbits;
-
- last -= ix->bitbuckets;
- /*
- * compute log of max. power of two not greater than
- * number of remaining buckets.
- */
- for(nbits=0; last>>=1; nbits++)
- ;
- ix->maxdepth = nbits;
-
- if((1UL<<ix->maxdepth) > ix->buckets-ix->bitbuckets){
- seterr(ECorrupt, "inconsistent math for buckets in %s", ix->name);
+ if(initindex1(ix) < 0){
freeindex(ix);
return nil;
}
@@ -195,9 +191,37 @@ initindex(char *name, ISect **sects, int n)
freeindex(ix);
return nil;
}
+
return ix;
}
+static int
+initindex1(Index *ix)
+{
+ u32int buckets;
+
+ switch(ix->version){
+ case IndexVersion1:
+ ix->div = (((u64int)1 << 32) + ix->buckets - 1) / ix->buckets;
+ buckets = (((u64int)1 << 32) - 1) / ix->div + 1;
+ if(buckets != ix->buckets){
+ seterr(ECorrupt, "inconsistent math for divisor and buckets in %s", ix->name);
+ return -1;
+ }
+ break;
+
+ case IndexVersion2:
+ buckets = ix->buckets - ix->bitblocks;
+ if(ix->buckets < ix->bitblocks || (buckets&(buckets-1)))
+ seterr(ECorrupt, "bucket count not a power of two in %s", ix->name);
+ ix->maxdepth = u64log2(buckets);
+ ix->bitkeylog = u64log2(ix->blocksize*8);
+ ix->bitkeymask = (1<<ix->bitkeylog)-1;
+ break;
+ }
+ return 0;
+}
+
int
wbindex(Index *ix)
{
@@ -237,7 +261,7 @@ wbindex(Index *ix)
}
/*
- * index: IndexMagic '\n' version '\n' name '\n' blocksize '\n' sections arenas
+ * index: IndexMagic '\n' version '\n' name '\n' blocksize '\n' [V2: bitblocks '\n'] sections arenas
* version, blocksize: u32int
* name: max. ANameSize string
* sections, arenas: AMap
@@ -246,6 +270,7 @@ int
outputindex(Fmt *f, Index *ix)
{
if(fmtprint(f, "%s\n%ud\n%s\n%ud\n", IndexMagic, ix->version, ix->name, ix->blocksize) < 0
+ || (ix->version==IndexVersion2 && fmtprint(f, "%ud\n", ix->bitblocks) < 0)
|| outputamap(f, ix->smap, ix->nsects) < 0
|| outputamap(f, ix->amap, ix->narenas) < 0)
return -1;
@@ -276,7 +301,7 @@ parseindex(IFile *f, Index *ix)
return -1;
}
ix->version = v;
- if(ix->version != IndexVersion){
+ if(ix->version != IndexVersion1 && ix->version != IndexVersion2){
seterr(ECorrupt, "bad version number in %s", f->name);
return -1;
}
@@ -293,11 +318,22 @@ parseindex(IFile *f, Index *ix)
* block size
*/
if(ifileu32int(f, &v) < 0){
- seterr(ECorrupt, "syntax error: bad version number in %s", f->name);
+ seterr(ECorrupt, "syntax error: bad block size number in %s", f->name);
return -1;
}
ix->blocksize = v;
+ if(ix->version == IndexVersion2){
+ /*
+ * free bitmap size
+ */
+ if(ifileu32int(f, &v) < 0){
+ seterr(ECorrupt, "syntax error: bad bitmap size in %s", f->name);
+ return -1;
+ }
+ ix->bitblocks = v;
+ }
+
if(parseamap(f, &amn) < 0)
return -1;
ix->nsects = amn.n;
@@ -320,8 +356,10 @@ newindex(char *name, ISect **sects, int n)
Index *ix;
AMap *smap;
u64int nb;
- u32int div, ub, xb, start, stop, blocksize, tabsize;
- int i, j;
+ u32int div, ub, xb, fb, start, stop, blocksize, tabsize;
+ int i, j, version;
+
+ version = IndexVersion2;
if(n < 1){
seterr(EOk, "creating index with no index sections");
@@ -368,16 +406,27 @@ newindex(char *name, ISect **sects, int n)
seterr(EBug, "index too large");
return nil;
}
- div = (((u64int)1 << 32) + nb - 1) / nb;
- ub = (((u64int)1 << 32) - 1) / div + 1;
- if(div < 100){
- seterr(EBug, "index divisor too coarse");
- return nil;
+
+ div = 0;
+ fb = 0;
+ if(version == IndexVersion1){
+ div = (((u64int)1 << 32) + nb - 1) / nb;
+ ub = (((u64int)1 << 32) - 1) / div + 1;
+ if(div < 100){
+ seterr(EBug, "index divisor too coarse");
+ return nil;
+ }
+ }else{
+ fb = (nb+blocksize*8-1)/(blocksize*8);
+ for(ub=1; ub<=((nb-fb)>>1); ub<<=1)
+ ;
+ ub += fb;
}
if(ub > nb){
seterr(EBug, "index initialization math wrong");
return nil;
}
+ xb = nb - ub;
/*
* initialize each of the index sections
@@ -388,7 +437,6 @@ newindex(char *name, ISect **sects, int n)
seterr(EOk, "can't create new index: out of memory");
return nil;
}
- xb = nb - ub;
start = 0;
for(i = 0; i < n; i++){
stop = start + sects[i]->blocks - xb / n;
@@ -413,15 +461,22 @@ newindex(char *name, ISect **sects, int n)
free(smap);
return nil;
}
- ix->version = IndexVersion;
+ ix->version = version;
namecp(ix->name, name);
ix->sects = sects;
ix->smap = smap;
ix->nsects = n;
ix->blocksize = blocksize;
- ix->div = div;
ix->buckets = ub;
ix->tabsize = tabsize;
+ ix->div = div;
+ ix->bitblocks = fb;
+
+ if(initindex1(ix) < 0){
+ free(smap);
+ return nil;
+ }
+
return ix;
}
@@ -489,7 +544,7 @@ newisect(Part *part, char *name, u32int blocksize, u32int tabsize)
}
/*
- * initialize the computed paramaters for an index
+ * initialize the computed parameters for an index
*/
static ISect*
initisect1(ISect *is)
@@ -606,7 +661,7 @@ writeiclump(Index *ix, Clump *c, u8int *clbuf)
}
/*
- * convert an arena index to an relative address address
+ * convert an arena index to an relative arena address
*/
Arena*
amapitoa(Index *ix, u64int a, u64int *aa)
@@ -665,20 +720,15 @@ loadientry(Index *ix, u8int *score, int type, IEntry *ie)
u32int buck;
int h, ok;
- buck = hashbits(score, 32) / ix->div;
ok = -1;
qlock(&stats.lock);
stats.indexreads++;
qunlock(&stats.lock);
- is = findibucket(ix, buck, &buck);
- if(is == nil)
- return -1;
- b = getdblock(is->part, is->blockbase + ((u64int)buck << is->blocklog), 1);
+ b = loadibucket(ix, score, &is, &buck, &ib);
if(b == nil)
return -1;
- unpackibucket(&ib, b->data);
if(okibucket(&ib, is) < 0)
goto out;
@@ -707,19 +757,16 @@ storeientry(Index *ix, IEntry *ie)
u32int buck;
int h, ok;
- buck = hashbits(ie->score, 32) / ix->div;
ok = 0;
qlock(&stats.lock);
stats.indexwreads++;
qunlock(&stats.lock);
- is = findibucket(ix, buck, &buck);
- b = getdblock(is->part, is->blockbase + ((u64int)buck << is->blocklog), 1);
+ b = loadibucket(ix, ie->score, &is, &buck, &ib);
if(b == nil)
return -1;
- unpackibucket(&ib, b->data);
if(okibucket(&ib, is) < 0)
goto out;
@@ -765,177 +812,14 @@ writebucket(ISect *is, u32int buck, IBucket *ib, DBlock *b)
return 0;
}
-/*
- * find the number of the index section holding score
- */
-int
-indexsect(Index *ix, u8int *score)
-{
- u32int buck;
- int r, l, m;
-
- buck = hashbits(score, 32) / ix->div;
- l = 1;
- r = ix->nsects - 1;
- while(l <= r){
- m = (r + l) >> 1;
- if(ix->sects[m]->start <= buck)
- l = m + 1;
- else
- r = m - 1;
- }
- return l - 1;
-}
-
-/*
- * find the index section which holds bucket #buck.
- */
-static ISect*
-findisect(Index *ix, u32int buck, u32int *ibuck)
-{
- ISect *is;
- int r, l, m;
-
- l = 1;
- r = ix->nsects - 1;
- while(l <= r){
- m = (r + l) >> 1;
- if(ix->sects[m]->start <= buck)
- l = m + 1;
- else
- r = m - 1;
- }
- is = ix->sects[l - 1];
- if(is->start <= buck && is->stop > buck){
- *ibuck = buck - is->start;
- return is;
- }
- seterr(EAdmin, "index lookup out of range: %ud not found in index\n", buck);
- return nil;
-}
-
-static DBlock*
-loadisectblock(Index *ix, u32int buck, int read)
-{
- ISect *is;
-
- if((is = findisect(ix, buck, &buck)) == nil)
- return nil;
- return getdblock(is->part, is->blockbase + ((u64int)buck << is->blocklog), read);
-}
-
-/*
- * find the index section which holds the logical bucket #buck
- */
-static DBlock*
-loadibucket(Index *ix, u32int buck, IBucket *ib)
-{
- int d, i, times;
- u32int ino;
- DBlock *b;
- u32int bbuck;
- IBucket eib;
-
- times = 0;
-
-top:
- if(times++ > 2*ix->maxdepth){
- seterr(EAdmin, "bucket bitmap tree never converges with buckets");
- return nil;
- }
-
- bbuck = -1;
- b = nil;
-
- /*
- * consider the bits of buck, one at a time, to make the bucket number.
- */
-
- /*
- * walk down the bucket tree using the bitmap, which is used so
- * often it's almost certain to be in cache.
- */
- ino = 0;
- for(d=0; d<ix->maxdepth; d++){
- /* fetch the bitmap that says whether ino has been split */
- if(bbuck != (ino>>ix->bitlog)){
- if(b)
- putdblock(b);
- bbuck = (ino>>ix->bitlog);
- if((b = loadisectblock(ix, bbuck, 1)) == nil)
- return nil;
- }
- /* has it been split yet? */
- if((((u32int*)b->data)[(ino&(ix->bitmask))>>5] & (1<<(ino&31))) == 0){
- /* no. we're done */
- break;
- }
- }
- putdblock(b);
-
- /*
- * continue walking down (or up!) the bucket tree, which may not
- * be completely in sync with the bitmap. we could continue the loop
- * here, but it's easiest just to start over once we correct the bitmap.
- * corrections should only happen when things get out of sync because
- * a crash keeps some updates from making it to disk, so it's not too
- * frequent. we should converge after 2x the max depth, at the very worst
- * (up and back down the tree).
- */
- if((b = loadisectblock(ix, ix->bitbuckets+bucketno(buck, d), 1)) == nil)
- return nil;
- unpackibucket(&eib, b->data);
- if(eib.depth > d){
- /* the bitmap thought this block hadn't split */
- putdblock(b);
- if(markblocksplit(buck, d) < 0)
- return nil;
- goto top;
- }
- if(eib.depth < d){
- /* the bitmap thought this block had split */
- putdblock(b);
- if(markblockunsplit(ix, buck, d) < 0)
- return nil;
- goto top;
- }
- *ib = eib;
- return b;
-}
-
-static int
-markblocksplit(Index *ix, u32int buck, int d)
-{
- u32int ino;
-
- ino = bucketno(buck, d);
- if((b = loadisectblock(ix, ino>>ix->bitlog, 1)) == nil)
- return -1;
- dirtydblock(b, DirtyIndex);
- (((u32int*)b->data)[(ino&(ix->bitmask))>>5] |= (1<<(ino&31));
- putdblock(b);
- return 0;
-}
-
-static int
-markblockunsplit(Index *ix, u32int buck, int d)
-{
- /*
- * Let's
- u32int ino;
-
- ino = bucketno(buck, d);
-
-}
-
static int
okibucket(IBucket *ib, ISect *is)
{
- if(ib->n <= is->buckmax && (ib->next == 0 || ib->next >= is->start && ib->next < is->stop))
+ if(ib->n <= is->buckmax)
return 0;
- seterr(EICorrupt, "corrupted disk index bucket: n=%ud max=%ud, next=%lud range=[%lud,%lud)",
- ib->n, is->buckmax, ib->next, is->start, is->stop);
+ seterr(EICorrupt, "corrupted disk index bucket: n=%ud max=%ud, depth=%lud range=[%lud,%lud)",
+ ib->n, is->buckmax, ib->depth, is->start, is->stop);
return -1;
}
@@ -1010,3 +894,223 @@ ientrycmp(const void *vie1, const void *vie2)
}
return -1;
}
+
+/*
+ * find the number of the index section holding bucket #buck
+ */
+static int
+indexsect0(Index *ix, u32int buck)
+{
+ int r, l, m;
+
+ l = 1;
+ r = ix->nsects - 1;
+ while(l <= r){
+ m = (r + l) >> 1;
+ if(ix->sects[m]->start <= buck)
+ l = m + 1;
+ else
+ r = m - 1;
+ }
+ return l - 1;
+}
+
+/*
+ * load the index block at bucket #buck
+ */
+static DBlock*
+loadibucket0(Index *ix, u32int buck, ISect **pis, u32int *pbuck, IBucket *ib)
+{
+ ISect *is;
+ DBlock *b;
+
+ is = ix->sects[indexsect0(ix, buck)];
+ if(buck < is->start || is->stop <= buck){
+ seterr(EAdmin, "index lookup out of range: %ud not found in index\n", buck);
+ return nil;
+ }
+
+ buck -= is->start;
+ if((b = getdblock(is->part, is->blockbase + ((u64int)buck << is->blocklog), 1)) == nil)
+ return nil;
+
+ *pis = is;
+ *pbuck = buck;
+ unpackibucket(ib, b->data);
+ return b;
+}
+
+/*
+ * find the number of the index section holding score
+ */
+static int
+indexsect1(Index *ix, u8int *score)
+{
+ return indexsect0(ix, hashbits(score, 32) / ix->div);
+}
+
+/*
+ * load the index block responsible for score.
+ */
+static DBlock*
+loadibucket1(Index *ix, u8int *score, ISect **pis, u32int *pbuck, IBucket *ib)
+{
+ return loadibucket0(ix, hashbits(score, 32)/ix->div, pis, pbuck, ib);
+}
+
+static u32int
+keytobuck(Index *ix, u32int key, int d)
+{
+ /* clear all but top d bits */
+ if(d != 32)
+ key &= ~((1<<(32-d))-1);
+
+ /* truncate to maxdepth bits */
+ if(ix->maxdepth != 32)
+ key >>= 32 - ix->maxdepth;
+
+ return ix->bitblocks + key;
+}
+
+/*
+ * to check whether .xxx has split, check whether .xxx1 is in use.
+ * it might be worth caching the block for future lookups, but for now
+ * let's assume the dcache is good enough.
+ */
+static int
+bitmapop(Index *ix, u32int key, int d, int set)
+{
+ DBlock *b;
+ ISect *is;
+ IBucket ib;
+ u32int buck;
+ int inuse;
+
+ if(d >= ix->maxdepth)
+ return 0;
+
+ /* construct .xxx1 in bucket number format */
+ key = keytobuck(ix, key, d) | (1<<(ix->maxdepth-d-1));
+
+ /* check whether key (now the bucket number for .xxx1) is in use */
+
+ if((b = loadibucket0(ix, key >> ix->bitkeylog, &is, &buck, &ib)) == nil){
+ seterr(ECorrupt, "cannot load in-use bitmap block");
+ return -1;
+ }
+ inuse = ((u32int*)b->data)[(key & ix->bitkeymask)>>5] & (1<<(key&31));
+ if(set && !inuse){
+ dirtydblock(b, DirtyIndexBitmap);
+ ((u32int*)b->data)[(key & ix->bitkeymask)>>5] |= (1<<(key&31));
+ }
+ putdblock(b);
+ return inuse;
+}
+
+static int
+issplit(Index *ix, u32int key, int d)
+{
+ return bitmapop(ix, key, d, 0);
+}
+
+static int
+marksplit(Index *ix, u32int key, int d)
+{
+ return bitmapop(ix, key, d, 1);
+}
+
+/*
+ * find the number of the index section holding score.
+ * it's not terrible to be wrong once in a while, so we just
+ * do what the bitmap tells us and don't worry about the
+ * bitmap being out of date.
+ */
+static int
+indexsect2(Index *ix, u8int *score)
+{
+ u32int key;
+ int d, is;
+
+ key = hashbits(score, 32);
+ for(d=0; d<=ix->maxdepth; d++){
+ is = issplit(ix, key, d);
+ if(is == -1)
+ return 0; /* must return something valid! */
+ if(!is)
+ break;
+ }
+
+ if(d > ix->maxdepth){
+ seterr(EBug, "index bitmap inconsistent with maxdepth");
+ return 0; /* must return something valid! */
+ }
+
+ return indexsect0(ix, keytobuck(ix, key, d));
+}
+
+/*
+ * load the index block responsible for score.
+ * (unlike indexsect2, must be correct always.)
+ */
+static DBlock*
+loadibucket2(Index *ix, u8int *score, ISect **pis, u32int *pbuck, IBucket *ib)
+{
+ u32int key;
+ int d, try, is;
+ DBlock *b;
+
+ for(try=0; try<32; try++){
+ key = hashbits(score, 32);
+ for(d=0; d<=ix->maxdepth; d++){
+ is = issplit(ix, key, d);
+ if(is == -1)
+ return nil;
+ if(!is)
+ break;
+ }
+ if(d > ix->maxdepth){
+ seterr(EBug, "index bitmap inconsistent with maxdepth");
+ return nil;
+ }
+
+ if((b = loadibucket0(ix, keytobuck(ix, key, d), pis, pbuck, ib)) == nil)
+ return nil;
+
+ if(ib->depth == d)
+ return b;
+
+ if(ib->depth < d){
+ seterr(EBug, "index block has smaller depth than expected -- cannot happen");
+ putdblock(b);
+ return nil;
+ }
+
+ /*
+ * ib->depth > d, meaning the bitmap was out of date.
+ * fix the bitmap and try again.
+ */
+ putdblock(b);
+ if(marksplit(ix, key, d) < 0)
+ return nil;
+ }
+ seterr(EBug, "loadibucket2 failed to sync bitmap with disk!");
+ return nil;
+}
+
+int
+indexsect(Index *ix, u8int *score)
+{
+ if(ix->version == IndexVersion1)
+ return indexsect1(ix, score);
+ return indexsect2(ix, score);
+}
+
+DBlock*
+loadibucket(Index *ix, u8int *score, ISect **pis, u32int *pbuck, IBucket *ib)
+{
+ if(ix->version == IndexVersion1)
+ return loadibucket1(ix, score, pis, pbuck, ib);
+ return loadibucket2(ix, score, pis, pbuck, ib);
+}
+
+
diff --git a/src/cmd/venti/lump.c b/src/cmd/venti/lump.c
index c449f022..746a89ca 100644
--- a/src/cmd/venti/lump.c
+++ b/src/cmd/venti/lump.c
@@ -81,7 +81,6 @@ writelump(Packet *p, u8int *score, int type, u32int creator)
return ok;
}
-print("writelump %08x\n", mainindex->arenas[0]);
if(queuewrites)
return queuewrite(u, p, creator);
diff --git a/src/cmd/venti/mkfile b/src/cmd/venti/mkfile
index 2ef1ed21..99d8c98f 100644
--- a/src/cmd/venti/mkfile
+++ b/src/cmd/venti/mkfile
@@ -44,9 +44,9 @@ TARG=\
fmtarenas\
fmtisect\
fmtindex\
- buildindex\
+# buildindex\
checkarenas\
- checkindex\
+# checkindex\
clumpstats\
findscore\
rdarena\
diff --git a/src/cmd/venti/part.c b/src/cmd/venti/part.c
index 55cac6bb..8a162958 100644
--- a/src/cmd/venti/part.c
+++ b/src/cmd/venti/part.c
@@ -2,6 +2,8 @@
#include "dat.h"
#include "fns.h"
+#define trace 0
+
u32int maxblocksize;
int readonly;
@@ -75,7 +77,7 @@ writepart(Part *part, u64int addr, u8int *buf, u32int n)
seterr(ECorrupt, "out of bounds write to partition='%s'", part->name);
return -1;
}
- print("write %s %lud at %llud\n", part->name, n, addr);
+if(trace) print("write %s %lud at %llud\n", part->name, n, addr);
for(nn = 0; nn < n; nn += m){
mm = n - nn;
if(mm > MaxIo)
@@ -107,7 +109,7 @@ readpart(Part *part, u64int addr, u8int *buf, u32int n)
seterr(ECorrupt, "out of bounds read from partition='%s': addr=%lld n=%d size=%lld", part->name, addr, n, part->size);
return -1;
}
- print("read %s %lud at %llud\n", part->name, n, addr);
+if(trace) print("read %s %lud at %llud\n", part->name, n, addr);
for(nn = 0; nn < n; nn += m){
mm = n - nn;
if(mm > MaxIo)
diff --git a/src/cmd/venti/venti.c b/src/cmd/venti/venti.c
index fc0a75b9..4c533cc6 100644
--- a/src/cmd/venti/venti.c
+++ b/src/cmd/venti/venti.c
@@ -147,8 +147,8 @@ ventiserver(char *addr)
while((r = vtgetreq(s)) != nil){
r->rx.type = r->tx.type+1;
- print("req (arenas[0]=%p sects[0]=%p) %F\n",
- mainindex->arenas[0], mainindex->sects[0], &r->tx);
+ // print("req (arenas[0]=%p sects[0]=%p) %F\n",
+ // mainindex->arenas[0], mainindex->sects[0], &r->tx);
switch(r->tx.type){
default:
vtrerror(r, "unknown request");