diff options
author | rsc <devnull@localhost> | 2005-07-12 15:23:36 +0000 |
---|---|---|
committer | rsc <devnull@localhost> | 2005-07-12 15:23:36 +0000 |
commit | a0d146edd7a7de6236a0d60baafeeb59f8452aae (patch) | |
tree | b55baa526d9f5adfc73246e6ee2fadf455e0b7a2 /src/cmd/venti/srv/index.c | |
parent | 88bb285e3d87ec2508840af33f7e0af53ec3c13c (diff) | |
download | plan9port-a0d146edd7a7de6236a0d60baafeeb59f8452aae.tar.gz plan9port-a0d146edd7a7de6236a0d60baafeeb59f8452aae.tar.bz2 plan9port-a0d146edd7a7de6236a0d60baafeeb59f8452aae.zip |
return of venti
Diffstat (limited to 'src/cmd/venti/srv/index.c')
-rw-r--r-- | src/cmd/venti/srv/index.c | 819 |
1 files changed, 819 insertions, 0 deletions
diff --git a/src/cmd/venti/srv/index.c b/src/cmd/venti/srv/index.c new file mode 100644 index 00000000..46bf91e2 --- /dev/null +++ b/src/cmd/venti/srv/index.c @@ -0,0 +1,819 @@ +/* + * Index, mapping scores to log positions. + * + * The index is made up of some number of index sections, each of + * which is typically stored on a different disk. The blocks in all the + * index sections are logically numbered, with each index section + * responsible for a range of blocks. Blocks are typically 8kB. + * + * The N index blocks are treated as a giant hash table. The top 32 bits + * of score are used as the key for a lookup. Each index block holds + * one hash bucket, which is responsible for ceil(2^32 / N) of the key space. + * + * The index is sized so that a particular bucket is extraordinarily + * unlikely to overflow: assuming compressed data blocks are 4kB + * on disk, and assuming each block has a 40 byte index entry, + * the index data will be 1% of the total data. Since scores are essentially + * random, all buckets should be about the same fullness. + * A factor of 5 gives us a wide comfort boundary to account for + * random variation. So the index disk space should be 5% of the arena disk space. + */ + +#include "stdinc.h" +#include "dat.h" +#include "fns.h" + +//static int bucklook(u8int *score, int type, u8int *data, int n); +//static int writebucket(ISect *is, u32int buck, IBucket *ib, DBlock *b); +//static int okibucket(IBucket *ib, ISect *is); +static int initindex1(Index*); +static ISect *initisect1(ISect *is); +//static int splitiblock(Index *ix, DBlock *b, ISect *is, u32int buck, IBucket *ib); + +#define KEY(k,d) ((d) ? (k)>>(32-(d)) : 0) + +//static QLock indexlock; //ZZZ + +static char IndexMagic[] = "venti index configuration"; + +Index* +initindex(char *name, ISect **sects, int n) +{ + IFile f; + Index *ix; + ISect *is; + u32int last, blocksize, tabsize; + int i; + + if(n <= 0){ +fprint(2, "bad n\n"); + seterr(EOk, "no index sections to initialize index"); + return nil; + } + ix = MKZ(Index); + if(ix == nil){ +fprint(2, "no mem\n"); + seterr(EOk, "can't initialize index: out of memory"); + freeindex(ix); + return nil; + } + + tabsize = sects[0]->tabsize; + if(partifile(&f, sects[0]->part, sects[0]->tabbase, tabsize) < 0) + return nil; + if(parseindex(&f, ix) < 0){ + freeifile(&f); + freeindex(ix); + return nil; + } + freeifile(&f); + if(namecmp(ix->name, name) != 0){ + seterr(ECorrupt, "mismatched index name: found %s expected %s", ix->name, name); + return nil; + } + if(ix->nsects != n){ + seterr(ECorrupt, "mismatched number index sections: found %d expected %d", n, ix->nsects); + freeindex(ix); + return nil; + } + ix->sects = sects; + last = 0; + blocksize = ix->blocksize; + for(i = 0; i < ix->nsects; i++){ + is = sects[i]; + if(namecmp(ix->name, is->index) != 0 + || is->blocksize != blocksize + || is->tabsize != tabsize + || namecmp(is->name, ix->smap[i].name) != 0 + || is->start != ix->smap[i].start + || is->stop != ix->smap[i].stop + || last != is->start + || is->start > is->stop){ + seterr(ECorrupt, "inconsistent index sections in %s", ix->name); + freeindex(ix); + return nil; + } + last = is->stop; + } + ix->tabsize = tabsize; + ix->buckets = last; + + if(initindex1(ix) < 0){ + freeindex(ix); + return nil; + } + + ix->arenas = MKNZ(Arena*, ix->narenas); + if(maparenas(ix->amap, ix->arenas, ix->narenas, ix->name) < 0){ + freeindex(ix); + return nil; + } + + return ix; +} + +static int +initindex1(Index *ix) +{ + u32int buckets; + + ix->div = (((u64int)1 << 32) + ix->buckets - 1) / ix->buckets; + buckets = (((u64int)1 << 32) - 1) / ix->div + 1; + if(buckets != ix->buckets){ + seterr(ECorrupt, "inconsistent math for divisor and buckets in %s", ix->name); + return -1; + } + + return 0; +} + +int +wbindex(Index *ix) +{ + Fmt f; + ZBlock *b; + int i; + + if(ix->nsects == 0){ + seterr(EOk, "no sections in index %s", ix->name); + return -1; + } + b = alloczblock(ix->tabsize, 1, ix->blocksize); + if(b == nil){ + seterr(EOk, "can't write index configuration: out of memory"); + return -1; + } + fmtzbinit(&f, b); + if(outputindex(&f, ix) < 0){ + seterr(EOk, "can't make index configuration: table storage too small %d", ix->tabsize); + freezblock(b); + return -1; + } + for(i = 0; i < ix->nsects; i++){ + if(writepart(ix->sects[i]->part, ix->sects[i]->tabbase, b->data, ix->tabsize) < 0){ + seterr(EOk, "can't write index: %r"); + freezblock(b); + return -1; + } + } + freezblock(b); + + for(i = 0; i < ix->nsects; i++) + if(wbisect(ix->sects[i]) < 0) + return -1; + + return 0; +} + +/* + * index: IndexMagic '\n' version '\n' name '\n' blocksize '\n' [V2: bitblocks '\n'] sections arenas + * version, blocksize: u32int + * name: max. ANameSize string + * sections, arenas: AMap + */ +int +outputindex(Fmt *f, Index *ix) +{ + if(fmtprint(f, "%s\n%ud\n%s\n%ud\n", IndexMagic, ix->version, ix->name, ix->blocksize) < 0 + || outputamap(f, ix->smap, ix->nsects) < 0 + || outputamap(f, ix->amap, ix->narenas) < 0) + return -1; + return 0; +} + +int +parseindex(IFile *f, Index *ix) +{ + AMapN amn; + u32int v; + char *s; + + /* + * magic + */ + s = ifileline(f); + if(s == nil || strcmp(s, IndexMagic) != 0){ + seterr(ECorrupt, "bad index magic for %s", f->name); + return -1; + } + + /* + * version + */ + if(ifileu32int(f, &v) < 0){ + seterr(ECorrupt, "syntax error: bad version number in %s", f->name); + return -1; + } + ix->version = v; + if(ix->version != IndexVersion){ + seterr(ECorrupt, "bad version number in %s", f->name); + return -1; + } + + /* + * name + */ + if(ifilename(f, ix->name) < 0){ + seterr(ECorrupt, "syntax error: bad index name in %s", f->name); + return -1; + } + + /* + * block size + */ + if(ifileu32int(f, &v) < 0){ + seterr(ECorrupt, "syntax error: bad block size number in %s", f->name); + return -1; + } + ix->blocksize = v; + + if(parseamap(f, &amn) < 0) + return -1; + ix->nsects = amn.n; + ix->smap = amn.map; + + if(parseamap(f, &amn) < 0) + return -1; + ix->narenas = amn.n; + ix->amap = amn.map; + + return 0; +} + +/* + * initialize an entirely new index + */ +Index * +newindex(char *name, ISect **sects, int n) +{ + Index *ix; + AMap *smap; + u64int nb; + u32int div, ub, xb, fb, start, stop, blocksize, tabsize; + int i, j; + + if(n < 1){ + seterr(EOk, "creating index with no index sections"); + return nil; + } + + /* + * compute the total buckets available in the index, + * and the total buckets which are used. + */ + nb = 0; + blocksize = sects[0]->blocksize; + tabsize = sects[0]->tabsize; + for(i = 0; i < n; i++){ + if(sects[i]->start != 0 || sects[i]->stop != 0 + || sects[i]->index[0] != '\0'){ + seterr(EOk, "creating new index using non-empty section %s", sects[i]->name); + return nil; + } + if(blocksize != sects[i]->blocksize){ + seterr(EOk, "mismatched block sizes in index sections"); + return nil; + } + if(tabsize != sects[i]->tabsize){ + seterr(EOk, "mismatched config table sizes in index sections"); + return nil; + } + nb += sects[i]->blocks; + } + + /* + * check for duplicate names + */ + for(i = 0; i < n; i++){ + for(j = i + 1; j < n; j++){ + if(namecmp(sects[i]->name, sects[j]->name) == 0){ + seterr(EOk, "duplicate section name %s for index %s", sects[i]->name, name); + return nil; + } + } + } + + if(nb >= ((u64int)1 << 32)){ + seterr(EBug, "index too large"); + return nil; + } + + fb = 0; + div = (((u64int)1 << 32) + nb - 1) / nb; + ub = (((u64int)1 << 32) - 1) / div + 1; + if(div < 100){ + seterr(EBug, "index divisor too coarse"); + return nil; + } + if(ub > nb){ + seterr(EBug, "index initialization math wrong"); + return nil; + } + xb = nb - ub; + + /* + * initialize each of the index sections + * and the section map table + */ + smap = MKNZ(AMap, n); + if(smap == nil){ + seterr(EOk, "can't create new index: out of memory"); + return nil; + } + start = 0; + for(i = 0; i < n; i++){ + stop = start + sects[i]->blocks - xb / n; + if(i == n - 1) + stop = ub; + sects[i]->start = start; + sects[i]->stop = stop; + namecp(sects[i]->index, name); + + smap[i].start = start; + smap[i].stop = stop; + namecp(smap[i].name, sects[i]->name); + start = stop; + } + + /* + * initialize the index itself + */ + ix = MKZ(Index); + if(ix == nil){ + seterr(EOk, "can't create new index: out of memory"); + free(smap); + return nil; + } + ix->version = IndexVersion; + namecp(ix->name, name); + ix->sects = sects; + ix->smap = smap; + ix->nsects = n; + ix->blocksize = blocksize; + ix->buckets = ub; + ix->tabsize = tabsize; + ix->div = div; + ix->bitblocks = fb; + + if(initindex1(ix) < 0){ + free(smap); + return nil; + } + + return ix; +} + +ISect* +initisect(Part *part) +{ + ISect *is; + ZBlock *b; + int ok; + + b = alloczblock(HeadSize, 0, 0); + if(b == nil || readpart(part, PartBlank, b->data, HeadSize) < 0){ + seterr(EAdmin, "can't read index section header: %r"); + return nil; + } + + is = MKZ(ISect); + if(is == nil){ + freezblock(b); + return nil; + } + is->part = part; + ok = unpackisect(is, b->data); + freezblock(b); + if(ok < 0){ + seterr(ECorrupt, "corrupted index section header: %r"); + freeisect(is); + return nil; + } + + if(is->version != ISectVersion1 && is->version != ISectVersion2){ + seterr(EAdmin, "unknown index section version %d", is->version); + freeisect(is); + return nil; + } + + return initisect1(is); +} + +ISect* +newisect(Part *part, u32int vers, char *name, u32int blocksize, u32int tabsize) +{ + ISect *is; + u32int tabbase; + + is = MKZ(ISect); + if(is == nil) + return nil; + + namecp(is->name, name); + is->version = vers; + is->part = part; + is->blocksize = blocksize; + is->start = 0; + is->stop = 0; + tabbase = (PartBlank + HeadSize + blocksize - 1) & ~(blocksize - 1); + is->blockbase = (tabbase + tabsize + blocksize - 1) & ~(blocksize - 1); + is->blocks = is->part->size / blocksize - is->blockbase / blocksize; + is->bucketmagic = 0; + if(is->version == ISectVersion2){ + do{ + is->bucketmagic = fastrand(); + }while(is->bucketmagic==0); + } + is = initisect1(is); + if(is == nil) + return nil; + + return is; +} + +/* + * initialize the computed parameters for an index + */ +static ISect* +initisect1(ISect *is) +{ + u64int v; + + is->buckmax = (is->blocksize - IBucketSize) / IEntrySize; + is->blocklog = u64log2(is->blocksize); + if(is->blocksize != (1 << is->blocklog)){ + seterr(ECorrupt, "illegal non-power-of-2 bucket size %d\n", is->blocksize); + freeisect(is); + return nil; + } + partblocksize(is->part, is->blocksize); + is->tabbase = (PartBlank + HeadSize + is->blocksize - 1) & ~(is->blocksize - 1); + if(is->tabbase >= is->blockbase){ + seterr(ECorrupt, "index section config table overlaps bucket storage"); + freeisect(is); + return nil; + } + is->tabsize = is->blockbase - is->tabbase; + v = is->part->size & ~(u64int)(is->blocksize - 1); + if(is->blockbase + (u64int)is->blocks * is->blocksize != v){ + seterr(ECorrupt, "invalid blocks in index section %s", is->name); +//ZZZZZZZZZ +// freeisect(is); +// return nil; + } + + if(is->stop - is->start > is->blocks){ + seterr(ECorrupt, "index section overflows available space"); + freeisect(is); + return nil; + } + if(is->start > is->stop){ + seterr(ECorrupt, "invalid index section range"); + freeisect(is); + return nil; + } + + return is; +} + +int +wbisect(ISect *is) +{ + ZBlock *b; + + b = alloczblock(HeadSize, 1, 0); + if(b == nil) +//ZZZ set error? + return -1; + + if(packisect(is, b->data) < 0){ + seterr(ECorrupt, "can't make index section header: %r"); + freezblock(b); + return -1; + } + if(writepart(is->part, PartBlank, b->data, HeadSize) < 0){ + seterr(EAdmin, "can't write index section header: %r"); + freezblock(b); + return -1; + } + freezblock(b); + + return 0; +} + +void +freeisect(ISect *is) +{ + if(is == nil) + return; + free(is); +} + +void +freeindex(Index *ix) +{ + int i; + + if(ix == nil) + return; + free(ix->amap); + free(ix->arenas); + if(ix->sects) + for(i = 0; i < ix->nsects; i++) + freeisect(ix->sects[i]); + free(ix->sects); + free(ix->smap); + free(ix); +} + +/* + * write a clump to an available arena in the index + * and return the address of the clump within the index. +ZZZ question: should this distinguish between an arena +filling up and real errors writing the clump? + */ +u64int +writeiclump(Index *ix, Clump *c, u8int *clbuf, u64int *pa) +{ + u64int a; + int i; + + trace(TraceLump, "writeiclump enter"); + for(i = ix->mapalloc; i < ix->narenas; i++){ + a = writeaclump(ix->arenas[i], c, clbuf, ix->amap[i].start, pa); + if(a != TWID64){ + ix->mapalloc = i; /* assuming write is atomic, race is okay */ + trace(TraceLump, "writeiclump exit"); + return a; + } + } + + seterr(EAdmin, "no space left in arenas"); + trace(TraceLump, "writeiclump failed"); + return TWID64; +} + +/* + * convert an arena index to an relative arena address + */ +Arena* +amapitoa(Index *ix, u64int a, u64int *aa) +{ + int i, r, l, m; + + l = 1; + r = ix->narenas - 1; + while(l <= r){ + m = (r + l) / 2; + if(ix->amap[m].start <= a) + l = m + 1; + else + r = m - 1; + } + l--; + + if(a > ix->amap[l].stop){ +for(i=0; i<ix->narenas; i++) + print("arena %d: %llux - %llux\n", i, ix->amap[i].start, ix->amap[i].stop); +print("want arena %d for %llux\n", l, a); + seterr(ECrash, "unmapped address passed to amapitoa"); + return nil; + } + + if(ix->arenas[l] == nil){ + seterr(ECrash, "unmapped arena selected in amapitoa"); + return nil; + } + *aa = a - ix->amap[l].start; + return ix->arenas[l]; +} + +int +iaddrcmp(IAddr *ia1, IAddr *ia2) +{ + return ia1->type != ia2->type + || ia1->size != ia2->size + || ia1->blocks != ia2->blocks + || ia1->addr != ia2->addr; +} + +/* + * lookup the score in the partition + * + * nothing needs to be explicitly locked: + * only static parts of ix are used, and + * the bucket is locked by the DBlock lock. + */ +int +loadientry(Index *ix, u8int *score, int type, IEntry *ie) +{ + ISect *is; + DBlock *b; + IBucket ib; + u32int buck; + int h, ok; + + ok = -1; + + trace(TraceLump, "loadientry enter"); + + /* + qlock(&stats.lock); + stats.indexreads++; + qunlock(&stats.lock); + */ + + if(!inbloomfilter(mainindex->bloom, score)){ + trace(TraceLump, "loadientry bloomhit"); + return -1; + } + + trace(TraceLump, "loadientry loadibucket"); + b = loadibucket(ix, score, &is, &buck, &ib); + trace(TraceLump, "loadientry loadedibucket"); + if(b == nil) + return -1; + + if(okibucket(&ib, is) < 0){ + trace(TraceLump, "loadientry badbucket"); + goto out; + } + + h = bucklook(score, type, ib.data, ib.n); + if(h & 1){ + h ^= 1; + trace(TraceLump, "loadientry found"); + unpackientry(ie, &ib.data[h]); + ok = 0; + goto out; + } + trace(TraceLump, "loadientry notfound"); + addstat(StatBloomFalseMiss, 1); +out: + putdblock(b); + trace(TraceLump, "loadientry exit"); + return ok; +} + +int +okibucket(IBucket *ib, ISect *is) +{ + if(ib->n <= is->buckmax) + return 0; + + seterr(EICorrupt, "corrupted disk index bucket: n=%ud max=%ud, range=[%lud,%lud)", + ib->n, is->buckmax, is->start, is->stop); + return -1; +} + +/* + * look for score within data; + * return 1 | byte index of matching index, + * or 0 | index of least element > score + */ +int +bucklook(u8int *score, int otype, u8int *data, int n) +{ + int i, r, l, m, h, c, cc, type; + + type = vttodisktype(otype); + l = 0; + r = n - 1; + while(l <= r){ + m = (r + l) >> 1; + h = m * IEntrySize; + for(i = 0; i < VtScoreSize; i++){ + c = score[i]; + cc = data[h + i]; + if(c != cc){ + if(c > cc) + l = m + 1; + else + r = m - 1; + goto cont; + } + } + cc = data[h + IEntryTypeOff]; + if(type != cc){ + if(type > cc) + l = m + 1; + else + r = m - 1; + goto cont; + } + return h | 1; + cont:; + } + + return l * IEntrySize; +} + +/* + * compare two IEntries; consistent with bucklook + */ +int +ientrycmp(const void *vie1, const void *vie2) +{ + u8int *ie1, *ie2; + int i, v1, v2; + + ie1 = (u8int*)vie1; + ie2 = (u8int*)vie2; + for(i = 0; i < VtScoreSize; i++){ + v1 = ie1[i]; + v2 = ie2[i]; + if(v1 != v2){ + if(v1 < v2) + return -1; + return 1; + } + } + v1 = ie1[IEntryTypeOff]; + v2 = ie2[IEntryTypeOff]; + if(v1 != v2){ + if(v1 < v2) + return -1; + return 1; + } + return 0; +} + +/* + * find the number of the index section holding bucket #buck + */ +int +indexsect0(Index *ix, u32int buck) +{ + int r, l, m; + + l = 1; + r = ix->nsects - 1; + while(l <= r){ + m = (r + l) >> 1; + if(ix->sects[m]->start <= buck) + l = m + 1; + else + r = m - 1; + } + return l - 1; +} + +/* + * load the index block at bucket #buck + */ +static DBlock* +loadibucket0(Index *ix, u32int buck, ISect **pis, u32int *pbuck, IBucket *ib, int mode) +{ + ISect *is; + DBlock *b; + + is = ix->sects[indexsect0(ix, buck)]; + if(buck < is->start || is->stop <= buck){ + seterr(EAdmin, "index lookup out of range: %ud not found in index\n", buck); + return nil; + } + + buck -= is->start; + if((b = getdblock(is->part, is->blockbase + ((u64int)buck << is->blocklog), mode)) == nil) + return nil; + + if(pis) + *pis = is; + if(pbuck) + *pbuck = buck; + if(ib) + unpackibucket(ib, b->data, is->bucketmagic); + return b; +} + +/* + * find the number of the index section holding score + */ +static int +indexsect1(Index *ix, u8int *score) +{ + return indexsect0(ix, hashbits(score, 32) / ix->div); +} + +/* + * load the index block responsible for score. + */ +static DBlock* +loadibucket1(Index *ix, u8int *score, ISect **pis, u32int *pbuck, IBucket *ib) +{ + return loadibucket0(ix, hashbits(score, 32)/ix->div, pis, pbuck, ib, OREAD); +} + +int +indexsect(Index *ix, u8int *score) +{ + return indexsect1(ix, score); +} + +DBlock* +loadibucket(Index *ix, u8int *score, ISect **pis, u32int *pbuck, IBucket *ib) +{ + return loadibucket1(ix, score, pis, pbuck, ib); +} + + |