aboutsummaryrefslogtreecommitdiff
path: root/src/cmd/fossil/cache.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/cmd/fossil/cache.c')
-rw-r--r--src/cmd/fossil/cache.c2114
1 files changed, 2114 insertions, 0 deletions
diff --git a/src/cmd/fossil/cache.c b/src/cmd/fossil/cache.c
new file mode 100644
index 00000000..e812f6b6
--- /dev/null
+++ b/src/cmd/fossil/cache.c
@@ -0,0 +1,2114 @@
+#include "stdinc.h"
+#include "dat.h"
+#include "fns.h"
+#include "error.h"
+
+#include "9.h" /* for cacheFlush */
+
+typedef struct FreeList FreeList;
+typedef struct BAddr BAddr;
+
+enum {
+ BadHeap = ~0,
+};
+
+/*
+ * Store data to the memory cache in c->size blocks
+ * with the block zero extended to fill it out. When writing to
+ * Venti, the block will be zero truncated. The walker will also check
+ * that the block fits within psize or dsize as the case may be.
+ */
+
+struct Cache
+{
+ VtLock *lk;
+ int ref;
+ int mode;
+
+ Disk *disk;
+ int size; /* block size */
+ int ndmap; /* size of per-block dirty pointer map used in blockWrite */
+ VtSession *z;
+ u32int now; /* ticks for usage timestamps */
+ Block **heads; /* hash table for finding address */
+ int nheap; /* number of available victims */
+ Block **heap; /* heap for locating victims */
+ long nblocks; /* number of blocks allocated */
+ Block *blocks; /* array of block descriptors */
+ u8int *mem; /* memory for all block data & blists */
+
+ BList *blfree;
+ VtRendez *blrend;
+
+ int ndirty; /* number of dirty blocks in the cache */
+ int maxdirty; /* max number of dirty blocks */
+ u32int vers;
+
+ long hashSize;
+
+ FreeList *fl;
+
+ VtRendez *die; /* daemon threads should die when != nil */
+
+ VtRendez *flush;
+ VtRendez *flushwait;
+ VtRendez *heapwait;
+ BAddr *baddr;
+ int bw, br, be;
+ int nflush;
+
+ Periodic *sync;
+
+ /* unlink daemon */
+ BList *uhead;
+ BList *utail;
+ VtRendez *unlink;
+
+ /* block counts */
+ int nused;
+ int ndisk;
+};
+
+struct BList {
+ int part;
+ u32int addr;
+ uchar type;
+ u32int tag;
+ u32int epoch;
+ u32int vers;
+
+ int recurse; /* for block unlink */
+
+ /* for roll back */
+ int index; /* -1 indicates not valid */
+ union {
+ uchar score[VtScoreSize];
+ uchar entry[VtEntrySize];
+ } old;
+ BList *next;
+};
+
+struct BAddr {
+ int part;
+ u32int addr;
+ u32int vers;
+};
+
+struct FreeList {
+ VtLock *lk;
+ u32int last; /* last block allocated */
+ u32int end; /* end of data partition */
+ u32int nused; /* number of used blocks */
+ u32int epochLow; /* low epoch when last updated nused */
+};
+
+static FreeList *flAlloc(u32int end);
+static void flFree(FreeList *fl);
+
+static Block *cacheBumpBlock(Cache *c);
+static void heapDel(Block*);
+static void heapIns(Block*);
+static void cacheCheck(Cache*);
+static void unlinkThread(void *a);
+static void flushThread(void *a);
+static void unlinkBody(Cache *c);
+static int cacheFlushBlock(Cache *c);
+static void cacheSync(void*);
+static BList *blistAlloc(Block*);
+static void blistFree(Cache*, BList*);
+static void doRemoveLink(Cache*, BList*);
+
+/*
+ * Mapping from local block type to Venti type
+ */
+int vtType[BtMax] = {
+ VtDataType, /* BtData | 0 */
+ VtPointerType0, /* BtData | 1 */
+ VtPointerType1, /* BtData | 2 */
+ VtPointerType2, /* BtData | 3 */
+ VtPointerType3, /* BtData | 4 */
+ VtPointerType4, /* BtData | 5 */
+ VtPointerType5, /* BtData | 6 */
+ VtPointerType6, /* BtData | 7 */
+ VtDirType, /* BtDir | 0 */
+ VtPointerType0, /* BtDir | 1 */
+ VtPointerType1, /* BtDir | 2 */
+ VtPointerType2, /* BtDir | 3 */
+ VtPointerType3, /* BtDir | 4 */
+ VtPointerType4, /* BtDir | 5 */
+ VtPointerType5, /* BtDir | 6 */
+ VtPointerType6, /* BtDir | 7 */
+};
+
+/*
+ * Allocate the memory cache.
+ */
+Cache *
+cacheAlloc(Disk *disk, VtSession *z, ulong nblocks, int mode)
+{
+ int i;
+ Cache *c;
+ Block *b;
+ BList *bl;
+ u8int *p;
+ int nbl;
+
+ c = vtMemAllocZ(sizeof(Cache));
+
+ /* reasonable number of BList elements */
+ nbl = nblocks * 4;
+
+ c->lk = vtLockAlloc();
+ c->ref = 1;
+ c->disk = disk;
+ c->z = z;
+ c->size = diskBlockSize(disk);
+bwatchSetBlockSize(c->size);
+ /* round c->size up to be a nice multiple */
+ c->size = (c->size + 127) & ~127;
+ c->ndmap = (c->size/20 + 7) / 8;
+ c->nblocks = nblocks;
+ c->hashSize = nblocks;
+ c->heads = vtMemAllocZ(c->hashSize*sizeof(Block*));
+ c->heap = vtMemAllocZ(nblocks*sizeof(Block*));
+ c->blocks = vtMemAllocZ(nblocks*sizeof(Block));
+ c->mem = vtMemAllocZ(nblocks * (c->size + c->ndmap) + nbl * sizeof(BList));
+ c->baddr = vtMemAllocZ(nblocks * sizeof(BAddr));
+ c->mode = mode;
+ c->vers++;
+ p = c->mem;
+ for(i = 0; i < nblocks; i++){
+ b = &c->blocks[i];
+ b->lk = vtLockAlloc();
+ b->c = c;
+ b->data = p;
+ b->heap = i;
+ b->ioready = vtRendezAlloc(b->lk);
+ c->heap[i] = b;
+ p += c->size;
+ }
+ c->nheap = nblocks;
+ for(i = 0; i < nbl; i++){
+ bl = (BList*)p;
+ bl->next = c->blfree;
+ c->blfree = bl;
+ p += sizeof(BList);
+ }
+ /* separate loop to keep blocks and blists reasonably aligned */
+ for(i = 0; i < nblocks; i++){
+ b = &c->blocks[i];
+ b->dmap = p;
+ p += c->ndmap;
+ }
+
+ c->blrend = vtRendezAlloc(c->lk);
+
+ c->maxdirty = nblocks*(DirtyPercentage*0.01);
+
+ c->fl = flAlloc(diskSize(disk, PartData));
+
+ c->unlink = vtRendezAlloc(c->lk);
+ c->flush = vtRendezAlloc(c->lk);
+ c->flushwait = vtRendezAlloc(c->lk);
+ c->heapwait = vtRendezAlloc(c->lk);
+ c->sync = periodicAlloc(cacheSync, c, 30*1000);
+
+ if(mode == OReadWrite){
+ c->ref += 2;
+ vtThread(unlinkThread, c);
+ vtThread(flushThread, c);
+ }
+ cacheCheck(c);
+
+ return c;
+}
+
+/*
+ * Free the whole memory cache, flushing all dirty blocks to the disk.
+ */
+void
+cacheFree(Cache *c)
+{
+ int i;
+
+ /* kill off daemon threads */
+ vtLock(c->lk);
+ c->die = vtRendezAlloc(c->lk);
+ periodicKill(c->sync);
+ vtWakeup(c->flush);
+ vtWakeup(c->unlink);
+ while(c->ref > 1)
+ vtSleep(c->die);
+
+ /* flush everything out */
+ do {
+ unlinkBody(c);
+ vtUnlock(c->lk);
+ while(cacheFlushBlock(c))
+ ;
+ diskFlush(c->disk);
+ vtLock(c->lk);
+ } while(c->uhead || c->ndirty);
+ vtUnlock(c->lk);
+
+ cacheCheck(c);
+
+ for(i = 0; i < c->nblocks; i++){
+ assert(c->blocks[i].ref == 0);
+ vtRendezFree(c->blocks[i].ioready);
+ vtLockFree(c->blocks[i].lk);
+ }
+ flFree(c->fl);
+ vtMemFree(c->baddr);
+ vtMemFree(c->heads);
+ vtMemFree(c->blocks);
+ vtMemFree(c->mem);
+ vtLockFree(c->lk);
+ diskFree(c->disk);
+ vtRendezFree(c->blrend);
+ /* don't close vtSession */
+ vtMemFree(c);
+}
+
+static void
+cacheDump(Cache *c)
+{
+ int i;
+ Block *b;
+
+ for(i = 0; i < c->nblocks; i++){
+ b = &c->blocks[i];
+ fprint(2, "%d. p=%d a=%ud %V t=%d ref=%d state=%s io=%s pc=%#p\n",
+ i, b->part, b->addr, b->score, b->l.type, b->ref,
+ bsStr(b->l.state), bioStr(b->iostate), b->pc);
+ }
+}
+
+static void
+cacheCheck(Cache *c)
+{
+ u32int size, now;
+ int i, k, refed;
+ static uchar zero[VtScoreSize];
+ Block *b;
+
+ size = c->size;
+ now = c->now;
+
+ for(i = 0; i < c->nheap; i++){
+ if(c->heap[i]->heap != i)
+ vtFatal("mis-heaped at %d: %d", i, c->heap[i]->heap);
+ if(i > 0 && c->heap[(i - 1) >> 1]->used - now > c->heap[i]->used - now)
+ vtFatal("bad heap ordering");
+ k = (i << 1) + 1;
+ if(k < c->nheap && c->heap[i]->used - now > c->heap[k]->used - now)
+ vtFatal("bad heap ordering");
+ k++;
+ if(k < c->nheap && c->heap[i]->used - now > c->heap[k]->used - now)
+ vtFatal("bad heap ordering");
+ }
+
+ refed = 0;
+ for(i = 0; i < c->nblocks; i++){
+ b = &c->blocks[i];
+ if(b->data != &c->mem[i * size])
+ vtFatal("mis-blocked at %d", i);
+ if(b->ref && b->heap == BadHeap){
+ refed++;
+ }
+ }
+if(c->nheap + refed != c->nblocks){
+fprint(2, "%s: cacheCheck: nheap %d refed %d nblocks %ld\n", argv0, c->nheap, refed, c->nblocks);
+cacheDump(c);
+}
+ assert(c->nheap + refed == c->nblocks);
+ refed = 0;
+ for(i = 0; i < c->nblocks; i++){
+ b = &c->blocks[i];
+ if(b->ref){
+if(1)fprint(2, "%s: p=%d a=%ud %V ref=%d %L\n", argv0, b->part, b->addr, b->score, b->ref, &b->l);
+ refed++;
+ }
+ }
+if(refed > 0)fprint(2, "%s: cacheCheck: in used %d\n", argv0, refed);
+}
+
+
+/*
+ * locate the block with the oldest second to last use.
+ * remove it from the heap, and fix up the heap.
+ */
+/* called with c->lk held */
+static Block *
+cacheBumpBlock(Cache *c)
+{
+ int printed;
+ Block *b;
+
+ /*
+ * locate the block with the oldest second to last use.
+ * remove it from the heap, and fix up the heap.
+ */
+ printed = 0;
+ if(c->nheap == 0){
+ while(c->nheap == 0){
+ vtWakeup(c->flush);
+ vtSleep(c->heapwait);
+ if(c->nheap == 0){
+ printed = 1;
+ fprint(2, "%s: entire cache is busy, %d dirty "
+ "-- waking flush thread\n",
+ argv0, c->ndirty);
+ }
+ }
+ if(printed)
+ fprint(2, "%s: cache is okay again, %d dirty\n",
+ argv0, c->ndirty);
+ }
+
+ b = c->heap[0];
+ heapDel(b);
+
+ assert(b->heap == BadHeap);
+ assert(b->ref == 0);
+ assert(b->iostate != BioDirty && b->iostate != BioReading && b->iostate != BioWriting);
+ assert(b->prior == nil);
+ assert(b->uhead == nil);
+
+ /*
+ * unchain the block from hash chain
+ */
+ if(b->prev){
+ *(b->prev) = b->next;
+ if(b->next)
+ b->next->prev = b->prev;
+ b->prev = nil;
+ }
+
+
+if(0)fprint(2, "%s: dropping %d:%x:%V\n", argv0, b->part, b->addr, b->score);
+ /* set block to a reasonable state */
+ b->ref = 1;
+ b->part = PartError;
+ memset(&b->l, 0, sizeof(b->l));
+ b->iostate = BioEmpty;
+
+ return b;
+}
+
+/*
+ * look for a particular version of the block in the memory cache.
+ */
+static Block *
+_cacheLocalLookup(Cache *c, int part, u32int addr, u32int vers,
+ int waitlock, int *lockfailure)
+{
+ Block *b;
+ ulong h;
+
+ h = addr % c->hashSize;
+
+ if(lockfailure)
+ *lockfailure = 0;
+
+ /*
+ * look for the block in the cache
+ */
+ vtLock(c->lk);
+ for(b = c->heads[h]; b != nil; b = b->next){
+ if(b->part == part && b->addr == addr)
+ break;
+ }
+ if(b == nil || b->vers != vers){
+ vtUnlock(c->lk);
+ return nil;
+ }
+ if(!waitlock && !vtCanLock(b->lk)){
+ *lockfailure = 1;
+ vtUnlock(c->lk);
+ return nil;
+ }
+ heapDel(b);
+ b->ref++;
+ vtUnlock(c->lk);
+
+ bwatchLock(b);
+ if(waitlock)
+ vtLock(b->lk);
+ b->nlock = 1;
+
+ for(;;){
+ switch(b->iostate){
+ default:
+ abort();
+ case BioEmpty:
+ case BioLabel:
+ case BioClean:
+ case BioDirty:
+ if(b->vers != vers){
+ blockPut(b);
+ return nil;
+ }
+ return b;
+ case BioReading:
+ case BioWriting:
+ vtSleep(b->ioready);
+ break;
+ case BioVentiError:
+ blockPut(b);
+ vtSetError("venti i/o error block 0x%.8ux", addr);
+ return nil;
+ case BioReadError:
+ blockPut(b);
+ vtSetError("error reading block 0x%.8ux", addr);
+ return nil;
+ }
+ }
+ /* NOT REACHED */
+}
+static Block*
+cacheLocalLookup(Cache *c, int part, u32int addr, u32int vers)
+{
+ return _cacheLocalLookup(c, part, addr, vers, Waitlock, 0);
+}
+
+
+/*
+ * fetch a local (on-disk) block from the memory cache.
+ * if it's not there, load it, bumping some other block.
+ */
+Block *
+_cacheLocal(Cache *c, int part, u32int addr, int mode, u32int epoch)
+{
+ Block *b;
+ ulong h;
+
+ assert(part != PartVenti);
+
+ h = addr % c->hashSize;
+
+ /*
+ * look for the block in the cache
+ */
+ vtLock(c->lk);
+ for(b = c->heads[h]; b != nil; b = b->next){
+ if(b->part != part || b->addr != addr)
+ continue;
+ if(epoch && b->l.epoch != epoch){
+fprint(2, "%s: _cacheLocal want epoch %ud got %ud\n", argv0, epoch, b->l.epoch);
+ vtUnlock(c->lk);
+ vtSetError(ELabelMismatch);
+ return nil;
+ }
+ heapDel(b);
+ b->ref++;
+ break;
+ }
+
+ if(b == nil){
+ b = cacheBumpBlock(c);
+
+ b->part = part;
+ b->addr = addr;
+ localToGlobal(addr, b->score);
+
+ /* chain onto correct hash */
+ b->next = c->heads[h];
+ c->heads[h] = b;
+ if(b->next != nil)
+ b->next->prev = &b->next;
+ b->prev = &c->heads[h];
+ }
+
+ vtUnlock(c->lk);
+
+ /*
+ * BUG: what if the epoch changes right here?
+ * In the worst case, we could end up in some weird
+ * lock loop, because the block we want no longer exists,
+ * and instead we're trying to lock a block we have no
+ * business grabbing.
+ *
+ * For now, I'm not going to worry about it.
+ */
+
+if(0)fprint(2, "%s: cacheLocal: %d: %d %x\n", argv0, getpid(), b->part, b->addr);
+ bwatchLock(b);
+ vtLock(b->lk);
+ b->nlock = 1;
+
+ if(part == PartData && b->iostate == BioEmpty){
+ if(!readLabel(c, &b->l, addr)){
+ blockPut(b);
+ return nil;
+ }
+ blockSetIOState(b, BioLabel);
+ }
+ if(epoch && b->l.epoch != epoch){
+ blockPut(b);
+fprint(2, "%s: _cacheLocal want epoch %ud got %ud\n", argv0, epoch, b->l.epoch);
+ vtSetError(ELabelMismatch);
+ return nil;
+ }
+
+ b->pc = getcallerpc(&c);
+ for(;;){
+ switch(b->iostate){
+ default:
+ abort();
+ case BioLabel:
+ if(mode == OOverWrite)
+ /*
+ * leave iostate as BioLabel because data
+ * hasn't been read.
+ */
+ return b;
+ /* fall through */
+ case BioEmpty:
+ diskRead(c->disk, b);
+ vtSleep(b->ioready);
+ break;
+ case BioClean:
+ case BioDirty:
+ return b;
+ case BioReading:
+ case BioWriting:
+ vtSleep(b->ioready);
+ break;
+ case BioReadError:
+ blockSetIOState(b, BioEmpty);
+ blockPut(b);
+ vtSetError("error reading block 0x%.8ux", addr);
+ return nil;
+ }
+ }
+ /* NOT REACHED */
+}
+
+Block *
+cacheLocal(Cache *c, int part, u32int addr, int mode)
+{
+ return _cacheLocal(c, part, addr, mode, 0);
+}
+
+/*
+ * fetch a local (on-disk) block from the memory cache.
+ * if it's not there, load it, bumping some other block.
+ * check tag and type.
+ */
+Block *
+cacheLocalData(Cache *c, u32int addr, int type, u32int tag, int mode, u32int epoch)
+{
+ Block *b;
+
+ b = _cacheLocal(c, PartData, addr, mode, epoch);
+ if(b == nil)
+ return nil;
+ if(b->l.type != type || b->l.tag != tag){
+ fprint(2, "%s: cacheLocalData: addr=%d type got %d exp %d: tag got %ux exp %ux\n",
+ argv0, addr, b->l.type, type, b->l.tag, tag);
+ vtSetError(ELabelMismatch);
+ blockPut(b);
+ return nil;
+ }
+ b->pc = getcallerpc(&c);
+ return b;
+}
+
+/*
+ * fetch a global (Venti) block from the memory cache.
+ * if it's not there, load it, bumping some other block.
+ * check tag and type if it's really a local block in disguise.
+ */
+Block *
+cacheGlobal(Cache *c, uchar score[VtScoreSize], int type, u32int tag, int mode)
+{
+ int n;
+ Block *b;
+ ulong h;
+ u32int addr;
+
+ addr = globalToLocal(score);
+ if(addr != NilBlock){
+ b = cacheLocalData(c, addr, type, tag, mode, 0);
+ if(b)
+ b->pc = getcallerpc(&c);
+ return b;
+ }
+
+ h = (u32int)(score[0]|(score[1]<<8)|(score[2]<<16)|(score[3]<<24)) % c->hashSize;
+
+ /*
+ * look for the block in the cache
+ */
+ vtLock(c->lk);
+ for(b = c->heads[h]; b != nil; b = b->next){
+ if(b->part != PartVenti || memcmp(b->score, score, VtScoreSize) != 0 || b->l.type != type)
+ continue;
+ heapDel(b);
+ b->ref++;
+ break;
+ }
+
+ if(b == nil){
+if(0)fprint(2, "%s: cacheGlobal %V %d\n", argv0, score, type);
+
+ b = cacheBumpBlock(c);
+
+ b->part = PartVenti;
+ b->addr = NilBlock;
+ b->l.type = type;
+ memmove(b->score, score, VtScoreSize);
+
+ /* chain onto correct hash */
+ b->next = c->heads[h];
+ c->heads[h] = b;
+ if(b->next != nil)
+ b->next->prev = &b->next;
+ b->prev = &c->heads[h];
+ }
+ vtUnlock(c->lk);
+
+ bwatchLock(b);
+ vtLock(b->lk);
+ b->nlock = 1;
+ b->pc = getcallerpc(&c);
+
+ switch(b->iostate){
+ default:
+ abort();
+ case BioEmpty:
+ n = vtRead(c->z, score, vtType[type], b->data, c->size);
+ if(n < 0 || !vtSha1Check(score, b->data, n)){
+ blockSetIOState(b, BioVentiError);
+ blockPut(b);
+ vtSetError(
+ "venti error reading block %V or wrong score: %r",
+ score);
+ return nil;
+ }
+ vtZeroExtend(vtType[type], b->data, n, c->size);
+ blockSetIOState(b, BioClean);
+ return b;
+ case BioClean:
+ return b;
+ case BioVentiError:
+ blockPut(b);
+ vtSetError("venti i/o error or wrong score, block %V", score);
+ return nil;
+ case BioReadError:
+ blockPut(b);
+ vtSetError("error reading block %V", b->score);
+ return nil;
+ }
+ /* NOT REACHED */
+}
+
+/*
+ * allocate a new on-disk block and load it into the memory cache.
+ * BUG: if the disk is full, should we flush some of it to Venti?
+ */
+static u32int lastAlloc;
+
+Block *
+cacheAllocBlock(Cache *c, int type, u32int tag, u32int epoch, u32int epochLow)
+{
+ FreeList *fl;
+ u32int addr;
+ Block *b;
+ int n, nwrap;
+ Label lab;
+
+ n = c->size / LabelSize;
+ fl = c->fl;
+
+ vtLock(fl->lk);
+ addr = fl->last;
+ b = cacheLocal(c, PartLabel, addr/n, OReadOnly);
+ if(b == nil){
+ fprint(2, "%s: cacheAllocBlock: xxx %R\n", argv0);
+ vtUnlock(fl->lk);
+ return nil;
+ }
+ nwrap = 0;
+ for(;;){
+ if(++addr >= fl->end){
+ addr = 0;
+ if(++nwrap >= 2){
+ blockPut(b);
+ vtSetError("disk is full");
+ /*
+ * try to avoid a continuous spew of console
+ * messages.
+ */
+ if (fl->last != 0)
+ fprint(2, "%s: cacheAllocBlock: xxx1 %R\n",
+ argv0);
+ fl->last = 0;
+ vtUnlock(fl->lk);
+ return nil;
+ }
+ }
+ if(addr%n == 0){
+ blockPut(b);
+ b = cacheLocal(c, PartLabel, addr/n, OReadOnly);
+ if(b == nil){
+ fl->last = addr;
+ fprint(2, "%s: cacheAllocBlock: xxx2 %R\n", argv0);
+ vtUnlock(fl->lk);
+ return nil;
+ }
+ }
+ if(!labelUnpack(&lab, b->data, addr%n))
+ continue;
+ if(lab.state == BsFree)
+ goto Found;
+ if(lab.state&BsClosed)
+ if(lab.epochClose <= epochLow || lab.epoch==lab.epochClose)
+ goto Found;
+ }
+Found:
+ blockPut(b);
+ b = cacheLocal(c, PartData, addr, OOverWrite);
+ if(b == nil){
+ fprint(2, "%s: cacheAllocBlock: xxx3 %R\n", argv0);
+ return nil;
+ }
+assert(b->iostate == BioLabel || b->iostate == BioClean);
+ fl->last = addr;
+ lab.type = type;
+ lab.tag = tag;
+ lab.state = BsAlloc;
+ lab.epoch = epoch;
+ lab.epochClose = ~(u32int)0;
+ if(!blockSetLabel(b, &lab, 1)){
+ fprint(2, "%s: cacheAllocBlock: xxx4 %R\n", argv0);
+ blockPut(b);
+ return nil;
+ }
+ vtZeroExtend(vtType[type], b->data, 0, c->size);
+if(0)diskWrite(c->disk, b);
+
+if(0)fprint(2, "%s: fsAlloc %ud type=%d tag = %ux\n", argv0, addr, type, tag);
+ lastAlloc = addr;
+ fl->nused++;
+ vtUnlock(fl->lk);
+ b->pc = getcallerpc(&c);
+ return b;
+}
+
+int
+cacheDirty(Cache *c)
+{
+ return c->ndirty;
+}
+
+void
+cacheCountUsed(Cache *c, u32int epochLow, u32int *used, u32int *total, u32int *bsize)
+{
+ int n;
+ u32int addr, nused;
+ Block *b;
+ Label lab;
+ FreeList *fl;
+
+ fl = c->fl;
+ n = c->size / LabelSize;
+ *bsize = c->size;
+ vtLock(fl->lk);
+ if(fl->epochLow == epochLow){
+ *used = fl->nused;
+ *total = fl->end;
+ vtUnlock(fl->lk);
+ return;
+ }
+ b = nil;
+ nused = 0;
+ for(addr=0; addr<fl->end; addr++){
+ if(addr%n == 0){
+ blockPut(b);
+ b = cacheLocal(c, PartLabel, addr/n, OReadOnly);
+ if(b == nil){
+ fprint(2, "%s: flCountUsed: loading %ux: %R\n",
+ argv0, addr/n);
+ break;
+ }
+ }
+ if(!labelUnpack(&lab, b->data, addr%n))
+ continue;
+ if(lab.state == BsFree)
+ continue;
+ if(lab.state&BsClosed)
+ if(lab.epochClose <= epochLow || lab.epoch==lab.epochClose)
+ continue;
+ nused++;
+ }
+ blockPut(b);
+ if(addr == fl->end){
+ fl->nused = nused;
+ fl->epochLow = epochLow;
+ }
+ *used = nused;
+ *total = fl->end;
+ vtUnlock(fl->lk);
+ return;
+}
+
+static FreeList *
+flAlloc(u32int end)
+{
+ FreeList *fl;
+
+ fl = vtMemAllocZ(sizeof(*fl));
+ fl->lk = vtLockAlloc();
+ fl->last = 0;
+ fl->end = end;
+ return fl;
+}
+
+static void
+flFree(FreeList *fl)
+{
+ vtLockFree(fl->lk);
+ vtMemFree(fl);
+}
+
+u32int
+cacheLocalSize(Cache *c, int part)
+{
+ return diskSize(c->disk, part);
+}
+
+/*
+ * The thread that has locked b may refer to it by
+ * multiple names. Nlock counts the number of
+ * references the locking thread holds. It will call
+ * blockPut once per reference.
+ */
+void
+blockDupLock(Block *b)
+{
+ assert(b->nlock > 0);
+ b->nlock++;
+}
+
+/*
+ * we're done with the block.
+ * unlock it. can't use it after calling this.
+ */
+void
+blockPut(Block* b)
+{
+ Cache *c;
+
+ if(b == nil)
+ return;
+
+if(0)fprint(2, "%s: blockPut: %d: %d %x %d %s\n", argv0, getpid(), b->part, b->addr, c->nheap, bioStr(b->iostate));
+
+ if(b->iostate == BioDirty)
+ bwatchDependency(b);
+
+ if(--b->nlock > 0)
+ return;
+
+ /*
+ * b->nlock should probably stay at zero while
+ * the block is unlocked, but diskThread and vtSleep
+ * conspire to assume that they can just vtLock(b->lk); blockPut(b),
+ * so we have to keep b->nlock set to 1 even
+ * when the block is unlocked.
+ */
+ assert(b->nlock == 0);
+ b->nlock = 1;
+// b->pc = 0;
+
+ bwatchUnlock(b);
+ vtUnlock(b->lk);
+ c = b->c;
+ vtLock(c->lk);
+
+ if(--b->ref > 0){
+ vtUnlock(c->lk);
+ return;
+ }
+
+ assert(b->ref == 0);
+ switch(b->iostate){
+ default:
+ b->used = c->now++;
+ heapIns(b);
+ break;
+ case BioEmpty:
+ case BioLabel:
+ if(c->nheap == 0)
+ b->used = c->now++;
+ else
+ b->used = c->heap[0]->used;
+ heapIns(b);
+ break;
+ case BioDirty:
+ break;
+ }
+ vtUnlock(c->lk);
+}
+
+/*
+ * set the label associated with a block.
+ */
+Block*
+_blockSetLabel(Block *b, Label *l)
+{
+ int lpb;
+ Block *bb;
+ u32int a;
+ Cache *c;
+
+ c = b->c;
+
+ assert(b->part == PartData);
+ assert(b->iostate == BioLabel || b->iostate == BioClean || b->iostate == BioDirty);
+ lpb = c->size / LabelSize;
+ a = b->addr / lpb;
+ bb = cacheLocal(c, PartLabel, a, OReadWrite);
+ if(bb == nil){
+ blockPut(b);
+ return nil;
+ }
+ b->l = *l;
+ labelPack(l, bb->data, b->addr%lpb);
+ blockDirty(bb);
+ return bb;
+}
+
+int
+blockSetLabel(Block *b, Label *l, int allocating)
+{
+ Block *lb;
+ Label oldl;
+
+ oldl = b->l;
+ lb = _blockSetLabel(b, l);
+ if(lb == nil)
+ return 0;
+
+ /*
+ * If we're allocating the block, make sure the label (bl)
+ * goes to disk before the data block (b) itself. This is to help
+ * the blocks that in turn depend on b.
+ *
+ * Suppose bx depends on (must be written out after) b.
+ * Once we write b we'll think it's safe to write bx.
+ * Bx can't get at b unless it has a valid label, though.
+ *
+ * Allocation is the only case in which having a current label
+ * is vital because:
+ *
+ * - l.type is set at allocation and never changes.
+ * - l.tag is set at allocation and never changes.
+ * - l.state is not checked when we load blocks.
+ * - the archiver cares deeply about l.state being
+ * BaActive vs. BaCopied, but that's handled
+ * by direct calls to _blockSetLabel.
+ */
+
+ if(allocating)
+ blockDependency(b, lb, -1, nil, nil);
+ blockPut(lb);
+ return 1;
+}
+
+/*
+ * Record that bb must be written out before b.
+ * If index is given, we're about to overwrite the score/e
+ * at that index in the block. Save the old value so we
+ * can write a safer ``old'' version of the block if pressed.
+ */
+void
+blockDependency(Block *b, Block *bb, int index, uchar *score, Entry *e)
+{
+ BList *p;
+
+ if(bb->iostate == BioClean)
+ return;
+
+ /*
+ * Dependencies for blocks containing Entry structures
+ * or scores must always be explained. The problem with
+ * only explaining some of them is this. Suppose we have two
+ * dependencies for the same field, the first explained
+ * and the second not. We try to write the block when the first
+ * dependency is not written but the second is. We will roll back
+ * the first change even though the second trumps it.
+ */
+ if(index == -1 && bb->part == PartData)
+ assert(b->l.type == BtData);
+
+ if(bb->iostate != BioDirty){
+ fprint(2, "%s: %d:%x:%d iostate is %d in blockDependency\n",
+ argv0, bb->part, bb->addr, bb->l.type, bb->iostate);
+ abort();
+ }
+
+ p = blistAlloc(bb);
+ if(p == nil)
+ return;
+
+ assert(bb->iostate == BioDirty);
+if(0)fprint(2, "%s: %d:%x:%d depends on %d:%x:%d\n", argv0, b->part, b->addr, b->l.type, bb->part, bb->addr, bb->l.type);
+
+ p->part = bb->part;
+ p->addr = bb->addr;
+ p->type = bb->l.type;
+ p->vers = bb->vers;
+ p->index = index;
+ if(p->index >= 0){
+ /*
+ * This test would just be b->l.type==BtDir except
+ * we need to exclude the super block.
+ */
+ if(b->l.type == BtDir && b->part == PartData)
+ entryPack(e, p->old.entry, 0);
+ else
+ memmove(p->old.score, score, VtScoreSize);
+ }
+ p->next = b->prior;
+ b->prior = p;
+}
+
+/*
+ * Mark an in-memory block as dirty. If there are too many
+ * dirty blocks, start writing some out to disk.
+ *
+ * If there were way too many dirty blocks, we used to
+ * try to do some flushing ourselves, but it's just too dangerous --
+ * it implies that the callers cannot have any of our priors locked,
+ * but this is hard to avoid in some cases.
+ */
+int
+blockDirty(Block *b)
+{
+ Cache *c;
+
+ c = b->c;
+
+ assert(b->part != PartVenti);
+
+ if(b->iostate == BioDirty)
+ return 1;
+ assert(b->iostate == BioClean || b->iostate == BioLabel);
+
+ vtLock(c->lk);
+ b->iostate = BioDirty;
+ c->ndirty++;
+ if(c->ndirty > (c->maxdirty>>1))
+ vtWakeup(c->flush);
+ vtUnlock(c->lk);
+
+ return 1;
+}
+
+/*
+ * We've decided to write out b. Maybe b has some pointers to blocks
+ * that haven't yet been written to disk. If so, construct a slightly out-of-date
+ * copy of b that is safe to write out. (diskThread will make sure the block
+ * remains marked as dirty.)
+ */
+uchar *
+blockRollback(Block *b, uchar *buf)
+{
+ u32int addr;
+ BList *p;
+ Super super;
+
+ /* easy case */
+ if(b->prior == nil)
+ return b->data;
+
+ memmove(buf, b->data, b->c->size);
+ for(p=b->prior; p; p=p->next){
+ /*
+ * we know p->index >= 0 because blockWrite has vetted this block for us.
+ */
+ assert(p->index >= 0);
+ assert(b->part == PartSuper || (b->part == PartData && b->l.type != BtData));
+ if(b->part == PartSuper){
+ assert(p->index == 0);
+ superUnpack(&super, buf);
+ addr = globalToLocal(p->old.score);
+ if(addr == NilBlock){
+ fprint(2, "%s: rolling back super block: "
+ "bad replacement addr %V\n",
+ argv0, p->old.score);
+ abort();
+ }
+ super.active = addr;
+ superPack(&super, buf);
+ continue;
+ }
+ if(b->l.type == BtDir)
+ memmove(buf+p->index*VtEntrySize, p->old.entry, VtEntrySize);
+ else
+ memmove(buf+p->index*VtScoreSize, p->old.score, VtScoreSize);
+ }
+ return buf;
+}
+
+/*
+ * Try to write block b.
+ * If b depends on other blocks:
+ *
+ * If the block has been written out, remove the dependency.
+ * If the dependency is replaced by a more recent dependency,
+ * throw it out.
+ * If we know how to write out an old version of b that doesn't
+ * depend on it, do that.
+ *
+ * Otherwise, bail.
+ */
+int
+blockWrite(Block *b, int waitlock)
+{
+ uchar *dmap;
+ Cache *c;
+ BList *p, **pp;
+ Block *bb;
+ int lockfail;
+
+ c = b->c;
+
+ if(b->iostate != BioDirty)
+ return 1;
+
+ dmap = b->dmap;
+ memset(dmap, 0, c->ndmap);
+ pp = &b->prior;
+ for(p=*pp; p; p=*pp){
+ if(p->index >= 0){
+ /* more recent dependency has succeeded; this one can go */
+ if(dmap[p->index/8] & (1<<(p->index%8)))
+ goto ignblock;
+ }
+
+ lockfail = 0;
+ bb = _cacheLocalLookup(c, p->part, p->addr, p->vers, waitlock,
+ &lockfail);
+ if(bb == nil){
+ if(lockfail)
+ return 0;
+ /* block not in cache => was written already */
+ dmap[p->index/8] |= 1<<(p->index%8);
+ goto ignblock;
+ }
+
+ /*
+ * same version of block is still in cache.
+ *
+ * the assertion is true because the block still has version p->vers,
+ * which means it hasn't been written out since we last saw it.
+ */
+ if(bb->iostate != BioDirty){
+ fprint(2, "%s: %d:%x:%d iostate is %d in blockWrite\n",
+ argv0, bb->part, bb->addr, bb->l.type, bb->iostate);
+ /* probably BioWriting if it happens? */
+ if(bb->iostate == BioClean)
+ goto ignblock;
+ }
+
+ blockPut(bb);
+
+ if(p->index < 0){
+ /*
+ * We don't know how to temporarily undo
+ * b's dependency on bb, so just don't write b yet.
+ */
+ if(0) fprint(2, "%s: blockWrite skipping %d %x %d %d; need to write %d %x %d\n",
+ argv0, b->part, b->addr, b->vers, b->l.type, p->part, p->addr, bb->vers);
+ return 0;
+ }
+ /* keep walking down the list */
+ pp = &p->next;
+ continue;
+
+ignblock:
+ *pp = p->next;
+ blistFree(c, p);
+ continue;
+ }
+
+ /*
+ * DiskWrite must never be called with a double-locked block.
+ * This call to diskWrite is okay because blockWrite is only called
+ * from the cache flush thread, which never double-locks a block.
+ */
+ diskWrite(c->disk, b);
+ return 1;
+}
+
+/*
+ * Change the I/O state of block b.
+ * Just an assignment except for magic in
+ * switch statement (read comments there).
+ */
+void
+blockSetIOState(Block *b, int iostate)
+{
+ int dowakeup;
+ Cache *c;
+ BList *p, *q;
+
+if(0) fprint(2, "%s: iostate part=%d addr=%x %s->%s\n", argv0, b->part, b->addr, bioStr(b->iostate), bioStr(iostate));
+
+ c = b->c;
+
+ dowakeup = 0;
+ switch(iostate){
+ default:
+ abort();
+ case BioEmpty:
+ assert(!b->uhead);
+ break;
+ case BioLabel:
+ assert(!b->uhead);
+ break;
+ case BioClean:
+ bwatchDependency(b);
+ /*
+ * If b->prior is set, it means a write just finished.
+ * The prior list isn't needed anymore.
+ */
+ for(p=b->prior; p; p=q){
+ q = p->next;
+ blistFree(c, p);
+ }
+ b->prior = nil;
+ /*
+ * Freeing a block or just finished a write.
+ * Move the blocks from the per-block unlink
+ * queue to the cache unlink queue.
+ */
+ if(b->iostate == BioDirty || b->iostate == BioWriting){
+ vtLock(c->lk);
+ c->ndirty--;
+ b->iostate = iostate; /* change here to keep in sync with ndirty */
+ b->vers = c->vers++;
+ if(b->uhead){
+ /* add unlink blocks to unlink queue */
+ if(c->uhead == nil){
+ c->uhead = b->uhead;
+ vtWakeup(c->unlink);
+ }else
+ c->utail->next = b->uhead;
+ c->utail = b->utail;
+ b->uhead = nil;
+ }
+ vtUnlock(c->lk);
+ }
+ assert(!b->uhead);
+ dowakeup = 1;
+ break;
+ case BioDirty:
+ /*
+ * Wrote out an old version of the block (see blockRollback).
+ * Bump a version count, leave it dirty.
+ */
+ if(b->iostate == BioWriting){
+ vtLock(c->lk);
+ b->vers = c->vers++;
+ vtUnlock(c->lk);
+ dowakeup = 1;
+ }
+ break;
+ case BioReading:
+ case BioWriting:
+ /*
+ * Adding block to disk queue. Bump reference count.
+ * diskThread decs the count later by calling blockPut.
+ * This is here because we need to lock c->lk to
+ * manipulate the ref count.
+ */
+ vtLock(c->lk);
+ b->ref++;
+ vtUnlock(c->lk);
+ break;
+ case BioReadError:
+ case BioVentiError:
+ /*
+ * Oops.
+ */
+ dowakeup = 1;
+ break;
+ }
+ b->iostate = iostate;
+ /*
+ * Now that the state has changed, we can wake the waiters.
+ */
+ if(dowakeup)
+ vtWakeupAll(b->ioready);
+}
+
+/*
+ * The active file system is a tree of blocks.
+ * When we add snapshots to the mix, the entire file system
+ * becomes a dag and thus requires a bit more care.
+ *
+ * The life of the file system is divided into epochs. A snapshot
+ * ends one epoch and begins the next. Each file system block
+ * is marked with the epoch in which it was created (b.epoch).
+ * When the block is unlinked from the file system (closed), it is marked
+ * with the epoch in which it was removed (b.epochClose).
+ * Once we have discarded or archived all snapshots up to
+ * b.epochClose, we can reclaim the block.
+ *
+ * If a block was created in a past epoch but is not yet closed,
+ * it is treated as copy-on-write. Of course, in order to insert the
+ * new pointer into the tree, the parent must be made writable,
+ * and so on up the tree. The recursion stops because the root
+ * block is always writable.
+ *
+ * If blocks are never closed, they will never be reused, and
+ * we will run out of disk space. But marking a block as closed
+ * requires some care about dependencies and write orderings.
+ *
+ * (1) If a block p points at a copy-on-write block b and we
+ * copy b to create bb, then p must be written out after bb and
+ * lbb (bb's label block).
+ *
+ * (2) We have to mark b as closed, but only after we switch
+ * the pointer, so lb must be written out after p. In fact, we
+ * can't even update the in-memory copy, or the cache might
+ * mistakenly give out b for reuse before p gets written.
+ *
+ * CacheAllocBlock's call to blockSetLabel records a "bb after lbb" dependency.
+ * The caller is expected to record a "p after bb" dependency
+ * to finish (1), and also expected to call blockRemoveLink
+ * to arrange for (2) to happen once p is written.
+ *
+ * Until (2) happens, some pieces of the code (e.g., the archiver)
+ * still need to know whether a block has been copied, so we
+ * set the BsCopied bit in the label and force that to disk *before*
+ * the copy gets written out.
+ */
+Block*
+blockCopy(Block *b, u32int tag, u32int ehi, u32int elo)
+{
+ Block *bb, *lb;
+ Label l;
+
+ if((b->l.state&BsClosed) || b->l.epoch >= ehi)
+ fprint(2, "%s: blockCopy %#ux %L but fs is [%ud,%ud]\n",
+ argv0, b->addr, &b->l, elo, ehi);
+
+ bb = cacheAllocBlock(b->c, b->l.type, tag, ehi, elo);
+ if(bb == nil){
+ blockPut(b);
+ return nil;
+ }
+
+ /*
+ * Update label so we know the block has been copied.
+ * (It will be marked closed once it has been unlinked from
+ * the tree.) This must follow cacheAllocBlock since we
+ * can't be holding onto lb when we call cacheAllocBlock.
+ */
+ if((b->l.state&BsCopied)==0)
+ if(b->part == PartData){ /* not the superblock */
+ l = b->l;
+ l.state |= BsCopied;
+ lb = _blockSetLabel(b, &l);
+ if(lb == nil){
+ /* can't set label => can't copy block */
+ blockPut(b);
+ l.type = BtMax;
+ l.state = BsFree;
+ l.epoch = 0;
+ l.epochClose = 0;
+ l.tag = 0;
+ blockSetLabel(bb, &l, 0);
+ blockPut(bb);
+ return nil;
+ }
+ blockDependency(bb, lb, -1, nil, nil);
+ blockPut(lb);
+ }
+
+ memmove(bb->data, b->data, b->c->size);
+ blockDirty(bb);
+ blockPut(b);
+ return bb;
+}
+
+/*
+ * Block b once pointed at the block bb at addr/type/tag, but no longer does.
+ * If recurse is set, we are unlinking all of bb's children as well.
+ *
+ * We can't reclaim bb (or its kids) until the block b gets written to disk. We add
+ * the relevant information to b's list of unlinked blocks. Once b is written,
+ * the list will be queued for processing.
+ *
+ * If b depends on bb, it doesn't anymore, so we remove bb from the prior list.
+ */
+void
+blockRemoveLink(Block *b, u32int addr, int type, u32int tag, int recurse)
+{
+ BList *p, **pp, bl;
+
+ /* remove bb from prior list */
+ for(pp=&b->prior; (p=*pp)!=nil; ){
+ if(p->part == PartData && p->addr == addr){
+ *pp = p->next;
+ blistFree(b->c, p);
+ }else
+ pp = &p->next;
+ }
+
+ bl.part = PartData;
+ bl.addr = addr;
+ bl.type = type;
+ bl.tag = tag;
+ if(b->l.epoch == 0)
+ assert(b->part == PartSuper);
+ bl.epoch = b->l.epoch;
+ bl.next = nil;
+ bl.recurse = recurse;
+
+ if(b->part == PartSuper && b->iostate == BioClean)
+ p = nil;
+ else
+ p = blistAlloc(b);
+ if(p == nil){
+ /*
+ * b has already been written to disk.
+ */
+ doRemoveLink(b->c, &bl);
+ return;
+ }
+
+ /* Uhead is only processed when the block goes from Dirty -> Clean */
+ assert(b->iostate == BioDirty);
+
+ *p = bl;
+ if(b->uhead == nil)
+ b->uhead = p;
+ else
+ b->utail->next = p;
+ b->utail = p;
+}
+
+/*
+ * Process removal of a single block and perhaps its children.
+ */
+static void
+doRemoveLink(Cache *c, BList *p)
+{
+ int i, n, recurse;
+ u32int a;
+ Block *b;
+ Label l;
+ BList bl;
+
+ recurse = (p->recurse && p->type != BtData && p->type != BtDir);
+
+ /*
+ * We're not really going to overwrite b, but if we're not
+ * going to look at its contents, there is no point in reading
+ * them from the disk.
+ */
+ b = cacheLocalData(c, p->addr, p->type, p->tag, recurse ? OReadOnly : OOverWrite, 0);
+ if(b == nil)
+ return;
+
+ /*
+ * When we're unlinking from the superblock, close with the next epoch.
+ */
+ if(p->epoch == 0)
+ p->epoch = b->l.epoch+1;
+
+ /* sanity check */
+ if(b->l.epoch > p->epoch){
+ fprint(2, "%s: doRemoveLink: strange epoch %ud > %ud\n",
+ argv0, b->l.epoch, p->epoch);
+ blockPut(b);
+ return;
+ }
+
+ if(recurse){
+ n = c->size / VtScoreSize;
+ for(i=0; i<n; i++){
+ a = globalToLocal(b->data + i*VtScoreSize);
+ if(a == NilBlock || !readLabel(c, &l, a))
+ continue;
+ if(l.state&BsClosed)
+ continue;
+ /*
+ * If stack space becomes an issue...
+ p->addr = a;
+ p->type = l.type;
+ p->tag = l.tag;
+ doRemoveLink(c, p);
+ */
+
+ bl.part = PartData;
+ bl.addr = a;
+ bl.type = l.type;
+ bl.tag = l.tag;
+ bl.epoch = p->epoch;
+ bl.next = nil;
+ bl.recurse = 1;
+ /* give up the block lock - share with others */
+ blockPut(b);
+ doRemoveLink(c, &bl);
+ b = cacheLocalData(c, p->addr, p->type, p->tag, OReadOnly, 0);
+ if(b == nil){
+ fprint(2, "%s: warning: lost block in doRemoveLink\n",
+ argv0);
+ return;
+ }
+ }
+ }
+
+ l = b->l;
+ l.state |= BsClosed;
+ l.epochClose = p->epoch;
+ if(l.epochClose == l.epoch){
+ vtLock(c->fl->lk);
+ if(l.epoch == c->fl->epochLow)
+ c->fl->nused--;
+ blockSetLabel(b, &l, 0);
+ vtUnlock(c->fl->lk);
+ }else
+ blockSetLabel(b, &l, 0);
+ blockPut(b);
+}
+
+/*
+ * Allocate a BList so that we can record a dependency
+ * or queue a removal related to block b.
+ * If we can't find a BList, we write out b and return nil.
+ */
+static BList *
+blistAlloc(Block *b)
+{
+ Cache *c;
+ BList *p;
+
+ if(b->iostate != BioDirty){
+ /*
+ * should not happen anymore -
+ * blockDirty used to flush but no longer does.
+ */
+ assert(b->iostate == BioClean);
+ fprint(2, "%s: blistAlloc: called on clean block\n", argv0);
+ return nil;
+ }
+
+ c = b->c;
+ vtLock(c->lk);
+ if(c->blfree == nil){
+ /*
+ * No free BLists. What are our options?
+ */
+
+ /* Block has no priors? Just write it. */
+ if(b->prior == nil){
+ vtUnlock(c->lk);
+ diskWriteAndWait(c->disk, b);
+ return nil;
+ }
+
+ /*
+ * Wake the flush thread, which will hopefully free up
+ * some BLists for us. We used to flush a block from
+ * our own prior list and reclaim that BList, but this is
+ * a no-no: some of the blocks on our prior list may
+ * be locked by our caller. Or maybe their label blocks
+ * are locked by our caller. In any event, it's too hard
+ * to make sure we can do I/O for ourselves. Instead,
+ * we assume the flush thread will find something.
+ * (The flush thread never blocks waiting for a block,
+ * so it can't deadlock like we can.)
+ */
+ while(c->blfree == nil){
+ vtWakeup(c->flush);
+ vtSleep(c->blrend);
+ if(c->blfree == nil)
+ fprint(2, "%s: flushing for blists\n", argv0);
+ }
+ }
+
+ p = c->blfree;
+ c->blfree = p->next;
+ vtUnlock(c->lk);
+ return p;
+}
+
+static void
+blistFree(Cache *c, BList *bl)
+{
+ vtLock(c->lk);
+ bl->next = c->blfree;
+ c->blfree = bl;
+ vtWakeup(c->blrend);
+ vtUnlock(c->lk);
+}
+
+char*
+bsStr(int state)
+{
+ static char s[100];
+
+ if(state == BsFree)
+ return "Free";
+ if(state == BsBad)
+ return "Bad";
+
+ sprint(s, "%x", state);
+ if(!(state&BsAlloc))
+ strcat(s, ",Free"); /* should not happen */
+ if(state&BsCopied)
+ strcat(s, ",Copied");
+ if(state&BsVenti)
+ strcat(s, ",Venti");
+ if(state&BsClosed)
+ strcat(s, ",Closed");
+ return s;
+}
+
+char *
+bioStr(int iostate)
+{
+ switch(iostate){
+ default:
+ return "Unknown!!";
+ case BioEmpty:
+ return "Empty";
+ case BioLabel:
+ return "Label";
+ case BioClean:
+ return "Clean";
+ case BioDirty:
+ return "Dirty";
+ case BioReading:
+ return "Reading";
+ case BioWriting:
+ return "Writing";
+ case BioReadError:
+ return "ReadError";
+ case BioVentiError:
+ return "VentiError";
+ case BioMax:
+ return "Max";
+ }
+}
+
+static char *bttab[] = {
+ "BtData",
+ "BtData+1",
+ "BtData+2",
+ "BtData+3",
+ "BtData+4",
+ "BtData+5",
+ "BtData+6",
+ "BtData+7",
+ "BtDir",
+ "BtDir+1",
+ "BtDir+2",
+ "BtDir+3",
+ "BtDir+4",
+ "BtDir+5",
+ "BtDir+6",
+ "BtDir+7",
+};
+
+char*
+btStr(int type)
+{
+ if(type < nelem(bttab))
+ return bttab[type];
+ return "unknown";
+}
+
+int
+labelFmt(Fmt *f)
+{
+ Label *l;
+
+ l = va_arg(f->args, Label*);
+ return fmtprint(f, "%s,%s,e=%ud,%d,tag=%#ux",
+ btStr(l->type), bsStr(l->state), l->epoch, (int)l->epochClose, l->tag);
+}
+
+int
+scoreFmt(Fmt *f)
+{
+ uchar *v;
+ int i;
+ u32int addr;
+
+ v = va_arg(f->args, uchar*);
+ if(v == nil){
+ fmtprint(f, "*");
+ }else if((addr = globalToLocal(v)) != NilBlock)
+ fmtprint(f, "0x%.8ux", addr);
+ else{
+ for(i = 0; i < VtScoreSize; i++)
+ fmtprint(f, "%2.2ux", v[i]);
+ }
+
+ return 0;
+}
+
+static int
+upHeap(int i, Block *b)
+{
+ Block *bb;
+ u32int now;
+ int p;
+ Cache *c;
+
+ c = b->c;
+ now = c->now;
+ for(; i != 0; i = p){
+ p = (i - 1) >> 1;
+ bb = c->heap[p];
+ if(b->used - now >= bb->used - now)
+ break;
+ c->heap[i] = bb;
+ bb->heap = i;
+ }
+ c->heap[i] = b;
+ b->heap = i;
+
+ return i;
+}
+
+static int
+downHeap(int i, Block *b)
+{
+ Block *bb;
+ u32int now;
+ int k;
+ Cache *c;
+
+ c = b->c;
+ now = c->now;
+ for(; ; i = k){
+ k = (i << 1) + 1;
+ if(k >= c->nheap)
+ break;
+ if(k + 1 < c->nheap && c->heap[k]->used - now > c->heap[k + 1]->used - now)
+ k++;
+ bb = c->heap[k];
+ if(b->used - now <= bb->used - now)
+ break;
+ c->heap[i] = bb;
+ bb->heap = i;
+ }
+ c->heap[i] = b;
+ b->heap = i;
+ return i;
+}
+
+/*
+ * Delete a block from the heap.
+ * Called with c->lk held.
+ */
+static void
+heapDel(Block *b)
+{
+ int i, si;
+ Cache *c;
+
+ c = b->c;
+
+ si = b->heap;
+ if(si == BadHeap)
+ return;
+ b->heap = BadHeap;
+ c->nheap--;
+ if(si == c->nheap)
+ return;
+ b = c->heap[c->nheap];
+ i = upHeap(si, b);
+ if(i == si)
+ downHeap(i, b);
+}
+
+/*
+ * Insert a block into the heap.
+ * Called with c->lk held.
+ */
+static void
+heapIns(Block *b)
+{
+ assert(b->heap == BadHeap);
+ upHeap(b->c->nheap++, b);
+ vtWakeup(b->c->heapwait);
+}
+
+/*
+ * Get just the label for a block.
+ */
+int
+readLabel(Cache *c, Label *l, u32int addr)
+{
+ int lpb;
+ Block *b;
+ u32int a;
+
+ lpb = c->size / LabelSize;
+ a = addr / lpb;
+ b = cacheLocal(c, PartLabel, a, OReadOnly);
+ if(b == nil){
+ blockPut(b);
+ return 0;
+ }
+
+ if(!labelUnpack(l, b->data, addr%lpb)){
+ blockPut(b);
+ return 0;
+ }
+ blockPut(b);
+ return 1;
+}
+
+/*
+ * Process unlink queue.
+ * Called with c->lk held.
+ */
+static void
+unlinkBody(Cache *c)
+{
+ BList *p;
+
+ while(c->uhead != nil){
+ p = c->uhead;
+ c->uhead = p->next;
+ vtUnlock(c->lk);
+ doRemoveLink(c, p);
+ vtLock(c->lk);
+ p->next = c->blfree;
+ c->blfree = p;
+ }
+}
+
+/*
+ * Occasionally unlink the blocks on the cache unlink queue.
+ */
+static void
+unlinkThread(void *a)
+{
+ Cache *c = a;
+
+ vtThreadSetName("unlink");
+
+ vtLock(c->lk);
+ for(;;){
+ while(c->uhead == nil && c->die == nil)
+ vtSleep(c->unlink);
+ if(c->die != nil)
+ break;
+ unlinkBody(c);
+ }
+ c->ref--;
+ vtWakeup(c->die);
+ vtUnlock(c->lk);
+}
+
+static int
+baddrCmp(void *a0, void *a1)
+{
+ BAddr *b0, *b1;
+ b0 = a0;
+ b1 = a1;
+
+ if(b0->part < b1->part)
+ return -1;
+ if(b0->part > b1->part)
+ return 1;
+ if(b0->addr < b1->addr)
+ return -1;
+ if(b0->addr > b1->addr)
+ return 1;
+ return 0;
+}
+
+/*
+ * Scan the block list for dirty blocks; add them to the list c->baddr.
+ */
+static void
+flushFill(Cache *c)
+{
+ int i, ndirty;
+ BAddr *p;
+ Block *b;
+
+ vtLock(c->lk);
+ if(c->ndirty == 0){
+ vtUnlock(c->lk);
+ return;
+ }
+
+ p = c->baddr;
+ ndirty = 0;
+ for(i=0; i<c->nblocks; i++){
+ b = c->blocks + i;
+ if(b->part == PartError)
+ continue;
+ if(b->iostate == BioDirty || b->iostate == BioWriting)
+ ndirty++;
+ if(b->iostate != BioDirty)
+ continue;
+ p->part = b->part;
+ p->addr = b->addr;
+ p->vers = b->vers;
+ p++;
+ }
+ if(ndirty != c->ndirty){
+ fprint(2, "%s: ndirty mismatch expected %d found %d\n",
+ argv0, c->ndirty, ndirty);
+ c->ndirty = ndirty;
+ }
+ vtUnlock(c->lk);
+
+ c->bw = p - c->baddr;
+ qsort(c->baddr, c->bw, sizeof(BAddr), baddrCmp);
+}
+
+/*
+ * This is not thread safe, i.e. it can't be called from multiple threads.
+ *
+ * It's okay how we use it, because it only gets called in
+ * the flushThread. And cacheFree, but only after
+ * cacheFree has killed off the flushThread.
+ */
+static int
+cacheFlushBlock(Cache *c)
+{
+ Block *b;
+ BAddr *p;
+ int lockfail, nfail;
+
+ nfail = 0;
+ for(;;){
+ if(c->br == c->be){
+ if(c->bw == 0 || c->bw == c->be)
+ flushFill(c);
+ c->br = 0;
+ c->be = c->bw;
+ c->bw = 0;
+ c->nflush = 0;
+ }
+
+ if(c->br == c->be)
+ return 0;
+ p = c->baddr + c->br;
+ c->br++;
+ b = _cacheLocalLookup(c, p->part, p->addr, p->vers, Nowaitlock,
+ &lockfail);
+
+ if(b && blockWrite(b, Nowaitlock)){
+ c->nflush++;
+ blockPut(b);
+ return 1;
+ }
+ if(b)
+ blockPut(b);
+
+ /*
+ * Why didn't we write the block?
+ */
+
+ /* Block already written out */
+ if(b == nil && !lockfail)
+ continue;
+
+ /* Failed to acquire lock; sleep if happens a lot. */
+ if(lockfail && ++nfail > 100){
+ sleep(500);
+ nfail = 0;
+ }
+ /* Requeue block. */
+ if(c->bw < c->be)
+ c->baddr[c->bw++] = *p;
+ }
+}
+
+/*
+ * Occasionally flush dirty blocks from memory to the disk.
+ */
+static void
+flushThread(void *a)
+{
+ Cache *c = a;
+ int i;
+
+ vtThreadSetName("flush");
+ vtLock(c->lk);
+ while(c->die == nil){
+ vtSleep(c->flush);
+ vtUnlock(c->lk);
+ for(i=0; i<FlushSize; i++)
+ if(!cacheFlushBlock(c)){
+ /*
+ * If i==0, could be someone is waking us repeatedly
+ * to flush the cache but there's no work to do.
+ * Pause a little.
+ */
+ if(i==0){
+ // fprint(2, "%s: flushthread found "
+ // "nothing to flush - %d dirty\n",
+ // argv0, c->ndirty);
+ sleep(250);
+ }
+ break;
+ }
+ if(i==0 && c->ndirty){
+ /*
+ * All the blocks are being written right now -- there's nothing to do.
+ * We might be spinning with cacheFlush though -- he'll just keep
+ * kicking us until c->ndirty goes down. Probably we should sleep
+ * on something that the diskThread can kick, but for now we'll
+ * just pause for a little while waiting for disks to finish.
+ */
+ sleep(100);
+ }
+ vtLock(c->lk);
+ vtWakeupAll(c->flushwait);
+ }
+ c->ref--;
+ vtWakeup(c->die);
+ vtUnlock(c->lk);
+}
+
+/*
+ * Flush the cache.
+ */
+void
+cacheFlush(Cache *c, int wait)
+{
+ vtLock(c->lk);
+ if(wait){
+ while(c->ndirty){
+ // consPrint("cacheFlush: %d dirty blocks, uhead %p\n",
+ // c->ndirty, c->uhead);
+ vtWakeup(c->flush);
+ vtSleep(c->flushwait);
+ }
+ // consPrint("cacheFlush: done (uhead %p)\n", c->ndirty, c->uhead);
+ }else if(c->ndirty)
+ vtWakeup(c->flush);
+ vtUnlock(c->lk);
+}
+
+/*
+ * Kick the flushThread every 30 seconds.
+ */
+static void
+cacheSync(void *v)
+{
+ Cache *c;
+
+ c = v;
+ cacheFlush(c, 0);
+}