/* * The locking here is getting a little out of hand. */ #include "stdinc.h" #include "dat.h" #include "fns.h" typedef struct DCache DCache; enum { HashLog = 9, HashSize = 1<<HashLog, HashMask = HashSize - 1, }; struct DCache { QLock lock; RWLock dirtylock; /* must be held to inspect or set b->dirty */ u32int flushround; Rendez anydirty; Rendez full; Rendez flush; Rendez flushdone; DBlock *free; /* list of available lumps */ u32int now; /* ticks for usage timestamps */ int size; /* max. size of any block; allocated to each block */ DBlock **heads; /* hash table for finding address */ int nheap; /* number of available victims */ DBlock **heap; /* heap for locating victims */ int nblocks; /* number of blocks allocated */ DBlock *blocks; /* array of block descriptors */ DBlock **write; /* array of block pointers to be written */ u8int *mem; /* memory for all block descriptors */ int ndirty; /* number of dirty blocks */ int maxdirty; /* max. number of dirty blocks */ }; static DCache dcache; static int downheap(int i, DBlock *b); static int upheap(int i, DBlock *b); static DBlock *bumpdblock(void); static void delheap(DBlock *db); static void fixheap(int i, DBlock *b); static void _flushdcache(void); static void flushproc(void*); static void flushtimerproc(void*); static void writeproc(void*); void initdcache(u32int mem) { DBlock *b, *last; u32int nblocks, blocksize; int i; if(mem < maxblocksize * 2) sysfatal("need at least %d bytes for the disk cache", maxblocksize * 2); if(maxblocksize == 0) sysfatal("no max. block size given for disk cache"); blocksize = maxblocksize; nblocks = mem / blocksize; dcache.full.l = &dcache.lock; dcache.flush.l = &dcache.lock; dcache.anydirty.l = &dcache.lock; dcache.flushdone.l = &dcache.lock; dcache.nblocks = nblocks; dcache.maxdirty = (nblocks * 3) / 4; if(1) fprint(2, "initialize disk cache with %d blocks of %d bytes, maximum %d dirty blocks\n", nblocks, blocksize, dcache.maxdirty); dcache.size = blocksize; dcache.heads = MKNZ(DBlock*, HashSize); dcache.heap = MKNZ(DBlock*, nblocks); dcache.blocks = MKNZ(DBlock, nblocks); dcache.write = MKNZ(DBlock*, nblocks); dcache.mem = MKNZ(u8int, nblocks * blocksize); last = nil; for(i = 0; i < nblocks; i++){ b = &dcache.blocks[i]; b->data = &dcache.mem[i * blocksize]; b->heap = TWID32; chaninit(&b->writedonechan, sizeof(void*), 1); b->next = last; last = b; } dcache.free = last; dcache.nheap = 0; vtproc(flushproc, nil); vtproc(flushtimerproc, nil); } static u32int pbhash(u64int addr) { u32int h; #define hashit(c) ((((c) * 0x6b43a9b5) >> (32 - HashLog)) & HashMask) h = (addr >> 32) ^ addr; return hashit(h); } DBlock* getdblock(Part *part, u64int addr, int read) { DBlock *b; u32int h, size; size = part->blocksize; if(size > dcache.size){ seterr(EAdmin, "block size %d too big for cache", size); return nil; } h = pbhash(addr); /* * look for the block in the cache */ //checkdcache(); qlock(&dcache.lock); again: for(b = dcache.heads[h]; b != nil; b = b->next){ if(b->part == part && b->addr == addr){ qlock(&stats.lock); stats.pchit++; qunlock(&stats.lock); goto found; } } qlock(&stats.lock); stats.pcmiss++; qunlock(&stats.lock); /* * missed: locate the block with the oldest second to last use. * remove it from the heap, and fix up the heap. */ b = bumpdblock(); if(b == nil){ logerr(EAdmin, "all disk cache blocks in use"); rsleep(&dcache.full); goto again; } /* * the new block has no last use, so assume it happens sometime in the middle ZZZ this is not reasonable */ b->used = (b->used2 + dcache.now) / 2; /* * rechain the block on the correct hash chain */ b->next = dcache.heads[h]; dcache.heads[h] = b; if(b->next != nil) b->next->prev = b; b->prev = nil; b->addr = addr; b->part = part; b->size = 0; found: b->ref++; b->used2 = b->used; b->used = dcache.now++; if(b->heap != TWID32) fixheap(b->heap, b); qunlock(&dcache.lock); //checkdcache(); qlock(&b->lock); if(b->size != size){ if(b->size < size){ if(!read) memset(&b->data[b->size], 0, size - b->size); else{ if(readpart(part, addr + b->size, &b->data[b->size], size - b->size) < 0){ putdblock(b); return nil; } qlock(&stats.lock); stats.pcreads++; stats.pcbreads += size - b->size; qunlock(&stats.lock); } } b->size = size; } return b; } void putdblock(DBlock *b) { if(b == nil) return; if(b->dirtying){ b->dirtying = 0; runlock(&dcache.dirtylock); } qunlock(&b->lock); //checkdcache(); qlock(&dcache.lock); if(b->dirty) delheap(b); else if(--b->ref == 0){ if(b->heap == TWID32) upheap(dcache.nheap++, b); rwakeupall(&dcache.full); } qunlock(&dcache.lock); //checkdcache(); } void dirtydblock(DBlock *b, int dirty) { int odirty; Part *p; assert(b->ref != 0); /* * Because we use an rlock to keep track of how * many blocks are being dirtied (and a wlock to * stop dirtying), we cannot try to dirty two blocks * at the same time in the same thread -- if the * wlock happens between them, we get a deadlock. * * The only place in the code where we need to * dirty multiple blocks at once is in splitiblock, when * we split an index block. The new block has a dirty * flag of DirtyIndexSplit already, because it has to get * written to disk before the old block (DirtyIndex). * So when we see the DirtyIndexSplit block, we don't * acquire the rlock (and we don't set dirtying, so putdblock * won't drop the rlock). This kludginess means that * the caller will have to be sure to putdblock on the * new block before putdblock on the old block. */ if(dirty == DirtyIndexSplit){ /* caller should have another dirty block */ assert(!canwlock(&dcache.dirtylock)); /* this block should be clean */ assert(b->dirtying == 0); assert(b->dirty == 0); }else{ rlock(&dcache.dirtylock); assert(b->dirtying == 0); b->dirtying = 1; } qlock(&stats.lock); if(b->dirty) stats.absorbedwrites++; stats.dirtydblocks++; qunlock(&stats.lock); /* * In general, shouldn't mark any block as more than one * type, except that split index blocks are a subcase of index * blocks. Only clean blocks ever get marked DirtyIndexSplit, * though, so we don't need the opposite conjunction here. */ odirty = b->dirty; if(b->dirty) assert(b->dirty == dirty || (b->dirty==DirtyIndexSplit && dirty==DirtyIndex)); else b->dirty = dirty; p = b->part; if(p->writechan == nil){ fprint(2, "allocate write proc for part %s\n", p->name); /* XXX hope this doesn't fail! */ p->writechan = chancreate(sizeof(DBlock*), dcache.nblocks); vtproc(writeproc, p); } qlock(&dcache.lock); if(!odirty){ dcache.ndirty++; rwakeupall(&dcache.anydirty); } qunlock(&dcache.lock); } /* * remove some block from use and update the free list and counters */ static DBlock* bumpdblock(void) { int flushed; DBlock *b; ulong h; b = dcache.free; if(b != nil){ dcache.free = b->next; return b; } if(dcache.ndirty >= dcache.maxdirty) _flushdcache(); /* * remove blocks until we find one that is unused * referenced blocks are left in the heap even though * they can't be scavenged; this is simple a speed optimization */ flushed = 0; for(;;){ if(dcache.nheap == 0){ _flushdcache(); return nil; } b = dcache.heap[0]; delheap(b); if(!b->ref) break; } fprint(2, "bump %s at %llud\n", b->part->name, b->addr); /* * unchain the block */ if(b->prev == nil){ h = pbhash(b->addr); if(dcache.heads[h] != b) sysfatal("bad hash chains in disk cache"); dcache.heads[h] = b->next; }else b->prev->next = b->next; if(b->next != nil) b->next->prev = b->prev; return b; } /* * delete an arbitrary block from the heap */ static void delheap(DBlock *db) { if(db->heap == TWID32) return; fixheap(db->heap, dcache.heap[--dcache.nheap]); db->heap = TWID32; } /* * push an element up or down to it's correct new location */ static void fixheap(int i, DBlock *b) { if(upheap(i, b) == i) downheap(i, b); } static int upheap(int i, DBlock *b) { DBlock *bb; u32int now; int p; now = dcache.now; for(; i != 0; i = p){ p = (i - 1) >> 1; bb = dcache.heap[p]; if(b->used2 - now >= bb->used2 - now) break; dcache.heap[i] = bb; bb->heap = i; } dcache.heap[i] = b; b->heap = i; return i; } static int downheap(int i, DBlock *b) { DBlock *bb; u32int now; int k; now = dcache.now; for(; ; i = k){ k = (i << 1) + 1; if(k >= dcache.nheap) break; if(k + 1 < dcache.nheap && dcache.heap[k]->used2 - now > dcache.heap[k + 1]->used2 - now) k++; bb = dcache.heap[k]; if(b->used2 - now <= bb->used2 - now) break; dcache.heap[i] = bb; bb->heap = i; } dcache.heap[i] = b; b->heap = i; return i; } static void findblock(DBlock *bb) { DBlock *b, *last; int h; last = nil; h = pbhash(bb->addr); for(b = dcache.heads[h]; b != nil; b = b->next){ if(last != b->prev) sysfatal("bad prev link"); if(b == bb) return; last = b; } sysfatal("block missing from hash table"); } void checkdcache(void) { DBlock *b; u32int size, now; int i, k, refed, nfree; qlock(&dcache.lock); size = dcache.size; now = dcache.now; for(i = 0; i < dcache.nheap; i++){ if(dcache.heap[i]->heap != i) sysfatal("dc: mis-heaped at %d: %d", i, dcache.heap[i]->heap); if(i > 0 && dcache.heap[(i - 1) >> 1]->used2 - now > dcache.heap[i]->used2 - now) sysfatal("dc: bad heap ordering"); k = (i << 1) + 1; if(k < dcache.nheap && dcache.heap[i]->used2 - now > dcache.heap[k]->used2 - now) sysfatal("dc: bad heap ordering"); k++; if(k < dcache.nheap && dcache.heap[i]->used2 - now > dcache.heap[k]->used2 - now) sysfatal("dc: bad heap ordering"); } refed = 0; for(i = 0; i < dcache.nblocks; i++){ b = &dcache.blocks[i]; if(b->data != &dcache.mem[i * size]) sysfatal("dc: mis-blocked at %d", i); if(b->ref && b->heap == TWID32) refed++; if(b->addr) findblock(b); if(b->heap != TWID32 && dcache.heap[b->heap] != b) sysfatal("dc: spurious heap value"); } nfree = 0; for(b = dcache.free; b != nil; b = b->next){ if(b->addr != 0 || b->heap != TWID32) sysfatal("dc: bad free list"); nfree++; } if(dcache.nheap + nfree + refed != dcache.nblocks) sysfatal("dc: missing blocks: %d %d %d", dcache.nheap, refed, dcache.nblocks); qunlock(&dcache.lock); } void flushdcache(void) { u32int flushround; qlock(&dcache.lock); flushround = dcache.flushround; rwakeupall(&dcache.flush); while(flushround == dcache.flushround) rsleep(&dcache.flushdone); qunlock(&dcache.lock); } static void _flushdcache(void) { rwakeupall(&dcache.flush); } static int parallelwrites(DBlock **b, DBlock **eb, int dirty) { DBlock **p; for(p=b; p<eb && (*p)->dirty == dirty; p++) sendp((*p)->part->writechan, *p); for(p=b; p<eb && (*p)->dirty == dirty; p++) recvp(&(*p)->writedonechan); return p-b; } /* * Sort first by dirty flag, then by partition, then by address in partition. */ static int writeblockcmp(const void *va, const void *vb) { DBlock *a, *b; a = *(DBlock**)va; b = *(DBlock**)vb; if(a->dirty != b->dirty) return a->dirty - b->dirty; if(a->part != b->part){ if(a->part < b->part) return -1; if(a->part > b->part) return 1; } if(a->addr < b->addr) return -1; return 1; } static void flushtimerproc(void *v) { u32int round; for(;;){ qlock(&dcache.lock); while(dcache.ndirty == 0) rsleep(&dcache.anydirty); round = dcache.flushround; qunlock(&dcache.lock); sleep(60*1000); qlock(&dcache.lock); if(round == dcache.flushround){ rwakeupall(&dcache.flush); while(round == dcache.flushround) rsleep(&dcache.flushdone); } qunlock(&dcache.lock); } } static void flushproc(void *v) { int i, j, n; ulong t0; DBlock *b, **write; USED(v); for(;;){ qlock(&dcache.lock); dcache.flushround++; rwakeupall(&dcache.flushdone); rsleep(&dcache.flush); qunlock(&dcache.lock); fprint(2, "flushing dcache: t=0 ms\n"); t0 = nsec()/1000; /* * Because we don't record any dependencies at all, we must write out * all blocks currently dirty. Thus we must lock all the blocks that * are currently dirty. * * We grab dirtylock to stop the dirtying of new blocks. * Then we wait until all the current blocks finish being dirtied. * Now all the dirty blocks in the system are immutable (clean blocks * might still get recycled), so we can plan our disk writes. * * In a better scheme, dirtiers might lock the block for writing in getdblock, * so that flushproc could lock all the blocks here and then unlock them as it * finishes with them. */ fprint(2, "flushproc: wlock at t=%lud\n", (ulong)(nsec()/1000) - t0); wlock(&dcache.dirtylock); fprint(2, "flushproc: build list at t=%lud\n", (ulong)(nsec()/1000) - t0); write = dcache.write; n = 0; for(i=0; i<dcache.nblocks; i++){ b = &dcache.blocks[i]; if(b->dirty) write[n++] = b; } qsort(write, n, sizeof(write[0]), writeblockcmp); /* Write each stage of blocks out. */ fprint(2, "flushproc: write blocks at t=%lud\n", (ulong)(nsec()/1000) - t0); i = 0; for(j=1; j<DirtyMax; j++){ fprint(2, "flushproc: flush dirty %d at t=%lud\n", j, (ulong)(nsec()/1000) - t0); i += parallelwrites(write+i, write+n, j); } assert(i == n); fprint(2, "flushproc: update dirty bits at t=%lud\n", (ulong)(nsec()/1000) - t0); qlock(&dcache.lock); for(i=0; i<n; i++){ b = write[i]; b->dirty = 0; --dcache.ndirty; if(b->ref == 0 && b->heap == TWID32){ upheap(dcache.nheap++, b); rwakeupall(&dcache.full); } } qunlock(&dcache.lock); wunlock(&dcache.dirtylock); fprint(2, "flushproc: done at t=%lud\n", (ulong)(nsec()/1000) - t0); qlock(&stats.lock); stats.dcacheflushes++; stats.dcacheflushwrites += n; qunlock(&stats.lock); } } static void writeproc(void *v) { DBlock *b; Part *p; p = v; for(;;){ b = recvp(p->writechan); if(writepart(p, b->addr, b->data, b->size) < 0) fprint(2, "write error: %r\n"); /* XXX details! */ sendp(&b->writedonechan, b); } }