diff options
author | rsc <devnull@localhost> | 2005-07-12 15:23:36 +0000 |
---|---|---|
committer | rsc <devnull@localhost> | 2005-07-12 15:23:36 +0000 |
commit | a0d146edd7a7de6236a0d60baafeeb59f8452aae (patch) | |
tree | b55baa526d9f5adfc73246e6ee2fadf455e0b7a2 /src/cmd/venti/srv/dat.h | |
parent | 88bb285e3d87ec2508840af33f7e0af53ec3c13c (diff) | |
download | plan9port-a0d146edd7a7de6236a0d60baafeeb59f8452aae.tar.gz plan9port-a0d146edd7a7de6236a0d60baafeeb59f8452aae.tar.bz2 plan9port-a0d146edd7a7de6236a0d60baafeeb59f8452aae.zip |
return of venti
Diffstat (limited to 'src/cmd/venti/srv/dat.h')
-rw-r--r-- | src/cmd/venti/srv/dat.h | 718 |
1 files changed, 718 insertions, 0 deletions
diff --git a/src/cmd/venti/srv/dat.h b/src/cmd/venti/srv/dat.h new file mode 100644 index 00000000..5f6d1a3f --- /dev/null +++ b/src/cmd/venti/srv/dat.h @@ -0,0 +1,718 @@ +typedef struct Config Config; +typedef struct AMap AMap; +typedef struct AMapN AMapN; +typedef struct Arena Arena; +typedef struct AState AState; +typedef struct ArenaHead ArenaHead; +typedef struct ArenaPart ArenaPart; +typedef struct ArenaTail ArenaTail; +typedef struct ATailStats ATailStats; +typedef struct CIBlock CIBlock; +typedef struct Clump Clump; +typedef struct ClumpInfo ClumpInfo; +typedef struct Graph Graph; +typedef struct IAddr IAddr; +typedef struct IBucket IBucket; +typedef struct IEStream IEStream; +typedef struct IEntry IEntry; +typedef struct IFile IFile; +typedef struct ISect ISect; +typedef struct Index Index; +typedef struct Lump Lump; +typedef struct DBlock DBlock; +typedef struct Part Part; +typedef struct Statbin Statbin; +typedef struct Statdesc Statdesc; +typedef struct Stats Stats; +typedef struct ZBlock ZBlock; +typedef struct Round Round; +typedef struct Bloom Bloom; + +#define TWID32 ((u32int)~(u32int)0) +#define TWID64 ((u64int)~(u64int)0) +#define TWID8 ((u8int)~(u8int)0) + +enum +{ + ABlockLog = 9, /* log2(512), the quantum for reading arenas */ + ANameSize = 64, + MaxDiskBlock = 64*1024, /* max. allowed size for a disk block */ + MaxIoSize = 64*1024, /* max. allowed size for a disk io operation */ + PartBlank = 256*1024, /* untouched section at beginning of partition */ + HeadSize = 512, /* size of a header after PartBlank */ + MinArenaSize = 1*1024*1024, /* smallest reasonable arena size */ + IndexBase = 1024*1024, /* initial address to use in an index */ + MaxIo = 64*1024, /* max size of a single read or write operation */ + ICacheBits = 16, /* default bits for indexing icache */ + ICacheDepth = 4, /* default depth of an icache hash chain */ + MaxAMap = 2*1024, /* max. allowed arenas in an address mapping; must be < 32*1024 */ + + /* + * return codes from syncarena + */ + SyncDataErr = 1 << 0, /* problem reading the clump data */ + SyncCIErr = 1 << 1, /* found erroneous clump directory entries */ + SyncCIZero = 1 << 2, /* found unwritten clump directory entries */ + SyncFixErr = 1 << 3, /* error writing fixed data */ + SyncHeader = 1 << 4, /* altered header fields */ + + /* + * error severity + */ + EOk = 0, /* error expected in normal operation */ + EStrange, /* strange error that should be logged */ + ECorrupt, /* corrupted data found in arenas */ + EICorrupt, /* corrupted data found in index */ + EAdmin, /* should be brought to administrators' attention */ + ECrash, /* really bad internal error */ + EBug, /* a limitation which should be fixed */ + EInconsist, /* inconsistencies between index and arena */ + EMax, + + /* + * internal disk formats for the venti archival storage system + */ + /* + * magic numbers on disk + */ + _ClumpMagic = 0xd15cb10c, /* clump header, deprecated */ + ClumpFreeMagic = 0, /* free clump; terminates active clump log */ + + ArenaPartMagic = 0xa9e4a5e7, /* arena partition header */ + ArenaMagic = 0xf2a14ead, /* arena trailer */ + ArenaHeadMagic = 0xd15c4ead, /* arena header */ + + BloomMagic = 0xb1004ead, /* bloom filter header */ + BloomMaxHash = 32, + + ISectMagic = 0xd15c5ec7, /* index header */ + + ArenaPartVersion = 3, + ArenaVersion4 = 4, + ArenaVersion5 = 5, + BloomVersion = 1, + IndexVersion = 1, + ISectVersion1 = 1, + ISectVersion2 = 2, + + /* + * encodings of clumps on disk + */ + ClumpEErr = 0, /* can't happen */ + ClumpENone, /* plain */ + ClumpECompress, /* compressed */ + ClumpEMax, + + /* + * sizes in bytes on disk + */ + U8Size = 1, + U16Size = 2, + U32Size = 4, + U64Size = 8, + + ArenaPartSize = 4 * U32Size, + ArenaSize4 = 2 * U64Size + 6 * U32Size + ANameSize + U8Size, + ArenaSize5 = ArenaSize4 + U32Size, + ArenaHeadSize4 = U64Size + 3 * U32Size + ANameSize, + ArenaHeadSize5 = ArenaHeadSize4 + U32Size, + BloomHeadSize = 4 * U32Size, + ISectSize1 = 7 * U32Size + 2 * ANameSize, + ISectSize2 = ISectSize1 + U32Size, + ClumpInfoSize = U8Size + 2 * U16Size + VtScoreSize, + ClumpSize = ClumpInfoSize + U8Size + 3 * U32Size, + MaxBloomSize = 1<<(32-3), /* 2^32 bits */ + MaxBloomHash = 32, /* bits per score */ + /* + * BUG - The various block copies that manipulate entry buckets + * would be faster if we bumped IBucketSize up to 8 and IEntrySize up to 40, + * so that everything is word-aligned. Buildindex is actually cpu-bound + * by the (byte at a time) copying in qsort. + */ + IBucketSize = U32Size + U16Size, + IEntrySize = U64Size + U32Size + 2*U16Size + 2*U8Size + VtScoreSize, + IEntryTypeOff = VtScoreSize + U64Size + U32Size + 2 * U16Size, + + MaxClumpBlocks = (VtMaxLumpSize + ClumpSize + (1 << ABlockLog) - 1) >> ABlockLog, + + /* + * dirty flags - order controls disk write order + */ + DirtyArena = 1, + DirtyArenaCib, + DirtyArenaTrailer, + DirtyMax, + + VentiZZZZZZZZ +}; + +extern char TraceDisk[]; +extern char TraceLump[]; +extern char TraceBlock[]; +extern char TraceProc[]; +extern char TraceWork[]; +extern char TraceQuiet[]; +extern char TraceRpc[]; + +/* + * results of parsing and initializing a config file + */ +struct Config +{ + char *index; /* name of the index to initialize */ + int naparts; /* arena partitions initialized */ + ArenaPart **aparts; + int nsects; /* index sections initialized */ + ISect **sects; + Bloom *bloom; /* bloom filter */ + u32int bcmem; + u32int mem; + u32int icmem; + int queuewrites; + char* haddr; + char* vaddr; + char* webroot; +}; + +/* + * a Part is the low level interface to files or disks. + * there are two main types of partitions + * arena paritions, which some number of arenas, each in a sub-partition. + * index partition, which only have one subpartition. + */ +struct Part +{ + int fd; /* rock for accessing the disk */ + int mode; + u64int offset; + u64int size; /* size of the partiton */ + u32int blocksize; /* block size for reads and writes */ + u32int fsblocksize; /* minimum file system block size */ + char *name; + char *filename; + Channel *writechan; /* chan[dcache.nblock](DBlock*) */ +}; + +/* + * a cached block from the partition + * yuck -- most of this is internal structure for the cache + * all other routines should only use data + */ +struct DBlock +{ + u8int *data; + + Part *part; /* partition in which cached */ + u64int addr; /* base address on the partition */ + u32int size; /* amount of data available, not amount allocated; should go away */ + u32int mode; + u32int dirty; + u32int dirtying; + DBlock *next; /* doubly linked hash chains */ + DBlock *prev; + u32int heap; /* index in heap table */ + u32int used; /* last reference times */ + u32int used2; + u32int ref; /* reference count */ + RWLock lock; /* for access to data only */ + Channel *writedonechan; + void* chanbuf[1]; /* buffer for the chan! */ +}; + +/* + * a cached block from the partition + * yuck -- most of this is internal structure for the cache + * all other routines should only use data + * double yuck -- this is mostly the same as a DBlock + */ +struct Lump +{ + Packet *data; + + Part *part; /* partition in which cached */ + u8int score[VtScoreSize]; /* score of packet */ + u8int type; /* type of packet */ + u32int size; /* amount of data allocated to hold packet */ + Lump *next; /* doubly linked hash chains */ + Lump *prev; + u32int heap; /* index in heap table */ + u32int used; /* last reference times */ + u32int used2; + u32int ref; /* reference count */ + QLock lock; /* for access to data only */ +}; + +/* + * mapping between names and address ranges + */ +struct AMap +{ + u64int start; + u64int stop; + char name[ANameSize]; +}; + +/* + * an AMap along with a length + */ +struct AMapN +{ + int n; + AMap *map; +}; + +/* + * an ArenaPart is a partition made up of Arenas + * it exists because most os's don't support many partitions, + * and we want to have many different Arenas + */ +struct ArenaPart +{ + Part *part; + u64int size; /* size of underlying partition, rounded down to blocks */ + Arena **arenas; + u32int tabbase; /* base address of arena table on disk */ + u32int tabsize; /* max. bytes in arena table */ + + /* + * fields stored on disk + */ + u32int version; + u32int blocksize; /* "optimal" block size for reads and writes */ + u32int arenabase; /* base address of first arena */ + + /* + * stored in the arena mapping table on disk + */ + AMap *map; + int narenas; +}; + +/* + * info about one block in the clump info cache + */ +struct CIBlock +{ + u32int block; /* blocks in the directory */ + int offset; /* offsets of one clump in the data */ + DBlock *data; +}; + +/* + * Statistics kept in the tail. + */ +struct ATailStats +{ + u32int clumps; /* number of clumps */ + u32int cclumps; /* number of compressed clumps */ + u64int used; + u64int uncsize; + u8int sealed; +}; + +/* + * Arena state - represents a point in the data log + */ +struct AState +{ + Arena *arena; + u64int aa; /* index address */ + ATailStats stats; +}; + +/* + * an Arena is a log of Clumps, preceeded by an ArenaHeader, + * and followed by a Arena, each in one disk block. + * struct on disk is not always up to date, but should be self-consistent. + * to sync after reboot, follow clumps starting at used until ClumpFreeMagic if found. + * <struct name="Arena" type="Arena *"> + * <field name="name" val="s->name" type="AName"/> + * <field name="version" val="s->version" type="U32int"/> + * <field name="partition" val="s->part->name" type="AName"/> + * <field name="blocksize" val="s->blocksize" type="U32int"/> + * <field name="start" val="s->base" type="U64int"/> + * <field name="stop" val="s->base+2*s->blocksize" type="U64int"/> + * <field name="created" val="s->ctime" type="U32int"/> + * <field name="modified" val="s->wtime" type="U32int"/> + * <field name="sealed" val="s->sealed" type="Sealed"/> + * <field name="score" val="s->score" type="Score"/> + * <field name="clumps" val="s->clumps" type="U32int"/> + * <field name="compressedclumps" val="s->cclumps" type="U32int"/> + * <field name="data" val="s->uncsize" type="U64int"/> + * <field name="compresseddata" val="s->used - s->clumps * ClumpSize" type="U64int"/> + * <field name="storage" val="s->used + s->clumps * ClumpInfoSize" type="U64int"/> + * </struct> + */ +struct Arena +{ + QLock lock; /* lock for arena fields, writing to disk */ + Part *part; /* partition in which arena lives */ + int blocksize; /* size of block to read or write */ + u64int base; /* base address on disk */ + u64int size; /* total space in the arena */ + u64int limit; /* storage limit for clumps */ + u8int score[VtScoreSize]; /* score of the entire sealed & summed arena */ + + int clumpmax; /* ClumpInfos per block */ + AState mem; + int inqueue; + DigestState sha1; + + /* + * fields stored on disk + */ + u32int version; + char name[ANameSize]; /* text label */ + ATailStats memstats; + ATailStats diskstats; + u32int ctime; /* first time a block was written */ + u32int wtime; /* last time a block was written */ + u32int clumpmagic; +}; + +/* + * redundant storage of some fields at the beginning of each arena + */ +struct ArenaHead +{ + u32int version; + char name[ANameSize]; + u32int blocksize; + u64int size; + u32int clumpmagic; +}; + +/* + * most interesting meta information for a clump. + * stored in each clump's header and in the Arena's directory, + * stored in reverse order just prior to the arena trailer + */ +struct ClumpInfo +{ + u8int type; + u16int size; /* size of disk data, not including header */ + u16int uncsize; /* size of uncompressed data */ + u8int score[VtScoreSize]; /* score of the uncompressed data only */ +}; + +/* + * header for an immutable clump of data + */ +struct Clump +{ + ClumpInfo info; + u8int encoding; + u32int creator; /* initial client which wrote the block */ + u32int time; /* creation at gmt seconds since 1/1/1970 */ +}; + +/* + * index of all clumps according to their score + * this is just a wrapper to tie together the index sections + * <struct name="Index" type="Index *"> + * <field name="name" val="s->name" type="AName"/> + * <field name="version" val="s->version" type="U32int"/> + * <field name="blocksize" val="s->blocksize" type="U32int"/> + * <field name="tabsize" val="s->tabsize" type="U32int"/> + * <field name="buckets" val="s->buckets" type="U32int"/> + * <field name="buckdiv" val="s->div" type="U32int"/> + * <field name="bitblocks" val="s->div" type="U32int"/> + * <field name="maxdepth" val="s->div" type="U32int"/> + * <field name="bitkeylog" val="s->div" type="U32int"/> + * <field name="bitkeymask" val="s->div" type="U32int"/> + * <array name="sect" val="&s->smap[i]" elems="s->nsects" type="Amap"/> + * <array name="amap" val="&s->amap[i]" elems="s->narenas" type="Amap"/> + * <array name="arena" val="s->arenas[i]" elems="s->narenas" type="Arena"/> + * </struct> + * <struct name="Amap" type="AMap *"> + * <field name="name" val="s->name" type="AName"/> + * <field name="start" val="s->start" type="U64int"/> + * <field name="stop" val="s->stop" type="U64int"/> + * </struct> + */ +struct Index +{ + u32int div; /* divisor for mapping score to bucket */ + u32int buckets; /* last bucket used in disk hash table */ + u32int blocksize; + u32int tabsize; /* max. bytes in index config */ + u32int bitblocks; //XXX remove these fields + u32int maxdepth; + u32int bitkeylog; + u32int bitkeymask; + + int mapalloc; /* first arena to check when adding a lump */ + Arena **arenas; /* arenas in the mapping */ + ISect **sects; /* sections which hold the buckets */ + Bloom *bloom; /* bloom filter */ + + /* + * fields stored in config file + */ + u32int version; + char name[ANameSize]; /* text label */ + int nsects; + AMap *smap; /* mapping of buckets to index sections */ + int narenas; + AMap *amap; /* mapping from index addesses to arenas */ +}; + +/* + * one part of the bucket storage for an index. + * the index blocks are sequentially allocated + * across all of the sections. + */ +struct ISect +{ + Part *part; + int blocklog; /* log2(blocksize) */ + int buckmax; /* max. entries in a index bucket */ + u32int tabbase; /* base address of index config table on disk */ + u32int tabsize; /* max. bytes in index config */ + Channel *writechan; + Channel *writedonechan; + + /* + * fields stored on disk + */ + u32int version; + u32int bucketmagic; + char name[ANameSize]; /* text label */ + char index[ANameSize]; /* index owning the section */ + u32int blocksize; /* size of hash buckets in index */ + u32int blockbase; /* address of start of on disk index table */ + u32int blocks; /* total blocks on disk; some may be unused */ + u32int start; /* first bucket in this section */ + u32int stop; /* limit of buckets in this section */ +}; + +/* + * externally interesting part of an IEntry + */ +struct IAddr +{ + u64int addr; + u16int size; /* uncompressed size */ + u8int type; /* type of block */ + u8int blocks; /* arena io quanta for Clump + data */ +}; + +/* + * entries in the index + * kept in IBuckets in the disk index table, + * cached in the memory ICache. + */ +struct IEntry +{ + u8int score[VtScoreSize]; + IEntry *next; /* next in hash chain */ + IEntry *nextdirty; /* next in dirty chain */ + u32int wtime; /* last write time */ + u16int train; /* relative train containing the most recent ref; 0 if no ref, 1 if in same car */ + u8int rac; /* read ahead count */ + u8int dirty; /* is dirty */ + IAddr ia; +}; + +/* + * buckets in the on disk index table + */ +struct IBucket +{ + u16int n; /* number of active indices */ + u32int buck; /* used by buildindex/checkindex only */ + u8int *data; +}; + +/* + * temporary buffers used by individual threads + */ +struct ZBlock +{ + u32int len; + u32int _size; + u8int *data; + u8int *free; +}; + +/* + * simple input buffer for a '\0' terminated text file + */ +struct IFile +{ + char *name; /* name of the file */ + ZBlock *b; /* entire contents of file */ + u32int pos; /* current position in the file */ +}; + +struct Statdesc +{ + char *name; + ulong max; +}; + +/* keep in sync with stats.c:/statdesc and httpd.c:/graphname*/ +enum +{ + StatRpcTotal, + StatRpcRead, + StatRpcReadOk, + StatRpcReadFail, + StatRpcReadBytes, + StatRpcReadTime, + StatRpcReadCached, + StatRpcReadCachedTime, + StatRpcReadUncached, + StatRpcReadUncachedTime, + StatRpcWrite, + StatRpcWriteNew, + StatRpcWriteOld, + StatRpcWriteFail, + StatRpcWriteBytes, + StatRpcWriteTime, + StatRpcWriteNewTime, + StatRpcWriteOldTime, + + StatLcacheHit, + StatLcacheMiss, + StatLcacheRead, + StatLcacheWrite, + StatLcacheSize, + StatLcacheStall, + StatLcacheReadTime, + + StatDcacheHit, + StatDcacheMiss, + StatDcacheLookup, + StatDcacheRead, + StatDcacheWrite, + StatDcacheDirty, + StatDcacheSize, + StatDcacheFlush, + StatDcacheStall, + StatDcacheLookupTime, + + StatDblockStall, + StatLumpStall, + + StatIcacheHit, + StatIcacheMiss, + StatIcacheRead, + StatIcacheWrite, + StatIcacheFill, + StatIcachePrefetch, + StatIcacheDirty, + StatIcacheSize, + StatIcacheFlush, + StatIcacheStall, + StatIcacheReadTime, + + StatBloomHit, + StatBloomMiss, + StatBloomFalseMiss, + StatBloomLookup, + StatBloomOnes, + StatBloomBits, + StatBloomLookupTime, + + StatApartRead, + StatApartReadBytes, + StatApartWrite, + StatApartWriteBytes, + + StatIsectRead, + StatIsectReadBytes, + StatIsectWrite, + StatIsectWriteBytes, + + StatSumRead, + StatSumReadBytes, + + NStat +}; + +extern Statdesc statdesc[NStat]; + +/* + * statistics about the operation of the server + * mainly for performance monitoring and profiling. + */ +struct Stats +{ + ulong now; + ulong n[NStat]; +}; + +struct Statbin +{ + uint nsamp; + uint min; + uint max; + uint avg; +}; + +struct Graph +{ + long (*fn)(Stats*, Stats*, void*); + void *arg; + long t0; + long t1; + long min; + long max; + long wid; + long ht; + int fill; +}; + +/* + * for kicking background processes that run one round after another after another + */ +struct Round +{ + QLock lock; + Rendez start; + Rendez finish; + Rendez delaywait; + int delaytime; + int delaykick; + char* name; + int last; + int current; + int next; + int doanother; +}; + +/* + * Bloom filter of stored block hashes + */ +struct Bloom +{ + RWLock lk; /* protects nhash, nbits, tab, mb */ + QLock mod; /* one marker at a time, protects nb */ + int nhash; + ulong size; /* bytes in tab */ + ulong mask; /* to produce index */ + u8int *data; + Part *part; + Channel *writechan; + Channel *writedonechan; +}; + +extern Index *mainindex; +extern u32int maxblocksize; /* max. block size used by any partition */ +extern int paranoid; /* should verify hashes on disk read */ +extern int queuewrites; /* put all lump writes on a queue and finish later */ +extern int readonly; /* only allowed to read the disk data */ +extern Stats stats; +extern u8int zeroscore[VtScoreSize]; +extern int compressblocks; +extern int writestodevnull; /* dangerous - for performance debugging */ +extern int collectstats; +extern QLock memdrawlock; +extern int icachesleeptime; +extern int arenasumsleeptime; + +#ifndef PLAN9PORT +#pragma varargck type "V" uchar* +#define ODIRECT 0 +#endif |