diff options
Diffstat (limited to 'src/cmd/upas/bayes/hash.c')
-rw-r--r-- | src/cmd/upas/bayes/hash.c | 312 |
1 files changed, 312 insertions, 0 deletions
diff --git a/src/cmd/upas/bayes/hash.c b/src/cmd/upas/bayes/hash.c new file mode 100644 index 00000000..1d0d0ac0 --- /dev/null +++ b/src/cmd/upas/bayes/hash.c @@ -0,0 +1,312 @@ +#include <u.h> +#include <libc.h> +#include <bio.h> +#include "hash.h" + +/*** + * String hash tables. + */ + +Stringtab *tfree; + +Stringtab* +taballoc(void) +{ + static Stringtab *t; + static uint nt; + + if(tfree){ + Stringtab *tt = tfree; + tfree = tt->link; + return tt; + } + + if(nt == 0){ + t = malloc(64000*sizeof(Stringtab)); + if(t == 0) + sysfatal("out of memory"); + nt = 64000; + } + nt--; + return t++; +} + +void +tabfree(Stringtab *tt) +{ + tt->link = tfree; + tfree = tt; +} + +char* +xstrdup(char *s, int len) +{ + char *r; + static char *t; + static int nt; + + if(nt < len){ + t = malloc(512*1024+len); + if(t == 0) + sysfatal("out of memory"); + nt = 512*1024; + } + r = t; + t += len; + nt -= len; + memmove(r, s, len); + return r; +} + +static uint +hash(char *s, int n) +{ + uint h; + uchar *p, *ep; + h = 0; + for(p=(uchar*)s, ep=p+n; p<ep; p++) + h = h*37 + *p; + return h; +} + +static void +rehash(Hash *hh) +{ + int h; + Stringtab *s; + + if(hh->nstab == 0) + hh->nstab = 1024; + else + hh->nstab = hh->ntab*2; + + free(hh->stab); + hh->stab = mallocz(hh->nstab*sizeof(Stringtab*), 1); + if(hh->stab == nil) + sysfatal("out of memory"); + + for(s=hh->all; s; s=s->link){ + h = hash(s->str, s->n) % hh->nstab; + s->hash = hh->stab[h]; + hh->stab[h] = s; + } +} + +Stringtab* +findstab(Hash *hh, char *str, int n, int create) +{ + uint h; + Stringtab *tab, **l; + + if(hh->nstab == 0) + rehash(hh); + + h = hash(str, n) % hh->nstab; + for(tab=hh->stab[h], l=&hh->stab[h]; tab; l=&tab->hash, tab=tab->hash) + if(n==tab->n && memcmp(str, tab->str, n) == 0){ + *l = tab->hash; + tab->hash = hh->stab[h]; + hh->stab[h] = tab; + return tab; + } + + if(!create) + return nil; + + hh->sorted = 0; + tab = taballoc(); + tab->str = xstrdup(str, n); + tab->hash = hh->stab[h]; + tab->link = hh->all; + hh->all = tab; + tab->n = n; + tab->count = 0; + tab->date = 0; + hh->stab[h] = tab; + + hh->ntab++; + if(hh->ntab > 2*hh->nstab && !(hh->ntab&(hh->ntab-1))) + rehash(hh); + return tab; +} + +int +scmp(Stringtab *a, Stringtab *b) +{ + int n, x; + + if(a == 0) + return 1; + if(b == 0) + return -1; + n = a->n; + if(n > b->n) + n = b->n; + x = memcmp(a->str, b->str, n); + if(x != 0) + return x; + if(a->n < b->n) + return -1; + if(a->n > b->n) + return 1; + return 0; /* shouldn't happen */ +} + +Stringtab* +merge(Stringtab *a, Stringtab *b) +{ + Stringtab *s, **l; + + l = &s; + while(a || b){ + if(scmp(a, b) < 0){ + *l = a; + l = &a->link; + a = a->link; + }else{ + *l = b; + l = &b->link; + b = b->link; + } + } + *l = 0; + return s; +} + +Stringtab* +mergesort(Stringtab *s) +{ + Stringtab *a, *b; + int delay; + + if(s == nil) + return nil; + if(s->link == nil) + return s; + + a = b = s; + delay = 1; + while(a && b){ + if(delay) /* easy way to handle 2-element list */ + delay = 0; + else + a = a->link; + if(b = b->link) + b = b->link; + } + + b = a->link; + a->link = nil; + + a = mergesort(s); + b = mergesort(b); + + return merge(a, b); +} + +Stringtab* +sortstab(Hash *hh) +{ + if(!hh->sorted){ + hh->all = mergesort(hh->all); + hh->sorted = 1; + } + return hh->all; +} + +int +Bwritehash(Biobuf *b, Hash *hh) +{ + Stringtab *s; + int now; + + now = time(0); + s = sortstab(hh); + Bprint(b, "# hash table\n"); + for(; s; s=s->link){ + if(s->count <= 0) + continue; + /* + * Entries that haven't been updated in thirty days get tossed. + */ + if(s->date+30*86400 < now) + continue; + Bwrite(b, s->str, s->n); + Bprint(b, "\t%d %d\n", s->count, s->date); + } + if(Bflush(b) == Beof) + return -1; + return 0; +} + +void +Breadhash(Biobuf *b, Hash *hh, int scale) +{ + char *s; + char *t; + int n; + int date; + Stringtab *st; + + s = Brdstr(b, '\n', 1); + if(s == nil) + return; + if(strcmp(s, "# hash table") != 0) + sysfatal("bad hash table format"); + + while(s = Brdline(b, '\n')){ + s[Blinelen(b)-1] = 0; + t = strrchr(s, '\t'); + if(t == nil) + sysfatal("bad hash table format"); + *t++ = '\0'; + if(*t < '0' || *t > '9') + sysfatal("bad hash table format"); + n = strtol(t, &t, 10); + date = time(0); + if(*t != 0){ + if(*t == ' '){ + t++; + date = strtol(t, &t, 10); + } + if(*t != 0) + sysfatal("bad hash table format"); + } + st = findstab(hh, s, strlen(s), 1); + if(date > st->date) + st->date = date; + st->count += n*scale; + } +} + +void +freehash(Hash *h) +{ + Stringtab *s, *next; + + for(s=h->all; s; s=next){ + next = s->link; + tabfree(s); + } + free(h->stab); + free(h); +} + +Biobuf* +Bopenlock(char *file, int mode) +{ + int i; + Biobuf *b; + char err[ERRMAX]; + + b = nil; + for(i=0; i<120; i++){ + if((b = Bopen(file, mode)) != nil) + break; + rerrstr(err, sizeof err); + if(strstr(err, "file is locked")==nil && strstr(err, "exclusive lock")==nil) + break; + sleep(1000); + } + return b; +} |