diff options
author | rsc <devnull@localhost> | 2003-11-23 18:03:02 +0000 |
---|---|---|
committer | rsc <devnull@localhost> | 2003-11-23 18:03:02 +0000 |
commit | 5993a8f2756bc455101a8c9ce95347d5050e7883 (patch) | |
tree | df94fe66c46612af899e6622c808d42a52542b8c /src/cmd/diff/diffreg.c | |
parent | cfabc3ed1638efc186ebd26bdaa3dfb5663dff17 (diff) | |
download | plan9port-5993a8f2756bc455101a8c9ce95347d5050e7883.tar.gz plan9port-5993a8f2756bc455101a8c9ce95347d5050e7883.tar.bz2 plan9port-5993a8f2756bc455101a8c9ce95347d5050e7883.zip |
the one true diff.
at least it can handle "diff file1 file2 dir".
Diffstat (limited to 'src/cmd/diff/diffreg.c')
-rw-r--r-- | src/cmd/diff/diffreg.c | 419 |
1 files changed, 419 insertions, 0 deletions
diff --git a/src/cmd/diff/diffreg.c b/src/cmd/diff/diffreg.c new file mode 100644 index 00000000..87061898 --- /dev/null +++ b/src/cmd/diff/diffreg.c @@ -0,0 +1,419 @@ +#include <u.h> +#include <libc.h> +#include <bio.h> +#include "diff.h" + +/* diff - differential file comparison +* +* Uses an algorithm due to Harold Stone, which finds +* a pair of longest identical subsequences in the two +* files. +* +* The major goal is to generate the match vector J. +* J[i] is the index of the line in file1 corresponding +* to line i file0. J[i] = 0 if there is no +* such line in file1. +* +* Lines are hashed so as to work in core. All potential +* matches are located by sorting the lines of each file +* on the hash (called value). In particular, this +* collects the equivalence classes in file1 together. +* Subroutine equiv replaces the value of each line in +* file0 by the index of the first element of its +* matching equivalence in (the reordered) file1. +* To save space equiv squeezes file1 into a single +* array member in which the equivalence classes +* are simply concatenated, except that their first +* members are flagged by changing sign. +* +* Next the indices that point into member are unsorted into +* array class according to the original order of file0. +* +* The cleverness lies in routine stone. This marches +* through the lines of file0, developing a vector klist +* of "k-candidates". At step i a k-candidate is a matched +* pair of lines x,y (x in file0 y in file1) such that +* there is a common subsequence of lenght k +* between the first i lines of file0 and the first y +* lines of file1, but there is no such subsequence for +* any smaller y. x is the earliest possible mate to y +* that occurs in such a subsequence. +* +* Whenever any of the members of the equivalence class of +* lines in file1 matable to a line in file0 has serial number +* less than the y of some k-candidate, that k-candidate +* with the smallest such y is replaced. The new +* k-candidate is chained (via pred) to the current +* k-1 candidate so that the actual subsequence can +* be recovered. When a member has serial number greater +* that the y of all k-candidates, the klist is extended. +* At the end, the longest subsequence is pulled out +* and placed in the array J by unravel. +* +* With J in hand, the matches there recorded are +* check'ed against reality to assure that no spurious +* matches have crept in due to hashing. If they have, +* they are broken, and "jackpot " is recorded--a harmless +* matter except that a true match for a spuriously +* mated line may now be unnecessarily reported as a change. +* +* Much of the complexity of the program comes simply +* from trying to minimize core utilization and +* maximize the range of doable problems by dynamically +* allocating what is needed and reusing what is not. +* The core requirements for problems larger than somewhat +* are (in words) 2*length(file0) + length(file1) + +* 3*(number of k-candidates installed), typically about +* 6n words for files of length n. +*/ +/* TIDY THIS UP */ +struct cand { + int x; + int y; + int pred; +} cand; +struct line { + int serial; + int value; +} *file[2], line; +int len[2]; +int binary; +struct line *sfile[2]; /*shortened by pruning common prefix and suffix*/ +int slen[2]; +int pref, suff; /*length of prefix and suffix*/ +int *class; /*will be overlaid on file[0]*/ +int *member; /*will be overlaid on file[1]*/ +int *klist; /*will be overlaid on file[0] after class*/ +struct cand *clist; /* merely a free storage pot for candidates */ +int clen; +int *J; /*will be overlaid on class*/ +long *ixold; /*will be overlaid on klist*/ +long *ixnew; /*will be overlaid on file[1]*/ +/* END OF SOME TIDYING */ + +static void +sort(struct line *a, int n) /*shellsort CACM #201*/ +{ + int m; + struct line *ai, *aim, *j, *k; + struct line w; + int i; + + m = 0; + for (i = 1; i <= n; i *= 2) + m = 2*i - 1; + for (m /= 2; m != 0; m /= 2) { + k = a+(n-m); + for (j = a+1; j <= k; j++) { + ai = j; + aim = ai+m; + do { + if (aim->value > ai->value || + aim->value == ai->value && + aim->serial > ai->serial) + break; + w = *ai; + *ai = *aim; + *aim = w; + + aim = ai; + ai -= m; + } while (ai > a && aim >= ai); + } + } +} + +static void +unsort(struct line *f, int l, int *b) +{ + int *a; + int i; + + a = MALLOC(int, (l+1)); + for(i=1;i<=l;i++) + a[f[i].serial] = f[i].value; + for(i=1;i<=l;i++) + b[i] = a[i]; + FREE(a); +} + +static void +prune(void) +{ + int i,j; + + for(pref=0;pref<len[0]&&pref<len[1]&& + file[0][pref+1].value==file[1][pref+1].value; + pref++ ) ; + for(suff=0;suff<len[0]-pref&&suff<len[1]-pref&& + file[0][len[0]-suff].value==file[1][len[1]-suff].value; + suff++) ; + for(j=0;j<2;j++) { + sfile[j] = file[j]+pref; + slen[j] = len[j]-pref-suff; + for(i=0;i<=slen[j];i++) + sfile[j][i].serial = i; + } +} + +static void +equiv(struct line *a, int n, struct line *b, int m, int *c) +{ + int i, j; + + i = j = 1; + while(i<=n && j<=m) { + if(a[i].value < b[j].value) + a[i++].value = 0; + else if(a[i].value == b[j].value) + a[i++].value = j; + else + j++; + } + while(i <= n) + a[i++].value = 0; + b[m+1].value = 0; + j = 0; + while(++j <= m) { + c[j] = -b[j].serial; + while(b[j+1].value == b[j].value) { + j++; + c[j] = b[j].serial; + } + } + c[j] = -1; +} + +static int +newcand(int x, int y, int pred) +{ + struct cand *q; + + clist = REALLOC(clist, struct cand, (clen+1)); + q = clist + clen; + q->x = x; + q->y = y; + q->pred = pred; + return clen++; +} + +static int +search(int *c, int k, int y) +{ + int i, j, l; + int t; + + if(clist[c[k]].y < y) /*quick look for typical case*/ + return k+1; + i = 0; + j = k+1; + while((l=(i+j)/2) > i) { + t = clist[c[l]].y; + if(t > y) + j = l; + else if(t < y) + i = l; + else + return l; + } + return l+1; +} + +static int +stone(int *a, int n, int *b, int *c) +{ + int i, k,y; + int j, l; + int oldc, tc; + int oldl; + + k = 0; + c[0] = newcand(0,0,0); + for(i=1; i<=n; i++) { + j = a[i]; + if(j==0) + continue; + y = -b[j]; + oldl = 0; + oldc = c[0]; + do { + if(y <= clist[oldc].y) + continue; + l = search(c, k, y); + if(l!=oldl+1) + oldc = c[l-1]; + if(l<=k) { + if(clist[c[l]].y <= y) + continue; + tc = c[l]; + c[l] = newcand(i,y,oldc); + oldc = tc; + oldl = l; + } else { + c[l] = newcand(i,y,oldc); + k++; + break; + } + } while((y=b[++j]) > 0); + } + return k; +} + +static void +unravel(int p) +{ + int i; + struct cand *q; + + for(i=0; i<=len[0]; i++) { + if (i <= pref) + J[i] = i; + else if (i > len[0]-suff) + J[i] = i+len[1]-len[0]; + else + J[i] = 0; + } + for(q=clist+p;q->y!=0;q=clist+q->pred) + J[q->x+pref] = q->y+pref; +} + +static void +output(void) +{ + int m, i0, i1, j0, j1; + + m = len[0]; + J[0] = 0; + J[m+1] = len[1]+1; + if (mode != 'e') { + for (i0 = 1; i0 <= m; i0 = i1+1) { + while (i0 <= m && J[i0] == J[i0-1]+1) + i0++; + j0 = J[i0-1]+1; + i1 = i0-1; + while (i1 < m && J[i1+1] == 0) + i1++; + j1 = J[i1+1]-1; + J[i1] = j1; + change(i0, i1, j0, j1); + } + } + else { + for (i0 = m; i0 >= 1; i0 = i1-1) { + while (i0 >= 1 && J[i0] == J[i0+1]-1 && J[i0]) + i0--; + j0 = J[i0+1]-1; + i1 = i0+1; + while (i1 > 1 && J[i1-1] == 0) + i1--; + j1 = J[i1-1]+1; + J[i1] = j1; + change(i1 , i0, j1, j0); + } + } + if (m == 0) + change(1, 0, 1, len[1]); +} + +#define BUF 4096 +static int +cmp(Biobuf* b1, Biobuf* b2) +{ + int n; + uchar buf1[BUF], buf2[BUF]; + int f1, f2; + vlong nc = 1; + uchar *b1s, *b1e, *b2s, *b2e; + + f1 = Bfildes(b1); + f2 = Bfildes(b2); + seek(f1, 0, 0); + seek(f2, 0, 0); + b1s = b1e = buf1; + b2s = b2e = buf2; + for(;;){ + if(b1s >= b1e){ + if(b1s >= &buf1[BUF]) + b1s = buf1; + n = read(f1, b1s, &buf1[BUF] - b1s); + b1e = b1s + n; + } + if(b2s >= b2e){ + if(b2s >= &buf2[BUF]) + b2s = buf2; + n = read(f2, b2s, &buf2[BUF] - b2s); + b2e = b2s + n; + } + n = b2e - b2s; + if(n > b1e - b1s) + n = b1e - b1s; + if(n <= 0) + break; + if(memcmp((void *)b1s, (void *)b2s, n) != 0){ + return 1; + } + nc += n; + b1s += n; + b2s += n; + } + if(b1e - b1s == b2e - b2s) + return 0; + return 1; +} + +void +diffreg(char *f, char *t) +{ + Biobuf *b0, *b1; + int k; + + binary = 0; + b0 = prepare(0, f); + if (!b0) + return; + b1 = prepare(1, t); + if (!b1) { + FREE(file[0]); + Bterm(b0); + return; + } + if (binary){ + // could use b0 and b1 but this is simpler. + if (cmp(b0, b1)) + print("binary files %s %s differ\n", f, t); + Bterm(b0); + Bterm(b1); + return; + } + clen = 0; + prune(); + sort(sfile[0], slen[0]); + sort(sfile[1], slen[1]); + + member = (int *)file[1]; + equiv(sfile[0], slen[0], sfile[1], slen[1], member); + member = REALLOC(member, int, slen[1]+2); + + class = (int *)file[0]; + unsort(sfile[0], slen[0], class); + class = REALLOC(class, int, slen[0]+2); + + klist = MALLOC(int, slen[0]+2); + clist = MALLOC(struct cand, 1); + k = stone(class, slen[0], member, klist); + FREE(member); + FREE(class); + + J = MALLOC(int, len[0]+2); + unravel(klist[k]); + FREE(clist); + FREE(klist); + + ixold = MALLOC(long, len[0]+2); + ixnew = MALLOC(long, len[1]+2); + Bseek(b0, 0, 0); Bseek(b1, 0, 0); + check(b0, b1); + output(); + FREE(J); FREE(ixold); FREE(ixnew); + Bterm(b0); Bterm(b1); /* ++++ */ +} |