diff options
Diffstat (limited to 'src/cmd/dict/comfix.awk')
-rw-r--r-- | src/cmd/dict/comfix.awk | 56 |
1 files changed, 56 insertions, 0 deletions
diff --git a/src/cmd/dict/comfix.awk b/src/cmd/dict/comfix.awk new file mode 100644 index 00000000..9a51b98e --- /dev/null +++ b/src/cmd/dict/comfix.awk @@ -0,0 +1,56 @@ +# when raw index has a lot of entries like +# 1578324 problematico, a, ci, che +# apply this algorithm: +# treat things after comma as suffixes +# for each suffix: +# if single letter, replace last letter +# else search backwards for beginning of suffix +# and if it leads to an old suffix of approximately +# the same length, put replace that suffix +# This will still leave some commas to fix by hand +# Usage: awk -F' ' -f comfix.awk rawindex > newrawindex + +NF == 2 { + i = index($2, ",") + if(i == 0 || length($2) == 0) + print $0 + else { + n = split($2, a, /,[ ]*/) + w = a[1] + printf "%s\t%s\n", $1, w + for(i = 2; i <= n; i++) { + suf = a[i] + m = matchsuflen(w, suf) + if(m) { + nw = substr(w, 1, length(w)-m) suf + printf "%s\t%s\n", $1, nw + } else + printf "%s\t%s\n", $1, w ", " suf + } + } + } +NF != 2 { + print $0 + } + +function matchsuflen(w, suf, wlen,suflen,c,pat,k,d) +{ + wlen = length(w) + suflen = length(suf) + if(suflen == 1) + return 1 + else { + c = substr(suf, 1, 1) + for (k = 1; k <= wlen ; k++) + if(substr(w, wlen-k+1, 1) == c) + break + if(k > wlen) + return 0 + d = k-suflen + if(d < 0) + d = -d + if(d > 3) + return 0 + return k + } +} |