aboutsummaryrefslogtreecommitdiff
path: root/src/cmd/dict/comfix.awk
diff options
context:
space:
mode:
authorrsc <devnull@localhost>2003-11-25 03:37:45 +0000
committerrsc <devnull@localhost>2003-11-25 03:37:45 +0000
commit08708877939323c1e1cb87210193ec25fc472ff7 (patch)
treebd34e2144a3e9532ab228619d7ae8d4a0078aeeb /src/cmd/dict/comfix.awk
parent091f74d0a0db5ba1e098a518922525cb032a97b4 (diff)
downloadplan9port-08708877939323c1e1cb87210193ec25fc472ff7.tar.gz
plan9port-08708877939323c1e1cb87210193ec25fc472ff7.tar.bz2
plan9port-08708877939323c1e1cb87210193ec25fc472ff7.zip
add dict
Diffstat (limited to 'src/cmd/dict/comfix.awk')
-rw-r--r--src/cmd/dict/comfix.awk56
1 files changed, 56 insertions, 0 deletions
diff --git a/src/cmd/dict/comfix.awk b/src/cmd/dict/comfix.awk
new file mode 100644
index 00000000..9a51b98e
--- /dev/null
+++ b/src/cmd/dict/comfix.awk
@@ -0,0 +1,56 @@
+# when raw index has a lot of entries like
+# 1578324 problematico, a, ci, che
+# apply this algorithm:
+# treat things after comma as suffixes
+# for each suffix:
+# if single letter, replace last letter
+# else search backwards for beginning of suffix
+# and if it leads to an old suffix of approximately
+# the same length, put replace that suffix
+# This will still leave some commas to fix by hand
+# Usage: awk -F' ' -f comfix.awk rawindex > newrawindex
+
+NF == 2 {
+ i = index($2, ",")
+ if(i == 0 || length($2) == 0)
+ print $0
+ else {
+ n = split($2, a, /,[ ]*/)
+ w = a[1]
+ printf "%s\t%s\n", $1, w
+ for(i = 2; i <= n; i++) {
+ suf = a[i]
+ m = matchsuflen(w, suf)
+ if(m) {
+ nw = substr(w, 1, length(w)-m) suf
+ printf "%s\t%s\n", $1, nw
+ } else
+ printf "%s\t%s\n", $1, w ", " suf
+ }
+ }
+ }
+NF != 2 {
+ print $0
+ }
+
+function matchsuflen(w, suf, wlen,suflen,c,pat,k,d)
+{
+ wlen = length(w)
+ suflen = length(suf)
+ if(suflen == 1)
+ return 1
+ else {
+ c = substr(suf, 1, 1)
+ for (k = 1; k <= wlen ; k++)
+ if(substr(w, wlen-k+1, 1) == c)
+ break
+ if(k > wlen)
+ return 0
+ d = k-suflen
+ if(d < 0)
+ d = -d
+ if(d > 3)
+ return 0
+ return k
+ }
+}