|  | # when raw index has a lot of entries like | 
|  | # 1578324	problematico, a, ci, che | 
|  | # apply this algorithm: | 
|  | #  treat things after comma as suffixes | 
|  | #  for each suffix: | 
|  | #      if single letter, replace last letter | 
|  | #      else search backwards for beginning of suffix | 
|  | #      and if it leads to an old suffix of approximately | 
|  | #      the same length, put replace that suffix | 
|  | # This will still leave some commas to fix by hand | 
|  | # Usage: awk -F'	' -f comfix.awk rawindex > newrawindex | 
|  |  | 
|  | NF == 2	{ | 
|  | i = index($2, ",") | 
|  | if(i == 0 || length($2) == 0) | 
|  | print $0 | 
|  | else { | 
|  | n = split($2, a, /,[ ]*/) | 
|  | w = a[1] | 
|  | printf "%s\t%s\n", $1, w | 
|  | for(i = 2; i <= n; i++) { | 
|  | suf = a[i] | 
|  | m = matchsuflen(w, suf) | 
|  | if(m) { | 
|  | nw = substr(w, 1, length(w)-m) suf | 
|  | printf "%s\t%s\n", $1, nw | 
|  | } else | 
|  | printf "%s\t%s\n", $1, w ", " suf | 
|  | } | 
|  | } | 
|  | } | 
|  | NF != 2 { | 
|  | print $0 | 
|  | } | 
|  |  | 
|  | function matchsuflen(w, suf,		wlen,suflen,c,pat,k,d) | 
|  | { | 
|  | wlen = length(w) | 
|  | suflen = length(suf) | 
|  | if(suflen == 1) | 
|  | return 1 | 
|  | else { | 
|  | c = substr(suf, 1, 1) | 
|  | for (k = 1; k <= wlen ; k++) | 
|  | if(substr(w, wlen-k+1, 1) == c) | 
|  | break | 
|  | if(k > wlen) | 
|  | return 0 | 
|  | d = k-suflen | 
|  | if(d < 0) | 
|  | d = -d | 
|  | if(d > 3) | 
|  | return 0 | 
|  | return k | 
|  | } | 
|  | } |