| # when raw index has a lot of entries like |
| # 1578324 problematico, a, ci, che |
| # treat things after comma as suffixes |
| # if single letter, replace last letter |
| # else search backwards for beginning of suffix |
| # and if it leads to an old suffix of approximately |
| # the same length, put replace that suffix |
| # This will still leave some commas to fix by hand |
| # Usage: awk -F' ' -f comfix.awk rawindex > newrawindex |
| if(i == 0 || length($2) == 0) |
| n = split($2, a, /,[ ]*/) |
| for(i = 2; i <= n; i++) { |
| nw = substr(w, 1, length(w)-m) suf |
| printf "%s\t%s\n", $1, nw |
| printf "%s\t%s\n", $1, w ", " suf |
| function matchsuflen(w, suf, wlen,suflen,c,pat,k,d) |
| for (k = 1; k <= wlen ; k++) |
| if(substr(w, wlen-k+1, 1) == c) |