| # when raw index has a lot of entries like | 
 | # 1578324	problematico, a, ci, che | 
 | # apply this algorithm: | 
 | #  treat things after comma as suffixes | 
 | #  for each suffix: | 
 | #      if single letter, replace last letter | 
 | #      else search backwards for beginning of suffix | 
 | #      and if it leads to an old suffix of approximately | 
 | #      the same length, put replace that suffix | 
 | # This will still leave some commas to fix by hand | 
 | # Usage: awk -F'	' -f comfix.awk rawindex > newrawindex | 
 |  | 
 | NF == 2	{ | 
 | 		i = index($2, ",") | 
 | 		if(i == 0 || length($2) == 0) | 
 | 			print $0 | 
 | 		else { | 
 | 			n = split($2, a, /,[ ]*/) | 
 | 			w = a[1] | 
 | 			printf "%s\t%s\n", $1, w | 
 | 			for(i = 2; i <= n; i++) { | 
 | 				suf = a[i] | 
 | 				m = matchsuflen(w, suf) | 
 | 				if(m) { | 
 | 					nw = substr(w, 1, length(w)-m) suf | 
 | 					printf "%s\t%s\n", $1, nw | 
 | 				} else | 
 | 					printf "%s\t%s\n", $1, w ", " suf | 
 | 			} | 
 | 		} | 
 | 	} | 
 | NF != 2 { | 
 | 	print $0 | 
 | 	} | 
 |  | 
 | function matchsuflen(w, suf,		wlen,suflen,c,pat,k,d) | 
 | { | 
 | 	wlen = length(w) | 
 | 	suflen = length(suf) | 
 | 	if(suflen == 1) | 
 | 		return 1 | 
 | 	else { | 
 | 		c = substr(suf, 1, 1) | 
 | 		for (k = 1; k <= wlen ; k++) | 
 | 			if(substr(w, wlen-k+1, 1) == c) | 
 | 				break | 
 | 		if(k > wlen) | 
 | 			return 0 | 
 | 		d = k-suflen | 
 | 		if(d < 0) | 
 | 			d = -d | 
 | 		if(d > 3) | 
 | 			return 0 | 
 | 		return k | 
 | 	} | 
 | } |