src/cmd/dict/comfix.awk - plan9 - Git at Google

 # when raw index has a lot of entries like
 # 1578324	problematico, a, ci, che
 # apply this algorithm:
 #  treat things after comma as suffixes
 #  for each suffix:
 #      if single letter, replace last letter
 #      else search backwards for beginning of suffix
 #      and if it leads to an old suffix of approximately
 #      the same length, put replace that suffix
 # This will still leave some commas to fix by hand
 # Usage: awk -F'	' -f comfix.awk rawindex > newrawindex

 NF == 2	{
 		i = index($2, ",")
 		if(i == 0 || length($2) == 0)
 			print $0
 		else {
 			n = split($2, a, /,[ ]*/)
 			w = a[1]
 			printf "%s\t%s\n", $1, w
 			for(i = 2; i <= n; i++) {
 				suf = a[i]
 				m = matchsuflen(w, suf)
 				if(m) {
 					nw = substr(w, 1, length(w)-m) suf
 					printf "%s\t%s\n", $1, nw
 				} else
 					printf "%s\t%s\n", $1, w ", " suf
 			}
 		}
 	}
 NF != 2 {
 	print $0
 	}

 function matchsuflen(w, suf,		wlen,suflen,c,pat,k,d)
 {
 	wlen = length(w)
 	suflen = length(suf)
 	if(suflen == 1)
 		return 1
 	else {
 		c = substr(suf, 1, 1)
 		for (k = 1; k <= wlen ; k++)
 			if(substr(w, wlen-k+1, 1) == c)
 				break
 		if(k > wlen)
 			return 0
 		d = k-suflen
 		if(d < 0)
 			d = -d
 		if(d > 3)
 			return 0
 		return k
 	}
 }
	# when raw index has a lot of entries like
	# 1578324 problematico, a, ci, che
	# apply this algorithm:
	# treat things after comma as suffixes
	# for each suffix:
	# if single letter, replace last letter
	# else search backwards for beginning of suffix
	# and if it leads to an old suffix of approximately
	# the same length, put replace that suffix
	# This will still leave some commas to fix by hand
	# Usage: awk -F' ' -f comfix.awk rawindex > newrawindex

	NF == 2 {
	i = index($2, ",")
	if(i == 0 \|\| length($2) == 0)
	print $0
	else {
	n = split($2, a, /,[ ]*/)
	w = a[1]
	printf "%s\t%s\n", $1, w
	for(i = 2; i <= n; i++) {
	suf = a[i]
	m = matchsuflen(w, suf)
	if(m) {
	nw = substr(w, 1, length(w)-m) suf
	printf "%s\t%s\n", $1, nw
	} else
	printf "%s\t%s\n", $1, w ", " suf
	}
	}
	}
	NF != 2 {
	print $0
	}

	function matchsuflen(w, suf, wlen,suflen,c,pat,k,d)
	{
	wlen = length(w)
	suflen = length(suf)
	if(suflen == 1)
	return 1
	else {
	c = substr(suf, 1, 1)
	for (k = 1; k <= wlen ; k++)
	if(substr(w, wlen-k+1, 1) == c)
	break
	if(k > wlen)
	return 0
	d = k-suflen
	if(d < 0)
	d = -d
	if(d > 3)
	return 0
	return k
	}
	}