1*3ee4fc2aSCody Peter Mello# From Gawk Manual modified by bug fix and removal of punctuation
2*3ee4fc2aSCody Peter Mello
3*3ee4fc2aSCody Peter Mello# Invoker can customize sort command if necessary.
4*3ee4fc2aSCody Peter MelloBEGIN {
5*3ee4fc2aSCody Peter Mello	if (!SORT) SORT = "LC_ALL=C sort"
6*3ee4fc2aSCody Peter Mello}
7*3ee4fc2aSCody Peter Mello
8*3ee4fc2aSCody Peter Mello# Record every word which is used at least once
9*3ee4fc2aSCody Peter Mello{
10*3ee4fc2aSCody Peter Mello	for (i = 1; i <= NF; i++) {
11*3ee4fc2aSCody Peter Mello		tmp = tolower($i)
12*3ee4fc2aSCody Peter Mello		if (0 != (pos = match(tmp, /([[:lower:]]|-)+/)))
13*3ee4fc2aSCody Peter Mello			used[substr(tmp, pos, RLENGTH)] = 1
14*3ee4fc2aSCody Peter Mello	}
15*3ee4fc2aSCody Peter Mello}
16*3ee4fc2aSCody Peter Mello
17*3ee4fc2aSCody Peter Mello#Find a number of distinct words longer than 10 characters
18*3ee4fc2aSCody Peter MelloEND {
19*3ee4fc2aSCody Peter Mello	num_long_words = 0
20*3ee4fc2aSCody Peter Mello	for (x in used)
21*3ee4fc2aSCody Peter Mello		if (length(x) > 10) {
22*3ee4fc2aSCody Peter Mello			++num_long_words
23*3ee4fc2aSCody Peter Mello			print x | SORT
24*3ee4fc2aSCody Peter Mello		}
25*3ee4fc2aSCody Peter Mello	print(num_long_words, "long words") | SORT
26*3ee4fc2aSCody Peter Mello	close(SORT)
27*3ee4fc2aSCody Peter Mello}
28