1da2e3ebdSchin /***********************************************************************
2da2e3ebdSchin *                                                                      *
3da2e3ebdSchin *               This software is part of the ast package               *
4*b30d1939SAndy Fiddaman *          Copyright (c) 1992-2012 AT&T Intellectual Property          *
5da2e3ebdSchin *                      and is licensed under the                       *
6*b30d1939SAndy Fiddaman *                 Eclipse Public License, Version 1.0                  *
77c2fbfb3SApril Chin *                    by AT&T Intellectual Property                     *
8da2e3ebdSchin *                                                                      *
9da2e3ebdSchin *                A copy of the License is available at                 *
10*b30d1939SAndy Fiddaman *          http://www.eclipse.org/org/documents/epl-v10.html           *
11*b30d1939SAndy Fiddaman *         (with md5 checksum b35adb5213ca9657e911e9befb180842)         *
12da2e3ebdSchin *                                                                      *
13da2e3ebdSchin *              Information and Software Systems Research               *
14da2e3ebdSchin *                            AT&T Research                             *
15da2e3ebdSchin *                           Florham Park NJ                            *
16da2e3ebdSchin *                                                                      *
17da2e3ebdSchin *                 Glenn Fowler <gsf@research.att.com>                  *
18da2e3ebdSchin *                  David Korn <dgk@research.att.com>                   *
19da2e3ebdSchin *                                                                      *
20da2e3ebdSchin ***********************************************************************/
21da2e3ebdSchin #pragma prototyped
22da2e3ebdSchin /*
23da2e3ebdSchin  * uniq
24da2e3ebdSchin  *
25da2e3ebdSchin  * Written by David Korn
26da2e3ebdSchin  */
27da2e3ebdSchin 
28da2e3ebdSchin static const char usage[] =
293e14f97fSRoger A. Faulkner "[-n?\n@(#)$Id: uniq (AT&T Research) 2009-11-28 $\n]"
30da2e3ebdSchin USAGE_LICENSE
31da2e3ebdSchin "[+NAME?uniq - Report or filter out repeated lines in a file]"
3234f9b3eeSRoland Mainz "[+DESCRIPTION?\buniq\b reads the input, compares adjacent lines, and "
3334f9b3eeSRoland Mainz 	"writes one copy of each input line on the output.  The second "
34da2e3ebdSchin 	"and succeeding copies of the repeated adjacent lines are not "
35da2e3ebdSchin 	"written.]"
36da2e3ebdSchin "[+?If the output file, \aoutfile\a, is not specified, \buniq\b writes "
37da2e3ebdSchin 	"to standard output.  If no \ainfile\a is given, or if the \ainfile\a "
3834f9b3eeSRoland Mainz 	"is \b-\b, \buniq\b reads from standard input with the start of "
3934f9b3eeSRoland Mainz 	"the file defined as the current offset.]"
40da2e3ebdSchin "[c:count?Output the number of times each line occurred  along with "
41da2e3ebdSchin 	"the line.]"
42da2e3ebdSchin "[d:repeated|duplicates?Output the first of each duplicate line.]"
43da2e3ebdSchin "[D:all-repeated?Output all duplicate lines as a group with an empty "
44da2e3ebdSchin     "line delimiter specified by \adelimit\a:]:?[delimit:=none]"
45da2e3ebdSchin     "{"
46da2e3ebdSchin         "[n:none?Do not delimit duplicate groups.]"
47da2e3ebdSchin         "[p:prepend?Prepend an empty line before each group.]"
48da2e3ebdSchin         "[s:separate?Separate each group with an empty line.]"
49da2e3ebdSchin     "}"
50da2e3ebdSchin "[f:skip-fields]#[fields?\afields\a is the number of fields to skip over "
51da2e3ebdSchin     "before checking for uniqueness. A field is the minimal string matching "
5234f9b3eeSRoland Mainz     "the BRE \b[[:blank:]]]]*[^[:blank:]]]]*\b. -\anumber\a is equivalent to "
5334f9b3eeSRoland Mainz     "\b--skip-fields\b=\anumber\a.]"
54da2e3ebdSchin "[i:ignore-case?Ignore case in comparisons.]"
55da2e3ebdSchin "[s:skip-chars]#[chars?\achars\a is the number of characters to skip over "
56da2e3ebdSchin 	"before checking for uniqueness.  If specified along with \b-f\b, "
57da2e3ebdSchin 	"the first \achars\a after the first \afields\a are ignored.  If "
58da2e3ebdSchin 	"the \achars\a specifies more characters than are on the line, "
5934f9b3eeSRoland Mainz 	"an empty string will be used for comparison. +\anumber\a is "
6034f9b3eeSRoland Mainz 	"equivalent to \b--skip-chars\b=\anumber\a.]"
61da2e3ebdSchin "[u:unique?Output unique lines.]"
62da2e3ebdSchin "[w:check-chars]#[chars?\achars\a is the number of characters to compare "
63da2e3ebdSchin 	"after skipping any specified fields and characters.]"
64da2e3ebdSchin "\n"
65da2e3ebdSchin "\n[infile [outfile]]\n"
66da2e3ebdSchin "\n"
67da2e3ebdSchin "[+EXIT STATUS?]{"
68da2e3ebdSchin 	"[+0?The input file was successfully processed.]"
69da2e3ebdSchin 	"[+>0?An error occurred.]"
70da2e3ebdSchin "}"
71da2e3ebdSchin "[+SEE ALSO?\bsort\b(1), \bgrep\b(1)]"
72da2e3ebdSchin ;
73da2e3ebdSchin 
74da2e3ebdSchin #include <cmd.h>
75da2e3ebdSchin 
76da2e3ebdSchin #define C_FLAG	1
77da2e3ebdSchin #define D_FLAG	2
78da2e3ebdSchin #define U_FLAG	4
79da2e3ebdSchin 
80da2e3ebdSchin #define CWIDTH	4
81da2e3ebdSchin #define MAXCNT	9999
82da2e3ebdSchin 
83da2e3ebdSchin typedef int (*Compare_f)(const char*, const char*, size_t);
84da2e3ebdSchin 
uniq(Sfio_t * fdin,Sfio_t * fdout,int fields,int chars,int width,int mode,int * all,Compare_f compare)85da2e3ebdSchin static int uniq(Sfio_t *fdin, Sfio_t *fdout, int fields, int chars, int width, int mode, int* all, Compare_f compare)
86da2e3ebdSchin {
873e14f97fSRoger A. Faulkner 	register int n, f, outsize=0, mb = mbwide();
883e14f97fSRoger A. Faulkner 	register char *cp, *ep, *mp, *bufp, *outp;
89da2e3ebdSchin 	char *orecp, *sbufp=0, *outbuff;
90da2e3ebdSchin 	int reclen,oreclen= -1,count=0,cwidth=0,sep,next;
91da2e3ebdSchin 	if(mode&C_FLAG)
92da2e3ebdSchin 		cwidth = CWIDTH+1;
93da2e3ebdSchin 	while(1)
94da2e3ebdSchin 	{
95da2e3ebdSchin 		if(bufp = sfgetr(fdin,'\n',0))
96da2e3ebdSchin 			n = sfvalue(fdin);
97da2e3ebdSchin 		else if(bufp = sfgetr(fdin,'\n',SF_LASTR))
98da2e3ebdSchin 		{
99da2e3ebdSchin 			n = sfvalue(fdin);
100da2e3ebdSchin 			bufp = memcpy(fmtbuf(n + 1), bufp, n);
101da2e3ebdSchin 			bufp[n++] = '\n';
102da2e3ebdSchin 		}
103da2e3ebdSchin 		else
104da2e3ebdSchin 			n = 0;
1053e14f97fSRoger A. Faulkner 		if (n)
106da2e3ebdSchin 		{
107da2e3ebdSchin 			cp = bufp;
108da2e3ebdSchin 			ep = cp + n;
1093e14f97fSRoger A. Faulkner 			if (f = fields)
1103e14f97fSRoger A. Faulkner 				while (f-->0 && cp<ep) /* skip over fields */
111da2e3ebdSchin 				{
1123e14f97fSRoger A. Faulkner 					while (cp<ep && *cp==' ' || *cp=='\t')
113da2e3ebdSchin 						cp++;
1143e14f97fSRoger A. Faulkner 					while (cp<ep && *cp!=' ' && *cp!='\t')
115da2e3ebdSchin 						cp++;
116da2e3ebdSchin 				}
1173e14f97fSRoger A. Faulkner 			if (chars)
1183e14f97fSRoger A. Faulkner 			{
1193e14f97fSRoger A. Faulkner 				if (mb)
1203e14f97fSRoger A. Faulkner 					for (f = chars; f; f--)
1213e14f97fSRoger A. Faulkner 						mbchar(cp);
1223e14f97fSRoger A. Faulkner 				else
1233e14f97fSRoger A. Faulkner 					cp += chars;
1243e14f97fSRoger A. Faulkner 			}
1253e14f97fSRoger A. Faulkner 			if ((reclen = n - (cp - bufp)) <= 0)
126da2e3ebdSchin 			{
127da2e3ebdSchin 				reclen = 1;
1283e14f97fSRoger A. Faulkner 				cp = bufp + n - 1;
1293e14f97fSRoger A. Faulkner 			}
1303e14f97fSRoger A. Faulkner 			else if (width >= 0 && width < reclen)
1313e14f97fSRoger A. Faulkner 			{
1323e14f97fSRoger A. Faulkner 				if (mb)
1333e14f97fSRoger A. Faulkner 				{
1343e14f97fSRoger A. Faulkner 					reclen = 0;
1353e14f97fSRoger A. Faulkner 					mp = cp;
1363e14f97fSRoger A. Faulkner 					while (reclen < width && mp < ep)
1373e14f97fSRoger A. Faulkner 					{
1383e14f97fSRoger A. Faulkner 						reclen++;
1393e14f97fSRoger A. Faulkner 						mbchar(mp);
1403e14f97fSRoger A. Faulkner 					}
1413e14f97fSRoger A. Faulkner 					reclen = mp - cp;
1423e14f97fSRoger A. Faulkner 				}
1433e14f97fSRoger A. Faulkner 				else
1443e14f97fSRoger A. Faulkner 					reclen = width;
145da2e3ebdSchin 			}
146da2e3ebdSchin 		}
147da2e3ebdSchin 		else
1483e14f97fSRoger A. Faulkner 			reclen = -2;
149da2e3ebdSchin 		if(reclen==oreclen && (!reclen || !(*compare)(cp,orecp,reclen)))
150da2e3ebdSchin 		{
151da2e3ebdSchin 			count++;
152da2e3ebdSchin 			if (!all)
153da2e3ebdSchin 				continue;
154da2e3ebdSchin 			next = count;
155da2e3ebdSchin 		}
156da2e3ebdSchin 		else
157da2e3ebdSchin 		{
158da2e3ebdSchin 			next = 0;
159da2e3ebdSchin 			if(outsize>0)
160da2e3ebdSchin 			{
161da2e3ebdSchin 				if(((mode&D_FLAG)&&count==0) || ((mode&U_FLAG)&&count))
162da2e3ebdSchin 				{
163da2e3ebdSchin 					if(outp!=sbufp)
164da2e3ebdSchin 						sfwrite(fdout,outp,0);
165da2e3ebdSchin 				}
166da2e3ebdSchin 				else
167da2e3ebdSchin 				{
168da2e3ebdSchin 					if(cwidth)
169da2e3ebdSchin 					{
17034f9b3eeSRoland Mainz 						if(count<9)
171da2e3ebdSchin 						{
17234f9b3eeSRoland Mainz 							f = 0;
17334f9b3eeSRoland Mainz 							while(f < CWIDTH-1)
17434f9b3eeSRoland Mainz 								outp[f++] = ' ';
17534f9b3eeSRoland Mainz 							outp[f++] = '0' + count + 1;
17634f9b3eeSRoland Mainz 							outp[f] = ' ';
17734f9b3eeSRoland Mainz 						}
17834f9b3eeSRoland Mainz 						else if(count<MAXCNT)
17934f9b3eeSRoland Mainz 						{
18034f9b3eeSRoland Mainz 							count++;
18134f9b3eeSRoland Mainz 							f = CWIDTH;
18234f9b3eeSRoland Mainz 							outp[f--] = ' ';
18334f9b3eeSRoland Mainz 							do
18434f9b3eeSRoland Mainz 							{
18534f9b3eeSRoland Mainz 								outp[f--] = '0' + (count % 10);
18634f9b3eeSRoland Mainz 							} while (count /= 10);
18734f9b3eeSRoland Mainz 							while (f >= 0)
18834f9b3eeSRoland Mainz 								outp[f--] = ' ';
189da2e3ebdSchin 						}
190da2e3ebdSchin 						else
191da2e3ebdSchin 						{
192da2e3ebdSchin 							outsize -= (CWIDTH+1);
193da2e3ebdSchin 							if(outp!=sbufp)
194da2e3ebdSchin 							{
195da2e3ebdSchin 								if(!(sbufp=fmtbuf(outsize)))
196da2e3ebdSchin 									return(1);
197da2e3ebdSchin 								memcpy(sbufp,outp+CWIDTH+1,outsize);
198da2e3ebdSchin 								sfwrite(fdout,outp,0);
199da2e3ebdSchin 								outp = sbufp;
200da2e3ebdSchin 							}
201da2e3ebdSchin 							else
202da2e3ebdSchin 								outp += CWIDTH+1;
203da2e3ebdSchin 							sfprintf(fdout,"%4d ",count+1);
204da2e3ebdSchin 						}
205da2e3ebdSchin 					}
206da2e3ebdSchin 					if(sfwrite(fdout,outp,outsize) != outsize)
207da2e3ebdSchin 						return(1);
208da2e3ebdSchin 				}
209da2e3ebdSchin 			}
210da2e3ebdSchin 		}
211da2e3ebdSchin 		if(n==0)
212da2e3ebdSchin 			break;
213da2e3ebdSchin 		if(count = next)
214da2e3ebdSchin 		{
215da2e3ebdSchin 			if(sfwrite(fdout,outp,outsize) != outsize)
216da2e3ebdSchin 				return(1);
217da2e3ebdSchin 			if(*all >= 0)
218da2e3ebdSchin 				*all = 1;
219da2e3ebdSchin 			sep = 0;
220da2e3ebdSchin 		}
221da2e3ebdSchin 		else
222da2e3ebdSchin 			sep = all && *all > 0;
223da2e3ebdSchin 		/* save current record */
224da2e3ebdSchin 		if (!(outbuff = sfreserve(fdout, 0, 0)) || (outsize = sfvalue(fdout)) < 0)
225da2e3ebdSchin 			return(1);
226da2e3ebdSchin 		outp = outbuff;
227da2e3ebdSchin 		if(outsize < n+cwidth+sep)
228da2e3ebdSchin 		{
229da2e3ebdSchin 			/* no room in outp, clear lock and use side buffer */
230da2e3ebdSchin 			sfwrite(fdout,outp,0);
231da2e3ebdSchin 			if(!(sbufp = outp=fmtbuf(outsize=n+cwidth+sep)))
232da2e3ebdSchin 				return(1);
233da2e3ebdSchin 		}
234da2e3ebdSchin 		else
235da2e3ebdSchin 			outsize = n+cwidth+sep;
236da2e3ebdSchin 		memcpy(outp+cwidth+sep,bufp,n);
237da2e3ebdSchin 		if(sep)
238da2e3ebdSchin 			outp[cwidth] = '\n';
239da2e3ebdSchin 		oreclen = reclen;
240da2e3ebdSchin 		orecp = outp+cwidth+sep + (cp-bufp);
241da2e3ebdSchin 	}
242da2e3ebdSchin 	return(0);
243da2e3ebdSchin }
244da2e3ebdSchin 
245da2e3ebdSchin int
b_uniq(int argc,char ** argv,Shbltin_t * context)246*b30d1939SAndy Fiddaman b_uniq(int argc, char** argv, Shbltin_t* context)
247da2e3ebdSchin {
248*b30d1939SAndy Fiddaman 	register int mode=0;
249da2e3ebdSchin 	register char *cp;
250da2e3ebdSchin 	int fields=0, chars=0, width=-1;
251da2e3ebdSchin 	Sfio_t *fpin, *fpout;
252da2e3ebdSchin 	int* all = 0;
253da2e3ebdSchin 	int sep;
254da2e3ebdSchin 	Compare_f compare = (Compare_f)memcmp;
255da2e3ebdSchin 
256da2e3ebdSchin 	cmdinit(argc, argv, context, ERROR_CATALOG, 0);
257*b30d1939SAndy Fiddaman 	for (;;)
258da2e3ebdSchin 	{
259*b30d1939SAndy Fiddaman 		switch (optget(argv, usage))
260da2e3ebdSchin 		{
261*b30d1939SAndy Fiddaman 		case 'c':
262*b30d1939SAndy Fiddaman 			mode |= C_FLAG;
263*b30d1939SAndy Fiddaman 			continue;
264*b30d1939SAndy Fiddaman 		case 'd':
265*b30d1939SAndy Fiddaman 			mode |= D_FLAG;
266*b30d1939SAndy Fiddaman 			continue;
267*b30d1939SAndy Fiddaman 		case 'D':
268*b30d1939SAndy Fiddaman 			mode |= D_FLAG;
269*b30d1939SAndy Fiddaman 			switch ((int)opt_info.num)
270*b30d1939SAndy Fiddaman 			{
271*b30d1939SAndy Fiddaman 			case 'p':
272*b30d1939SAndy Fiddaman 				sep = 1;
273*b30d1939SAndy Fiddaman 				break;
274*b30d1939SAndy Fiddaman 			case 's':
275*b30d1939SAndy Fiddaman 				sep = 0;
276*b30d1939SAndy Fiddaman 				break;
277*b30d1939SAndy Fiddaman 			default:
278*b30d1939SAndy Fiddaman 				sep = -1;
279*b30d1939SAndy Fiddaman 				break;
280*b30d1939SAndy Fiddaman 			}
281*b30d1939SAndy Fiddaman 			all = &sep;
282*b30d1939SAndy Fiddaman 			continue;
283*b30d1939SAndy Fiddaman 		case 'i':
284*b30d1939SAndy Fiddaman 			compare = (Compare_f)strncasecmp;
285*b30d1939SAndy Fiddaman 			continue;
286*b30d1939SAndy Fiddaman 		case 'u':
287*b30d1939SAndy Fiddaman 			mode |= U_FLAG;
288*b30d1939SAndy Fiddaman 			continue;
289*b30d1939SAndy Fiddaman 		case 'f':
290*b30d1939SAndy Fiddaman 			if(*opt_info.option=='-')
291*b30d1939SAndy Fiddaman 				fields = opt_info.num;
292*b30d1939SAndy Fiddaman 			else
293*b30d1939SAndy Fiddaman 				chars = opt_info.num;
294*b30d1939SAndy Fiddaman 			continue;
295da2e3ebdSchin 		case 's':
296*b30d1939SAndy Fiddaman 			chars = opt_info.num;
297*b30d1939SAndy Fiddaman 			continue;
298*b30d1939SAndy Fiddaman 		case 'w':
299*b30d1939SAndy Fiddaman 			width = opt_info.num;
300*b30d1939SAndy Fiddaman 			continue;
301*b30d1939SAndy Fiddaman 		case ':':
302*b30d1939SAndy Fiddaman 			error(2, "%s", opt_info.arg);
303da2e3ebdSchin 			break;
304*b30d1939SAndy Fiddaman 		case '?':
305*b30d1939SAndy Fiddaman 			error(ERROR_usage(2), "%s", opt_info.arg);
306da2e3ebdSchin 			break;
307da2e3ebdSchin 		}
308da2e3ebdSchin 		break;
309da2e3ebdSchin 	}
310da2e3ebdSchin 	argv += opt_info.index;
311da2e3ebdSchin 	if(all && (mode&C_FLAG))
312da2e3ebdSchin 		error(2, "-c and -D are mutually exclusive");
313da2e3ebdSchin 	if(error_info.errors)
314da2e3ebdSchin 		error(ERROR_usage(2), "%s", optusage(NiL));
315da2e3ebdSchin 	if((cp = *argv) && (argv++,!streq(cp,"-")))
316da2e3ebdSchin 	{
317da2e3ebdSchin 		if(!(fpin = sfopen(NiL,cp,"r")))
318da2e3ebdSchin 			error(ERROR_system(1),"%s: cannot open",cp);
319da2e3ebdSchin 	}
320da2e3ebdSchin 	else
321da2e3ebdSchin 		fpin = sfstdin;
322da2e3ebdSchin 	if(cp = *argv)
323da2e3ebdSchin 	{
324da2e3ebdSchin 		argv++;
325da2e3ebdSchin 		if(!(fpout = sfopen(NiL,cp,"w")))
326da2e3ebdSchin 			error(ERROR_system(1),"%s: cannot create",cp);
327da2e3ebdSchin 	}
328da2e3ebdSchin 	else
329da2e3ebdSchin 		fpout = sfstdout;
330da2e3ebdSchin 	if(*argv)
331da2e3ebdSchin 	{
332da2e3ebdSchin 		error(2, "too many arguments");
333da2e3ebdSchin 		error(ERROR_usage(2), "%s", optusage(NiL));
334da2e3ebdSchin 	}
335da2e3ebdSchin 	error_info.errors = uniq(fpin,fpout,fields,chars,width,mode,all,compare);
336da2e3ebdSchin 	if(fpin!=sfstdin)
337da2e3ebdSchin 		sfclose(fpin);
338da2e3ebdSchin 	if(fpout!=sfstdout)
339da2e3ebdSchin 		sfclose(fpout);
340da2e3ebdSchin 	return(error_info.errors);
341da2e3ebdSchin }
342da2e3ebdSchin 
343