1 /***********************************************************************
2 *                                                                      *
3 *               This software is part of the ast package               *
4 *          Copyright (c) 1992-2012 AT&T Intellectual Property          *
5 *                      and is licensed under the                       *
6 *                 Eclipse Public License, Version 1.0                  *
7 *                    by AT&T Intellectual Property                     *
8 *                                                                      *
9 *                A copy of the License is available at                 *
10 *          http://www.eclipse.org/org/documents/epl-v10.html           *
11 *         (with md5 checksum b35adb5213ca9657e911e9befb180842)         *
12 *                                                                      *
13 *              Information and Software Systems Research               *
14 *                            AT&T Research                             *
15 *                           Florham Park NJ                            *
16 *                                                                      *
17 *                 Glenn Fowler <gsf@research.att.com>                  *
18 *                  David Korn <dgk@research.att.com>                   *
19 *                                                                      *
20 ***********************************************************************/
21 #pragma prototyped
22 /*
23  * uniq
24  *
25  * Written by David Korn
26  */
27 
28 static const char usage[] =
29 "[-n?\n@(#)$Id: uniq (AT&T Research) 2009-11-28 $\n]"
30 USAGE_LICENSE
31 "[+NAME?uniq - Report or filter out repeated lines in a file]"
32 "[+DESCRIPTION?\buniq\b reads the input, compares adjacent lines, and "
33 	"writes one copy of each input line on the output.  The second "
34 	"and succeeding copies of the repeated adjacent lines are not "
35 	"written.]"
36 "[+?If the output file, \aoutfile\a, is not specified, \buniq\b writes "
37 	"to standard output.  If no \ainfile\a is given, or if the \ainfile\a "
38 	"is \b-\b, \buniq\b reads from standard input with the start of "
39 	"the file defined as the current offset.]"
40 "[c:count?Output the number of times each line occurred  along with "
41 	"the line.]"
42 "[d:repeated|duplicates?Output the first of each duplicate line.]"
43 "[D:all-repeated?Output all duplicate lines as a group with an empty "
44     "line delimiter specified by \adelimit\a:]:?[delimit:=none]"
45     "{"
46         "[n:none?Do not delimit duplicate groups.]"
47         "[p:prepend?Prepend an empty line before each group.]"
48         "[s:separate?Separate each group with an empty line.]"
49     "}"
50 "[f:skip-fields]#[fields?\afields\a is the number of fields to skip over "
51     "before checking for uniqueness. A field is the minimal string matching "
52     "the BRE \b[[:blank:]]]]*[^[:blank:]]]]*\b. -\anumber\a is equivalent to "
53     "\b--skip-fields\b=\anumber\a.]"
54 "[i:ignore-case?Ignore case in comparisons.]"
55 "[s:skip-chars]#[chars?\achars\a is the number of characters to skip over "
56 	"before checking for uniqueness.  If specified along with \b-f\b, "
57 	"the first \achars\a after the first \afields\a are ignored.  If "
58 	"the \achars\a specifies more characters than are on the line, "
59 	"an empty string will be used for comparison. +\anumber\a is "
60 	"equivalent to \b--skip-chars\b=\anumber\a.]"
61 "[u:unique?Output unique lines.]"
62 "[w:check-chars]#[chars?\achars\a is the number of characters to compare "
63 	"after skipping any specified fields and characters.]"
64 "\n"
65 "\n[infile [outfile]]\n"
66 "\n"
67 "[+EXIT STATUS?]{"
68 	"[+0?The input file was successfully processed.]"
69 	"[+>0?An error occurred.]"
70 "}"
71 "[+SEE ALSO?\bsort\b(1), \bgrep\b(1)]"
72 ;
73 
74 #include <cmd.h>
75 
76 #define C_FLAG	1
77 #define D_FLAG	2
78 #define U_FLAG	4
79 
80 #define CWIDTH	4
81 #define MAXCNT	9999
82 
83 typedef int (*Compare_f)(const char*, const char*, size_t);
84 
uniq(Sfio_t * fdin,Sfio_t * fdout,int fields,int chars,int width,int mode,int * all,Compare_f compare)85 static int uniq(Sfio_t *fdin, Sfio_t *fdout, int fields, int chars, int width, int mode, int* all, Compare_f compare)
86 {
87 	register int n, f, outsize=0, mb = mbwide();
88 	register char *cp, *ep, *mp, *bufp, *outp;
89 	char *orecp, *sbufp=0, *outbuff;
90 	int reclen,oreclen= -1,count=0,cwidth=0,sep,next;
91 	if(mode&C_FLAG)
92 		cwidth = CWIDTH+1;
93 	while(1)
94 	{
95 		if(bufp = sfgetr(fdin,'\n',0))
96 			n = sfvalue(fdin);
97 		else if(bufp = sfgetr(fdin,'\n',SF_LASTR))
98 		{
99 			n = sfvalue(fdin);
100 			bufp = memcpy(fmtbuf(n + 1), bufp, n);
101 			bufp[n++] = '\n';
102 		}
103 		else
104 			n = 0;
105 		if (n)
106 		{
107 			cp = bufp;
108 			ep = cp + n;
109 			if (f = fields)
110 				while (f-->0 && cp<ep) /* skip over fields */
111 				{
112 					while (cp<ep && *cp==' ' || *cp=='\t')
113 						cp++;
114 					while (cp<ep && *cp!=' ' && *cp!='\t')
115 						cp++;
116 				}
117 			if (chars)
118 			{
119 				if (mb)
120 					for (f = chars; f; f--)
121 						mbchar(cp);
122 				else
123 					cp += chars;
124 			}
125 			if ((reclen = n - (cp - bufp)) <= 0)
126 			{
127 				reclen = 1;
128 				cp = bufp + n - 1;
129 			}
130 			else if (width >= 0 && width < reclen)
131 			{
132 				if (mb)
133 				{
134 					reclen = 0;
135 					mp = cp;
136 					while (reclen < width && mp < ep)
137 					{
138 						reclen++;
139 						mbchar(mp);
140 					}
141 					reclen = mp - cp;
142 				}
143 				else
144 					reclen = width;
145 			}
146 		}
147 		else
148 			reclen = -2;
149 		if(reclen==oreclen && (!reclen || !(*compare)(cp,orecp,reclen)))
150 		{
151 			count++;
152 			if (!all)
153 				continue;
154 			next = count;
155 		}
156 		else
157 		{
158 			next = 0;
159 			if(outsize>0)
160 			{
161 				if(((mode&D_FLAG)&&count==0) || ((mode&U_FLAG)&&count))
162 				{
163 					if(outp!=sbufp)
164 						sfwrite(fdout,outp,0);
165 				}
166 				else
167 				{
168 					if(cwidth)
169 					{
170 						if(count<9)
171 						{
172 							f = 0;
173 							while(f < CWIDTH-1)
174 								outp[f++] = ' ';
175 							outp[f++] = '0' + count + 1;
176 							outp[f] = ' ';
177 						}
178 						else if(count<MAXCNT)
179 						{
180 							count++;
181 							f = CWIDTH;
182 							outp[f--] = ' ';
183 							do
184 							{
185 								outp[f--] = '0' + (count % 10);
186 							} while (count /= 10);
187 							while (f >= 0)
188 								outp[f--] = ' ';
189 						}
190 						else
191 						{
192 							outsize -= (CWIDTH+1);
193 							if(outp!=sbufp)
194 							{
195 								if(!(sbufp=fmtbuf(outsize)))
196 									return(1);
197 								memcpy(sbufp,outp+CWIDTH+1,outsize);
198 								sfwrite(fdout,outp,0);
199 								outp = sbufp;
200 							}
201 							else
202 								outp += CWIDTH+1;
203 							sfprintf(fdout,"%4d ",count+1);
204 						}
205 					}
206 					if(sfwrite(fdout,outp,outsize) != outsize)
207 						return(1);
208 				}
209 			}
210 		}
211 		if(n==0)
212 			break;
213 		if(count = next)
214 		{
215 			if(sfwrite(fdout,outp,outsize) != outsize)
216 				return(1);
217 			if(*all >= 0)
218 				*all = 1;
219 			sep = 0;
220 		}
221 		else
222 			sep = all && *all > 0;
223 		/* save current record */
224 		if (!(outbuff = sfreserve(fdout, 0, 0)) || (outsize = sfvalue(fdout)) < 0)
225 			return(1);
226 		outp = outbuff;
227 		if(outsize < n+cwidth+sep)
228 		{
229 			/* no room in outp, clear lock and use side buffer */
230 			sfwrite(fdout,outp,0);
231 			if(!(sbufp = outp=fmtbuf(outsize=n+cwidth+sep)))
232 				return(1);
233 		}
234 		else
235 			outsize = n+cwidth+sep;
236 		memcpy(outp+cwidth+sep,bufp,n);
237 		if(sep)
238 			outp[cwidth] = '\n';
239 		oreclen = reclen;
240 		orecp = outp+cwidth+sep + (cp-bufp);
241 	}
242 	return(0);
243 }
244 
245 int
b_uniq(int argc,char ** argv,Shbltin_t * context)246 b_uniq(int argc, char** argv, Shbltin_t* context)
247 {
248 	register int mode=0;
249 	register char *cp;
250 	int fields=0, chars=0, width=-1;
251 	Sfio_t *fpin, *fpout;
252 	int* all = 0;
253 	int sep;
254 	Compare_f compare = (Compare_f)memcmp;
255 
256 	cmdinit(argc, argv, context, ERROR_CATALOG, 0);
257 	for (;;)
258 	{
259 		switch (optget(argv, usage))
260 		{
261 		case 'c':
262 			mode |= C_FLAG;
263 			continue;
264 		case 'd':
265 			mode |= D_FLAG;
266 			continue;
267 		case 'D':
268 			mode |= D_FLAG;
269 			switch ((int)opt_info.num)
270 			{
271 			case 'p':
272 				sep = 1;
273 				break;
274 			case 's':
275 				sep = 0;
276 				break;
277 			default:
278 				sep = -1;
279 				break;
280 			}
281 			all = &sep;
282 			continue;
283 		case 'i':
284 			compare = (Compare_f)strncasecmp;
285 			continue;
286 		case 'u':
287 			mode |= U_FLAG;
288 			continue;
289 		case 'f':
290 			if(*opt_info.option=='-')
291 				fields = opt_info.num;
292 			else
293 				chars = opt_info.num;
294 			continue;
295 		case 's':
296 			chars = opt_info.num;
297 			continue;
298 		case 'w':
299 			width = opt_info.num;
300 			continue;
301 		case ':':
302 			error(2, "%s", opt_info.arg);
303 			break;
304 		case '?':
305 			error(ERROR_usage(2), "%s", opt_info.arg);
306 			break;
307 		}
308 		break;
309 	}
310 	argv += opt_info.index;
311 	if(all && (mode&C_FLAG))
312 		error(2, "-c and -D are mutually exclusive");
313 	if(error_info.errors)
314 		error(ERROR_usage(2), "%s", optusage(NiL));
315 	if((cp = *argv) && (argv++,!streq(cp,"-")))
316 	{
317 		if(!(fpin = sfopen(NiL,cp,"r")))
318 			error(ERROR_system(1),"%s: cannot open",cp);
319 	}
320 	else
321 		fpin = sfstdin;
322 	if(cp = *argv)
323 	{
324 		argv++;
325 		if(!(fpout = sfopen(NiL,cp,"w")))
326 			error(ERROR_system(1),"%s: cannot create",cp);
327 	}
328 	else
329 		fpout = sfstdout;
330 	if(*argv)
331 	{
332 		error(2, "too many arguments");
333 		error(ERROR_usage(2), "%s", optusage(NiL));
334 	}
335 	error_info.errors = uniq(fpin,fpout,fields,chars,width,mode,all,compare);
336 	if(fpin!=sfstdin)
337 		sfclose(fpin);
338 	if(fpout!=sfstdout)
339 		sfclose(fpout);
340 	return(error_info.errors);
341 }
342 
343