xref: /illumos-gate/usr/src/contrib/ast/src/lib/libcmd/cut.c (revision b30d1939)
1 /***********************************************************************
2 *                                                                      *
3 *               This software is part of the ast package               *
4 *          Copyright (c) 1992-2012 AT&T Intellectual Property          *
5 *                      and is licensed under the                       *
6 *                 Eclipse Public License, Version 1.0                  *
7 *                    by AT&T Intellectual Property                     *
8 *                                                                      *
9 *                A copy of the License is available at                 *
10 *          http://www.eclipse.org/org/documents/epl-v10.html           *
11 *         (with md5 checksum b35adb5213ca9657e911e9befb180842)         *
12 *                                                                      *
13 *              Information and Software Systems Research               *
14 *                            AT&T Research                             *
15 *                           Florham Park NJ                            *
16 *                                                                      *
17 *                 Glenn Fowler <gsf@research.att.com>                  *
18 *                  David Korn <dgk@research.att.com>                   *
19 *                                                                      *
20 ***********************************************************************/
21 #pragma prototyped
22 /*
23  * David Korn
24  * AT&T Bell Laboratories
25  *
26  * cut fields or columns from fields from a file
27  */
28 
29 static const char usage[] =
30 "[-?\n@(#)$Id: cut (AT&T Research) 2010-08-11 $\n]"
31 USAGE_LICENSE
32 "[+NAME?cut - cut out selected columns or fields of each line of a file]"
33 "[+DESCRIPTION?\bcut\b bytes, characters, or character-delimited fields "
34 	"from one or more files, contatenating them on standard output.]"
35 "[+?The option argument \alist\a is a comma-separated or blank-separated "
36 	"list of positive numbers and ranges.  Ranges can be of three "
37 	"forms.  The first is two positive integers separated by a hyphen "
38 	"(\alow\a\b-\b\ahigh\a), which represents all fields from \alow\a to "
39 	"\ahigh\a.  The second is a positive number preceded by a hyphen "
40 	"(\b-\b\ahigh\a), which represents all fields from field \b1\b to "
41 	"\ahigh\a.  The last is a positive number followed by a hyphen "
42 	"(\alow\a\b-\b), which represents all fields from \alow\a to the "
43 	"last field, inclusive.  Elements in the \alist\a can be repeated, "
44 	"can overlap, and can appear in any order.  The order of the "
45 	"output is that of the input.]"
46 "[+?One and only one of \b-b\b, \b-c\b, or \b-f\b must be specified.]"
47 "[+?If no \afile\a is given, or if the \afile\a is \b-\b, \bcut\b "
48         "cuts from standard input.   The start of the file is defined "
49         "as the current offset.]"
50 "[b:bytes]:[list?\bcut\b based on a list of byte counts.]"
51 "[c:characters]:[list?\bcut\b based on a list of character counts.]"
52 "[d:delimiter]:[delim?The field character for the \b-f\b option is set "
53 	"to \adelim\a.  The default is the \btab\b character.]"
54 "[f:fields]:[list?\bcut\b based on fields separated by the delimiter "
55 	"character specified with the \b-d\b optiion.]"
56 "[n!:split?Split multibyte characters selected by the \b-b\b option.]"
57 "[R|r:reclen]#[reclen?If \areclen\a > 0, the input will be read as fixed length "
58 	"records of length \areclen\a when used with the \b-b\b or \b-c\b "
59 	"option.]"
60 "[s:suppress|only-delimited?Suppress lines with no delimiter characters, "
61 	"when used with the \b-f\b option.  By default, lines with no "
62 	"delimiters will be passsed in untouched.]"
63 "[D:line-delimeter|output-delimiter]:[ldelim?The line delimiter character for "
64 	"the \b-f\b option is set to \aldelim\a.  The default is the "
65 	"\bnewline\b character.]"
66 "[N!:newline?Output new-lines at end of each record when used "
67 	"with the \b-b\b or \b-c\b option.]"
68 "\n"
69 "\n[file ...]\n"
70 "\n"
71 "[+EXIT STATUS?]{"
72 	"[+0?All files processed successfully.]"
73 	"[+>0?One or more files failed to open or could not be read.]"
74 "}"
75 "[+SEE ALSO?\bpaste\b(1), \bgrep\b(1)]"
76 ;
77 
78 #include <cmd.h>
79 #include <ctype.h>
80 
81 typedef struct Delim_s
82 {
83 	char*		str;
84 	int		len;
85 	int		chr;
86 } Delim_t;
87 
88 typedef struct Cut_s
89 {
90 	int		mb;
91 	int		eob;
92 	int		cflag;
93 	int		nosplit;
94 	int		sflag;
95 	int		nlflag;
96 	int		reclen;
97 	Delim_t		wdelim;
98 	Delim_t		ldelim;
99 	unsigned char	space[UCHAR_MAX+1];
100 	int		list[2];	/* NOTE: must be last member */
101 } Cut_t;
102 
103 #define HUGE		INT_MAX
104 #define BLOCK		8*1024
105 #define C_BYTES		1
106 #define C_CHARS		2
107 #define C_FIELDS	4
108 #define C_SUPRESS	8
109 #define C_NOSPLIT	16
110 #define C_NONEWLINE	32
111 
112 #define SP_LINE		1
113 #define SP_WORD		2
114 #define SP_WIDE		3
115 
116 /*
117  * compare the first of an array of integers
118  */
119 
120 static int
mycomp(register const void * a,register const void * b)121 mycomp(register const void* a, register const void* b)
122 {
123 	if (*((int*)a) < *((int*)b))
124 		return -1;
125 	if (*((int*)a) > *((int*)b))
126 		return 1;
127 	return 0;
128 }
129 
130 static Cut_t*
cutinit(int mode,char * str,Delim_t * wdelim,Delim_t * ldelim,size_t reclen)131 cutinit(int mode, char* str, Delim_t* wdelim, Delim_t* ldelim, size_t reclen)
132 {
133 	register int*	lp;
134 	register int	c;
135 	register int	n = 0;
136 	register int	range = 0;
137 	register char*	cp = str;
138 	Cut_t*		cut;
139 
140 	if (!(cut = (Cut_t*)stakalloc(sizeof(Cut_t) + strlen(cp) * sizeof(int))))
141 		error(ERROR_exit(1), "out of space");
142 	if (cut->mb = mbwide())
143 	{
144 		memset(cut->space, 0, sizeof(cut->space) / 2);
145 		memset(cut->space + sizeof(cut->space) / 2, SP_WIDE, sizeof(cut->space) / 2);
146 	}
147 	else
148 		memset(cut->space, 0, sizeof(cut->space));
149 	cut->wdelim = *wdelim;
150 	if (wdelim->len == 1)
151 		cut->space[wdelim->chr] = SP_WORD;
152 	cut->ldelim = *ldelim;
153 	cut->eob = (ldelim->len == 1) ? ldelim->chr : 0;
154 	cut->space[cut->eob] = SP_LINE;
155 	cut->cflag = (mode&C_CHARS) && cut->mb;
156 	cut->nosplit = (mode&(C_BYTES|C_NOSPLIT)) == (C_BYTES|C_NOSPLIT) && cut->mb;
157 	cut->sflag = (mode&C_SUPRESS) != 0;
158 	cut->nlflag = (mode&C_NONEWLINE) != 0;
159 	cut->reclen = reclen;
160 	lp = cut->list;
161 	for (;;)
162 		switch(c = *cp++)
163 		{
164 		case ' ':
165 		case '\t':
166 			while(*cp==' ' || *cp=='\t')
167 				cp++;
168 			/*FALLTHROUGH*/
169 		case 0:
170 		case ',':
171 			if(range)
172 			{
173 				--range;
174 				if((n = (n ? (n-range) : (HUGE-1))) < 0)
175 					error(ERROR_exit(1),"invalid range for c/f option");
176 				*lp++ = range;
177 				*lp++ = n;
178 			}
179 			else
180 			{
181 				*lp++ = --n;
182 				*lp++ = 1;
183 			}
184 			if(c==0)
185 			{
186 				register int *dp;
187 				*lp = HUGE;
188 				n = 1 + (lp-cut->list)/2;
189 				qsort(lp=cut->list,n,2*sizeof(*lp),mycomp);
190 				/* eliminate overlapping regions */
191 				for(n=0,range= -2,dp=lp; *lp!=HUGE; lp+=2)
192 				{
193 					if(lp[0] <= range)
194 					{
195 						if(lp[1]==HUGE)
196 						{
197 							dp[-1] = HUGE;
198 							break;
199 						}
200 						if((c = lp[0]+lp[1]-range)>0)
201 						{
202 							range += c;
203 							dp[-1] += c;
204 						}
205 					}
206 					else
207 					{
208 						range = *dp++ = lp[0];
209 						if(lp[1]==HUGE)
210 						{
211 							*dp++ = HUGE;
212 							break;
213 						}
214 						range += (*dp++ = lp[1]);
215 					}
216 				}
217 				*dp = HUGE;
218 				lp = cut->list;
219 				/* convert ranges into gaps */
220 				for(n=0; *lp!=HUGE; lp+=2)
221 				{
222 					c = *lp;
223 					*lp -= n;
224 					n = c+lp[1];
225 				}
226 				return cut;
227 			}
228 			n = range = 0;
229 			break;
230 
231 		case '-':
232 			if(range)
233 				error(ERROR_exit(1),"bad list for c/f option");
234 			range = n?n:1;
235 			n = 0;
236 			break;
237 
238 		default:
239 			if(!isdigit(c))
240 				error(ERROR_exit(1),"bad list for c/f option");
241 			n = 10*n + (c-'0');
242 			break;
243 		}
244 	/* NOTREACHED */
245 }
246 
247 /*
248  * cut each line of file <fdin> and put results to <fdout> using list <list>
249  */
250 
251 static void
cutcols(Cut_t * cut,Sfio_t * fdin,Sfio_t * fdout)252 cutcols(Cut_t* cut, Sfio_t* fdin, Sfio_t* fdout)
253 {
254 	register int		c;
255 	register int		len;
256 	register int		ncol = 0;
257 	register const int*	lp = cut->list;
258 	register char*		bp;
259 	register int		skip; /* non-zero for don't copy */
260 	int			must;
261 	const char*		xx;
262 
263 	for (;;)
264 	{
265 		if (len = cut->reclen)
266 			bp = sfreserve(fdin, len, -1);
267 		else
268 			bp = sfgetr(fdin, '\n', 0);
269 		if (!bp && !(bp = sfgetr(fdin, 0, SF_LASTR)))
270 			break;
271 		len = sfvalue(fdin);
272 		xx = 0;
273 		if (!(ncol = skip  = *(lp = cut->list)))
274 			ncol = *++lp;
275 		must = 1;
276 		do
277 		{
278 			if (cut->nosplit)
279 			{
280 				register const char*	s = bp;
281 				register int		w = len < ncol ? len : ncol;
282 				register int		z;
283 
284 				while (w > 0)
285 				{
286 					if (!(*s & 0x80))
287 						z = 1;
288 					else if ((z = mbnsize(s, w)) <= 0)
289 					{
290 						if (s == bp && xx)
291 						{
292 							w += s - xx;
293 							bp = (char*)(s = xx);
294 							xx = 0;
295 							continue;
296 						}
297 						xx = s;
298 						if (skip)
299 							s += w;
300 						w = 0;
301 						break;
302 					}
303 					s += z;
304 					w -= z;
305 				}
306 				c = s - bp;
307 				ncol = !w && ncol >= len;
308 			}
309 			else if (cut->cflag)
310 			{
311 				register const char*	s = bp;
312 				register int		w = len;
313 				register int		z;
314 
315 				while (w > 0 && ncol > 0)
316 				{
317 					ncol--;
318 					if (!(*s & 0x80) || (z = mbnsize(s, w)) <= 0)
319 						z = 1;
320 					s += z;
321 					w -= z;
322 
323 				}
324 				c = s - bp;
325 				ncol = !w && (ncol || !skip);
326 			}
327 			else
328 			{
329 				if ((c = ncol) > len)
330 					c = len;
331 				else if (c == len && !skip)
332 					ncol++;
333 				ncol -= c;
334 			}
335 			if (!skip && c)
336 			{
337 				if (sfwrite(fdout, (char*)bp, c) < 0)
338 					return;
339 				must = 0;
340 			}
341 			bp += c;
342 			if (ncol)
343 				break;
344 			len -= c;
345 			ncol = *++lp;
346 			skip = !skip;
347 		} while (ncol != HUGE);
348 		if (!cut->nlflag && (skip || must || cut->reclen))
349 		{
350 			if (cut->ldelim.len > 1)
351 				sfwrite(fdout, cut->ldelim.str, cut->ldelim.len);
352 			else
353 				sfputc(fdout, cut->ldelim.chr);
354 		}
355 	}
356 }
357 
358 /*
359  * cut each line of file <fdin> and put results to <fdout> using list <list>
360  * stream <fdin> must be line buffered
361  */
362 
363 static void
cutfields(Cut_t * cut,Sfio_t * fdin,Sfio_t * fdout)364 cutfields(Cut_t* cut, Sfio_t* fdin, Sfio_t* fdout)
365 {
366 	register unsigned char *sp = cut->space;
367 	register unsigned char *cp;
368 	register unsigned char *wp;
369 	register int c, nfields;
370 	register const int *lp = cut->list;
371 	register unsigned char *copy;
372 	register int nodelim, empty, inword=0;
373 	register unsigned char *ep;
374 	unsigned char *bp, *first;
375 	int lastchar;
376 	wchar_t w;
377 	Sfio_t *fdtmp = 0;
378 	long offset = 0;
379 	unsigned char mb[8];
380 	/* process each buffer */
381 	while ((bp = (unsigned char*)sfreserve(fdin, SF_UNBOUND, -1)) && (c = sfvalue(fdin)) > 0)
382 	{
383 		cp = bp;
384 		ep = cp + --c;
385 		if((lastchar = cp[c]) != cut->eob)
386 			*ep = cut->eob;
387 		/* process each line in the buffer */
388 		while (cp <= ep)
389 		{
390 			first = cp;
391 			if (!inword)
392 			{
393 				nodelim = empty = 1;
394 				copy = cp;
395 				if (nfields = *(lp = cut->list))
396 					copy = 0;
397 				else
398 					nfields = *++lp;
399 			}
400 			else if (copy)
401 				copy = cp;
402 			inword = 0;
403 			do
404 			{
405 				/* skip over non-delimiter characters */
406 				if (cut->mb)
407 					for (;;)
408 					{
409 						switch (c = sp[*(unsigned char*)cp++])
410 						{
411 						case 0:
412 							continue;
413 						case SP_WIDE:
414 							wp = --cp;
415 							while ((c = mb2wc(w, cp, ep - cp)) <= 0)
416 							{
417 								/* mb char possibly spanning buffer boundary -- fun stuff */
418 								if ((ep - cp) < mbmax())
419 								{
420 									int	i;
421 									int	j;
422 									int	k;
423 
424 									if (lastchar != cut->eob)
425 									{
426 										*ep = lastchar;
427 										if ((c = mb2wc(w, cp, ep - cp)) > 0)
428 											break;
429 									}
430 									if (copy)
431 									{
432 										empty = 0;
433 										if ((c = cp - copy) > 0 && sfwrite(fdout, (char*)copy, c) < 0)
434 											goto failed;
435 									}
436 									for (i = 0; i <= (ep - cp); i++)
437 										mb[i] = cp[i];
438 									if (!(bp = (unsigned char*)sfreserve(fdin, SF_UNBOUND, -1)) || (c = sfvalue(fdin)) <= 0)
439 										goto failed;
440 									cp = bp;
441 									ep = cp + --c;
442 									if ((lastchar = cp[c]) != cut->eob)
443 										*ep = cut->eob;
444 									j = i;
445 									k = 0;
446 									while (j < mbmax())
447 										mb[j++] = cp[k++];
448 									if ((c = mb2wc(w, (char*)mb, j)) <= 0)
449 									{
450 										c = i;
451 										w = 0;
452 									}
453 									first = bp = cp += c - i;
454 									if (copy)
455 									{
456 										copy = bp;
457 										if (w == cut->ldelim.chr)
458 											lastchar = cut->ldelim.chr;
459 										else if (w != cut->wdelim.chr)
460 										{
461 											empty = 0;
462 											if (sfwrite(fdout, (char*)mb, c) < 0)
463 												goto failed;
464 										}
465 									}
466 									c = 0;
467 								}
468 								else
469 								{
470 									w = *cp;
471 									c = 1;
472 								}
473 								break;
474 							}
475 							cp += c;
476 							c = w;
477 							if (c == cut->wdelim.chr)
478 							{
479 								c = SP_WORD;
480 								break;
481 							}
482 							if (c == cut->ldelim.chr)
483 							{
484 								c = SP_LINE;
485 								break;
486 							}
487 							continue;
488 						default:
489 							wp = cp - 1;
490 							break;
491 						}
492 						break;
493 					}
494 				else
495 				{
496 					while (!(c = sp[*cp++]));
497 					wp = cp - 1;
498 				}
499 				/* check for end-of-line */
500 				if (c == SP_LINE)
501 				{
502 					if (cp <= ep)
503 						break;
504 					if (lastchar == cut->ldelim.chr)
505 						break;
506 					/* restore cut->last character */
507 					if (lastchar != cut->eob)
508 						*ep = lastchar;
509 					inword++;
510 					if (!sp[lastchar])
511 						break;
512 				}
513 				nodelim = 0;
514 				if (--nfields > 0)
515 					continue;
516 				nfields = *++lp;
517 				if (copy)
518 				{
519 					empty = 0;
520 					if ((c = wp - copy) > 0 && sfwrite(fdout, (char*)copy, c) < 0)
521 						goto failed;
522 					copy = 0;
523 				}
524 				else
525 					/* set to delimiter unless the first field */
526 					copy = empty ? cp : wp;
527 			} while (!inword);
528 			if (!inword)
529 			{
530 				if (!copy)
531 				{
532 					if (nodelim)
533 					{
534 						if (!cut->sflag)
535 						{
536 							if (offset)
537 							{
538 								sfseek(fdtmp,(Sfoff_t)0,SEEK_SET);
539 								sfmove(fdtmp,fdout,offset,-1);
540 							}
541 							copy = first;
542 						}
543 					}
544 					else
545 						sfputc(fdout,'\n');
546 				}
547 				if (offset)
548 					sfseek(fdtmp,offset=0,SEEK_SET);
549 			}
550 			if (copy && (c=cp-copy)>0 && (!nodelim || !cut->sflag) && sfwrite(fdout,(char*)copy,c)< 0)
551 				goto failed;
552 		}
553 		/* see whether to save in tmp file */
554 		if(inword && nodelim && !cut->sflag && (c=cp-first)>0)
555 		{
556 			/* copy line to tmpfile in case no fields */
557 			if(!fdtmp)
558 				fdtmp = sftmp(BLOCK);
559 			sfwrite(fdtmp,(char*)first,c);
560 			offset +=c;
561 		}
562 	}
563  failed:
564 	if(fdtmp)
565 		sfclose(fdtmp);
566 }
567 
568 int
b_cut(int argc,char ** argv,Shbltin_t * context)569 b_cut(int argc, char** argv, Shbltin_t* context)
570 {
571 	register char*		cp = 0;
572 	register Sfio_t*	fp;
573 	char*			s;
574 	int			n;
575 	Cut_t*			cut;
576 	int			mode = 0;
577 	Delim_t			wdelim;
578 	Delim_t			ldelim;
579 	size_t			reclen = 0;
580 
581 	cmdinit(argc, argv, context, ERROR_CATALOG, 0);
582 	wdelim.chr = '\t';
583 	ldelim.chr = '\n';
584 	wdelim.len = ldelim.len = 1;
585 	for (;;)
586 	{
587 		switch (optget(argv, usage))
588 		{
589 		case 0:
590 			break;
591 		case 'b':
592 		case 'c':
593 			if(mode&C_FIELDS)
594 			{
595 				error(2, "f option already specified");
596 				continue;
597 			}
598 			cp = opt_info.arg;
599 			if(opt_info.option[1]=='b')
600 				mode |= C_BYTES;
601 			else
602 				mode |= C_CHARS;
603 			continue;
604 		case 'D':
605 			ldelim.str = opt_info.arg;
606 			if (mbwide())
607 			{
608 				s = opt_info.arg;
609 				ldelim.chr = mbchar(s);
610 				if ((n = s - opt_info.arg) > 1)
611 				{
612 					ldelim.len = n;
613 					continue;
614 				}
615 			}
616 			ldelim.chr = *(unsigned char*)opt_info.arg;
617 			ldelim.len = 1;
618 			continue;
619 		case 'd':
620 			wdelim.str = opt_info.arg;
621 			if (mbwide())
622 			{
623 				s = opt_info.arg;
624 				wdelim.chr = mbchar(s);
625 				if ((n = s - opt_info.arg) > 1)
626 				{
627 					wdelim.len = n;
628 					continue;
629 				}
630 			}
631 			wdelim.chr = *(unsigned char*)opt_info.arg;
632 			wdelim.len = 1;
633 			continue;
634 		case 'f':
635 			if(mode&(C_CHARS|C_BYTES))
636 			{
637 				error(2, "c option already specified");
638 				continue;
639 			}
640 			cp = opt_info.arg;
641 			mode |= C_FIELDS;
642 			continue;
643 		case 'n':
644 			mode |= C_NOSPLIT;
645 			continue;
646 		case 'N':
647 			mode |= C_NONEWLINE;
648 			continue;
649 		case 'R':
650 		case 'r':
651 			if(opt_info.num>0)
652 				reclen = opt_info.num;
653 			continue;
654 		case 's':
655 			mode |= C_SUPRESS;
656 			continue;
657 		case ':':
658 			error(2, "%s", opt_info.arg);
659 			break;
660 		case '?':
661 			error(ERROR_usage(2), "%s", opt_info.arg);
662 			break;
663 		}
664 		break;
665 	}
666 	argv += opt_info.index;
667 	if (error_info.errors)
668 		error(ERROR_usage(2), "%s",optusage(NiL));
669 	if(!cp)
670 	{
671 		error(2, "b, c or f option must be specified");
672 		error(ERROR_usage(2), "%s", optusage(NiL));
673 	}
674 	if(!*cp)
675 		error(3, "non-empty b, c or f option must be specified");
676 	if((mode & (C_FIELDS|C_SUPRESS)) == C_SUPRESS)
677 		error(3, "s option requires f option");
678 	cut = cutinit(mode, cp, &wdelim, &ldelim, reclen);
679 	if(cp = *argv)
680 		argv++;
681 	do
682 	{
683 		if(!cp || streq(cp,"-"))
684 			fp = sfstdin;
685 		else if(!(fp = sfopen(NiL,cp,"r")))
686 		{
687 			error(ERROR_system(0),"%s: cannot open",cp);
688 			continue;
689 		}
690 		if(mode&C_FIELDS)
691 			cutfields(cut,fp,sfstdout);
692 		else
693 			cutcols(cut,fp,sfstdout);
694 		if(fp!=sfstdin)
695 			sfclose(fp);
696 	} while(cp = *argv++);
697 	if (sfsync(sfstdout))
698 		error(ERROR_system(0), "write error");
699 	return error_info.errors != 0;
700 }
701