xref: /illumos-gate/usr/src/contrib/ast/src/lib/libcmd/cut.c (revision 3e14f97f)
1 /***********************************************************************
2 *                                                                      *
3 *               This software is part of the ast package               *
4 *          Copyright (c) 1992-2010 AT&T Intellectual Property          *
5 *                      and is licensed under the                       *
6 *                  Common Public License, Version 1.0                  *
7 *                    by AT&T Intellectual Property                     *
8 *                                                                      *
9 *                A copy of the License is available at                 *
10 *            http://www.opensource.org/licenses/cpl1.0.txt             *
11 *         (with md5 checksum 059e8cd6165cb4c31e351f2b69388fd9)         *
12 *                                                                      *
13 *              Information and Software Systems Research               *
14 *                            AT&T Research                             *
15 *                           Florham Park NJ                            *
16 *                                                                      *
17 *                 Glenn Fowler <gsf@research.att.com>                  *
18 *                  David Korn <dgk@research.att.com>                   *
19 *                                                                      *
20 ***********************************************************************/
21 #pragma prototyped
22 /*
23  * David Korn
24  * AT&T Bell Laboratories
25  *
26  * cut fields or columns from fields from a file
27  */
28 
29 static const char usage[] =
30 "[-?\n@(#)$Id: cut (AT&T Research) 2009-12-04 $\n]"
31 USAGE_LICENSE
32 "[+NAME?cut - cut out selected columns or fields of each line of a file]"
33 "[+DESCRIPTION?\bcut\b bytes, characters, or character-delimited fields "
34 	"from one or more files, contatenating them on standard output.]"
35 "[+?The option argument \alist\a is a comma-separated or blank-separated "
36 	"list of positive numbers and ranges.  Ranges can be of three "
37 	"forms.  The first is two positive integers separated by a hyphen "
38 	"(\alow\a\b-\b\ahigh\a), which represents all fields from \alow\a to "
39 	"\ahigh\a.  The second is a positive number preceded by a hyphen "
40 	"(\b-\b\ahigh\a), which represents all fields from field \b1\b to "
41 	"\ahigh\a.  The last is a positive number followed by a hyphen "
42 	"(\alow\a\b-\b), which represents all fields from \alow\a to the "
43 	"last field, inclusive.  Elements in the \alist\a can be repeated, "
44 	"can overlap, and can appear in any order.  The order of the "
45 	"output is that of the input.]"
46 "[+?One and only one of \b-b\b, \b-c\b, or \b-f\b must be specified.]"
47 "[+?If no \afile\a is given, or if the \afile\a is \b-\b, \bcut\b "
48         "cuts from standard input.   The start of the file is defined "
49         "as the current offset.]"
50 "[b:bytes]:[list?\bcut\b based on a list of byte counts.]"
51 "[c:characters]:[list?\bcut\b based on a list of character counts.]"
52 "[d:delimiter]:[delim?The field character for the \b-f\b option is set "
53 	"to \adelim\a.  The default is the \btab\b character.]"
54 "[f:fields]:[list?\bcut\b based on fields separated by the delimiter "
55 	"character specified with the \b-d\b optiion.]"
56 "[n!:split?Split multibyte characters selected by the \b-b\b option.]"
57 "[R|r:reclen]#[reclen?If \areclen\a > 0, the input will be read as fixed length "
58 	"records of length \areclen\a when used with the \b-b\b or \b-c\b "
59 	"option.]"
60 "[s:suppress|only-delimited?Suppress lines with no delimiter characters, "
61 	"when used with the \b-f\b option.  By default, lines with no "
62 	"delimiters will be passsed in untouched.]"
63 "[D:line-delimeter|output-delimiter]:[ldelim?The line delimiter character for "
64 	"the \b-f\b option is set to \aldelim\a.  The default is the "
65 	"\bnewline\b character.]"
66 "[N!:newline?Output new-lines at end of each record when used "
67 	"with the \b-b\b or \b-c\b option.]"
68 "\n"
69 "\n[file ...]\n"
70 "\n"
71 "[+EXIT STATUS?]{"
72 	"[+0?All files processed successfully.]"
73 	"[+>0?One or more files failed to open or could not be read.]"
74 "}"
75 "[+SEE ALSO?\bpaste\b(1), \bgrep\b(1)]"
76 ;
77 
78 #include <cmd.h>
79 #include <ctype.h>
80 
81 typedef struct Delim_s
82 {
83 	char*		str;
84 	int		len;
85 	int		chr;
86 } Delim_t;
87 
88 typedef struct Cut_s
89 {
90 	int		mb;
91 	int		eob;
92 	int		cflag;
93 	int		nosplit;
94 	int		sflag;
95 	int		nlflag;
96 	int		reclen;
97 	Delim_t		wdelim;
98 	Delim_t		ldelim;
99 	unsigned char	space[UCHAR_MAX+1];
100 	int		list[2];	/* NOTE: must be last member */
101 } Cut_t;
102 
103 #define HUGE		INT_MAX
104 #define BLOCK		8*1024
105 #define C_BYTES		1
106 #define C_CHARS		2
107 #define C_FIELDS	4
108 #define C_SUPRESS	8
109 #define C_NOSPLIT	16
110 #define C_NONEWLINE	32
111 
112 #define SP_LINE		1
113 #define SP_WORD		2
114 #define SP_WIDE		3
115 
116 #define mb2wc(w,p,n)	(*ast.mb_towc)(&w,(char*)p,n)
117 
118 /*
119  * compare the first of an array of integers
120  */
121 
122 static int
123 mycomp(register const void* a, register const void* b)
124 {
125 	if (*((int*)a) < *((int*)b))
126 		return -1;
127 	if (*((int*)a) > *((int*)b))
128 		return 1;
129 	return 0;
130 }
131 
132 static Cut_t*
133 cutinit(int mode, char* str, Delim_t* wdelim, Delim_t* ldelim, size_t reclen)
134 {
135 	register int*	lp;
136 	register int	c;
137 	register int	n = 0;
138 	register int	range = 0;
139 	register char*	cp = str;
140 	Cut_t*		cut;
141 
142 	if (!(cut = (Cut_t*)stakalloc(sizeof(Cut_t) + strlen(cp) * sizeof(int))))
143 		error(ERROR_exit(1), "out of space");
144 	if (cut->mb = mbwide())
145 	{
146 		memset(cut->space, 0, sizeof(cut->space) / 2);
147 		memset(cut->space + sizeof(cut->space) / 2, SP_WIDE, sizeof(cut->space) / 2);
148 	}
149 	else
150 		memset(cut->space, 0, sizeof(cut->space));
151 	cut->wdelim = *wdelim;
152 	if (wdelim->len == 1)
153 		cut->space[wdelim->chr] = SP_WORD;
154 	cut->ldelim = *ldelim;
155 	cut->eob = (ldelim->len == 1) ? ldelim->chr : 0;
156 	cut->space[cut->eob] = SP_LINE;
157 	cut->cflag = (mode&C_CHARS) && cut->mb;
158 	cut->nosplit = (mode&(C_BYTES|C_NOSPLIT)) == (C_BYTES|C_NOSPLIT) && cut->mb;
159 	cut->sflag = (mode&C_SUPRESS) != 0;
160 	cut->nlflag = (mode&C_NONEWLINE) != 0;
161 	cut->reclen = reclen;
162 	lp = cut->list;
163 	for (;;)
164 		switch(c = *cp++)
165 		{
166 		case ' ':
167 		case '\t':
168 			while(*cp==' ' || *cp=='\t')
169 				cp++;
170 			/*FALLTHROUGH*/
171 		case 0:
172 		case ',':
173 			if(range)
174 			{
175 				--range;
176 				if((n = (n ? (n-range) : (HUGE-1))) < 0)
177 					error(ERROR_exit(1),"invalid range for c/f option");
178 				*lp++ = range;
179 				*lp++ = n;
180 			}
181 			else
182 			{
183 				*lp++ = --n;
184 				*lp++ = 1;
185 			}
186 			if(c==0)
187 			{
188 				register int *dp;
189 				*lp = HUGE;
190 				n = 1 + (lp-cut->list)/2;
191 				qsort(lp=cut->list,n,2*sizeof(*lp),mycomp);
192 				/* eliminate overlapping regions */
193 				for(n=0,range= -2,dp=lp; *lp!=HUGE; lp+=2)
194 				{
195 					if(lp[0] <= range)
196 					{
197 						if(lp[1]==HUGE)
198 						{
199 							dp[-1] = HUGE;
200 							break;
201 						}
202 						if((c = lp[0]+lp[1]-range)>0)
203 						{
204 							range += c;
205 							dp[-1] += c;
206 						}
207 					}
208 					else
209 					{
210 						range = *dp++ = lp[0];
211 						if(lp[1]==HUGE)
212 						{
213 							*dp++ = HUGE;
214 							break;
215 						}
216 						range += (*dp++ = lp[1]);
217 					}
218 				}
219 				*dp = HUGE;
220 				lp = cut->list;
221 				/* convert ranges into gaps */
222 				for(n=0; *lp!=HUGE; lp+=2)
223 				{
224 					c = *lp;
225 					*lp -= n;
226 					n = c+lp[1];
227 				}
228 				return cut;
229 			}
230 			n = range = 0;
231 			break;
232 
233 		case '-':
234 			if(range)
235 				error(ERROR_exit(1),"bad list for c/f option");
236 			range = n?n:1;
237 			n = 0;
238 			break;
239 
240 		default:
241 			if(!isdigit(c))
242 				error(ERROR_exit(1),"bad list for c/f option");
243 			n = 10*n + (c-'0');
244 			break;
245 		}
246 	/* NOTREACHED */
247 }
248 
249 /*
250  * cut each line of file <fdin> and put results to <fdout> using list <list>
251  */
252 
253 static void
254 cutcols(Cut_t* cut, Sfio_t* fdin, Sfio_t* fdout)
255 {
256 	register int		c;
257 	register int		len;
258 	register int		ncol = 0;
259 	register const int*	lp = cut->list;
260 	register char*		bp;
261 	register int		skip; /* non-zero for don't copy */
262 	int			must;
263 	char*			ep;
264 	const char*		xx;
265 
266 	for (;;)
267 	{
268 		if (len = cut->reclen)
269 			bp = sfreserve(fdin, len, -1);
270 		else
271 			bp = sfgetr(fdin, '\n', 0);
272 		if (!bp && !(bp = sfgetr(fdin, 0, SF_LASTR)))
273 			break;
274 		len = sfvalue(fdin);
275 		ep = bp + len;
276 		xx = 0;
277 		if (!(ncol = skip  = *(lp = cut->list)))
278 			ncol = *++lp;
279 		must = 1;
280 		do
281 		{
282 			if (cut->nosplit)
283 			{
284 				register const char*	s = bp;
285 				register int		w = len < ncol ? len : ncol;
286 				register int		z;
287 
288 				while (w > 0)
289 				{
290 					if (!(*s & 0x80))
291 						z = 1;
292 					else if ((z = mblen(s, w)) <= 0)
293 					{
294 						if (s == bp && xx)
295 						{
296 							w += s - xx;
297 							bp = (char*)(s = xx);
298 							xx = 0;
299 							continue;
300 						}
301 						xx = s;
302 						if (skip)
303 							s += w;
304 						w = 0;
305 						break;
306 					}
307 					s += z;
308 					w -= z;
309 				}
310 				c = s - bp;
311 				ncol = !w && ncol >= len;
312 			}
313 			else if (cut->cflag)
314 			{
315 				register const char*	s = bp;
316 				register int		w = len;
317 				register int		z;
318 
319 				while (w > 0 && ncol > 0)
320 				{
321 					ncol--;
322 					if (!(*s & 0x80) || (z = mblen(s, w)) <= 0)
323 						z = 1;
324 					s += z;
325 					w -= z;
326 
327 				}
328 				c = s - bp;
329 				ncol = !w && (ncol || !skip);
330 			}
331 			else
332 			{
333 				if ((c = ncol) > len)
334 					c = len;
335 				else if (c == len && !skip)
336 					ncol++;
337 				ncol -= c;
338 			}
339 			if (!skip && c)
340 			{
341 				if (sfwrite(fdout, (char*)bp, c) < 0)
342 					return;
343 				must = 0;
344 			}
345 			bp += c;
346 			if (ncol)
347 				break;
348 			len -= c;
349 			ncol = *++lp;
350 			skip = !skip;
351 		} while (ncol != HUGE);
352 		if (!cut->nlflag && (skip || must || cut->reclen))
353 		{
354 			if (cut->ldelim.len > 1)
355 				sfwrite(fdout, cut->ldelim.str, cut->ldelim.len);
356 			else
357 				sfputc(fdout, cut->ldelim.chr);
358 		}
359 	}
360 }
361 
362 /*
363  * cut each line of file <fdin> and put results to <fdout> using list <list>
364  * stream <fdin> must be line buffered
365  */
366 
367 static void
368 cutfields(Cut_t* cut, Sfio_t* fdin, Sfio_t* fdout)
369 {
370 	register unsigned char *sp = cut->space;
371 	register unsigned char *cp;
372 	register unsigned char *wp;
373 	register int c, nfields;
374 	register const int *lp = cut->list;
375 	register unsigned char *copy;
376 	register int nodelim, empty, inword=0;
377 	register unsigned char *ep;
378 	unsigned char *bp, *first;
379 	int lastchar;
380 	wchar_t w;
381 	Sfio_t *fdtmp = 0;
382 	long offset = 0;
383 	unsigned char mb[8];
384 	/* process each buffer */
385 	while ((bp = (unsigned char*)sfreserve(fdin, SF_UNBOUND, -1)) && (c = sfvalue(fdin)) > 0)
386 	{
387 		cp = bp;
388 		ep = cp + --c;
389 		if((lastchar = cp[c]) != cut->eob)
390 			*ep = cut->eob;
391 		/* process each line in the buffer */
392 		while (cp <= ep)
393 		{
394 			first = cp;
395 			if (!inword)
396 			{
397 				nodelim = empty = 1;
398 				copy = cp;
399 				if (nfields = *(lp = cut->list))
400 					copy = 0;
401 				else
402 					nfields = *++lp;
403 			}
404 			else if (copy)
405 				copy = cp;
406 			inword = 0;
407 			do
408 			{
409 				/* skip over non-delimiter characters */
410 				if (cut->mb)
411 					for (;;)
412 					{
413 						switch (c = sp[*(unsigned char*)cp++])
414 						{
415 						case 0:
416 							continue;
417 						case SP_WIDE:
418 							wp = --cp;
419 							while ((c = mb2wc(w, cp, ep - cp)) <= 0)
420 							{
421 								/* mb char possibly spanning buffer boundary -- fun stuff */
422 								if ((ep - cp) < mbmax())
423 								{
424 									int	i;
425 									int	j;
426 									int	k;
427 
428 									if (lastchar != cut->eob)
429 									{
430 										*ep = lastchar;
431 										if ((c = mb2wc(w, cp, ep - cp)) > 0)
432 											break;
433 									}
434 									if (copy)
435 									{
436 										empty = 0;
437 										if ((c = cp - copy) > 0 && sfwrite(fdout, (char*)copy, c) < 0)
438 											goto failed;
439 									}
440 									for (i = 0; i <= (ep - cp); i++)
441 										mb[i] = cp[i];
442 									if (!(bp = (unsigned char*)sfreserve(fdin, SF_UNBOUND, -1)) || (c = sfvalue(fdin)) <= 0)
443 										goto failed;
444 									cp = bp;
445 									ep = cp + --c;
446 									if ((lastchar = cp[c]) != cut->eob)
447 										*ep = cut->eob;
448 									j = i;
449 									k = 0;
450 									while (j < mbmax())
451 										mb[j++] = cp[k++];
452 									if ((c = mb2wc(w, (char*)mb, j)) <= 0)
453 									{
454 										c = i;
455 										w = 0;
456 									}
457 									first = bp = cp += c - i;
458 									if (copy)
459 									{
460 										copy = bp;
461 										if (w == cut->ldelim.chr)
462 											lastchar = cut->ldelim.chr;
463 										else if (w != cut->wdelim.chr)
464 										{
465 											empty = 0;
466 											if (sfwrite(fdout, (char*)mb, c) < 0)
467 												goto failed;
468 										}
469 									}
470 									c = 0;
471 								}
472 								else
473 								{
474 									w = *cp;
475 									c = 1;
476 								}
477 								break;
478 							}
479 							cp += c;
480 							c = w;
481 							if (c == cut->wdelim.chr)
482 							{
483 								c = SP_WORD;
484 								break;
485 							}
486 							if (c == cut->ldelim.chr)
487 							{
488 								c = SP_LINE;
489 								break;
490 							}
491 							continue;
492 						default:
493 							wp = cp - 1;
494 							break;
495 						}
496 						break;
497 					}
498 				else
499 				{
500 					while (!(c = sp[*cp++]));
501 					wp = cp - 1;
502 				}
503 				/* check for end-of-line */
504 				if (c == SP_LINE)
505 				{
506 					if (cp <= ep)
507 						break;
508 					if (lastchar == cut->ldelim.chr)
509 						break;
510 					/* restore cut->last character */
511 					if (lastchar != cut->eob)
512 						*ep = lastchar;
513 					inword++;
514 					if (!sp[lastchar])
515 						break;
516 				}
517 				nodelim = 0;
518 				if (--nfields > 0)
519 					continue;
520 				nfields = *++lp;
521 				if (copy)
522 				{
523 					empty = 0;
524 					if ((c = wp - copy) > 0 && sfwrite(fdout, (char*)copy, c) < 0)
525 						goto failed;
526 					copy = 0;
527 				}
528 				else
529 					/* set to delimiter unless the first field */
530 					copy = empty ? cp : wp;
531 			} while (!inword);
532 			if (!inword)
533 			{
534 				if (!copy)
535 				{
536 					if (nodelim)
537 					{
538 						if (!cut->sflag)
539 						{
540 							if (offset)
541 							{
542 								sfseek(fdtmp,(Sfoff_t)0,SEEK_SET);
543 								sfmove(fdtmp,fdout,offset,-1);
544 							}
545 							copy = first;
546 						}
547 					}
548 					else
549 						sfputc(fdout,'\n');
550 				}
551 				if (offset)
552 					sfseek(fdtmp,offset=0,SEEK_SET);
553 			}
554 			if (copy && (c=cp-copy)>0 && (!nodelim || !cut->sflag) && sfwrite(fdout,(char*)copy,c)< 0)
555 				goto failed;
556 		}
557 		/* see whether to save in tmp file */
558 		if(inword && nodelim && !cut->sflag && (c=cp-first)>0)
559 		{
560 			/* copy line to tmpfile in case no fields */
561 			if(!fdtmp)
562 				fdtmp = sftmp(BLOCK);
563 			sfwrite(fdtmp,(char*)first,c);
564 			offset +=c;
565 		}
566 	}
567  failed:
568 	if(fdtmp)
569 		sfclose(fdtmp);
570 }
571 
572 int
573 b_cut(int argc, char** argv, void* context)
574 {
575 	register char*		cp = 0;
576 	register Sfio_t*	fp;
577 	char*			s;
578 	int			n;
579 	Cut_t*			cut;
580 	int			mode = 0;
581 	Delim_t			wdelim;
582 	Delim_t			ldelim;
583 	size_t			reclen = 0;
584 
585 	cmdinit(argc, argv, context, ERROR_CATALOG, 0);
586 	wdelim.chr = '\t';
587 	ldelim.chr = '\n';
588 	wdelim.len = ldelim.len = 1;
589 	for (;;)
590 	{
591 		switch (n = optget(argv, usage))
592 		{
593 		case 0:
594 			break;
595 		case 'b':
596 		case 'c':
597 			if(mode&C_FIELDS)
598 			{
599 				error(2, "f option already specified");
600 				continue;
601 			}
602 			cp = opt_info.arg;
603 			if(n=='b')
604 				mode |= C_BYTES;
605 			else
606 				mode |= C_CHARS;
607 			continue;
608 		case 'D':
609 			ldelim.str = opt_info.arg;
610 			if (mbwide())
611 			{
612 				s = opt_info.arg;
613 				ldelim.chr = mbchar(s);
614 				if ((n = s - opt_info.arg) > 1)
615 				{
616 					ldelim.len = n;
617 					continue;
618 				}
619 			}
620 			ldelim.chr = *(unsigned char*)opt_info.arg;
621 			ldelim.len = 1;
622 			continue;
623 		case 'd':
624 			wdelim.str = opt_info.arg;
625 			if (mbwide())
626 			{
627 				s = opt_info.arg;
628 				wdelim.chr = mbchar(s);
629 				if ((n = s - opt_info.arg) > 1)
630 				{
631 					wdelim.len = n;
632 					continue;
633 				}
634 			}
635 			wdelim.chr = *(unsigned char*)opt_info.arg;
636 			wdelim.len = 1;
637 			continue;
638 		case 'f':
639 			if(mode&(C_CHARS|C_BYTES))
640 			{
641 				error(2, "c option already specified");
642 				continue;
643 			}
644 			cp = opt_info.arg;
645 			mode |= C_FIELDS;
646 			continue;
647 		case 'n':
648 			mode |= C_NOSPLIT;
649 			continue;
650 		case 'N':
651 			mode |= C_NONEWLINE;
652 			continue;
653 		case 'R':
654 		case 'r':
655 			if(opt_info.num>0)
656 				reclen = opt_info.num;
657 			continue;
658 		case 's':
659 			mode |= C_SUPRESS;
660 			continue;
661 		case ':':
662 			error(2, "%s", opt_info.arg);
663 			break;
664 		case '?':
665 			error(ERROR_usage(2), "%s", opt_info.arg);
666 			break;
667 		}
668 		break;
669 	}
670 	argv += opt_info.index;
671 	if (error_info.errors)
672 		error(ERROR_usage(2), "%s",optusage(NiL));
673 	if(!cp)
674 	{
675 		error(2, "b, c or f option must be specified");
676 		error(ERROR_usage(2), "%s", optusage(NiL));
677 	}
678 	if(!*cp)
679 		error(3, "non-empty b, c or f option must be specified");
680 	if((mode & (C_FIELDS|C_SUPRESS)) == C_SUPRESS)
681 		error(3, "s option requires f option");
682 	cut = cutinit(mode, cp, &wdelim, &ldelim, reclen);
683 	if(cp = *argv)
684 		argv++;
685 	do
686 	{
687 		if(!cp || streq(cp,"-"))
688 			fp = sfstdin;
689 		else if(!(fp = sfopen(NiL,cp,"r")))
690 		{
691 			error(ERROR_system(0),"%s: cannot open",cp);
692 			continue;
693 		}
694 		if(mode&C_FIELDS)
695 			cutfields(cut,fp,sfstdout);
696 		else
697 			cutcols(cut,fp,sfstdout);
698 		if(fp!=sfstdin)
699 			sfclose(fp);
700 	} while(cp = *argv++);
701 	if (sfsync(sfstdout))
702 		error(ERROR_system(0), "write error");
703 	return error_info.errors != 0;
704 }
705