xref: /illumos-gate/usr/src/cmd/sort/options.c (revision 101e15b5)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #include "options.h"
28 
29 /*
30  * options
31  *
32  * Overview
33  *   sort(1) supports two methods for specifying the sort key:  the original,
34  *   now-obsolete, +n -m form and the POSIX -k n,m form.  We refer to the former
35  *   as "old specifiers" and the latter as "new specifiers".  The options()
36  *   function parses the command line arguments given to sort, placing the sort
37  *   key specifiers in the internal representation used in fields.c.
38  *
39  * Equivalence of specifiers
40  *   One of sort(1)'s standard peculiarities is the transformation of the
41  *   character offsets and field numbering between the new and old style field
42  *   specifications.  We simply quote from the Single Unix standard:
43  *
44  *	+w.xT -y.zU
45  *
46  *   is equivalent to
47  *
48  *	undefined		when z == 0, U contains b, and -t is set
49  *	-k w+1.x+1T,y.0U	when z == 0 otherwise
50  *	-k w+1.x+1T,y+1.zU	when z > 0
51  *
52  *   Undoubtedly, this seemed logical at the time.  (Using only the field head
53  *   as the coordinate, as done in the obsolete version, seems much simpler.)
54  *   The reverse map is where the key specifier
55  *
56  *	-k w.xT,y.zU
57  *
58  *   is equivalent to
59  *
60  *	undefined		when z == 0, U contains b, and -t is set
61  *	+w-1.x-1T,y.0U		when z == 0 otherwise
62  *	+w-1.x-1T,y-1.z		when z > 0
63  *
64  *   in the obsolete syntax.  Because the original key specifiers lead to a
65  *   simpler implementation, the internal representation of a field in this
66  *   implementation of sort is mostly that given by the obsolete syntax.
67  */
68 
69 /*
70  * While a key specifier in the obsolete +m ... -n form is being defined (that
71  * is, before the closing -n is seen), a narrower set of options is permitted.
72  * We specify this smaller set of options in OLD_SPEC_OPTIONS_STRING.
73  */
74 #define	OPTIONS_STRING		"cmuo:T:z:dfiMnrbt:k:S:0123456789"
75 #define	OLD_SPEC_OPTIONS_STRING	"bdfiMnrcmuo:T:z:t:k:S:"
76 
77 #define	OPTIONS_OLDSPEC		0x1	/* else new-style spec */
78 #define	OPTIONS_STARTSPEC	0x2	/* else end spec */
79 
80 static int
is_number(char * C)81 is_number(char *C)
82 {
83 	size_t	i;
84 
85 	for (i = 0; i < strlen(C); i++)
86 		if (!isdigit((uchar_t)C[i]))
87 			return (0);
88 
89 	return (1);
90 }
91 
92 /*
93  * If a field specified by the -k option or by the +n syntax contains any
94  * modifiers, then the current global field modifiers are not inherited.
95  */
96 static int
field_spec_has_modifiers(char * C,int length)97 field_spec_has_modifiers(char *C, int length)
98 {
99 	int p_nonmodifiers = strspn(C, ",.1234567890");
100 
101 	if (p_nonmodifiers == length)
102 		return (0);
103 
104 	return (1);
105 }
106 
107 static void
field_apply_all(field_t * fc,flag_t flags)108 field_apply_all(field_t *fc, flag_t flags)
109 {
110 	field_t *f;
111 
112 	for (f = fc; f; f = f->f_next)
113 		if ((f->f_options & FIELD_MODIFIERS_DEFINED) == 0)
114 			f->f_options |= flags;
115 }
116 
117 static int
parse_field_spec(field_t * F,char * C,int flags,int length)118 parse_field_spec(field_t *F, char *C, int flags, int length)
119 {
120 	int p_period = MIN(length, strcspn(C, "."));
121 	int p_modifiers = MIN(length, strspn(C, ".1234567890"));
122 	int p_boundary = MIN(p_period, p_modifiers);
123 	int field = 0;
124 	int offset = 0;
125 	int offset_seen = 0;
126 	int i;
127 	int blanks_flag = 0;
128 
129 	for (i = 0; i < p_boundary; i++) {
130 		if (isdigit((uchar_t)C[i]))
131 			field = (10 * field) + (C[i] - '0');
132 		else
133 			return (1);
134 	}
135 
136 	if (p_period < p_modifiers) {
137 		for (i = p_period + 1; i < p_modifiers; i++) {
138 			if (isdigit((uchar_t)C[i])) {
139 				offset_seen++;
140 				offset = (10 * offset) + (C[i] - '0');
141 			} else {
142 				return (1);
143 			}
144 		}
145 	}
146 
147 	if (p_modifiers < length) {
148 		for (i = p_modifiers; i < length; i++) {
149 			switch (C[i]) {
150 				case 'b':
151 					blanks_flag = 1;
152 					break;
153 				case 'd':
154 					F->f_options |= FIELD_DICTIONARY_ORDER;
155 					break;
156 				case 'f':
157 					F->f_options |= FIELD_FOLD_UPPERCASE;
158 					break;
159 				case 'i':
160 					F->f_options |=
161 					    FIELD_IGNORE_NONPRINTABLES;
162 					break;
163 				case 'M':
164 					F->f_species = MONTH;
165 					break;
166 				case 'n':
167 					F->f_species = NUMERIC;
168 					break;
169 				case 'r':
170 					F->f_options |=
171 					    FIELD_REVERSE_COMPARISONS;
172 					break;
173 				default:
174 					usage();
175 					break;
176 			}
177 		}
178 	}
179 
180 	if (flags & OPTIONS_STARTSPEC) {
181 		F->f_start_field = field;
182 		F->f_start_offset = offset;
183 		if ((flags & OPTIONS_OLDSPEC) != OPTIONS_OLDSPEC) {
184 			F->f_start_field--;
185 			if (offset_seen)
186 				F->f_start_offset--;
187 		}
188 		F->f_options |= blanks_flag ? FIELD_IGNORE_BLANKS_START : 0;
189 	} else {
190 		F->f_end_field = field;
191 		F->f_end_offset = offset;
192 		if ((flags & OPTIONS_OLDSPEC) != OPTIONS_OLDSPEC &&
193 		    offset_seen && offset != 0)
194 			F->f_end_field--;
195 		F->f_options |= blanks_flag ? FIELD_IGNORE_BLANKS_END : 0;
196 	}
197 
198 	return (0);
199 }
200 
201 static void
parse_new_field_spec(sort_t * S,char * arg)202 parse_new_field_spec(sort_t *S, char *arg)
203 {
204 	int length = strlen(arg);
205 	int p_comma = MIN(length, strcspn(arg, ","));
206 	field_t *nF;
207 	int p;
208 
209 	/*
210 	 * New field specifiers do not inherit from the general specifier if
211 	 * they have any modifiers set.  (This is specifically tested in the VSC
212 	 * test suite, assertion 32 for POSIX.cmd/sort.)
213 	 */
214 	if (field_spec_has_modifiers(arg, length)) {
215 		nF = field_new(NULL);
216 		nF->f_options = FIELD_MODIFIERS_DEFINED;
217 	} else {
218 		nF = field_new(S);
219 	}
220 	p = parse_field_spec(nF, arg, OPTIONS_STARTSPEC, p_comma);
221 
222 	if (p != 0)
223 		usage();
224 
225 	if (p_comma < length) {
226 		p = parse_field_spec(nF, &(arg[p_comma + 1]), 0,
227 		    strlen(&(arg[p_comma + 1])));
228 		if (p != 0)
229 			usage();
230 	}
231 
232 	if (nF->f_start_field < 0 || nF->f_start_offset < 0) {
233 		if (S->m_verbose)
234 			warn("-k %s is not a supported field specifier\n", arg);
235 	}
236 	nF->f_start_field = MAX(nF->f_start_field, 0);
237 	nF->f_start_offset = MAX(nF->f_start_offset, 0);
238 
239 	/*
240 	 * If the starting field exceeds a defined ending field, convention
241 	 * dictates that the field is ignored.
242 	 */
243 	if (nF->f_end_field == -1 || nF->f_start_field < nF->f_end_field ||
244 	    (nF->f_start_field == nF->f_end_field &&
245 	    nF->f_start_offset < nF->f_end_offset)) {
246 		field_add_to_chain(&(S->m_fields_head), nF);
247 	} else if (S->m_verbose) {
248 		warn("illegal field -k %s omitted", arg);
249 	}
250 }
251 
252 /*
253  * parse_old_field_spec() is getopt()-aware; it may modify the values of optind,
254  * optarg, and so forth, to correctly determine the characteristics being
255  * assigned to the current field.
256  */
257 static int
parse_old_field_spec(sort_t * S,int argc,char * argv[])258 parse_old_field_spec(sort_t *S, int argc, char *argv[])
259 {
260 	field_t *nF;
261 	int c, p;
262 	char *arg = argv[optind];
263 
264 	if (field_spec_has_modifiers(arg + 1, strlen(arg + 1))) {
265 		nF = field_new(NULL);
266 		nF->f_options = FIELD_MODIFIERS_DEFINED;
267 	} else {
268 		nF = field_new(S);
269 	}
270 
271 	p = parse_field_spec(nF, arg + 1, OPTIONS_OLDSPEC | OPTIONS_STARTSPEC,
272 	    strlen(arg + 1));
273 
274 	if (p != 0) {
275 		field_delete(nF);
276 		return (0);
277 	}
278 
279 	/*
280 	 * In the case that getopt() returns '?' (unrecognized option) or EOF
281 	 * (non-option argument), the field is considered closed.
282 	 */
283 	for (arg = argv[++optind]; optind < argc; arg = argv[optind]) {
284 		if (strlen(arg) >= 2 && *arg == '-' &&
285 		    isdigit(*(uchar_t *)(arg + 1))) {
286 			(void) parse_field_spec(nF, arg + 1,
287 			    OPTIONS_OLDSPEC, strlen(arg) - 1);
288 			field_add_to_chain(&(S->m_fields_head), nF);
289 			optind++;
290 			return (1);
291 		}
292 
293 		if ((c = getopt(argc, argv, OLD_SPEC_OPTIONS_STRING)) != EOF) {
294 			switch (c) {
295 			case 'b':
296 				nF->f_options |= FIELD_IGNORE_BLANKS_START;
297 				break;
298 			case 'd':
299 				nF->f_options |= FIELD_DICTIONARY_ORDER;
300 				break;
301 			case 'f':
302 				nF->f_options |= FIELD_FOLD_UPPERCASE;
303 				break;
304 			case 'i':
305 				nF->f_options |= FIELD_IGNORE_NONPRINTABLES;
306 				break;
307 			case 'M':
308 				nF->f_species = MONTH;
309 				break;
310 			case 'n':
311 				nF->f_species = NUMERIC;
312 				break;
313 			case 'r':
314 				nF->f_options |= FIELD_REVERSE_COMPARISONS;
315 				break;
316 			case '?':
317 			case 'c':
318 			case 'm':
319 			case 'u':
320 				/*
321 				 * Options without arguments.
322 				 */
323 				optind -= 1;
324 				field_add_to_chain(&(S->m_fields_head), nF);
325 				return (1);
326 				/*NOTREACHED*/
327 			case 'o':
328 			case 'T':
329 			case 'z':
330 			case 't':
331 			case 'k':
332 			case 'S':
333 				/*
334 				 * Options with arguments.
335 				 */
336 				if (optarg == argv[optind - 1] + 2) {
337 					optind -= 1;
338 				} else {
339 					optind -= 2;
340 				}
341 				field_add_to_chain(&(S->m_fields_head), nF);
342 				return (1);
343 				/*NOTREACHED*/
344 			default:
345 				die(EMSG_UNKN_OPTION);
346 				/*NOTREACHED*/
347 			}
348 		} else {
349 			break;
350 		}
351 	}
352 
353 	field_add_to_chain(&(S->m_fields_head), nF);
354 	return (1);
355 }
356 
357 int
options(sort_t * S,int argc,char * argv[])358 options(sort_t *S, int argc, char *argv[])
359 {
360 	int c;
361 
362 	optind = 1;
363 	while (optind < argc) {
364 		if (strncmp("-y", argv[optind], strlen("-y")) == 0) {
365 			/*
366 			 * The -y [kmem] option violates the standard syntax
367 			 * outlined in intro(1).  we have to be a little fancy
368 			 * to determine if the next argument is a valid integer.
369 			 * (note, of course, that the previous sort(1) had no
370 			 * mechanism to resolve a final
371 			 *	-y 99999
372 			 * into
373 			 *	-y, file 99999
374 			 * or
375 			 *	-y 99999, file stdin
376 			 *
377 			 * Now one can unambiguously use
378 			 *	-y -- 99999
379 			 * and
380 			 *	-y 99999 -
381 			 * to distinguish these cases.
382 			 *
383 			 * That said, we do not use the information passed using
384 			 * -y option in sort(1); we provide the argument to
385 			 * preserve compatibility for existing scripts.
386 			 */
387 			if (strlen(argv[optind]) == strlen("-y") &&
388 			    optind + 1 < argc &&
389 			    is_number(argv[optind + 1]))
390 				optind += 2;
391 			else
392 				optind += 1;
393 		}
394 
395 		if ((c = getopt(argc, argv, OPTIONS_STRING)) != EOF) {
396 			switch (c) {
397 			case 'c':
398 				S->m_check_if_sorted_only = 1;
399 				break;
400 
401 			case 'm':
402 				S->m_merge_only = 1;
403 				break;
404 
405 			case 'u':
406 				S->m_unique_lines = 1;
407 				break;
408 
409 			case 'o':
410 				S->m_output_filename = optarg;
411 				break;
412 
413 			case 'T':
414 				S->m_tmpdir_template = optarg;
415 				break;
416 
417 			case 'z':
418 				/*
419 				 * ignore optarg -- obsolete
420 				 */
421 				break;
422 
423 			case 'd':
424 				S->m_field_options |= FIELD_DICTIONARY_ORDER;
425 				field_apply_all(S->m_fields_head,
426 				    FIELD_DICTIONARY_ORDER);
427 				break;
428 
429 			case 'f':
430 				S->m_field_options |= FIELD_FOLD_UPPERCASE;
431 				field_apply_all(S->m_fields_head,
432 				    FIELD_FOLD_UPPERCASE);
433 				break;
434 
435 			case 'i':
436 				S->m_field_options |=
437 				    FIELD_IGNORE_NONPRINTABLES;
438 				field_apply_all(S->m_fields_head,
439 				    FIELD_IGNORE_NONPRINTABLES);
440 				break;
441 
442 			case 'M':
443 				S->m_default_species = MONTH;
444 				S->m_field_options &=
445 				    ~FIELD_IGNORE_BLANKS_START;
446 				break;
447 
448 			case 'n':
449 				S->m_default_species = NUMERIC;
450 				{
451 					field_t *f;
452 
453 					for (f = S->m_fields_head; f;
454 					    f = f->f_next)
455 						if ((f->f_options &
456 						    FIELD_MODIFIERS_DEFINED) ==
457 						    0)
458 							f->f_species = NUMERIC;
459 				}
460 				break;
461 
462 			case 'b':
463 				S->m_field_options |=
464 				    FIELD_IGNORE_BLANKS_START |
465 				    FIELD_IGNORE_BLANKS_END;
466 				break;
467 
468 			case 'r':
469 				S->m_field_options |=
470 				    FIELD_REVERSE_COMPARISONS;
471 				field_apply_all(S->m_fields_head,
472 				    FIELD_REVERSE_COMPARISONS);
473 				break;
474 
475 			case 't':
476 				/*
477 				 * delimiter
478 				 */
479 				if (S->m_single_byte_locale) {
480 					/*
481 					 * Most debuggers can't take tabs as
482 					 * input arguments, so we provide an
483 					 * escape sequence to allow testing of
484 					 * this special case for the DEBUG
485 					 * version.
486 					 */
487 					S->m_field_separator.sc =
488 #ifdef DEBUG
489 					    xstreql(optarg, "\\t") ? '\t' :
490 #endif
491 					    optarg[0];
492 				} else
493 					(void) mbtowc(&S->m_field_separator.wc,
494 					    optarg, MB_CUR_MAX);
495 				break;
496 
497 			case 'k':
498 				/*
499 				 * key
500 				 */
501 				(void) parse_new_field_spec(S, optarg);
502 				break;
503 
504 			case 'S':
505 				S->m_memory_limit = strtomem(optarg);
506 #ifdef DEBUG
507 				(void) fprintf(stderr, CMDNAME
508 				    ": limiting size to %d bytes\n",
509 				    S->m_memory_limit);
510 #endif /* DEBUG */
511 				break;
512 
513 			/*
514 			 * We never take a naked -999; these should always be
515 			 * associated with a preceding +000.
516 			 */
517 			case '0':
518 			case '1':
519 			case '2':
520 			case '3':
521 			case '4':
522 			case '5':
523 			case '6':
524 			case '7':
525 			case '8':
526 			case '9':
527 				usage();
528 				break;
529 			case '?':
530 				/* error case */
531 				usage();
532 				break;
533 			}
534 
535 			/*
536 			 * Go back for next argument.
537 			 */
538 			continue;
539 		}
540 
541 		/*
542 		 * There are three (interpretable) possibilities for getopt() to
543 		 * return EOF with arguments on the command line: we have seen
544 		 * the "end-of-options" token, --, we have encountered the
545 		 * old-style field definition, +NNN, or we have found a
546 		 * filename.
547 		 *
548 		 * In the second case, we must also search for the optional -NNN
549 		 * field terminal definition.  (since "+joe", for instance, is
550 		 * a valid filename, we must handle this pattern as well.)  This
551 		 * is performed by parse_old_field_spec().
552 		 */
553 		if (xstreql(argv[optind - 1], "--")) {
554 			/*
555 			 * Process all arguments following end-of-options token
556 			 * as filenames.
557 			 */
558 			while (optind < argc) {
559 				if (xstreql(argv[optind], "-"))
560 					S->m_input_from_stdin = 1;
561 				else
562 					stream_add_file_to_chain(
563 					    &(S->m_input_streams),
564 					    argv[optind]);
565 				optind++;
566 			}
567 
568 			break;
569 		}
570 
571 		if (optind < argc) {
572 			if (xstreql(argv[optind], "-")) {
573 				S->m_input_from_stdin = 1;
574 				optind++;
575 			} else if (*(argv[optind]) != '+' ||
576 			    !parse_old_field_spec(S, argc, argv)) {
577 				/*
578 				 * It's a filename, because it either doesn't
579 				 * start with '+', or if it did, it wasn't an
580 				 * actual field specifier.
581 				 */
582 				stream_add_file_to_chain(&(S->m_input_streams),
583 				    argv[optind]);
584 				optind++;
585 			}
586 		}
587 	}
588 
589 	if (S->m_input_streams == NULL)
590 		S->m_input_from_stdin = 1;
591 
592 	if (S->m_output_filename == NULL)
593 		S->m_output_to_stdout = 1;
594 
595 	/*
596 	 * If no fields, then one great field.  However, if the -b option was
597 	 * set globally, be sure to ignore it, as per UNIX98.
598 	 */
599 	if (S->m_fields_head == NULL) {
600 		S->m_field_options &= ~FIELD_IGNORE_BLANKS_START;
601 
602 		(void) parse_new_field_spec(S, "1");
603 		/*
604 		 * "Entire line" fast path is only valid if no delimiter has
605 		 * been set and no modifiers have been applied.
606 		 */
607 		if (S->m_field_separator.wc == 0 &&
608 		    S->m_default_species == ALPHA &&
609 		    S->m_field_options == 0)
610 			S->m_entire_line = 1;
611 	}
612 
613 	return (0);
614 }
615