1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License, Version 1.0 only
6 * (the "License").  You may not use this file except in compliance
7 * with the License.
8 *
9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 * or http://www.opensolaris.org/os/licensing.
11 * See the License for the specific language governing permissions
12 * and limitations under the License.
13 *
14 * When distributing Covered Code, include this CDDL HEADER in each
15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 * If applicable, add the following below this CDDL HEADER, with the
17 * fields enclosed by brackets "[]" replaced with your own identifying
18 * information: Portions Copyright [yyyy] [name of copyright owner]
19 *
20 * CDDL HEADER END
21 */
22/*
23 * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24 * Use is subject to license terms.
25 */
26
27#pragma ident	"%Z%%M%	%I%	%E% SMI"
28
29#include "options.h"
30
31/*
32 * options
33 *
34 * Overview
35 *   sort(1) supports two methods for specifying the sort key:  the original,
36 *   now-obsolete, +n -m form and the POSIX -k n,m form.  We refer to the former
37 *   as "old specifiers" and the latter as "new specifiers".  The options()
38 *   function parses the command line arguments given to sort, placing the sort
39 *   key specifiers in the internal representation used in fields.c.
40 *
41 * Equivalence of specifiers
42 *   One of sort(1)'s standard peculiarities is the transformation of the
43 *   character offsets and field numbering between the new and old style field
44 *   specifications.  We simply quote from the Single Unix standard:
45 *
46 *	+w.xT -y.zU
47 *
48 *   is equivalent to
49 *
50 * 	undefined		when z == 0, U contains b, and -t is set
51 * 	-k w+1.x+1T,y.0U	when z == 0 otherwise
52 * 	-k w+1.x+1T,y+1.zU	when z > 0
53 *
54 *   Undoubtedly, this seemed logical at the time.  (Using only the field head
55 *   as the coordinate, as done in the obsolete version, seems much simpler.)
56 *   The reverse map is where the key specifier
57 *
58 *	-k w.xT,y.zU
59 *
60 *   is equivalent to
61 *
62 * 	undefined		when z == 0, U contains b, and -t is set
63 *	+w-1.x-1T,y.0U		when z == 0 otherwise
64 *	+w-1.x-1T,y-1.z		when z > 0
65 *
66 *   in the obsolete syntax.  Because the original key specifiers lead to a
67 *   simpler implementation, the internal representation of a field in this
68 *   implementation of sort is mostly that given by the obsolete syntax.
69 */
70
71/*
72 * While a key specifier in the obsolete +m ... -n form is being defined (that
73 * is, before the closing -n is seen), a narrower set of options is permitted.
74 * We specify this smaller set of options in OLD_SPEC_OPTIONS_STRING.
75 */
76#define	OPTIONS_STRING		"cmuo:T:z:dfiMnrbt:k:S:0123456789"
77#define	OLD_SPEC_OPTIONS_STRING	"bdfiMnrcmuo:T:z:t:k:S:"
78
79#define	OPTIONS_OLDSPEC		0x1	/* else new-style spec */
80#define	OPTIONS_STARTSPEC	0x2	/* else end spec */
81
82static int
83is_number(char *C)
84{
85	size_t	i;
86
87	for (i = 0; i < strlen(C); i++)
88		if (!isdigit((uchar_t)C[i]))
89			return (0);
90
91	return (1);
92}
93
94/*
95 * If a field specified by the -k option or by the +n syntax contains any
96 * modifiers, then the current global field modifiers are not inherited.
97 */
98static int
99field_spec_has_modifiers(char *C, int length)
100{
101	int p_nonmodifiers = strspn(C, ",.1234567890");
102
103	if (p_nonmodifiers == length)
104		return (0);
105
106	return (1);
107}
108
109static void
110field_apply_all(field_t *fc, flag_t flags)
111{
112	field_t *f;
113
114	for (f = fc; f; f = f->f_next)
115		if ((f->f_options & FIELD_MODIFIERS_DEFINED) == 0)
116			f->f_options |= flags;
117}
118
119static int
120parse_field_spec(field_t *F, char *C, int flags, int length)
121{
122	int p_period = MIN(length, strcspn(C, "."));
123	int p_modifiers = MIN(length, strspn(C, ".1234567890"));
124	int p_boundary = MIN(p_period, p_modifiers);
125	int field = 0;
126	int offset = 0;
127	int offset_seen = 0;
128	int i;
129	int blanks_flag = 0;
130
131	for (i = 0; i < p_boundary; i++) {
132		if (isdigit((uchar_t)C[i]))
133			field = (10 * field) + (C[i] - '0');
134		else
135			return (1);
136	}
137
138	if (p_period < p_modifiers) {
139		for (i = p_period + 1; i < p_modifiers; i++) {
140			if (isdigit((uchar_t)C[i])) {
141				offset_seen++;
142				offset = (10 * offset) + (C[i] - '0');
143			} else {
144				return (1);
145			}
146		}
147	}
148
149	if (p_modifiers < length) {
150		for (i = p_modifiers; i < length; i++) {
151			switch (C[i]) {
152				case 'b':
153					blanks_flag = 1;
154					break;
155				case 'd':
156					F->f_options |= FIELD_DICTIONARY_ORDER;
157					break;
158				case 'f':
159					F->f_options |= FIELD_FOLD_UPPERCASE;
160					break;
161				case 'i':
162					F->f_options |=
163					    FIELD_IGNORE_NONPRINTABLES;
164					break;
165				case 'M':
166					F->f_species = MONTH;
167					break;
168				case 'n':
169					F->f_species = NUMERIC;
170					break;
171				case 'r':
172					F->f_options |=
173					    FIELD_REVERSE_COMPARISONS;
174					break;
175				default:
176					usage();
177					break;
178			}
179		}
180	}
181
182	if (flags & OPTIONS_STARTSPEC) {
183		F->f_start_field = field;
184		F->f_start_offset = offset;
185		if ((flags & OPTIONS_OLDSPEC) != OPTIONS_OLDSPEC) {
186			F->f_start_field--;
187			if (offset_seen)
188				F->f_start_offset--;
189		}
190		F->f_options |= blanks_flag ? FIELD_IGNORE_BLANKS_START : 0;
191	} else {
192		F->f_end_field = field;
193		F->f_end_offset = offset;
194		if ((flags & OPTIONS_OLDSPEC) != OPTIONS_OLDSPEC &&
195		    offset_seen && offset != 0)
196			F->f_end_field--;
197		F->f_options |= blanks_flag ? FIELD_IGNORE_BLANKS_END : 0;
198	}
199
200	return (0);
201}
202
203static void
204parse_new_field_spec(sort_t *S, char *arg)
205{
206	int length = strlen(arg);
207	int p_comma = MIN(length, strcspn(arg, ","));
208	field_t *nF;
209	int p;
210
211	/*
212	 * New field specifiers do not inherit from the general specifier if
213	 * they have any modifiers set.  (This is specifically tested in the VSC
214	 * test suite, assertion 32 for POSIX.cmd/sort.)
215	 */
216	if (field_spec_has_modifiers(arg, length)) {
217		nF = field_new(NULL);
218		nF->f_options = FIELD_MODIFIERS_DEFINED;
219	} else {
220		nF = field_new(S);
221	}
222	p = parse_field_spec(nF, arg, OPTIONS_STARTSPEC, p_comma);
223
224	if (p != 0)
225		usage();
226
227	if (p_comma < length) {
228		p = parse_field_spec(nF, &(arg[p_comma + 1]), 0,
229		    strlen(&(arg[p_comma + 1])));
230		if (p != 0)
231			usage();
232	}
233
234	if (nF->f_start_field < 0 || nF->f_start_offset < 0) {
235		if (S->m_verbose)
236			warn("-k %s is not a supported field specifier\n", arg);
237	}
238	nF->f_start_field = MAX(nF->f_start_field, 0);
239	nF->f_start_offset = MAX(nF->f_start_offset, 0);
240
241	/*
242	 * If the starting field exceeds a defined ending field, convention
243	 * dictates that the field is ignored.
244	 */
245	if (nF->f_end_field == -1 || nF->f_start_field < nF->f_end_field ||
246	    (nF->f_start_field == nF->f_end_field &&
247	    nF->f_start_offset < nF->f_end_offset)) {
248		field_add_to_chain(&(S->m_fields_head), nF);
249	} else if (S->m_verbose) {
250		warn("illegal field -k %s omitted", arg);
251	}
252}
253
254/*
255 * parse_old_field_spec() is getopt()-aware; it may modify the values of optind,
256 * optarg, and so forth, to correctly determine the characteristics being
257 * assigned to the current field.
258 */
259static int
260parse_old_field_spec(sort_t *S, int argc, char *argv[])
261{
262	field_t *nF;
263	int c, p;
264	char *arg = argv[optind];
265
266	if (field_spec_has_modifiers(arg + 1, strlen(arg + 1))) {
267		nF = field_new(NULL);
268		nF->f_options = FIELD_MODIFIERS_DEFINED;
269	} else {
270		nF = field_new(S);
271	}
272
273	p = parse_field_spec(nF, arg + 1, OPTIONS_OLDSPEC | OPTIONS_STARTSPEC,
274	    strlen(arg + 1));
275
276	if (p != 0) {
277		field_delete(nF);
278		return (0);
279	}
280
281	/*
282	 * In the case that getopt() returns '?' (unrecognized option) or EOF
283	 * (non-option argument), the field is considered closed.
284	 */
285	for (arg = argv[++optind]; optind < argc; arg = argv[optind]) {
286		if (strlen(arg) >= 2 && *arg == '-' &&
287		    isdigit(*(uchar_t *)(arg + 1))) {
288			(void) parse_field_spec(nF, arg + 1,
289			    OPTIONS_OLDSPEC, strlen(arg) - 1);
290			field_add_to_chain(&(S->m_fields_head), nF);
291			optind++;
292			return (1);
293		}
294
295		if ((c = getopt(argc, argv, OLD_SPEC_OPTIONS_STRING)) != EOF) {
296			switch (c) {
297			case 'b':
298				nF->f_options |= FIELD_IGNORE_BLANKS_START;
299				break;
300			case 'd':
301				nF->f_options |= FIELD_DICTIONARY_ORDER;
302				break;
303			case 'f':
304				nF->f_options |= FIELD_FOLD_UPPERCASE;
305				break;
306			case 'i':
307				nF->f_options |= FIELD_IGNORE_NONPRINTABLES;
308				break;
309			case 'M':
310				nF->f_species = MONTH;
311				break;
312			case 'n':
313				nF->f_species = NUMERIC;
314				break;
315			case 'r':
316				nF->f_options |= FIELD_REVERSE_COMPARISONS;
317				break;
318			case '?':
319			case 'c':
320			case 'm':
321			case 'u':
322				/*
323				 * Options without arguments.
324				 */
325				optind -= 1;
326				field_add_to_chain(&(S->m_fields_head), nF);
327				return (1);
328				/*NOTREACHED*/
329			case 'o':
330			case 'T':
331			case 'z':
332			case 't':
333			case 'k':
334			case 'S':
335				/*
336				 * Options with arguments.
337				 */
338				if (optarg == argv[optind - 1] + 2) {
339					optind -= 1;
340				} else {
341					optind -= 2;
342				}
343				field_add_to_chain(&(S->m_fields_head), nF);
344				return (1);
345				/*NOTREACHED*/
346			default:
347				die(EMSG_UNKN_OPTION);
348				/*NOTREACHED*/
349			}
350		} else {
351			break;
352		}
353	}
354
355	field_add_to_chain(&(S->m_fields_head), nF);
356	return (1);
357}
358
359int
360options(sort_t *S, int argc, char *argv[])
361{
362	int c;
363
364	optind = 1;
365	while (optind < argc) {
366		if (strncmp("-y", argv[optind], strlen("-y")) == 0) {
367			/*
368			 * The -y [kmem] option violates the standard syntax
369			 * outlined in intro(1).  we have to be a little fancy
370			 * to determine if the next argument is a valid integer.
371			 * (note, of course, that the previous sort(1) had no
372			 * mechanism to resolve a final
373			 *	-y 99999
374			 * into
375			 *	-y, file 99999
376			 * or
377			 *	-y 99999, file stdin
378			 *
379			 * Now one can unambiguously use
380			 *	-y -- 99999
381			 * and
382			 *	-y 99999 -
383			 * to distinguish these cases.
384			 *
385			 * That said, we do not use the information passed using
386			 * -y option in sort(1); we provide the argument to
387			 * preserve compatibility for existing scripts.
388			 */
389			if (strlen(argv[optind]) == strlen("-y") &&
390			    optind + 1 < argc &&
391			    is_number(argv[optind + 1]))
392				optind += 2;
393			else
394				optind += 1;
395		}
396
397		if ((c = getopt(argc, argv, OPTIONS_STRING)) != EOF) {
398			switch (c) {
399			case 'c':
400				S->m_check_if_sorted_only = 1;
401				break;
402
403			case 'm':
404				S->m_merge_only = 1;
405				break;
406
407			case 'u':
408				S->m_unique_lines = 1;
409				break;
410
411			case 'o':
412				S->m_output_filename = optarg;
413				break;
414
415			case 'T':
416				S->m_tmpdir_template = optarg;
417				break;
418
419			case 'z':
420				/*
421				 * ignore optarg -- obsolete
422				 */
423				break;
424
425			case 'd':
426				S->m_field_options |= FIELD_DICTIONARY_ORDER;
427				field_apply_all(S->m_fields_head,
428				    FIELD_DICTIONARY_ORDER);
429				break;
430
431			case 'f':
432				S->m_field_options |= FIELD_FOLD_UPPERCASE;
433				field_apply_all(S->m_fields_head,
434				    FIELD_FOLD_UPPERCASE);
435				break;
436
437			case 'i':
438				S->m_field_options |=
439				    FIELD_IGNORE_NONPRINTABLES;
440				field_apply_all(S->m_fields_head,
441				    FIELD_IGNORE_NONPRINTABLES);
442				break;
443
444			case 'M':
445				S->m_default_species = MONTH;
446				S->m_field_options &=
447				    ~FIELD_IGNORE_BLANKS_START;
448				break;
449
450			case 'n':
451				S->m_default_species = NUMERIC;
452				{
453					field_t *f;
454
455					for (f = S->m_fields_head; f;
456					    f = f->f_next)
457						if ((f->f_options &
458						    FIELD_MODIFIERS_DEFINED) ==
459						    0)
460							f->f_species = NUMERIC;
461				}
462				break;
463
464			case 'b':
465				S->m_field_options |=
466				    FIELD_IGNORE_BLANKS_START |
467				    FIELD_IGNORE_BLANKS_END;
468				break;
469
470			case 'r':
471				S->m_field_options |=
472				    FIELD_REVERSE_COMPARISONS;
473				field_apply_all(S->m_fields_head,
474				    FIELD_REVERSE_COMPARISONS);
475				break;
476
477			case 't':
478				/*
479				 * delimiter
480				 */
481				if (S->m_single_byte_locale) {
482					/*
483					 * Most debuggers can't take tabs as
484					 * input arguments, so we provide an
485					 * escape sequence to allow testing of
486					 * this special case for the DEBUG
487					 * version.
488					 */
489					S->m_field_separator.sc =
490#ifdef DEBUG
491					    xstreql(optarg, "\\t") ? '\t' :
492#endif
493					    optarg[0];
494				} else
495					(void) mbtowc(&S->m_field_separator.wc,
496					    optarg, MB_CUR_MAX);
497				break;
498
499			case 'k':
500				/*
501				 * key
502				 */
503				(void) parse_new_field_spec(S, optarg);
504				break;
505
506			case 'S':
507				S->m_memory_limit = strtomem(optarg);
508#ifdef DEBUG
509				(void) fprintf(stderr, CMDNAME
510				    ": limiting size to %d bytes\n",
511				    S->m_memory_limit);
512#endif /* DEBUG */
513				break;
514
515			/*
516			 * We never take a naked -999; these should always be
517			 * associated with a preceding +000.
518			 */
519			case '0':
520			case '1':
521			case '2':
522			case '3':
523			case '4':
524			case '5':
525			case '6':
526			case '7':
527			case '8':
528			case '9':
529				usage();
530				break;
531			case '?':
532				/* error case */
533				usage();
534				break;
535			}
536
537			/*
538			 * Go back for next argument.
539			 */
540			continue;
541		}
542
543		/*
544		 * There are three (interpretable) possibilities for getopt() to
545		 * return EOF with arguments on the command line: we have seen
546		 * the "end-of-options" token, --, we have encountered the
547		 * old-style field definition, +NNN, or we have found a
548		 * filename.
549		 *
550		 * In the second case, we must also search for the optional -NNN
551		 * field terminal definition.  (since "+joe", for instance, is
552		 * a valid filename, we must handle this pattern as well.)  This
553		 * is performed by parse_old_field_spec().
554		 */
555		if (xstreql(argv[optind - 1], "--")) {
556			/*
557			 * Process all arguments following end-of-options token
558			 * as filenames.
559			 */
560			while (optind < argc) {
561				if (xstreql(argv[optind], "-"))
562					S->m_input_from_stdin = 1;
563				else
564					stream_add_file_to_chain(
565					    &(S->m_input_streams),
566					    argv[optind]);
567				optind++;
568			}
569
570			break;
571		}
572
573		if (optind < argc) {
574			if (xstreql(argv[optind], "-")) {
575				S->m_input_from_stdin = 1;
576				optind++;
577			} else if (*(argv[optind]) != '+' ||
578			    !parse_old_field_spec(S, argc, argv)) {
579				/*
580				 * It's a filename, because it either doesn't
581				 * start with '+', or if it did, it wasn't an
582				 * actual field specifier.
583				 */
584				stream_add_file_to_chain(&(S->m_input_streams),
585				    argv[optind]);
586				optind++;
587			}
588		}
589	}
590
591	if (S->m_input_streams == NULL)
592		S->m_input_from_stdin = 1;
593
594	if (S->m_output_filename == NULL)
595		S->m_output_to_stdout = 1;
596
597	/*
598	 * If no fields, then one great field.  However, if the -b option was
599	 * set globally, be sure to ignore it, as per UNIX98.
600	 */
601	if (S->m_fields_head == NULL) {
602		S->m_field_options &= ~FIELD_IGNORE_BLANKS_START;
603
604		(void) parse_new_field_spec(S, "1");
605		/*
606		 * "Entire line" fast path is only valid if no delimiter has
607		 * been set and no modifiers have been applied.
608		 */
609		if (S->m_field_separator.wc == 0 &&
610		    S->m_default_species == ALPHA &&
611		    S->m_field_options == 0)
612			S->m_entire_line = 1;
613	}
614
615	return (0);
616}
617