xref: /illumos-gate/usr/src/cmd/sort/fields.c (revision 101e15b5)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 1998, 2010, Oracle and/or its affiliates. All rights reserved.
24  */
25 
26 #include "fields.h"
27 
28 /*
29  * fields
30  *
31  * Overview
32  *   By a field, we mean the various delimited character sequences within each
33  *   line of the input files.  The sort key consists of an ordered sequence of
34  *   fields, which need not include all possible fields for the given line.
35  *   (Furthermore, not every line need contain sufficient fields for the fields
36  *   given within the sort key.  In fact, none of the lines in the input stream
37  *   need contain sufficient fields.)
38  *
39  *   There are two methods for specifying fields for sort(1); these are
40  *   discussed in options.c.  Here we discuss only the internal representation
41  *   of fields, as used for constructing the collation vector for each line as
42  *   defined by the sort key.
43  *
44  * Representation
45  *   The sort key is a singly-linked list of field specifiers.  At present,
46  *   fields may belong to one of three species:  alphabetical, numerical, or
47  *   monthly; the species (f_species) then indicates the conversion function
48  *   (f_convert) used to transform the raw characters of the character sequence
49  *   to a collatable form.  (In principle, this allows us to consider future
50  *   field species such as hexadecimal.)
51  *
52  *   Fields and offsets are numbered such that zero refers to the first field or
53  *   character, respectively.  Thus, the interpretation of a key specifier, m.n,
54  *   is that the field begins at the nth character beyond the mth occurence of
55  *   the key separator.  If the blanks flag has been specified, then the field
56  *   begins at the nth non-blank character past the mth key separator.  If the
57  *   key separator is unspecified, then the key separator is defined as one or
58  *   more blank characters.
59  *
60  *   In general, the various options afforded by sort may be broken into two
61  *   categories:  field species and field modifiers.  For each field species,
62  *   there is one or more conversion routines that take a delimited character
63  *   sequence and convert it to a character sequence collatable by strcmp() or
64  *   memcmp().  For field species that may be further modified, such as the
65  *   fold-to-uppercase option for alphabetic fields, the conversion routine may
66  *   be aware of how the modifier affects collation.  Finally, the no-modifiers
67  *   case may present an opportunity for a simplified, faster version.
68  *
69  * Code Structure
70  *   The code paths for single-byte and multi-byte locales diverge significantly
71  *   in fields.c.  Most routines have an *_wide() version, which produces an
72  *   equivalent effect for line records whose data field is composed of wide
73  *   characters (wchar_t).  However, the l_collated field of a line record is
74  *   always composed of characters, so that the radix sorts provided in
75  *   internal.c can work in both single- and multi-byte locales.  Thus, in the
76  *   various convert_*_wide() routines, the output is placed in l_collated, with
77  *   a length multiplier of 4.
78  */
79 
80 #define	BEFORE_NUMBER	0x0
81 #define	IN_NUMBER	0x1
82 
83 static char	numerical_separator;
84 static char	numerical_decimal;
85 static char	monetary_separator;
86 static char	monetary_decimal;
87 
88 static wchar_t	w_numerical_separator;
89 static wchar_t	w_numerical_decimal;
90 static wchar_t	w_monetary_separator;
91 static wchar_t	w_monetary_decimal;
92 
93 #define	MONTHS_IN_YEAR	12
94 #define	MAX_MON_LEN	20
95 
96 enum { MO_NONE = 1, MO_OFFSET = 2 };
97 
98 static char	*months[MONTHS_IN_YEAR];
99 static size_t	month_lengths[MONTHS_IN_YEAR];
100 static wchar_t	*w_months[MONTHS_IN_YEAR];
101 static size_t	w_month_lengths[MONTHS_IN_YEAR];
102 
103 #define	DECIMAL_CHAR		(numerical_decimal)
104 #define	IS_BLANK(x)		(isspace((uchar_t)(x)) && (x) != '\n')
105 #define	IS_SEPARATOR(x)		\
106 	((numerical_separator != '\0' && (x) == numerical_separator) || \
107 	(monetary_separator != '\0' && (x) == monetary_separator))
108 #define	IS_DECIMAL(x)		\
109 	((x) == numerical_decimal || \
110 	(monetary_decimal != '\0' && (x) == monetary_decimal))
111 #define	W_DECIMAL_CHAR		(w_numerical_decimal)
112 #define	W_IS_BLANK(x)		(iswspace(x) && (x) != L'\n')
113 #define	W_IS_SEPARATOR(x)	\
114 	((numerical_separator != '\0' && (x) == w_numerical_separator) || \
115 	(monetary_separator != '\0' && (x) == w_monetary_separator))
116 #define	W_IS_DECIMAL(x)		\
117 	(((x) == w_numerical_decimal) || \
118 	(monetary_decimal != '\0' && (x) == w_monetary_decimal))
119 
120 #define	INTERFIELD_SEPARATOR '\0'
121 #define	W_INTERFIELD_SEPARATOR L'\0'
122 
123 #define	INT_SIGN_FLIP_MASK 0x80000000
124 #define	INT_SIGN_PASS_MASK 0x00000000
125 
126 /*
127  * strx_ops_t, xfrm_len, and xfrm_cpy:  In the case where we are sorting in the
128  * C locale, we want to avoid the expense of transforming strings to collatable
129  * forms since, by definition, an arbitrary string in the C locale is already in
130  * its collatable form.  Therefore, we construct a small ops vector (the
131  * strx_ops) and two wrappers: xfrm_len() to massage the strxfrm(NULL, ...) into
132  * strlen()-like behaviour, and xfrm_cpy() to make strncpy() appear
133  * strxfrm()-like.
134  */
135 /*ARGSUSED*/
136 static size_t
xfrm_len(const char * s2,size_t len)137 xfrm_len(const char *s2, size_t len)
138 {
139 	return (strxfrm(NULL, s2, 0) + 1);
140 }
141 
142 /*
143  * The length represented by n includes a null character, so to return the
144  * correct length we subtract 1.  Note that this function is only used by
145  * field_convert_alpha, and isn't for general use, as it assumes that n is the
146  * length of s2 plus a null character.
147  */
148 static size_t
C_ncpy(char * s1,const char * s2,size_t n)149 C_ncpy(char *s1, const char *s2, size_t n)
150 {
151 	(void) strncpy(s1, s2, n);
152 	return (n - 1);
153 }
154 
155 /*ARGSUSED*/
156 static size_t
C_len(const char * s,size_t len)157 C_len(const char *s, size_t len)
158 {
159 	ASSERT(s != NULL);
160 	return (len);
161 }
162 
163 typedef struct _strx_ops {
164 	size_t	(*sx_len)(const char *, size_t);
165 	size_t	(*sx_xfrm)(char *, const char *, size_t);
166 } strx_ops_t;
167 
168 static const strx_ops_t C_ops = { C_len, C_ncpy };
169 static const strx_ops_t SB_ops = { xfrm_len, strxfrm };
170 
171 static const strx_ops_t *xfrm_ops;
172 
173 static void
field_initialize_separator(void)174 field_initialize_separator(void)
175 {
176 	/*
177 	 * A locale need not define all of the cases below:  only decimal_point
178 	 * must be defined.  Furthermore, sort(1) has traditionally not used the
179 	 * positive_sign and negative_sign, grouping, or currency_symbols (or
180 	 * their numeric counterparts, if any).
181 	 */
182 	struct lconv *conv = localeconv();
183 
184 	if (!xstreql(conv->thousands_sep, "")) {
185 		numerical_separator = *conv->thousands_sep;
186 		(void) mbtowc(&w_numerical_separator, conv->thousands_sep,
187 		    MB_CUR_MAX);
188 	} else
189 		numerical_separator = '\0';
190 
191 	if (!xstreql(conv->mon_thousands_sep, "")) {
192 		monetary_separator = *conv->mon_thousands_sep;
193 		(void) mbtowc(&w_monetary_separator, conv->mon_thousands_sep,
194 		    MB_CUR_MAX);
195 	} else
196 		monetary_separator = '\0';
197 
198 	if (!xstreql(conv->mon_decimal_point, "")) {
199 		monetary_decimal = *conv->mon_decimal_point;
200 		(void) mbtowc(&w_monetary_decimal, conv->mon_decimal_point,
201 		    MB_CUR_MAX);
202 	} else
203 		monetary_decimal = '\0';
204 
205 	numerical_decimal = *conv->decimal_point;
206 	(void) mbtowc(&w_numerical_decimal, conv->decimal_point, MB_CUR_MAX);
207 }
208 
209 static void
field_initialize_month(int is_c_locale)210 field_initialize_month(int is_c_locale)
211 {
212 	int i;
213 	int j;
214 	struct tm this_month;
215 	const char *c_months[MONTHS_IN_YEAR] = {
216 		"JAN", "FEB", "MAR", "APR", "MAY", "JUN",
217 		"JUL", "AUG", "SEP", "OCT", "NOV", "DEC"
218 	};
219 
220 	char month_name[MAX_MON_LEN * MB_LEN_MAX];
221 	wchar_t	w_month_name[MAX_MON_LEN];
222 
223 	if (is_c_locale) {
224 		for (i = 0; i < MONTHS_IN_YEAR; i++) {
225 			months[i] = (char *)c_months[i];
226 			month_lengths[i] = strlen(c_months[i]);
227 		}
228 		/*
229 		 * We don't need to initialize the wide version of the month
230 		 * names.
231 		 */
232 		return;
233 	}
234 
235 	(void) memset(&this_month, 0, sizeof (this_month));
236 
237 	for (i = 0; i < MONTHS_IN_YEAR; i++) {
238 		this_month.tm_mon = i;
239 
240 		(void) strftime(month_name, sizeof (month_name),
241 		    "%b", &this_month);
242 
243 		for (j = 0; j < strlen(month_name); j++)
244 			month_name[j] = toupper(month_name[j]);
245 		(void) mbstowcs(w_month_name, month_name, MAX_MON_LEN);
246 
247 		months[i] = strdup(month_name);
248 		month_lengths[i] = strlen(month_name);
249 		w_months[i] = wsdup(w_month_name);
250 		w_month_lengths[i] = wslen(w_month_name);
251 	}
252 }
253 
254 void
field_initialize(sort_t * S)255 field_initialize(sort_t *S)
256 {
257 	field_initialize_month(S->m_c_locale);
258 	field_initialize_separator();
259 
260 	if (S->m_c_locale)
261 		xfrm_ops = &C_ops;
262 	else
263 		xfrm_ops = &SB_ops;
264 }
265 
266 field_t *
field_new(sort_t * S)267 field_new(sort_t *S)
268 {
269 	field_t	*F = safe_realloc(NULL, sizeof (field_t));
270 
271 	F->f_start_field = -1;
272 	F->f_start_offset = -1;
273 	F->f_end_field = -1;
274 	F->f_end_offset = -1;
275 	F->f_next = NULL;
276 
277 	if (S == NULL) {
278 		F->f_species = ALPHA;
279 		F->f_options = 0;
280 	} else {
281 		F->f_species = S->m_default_species;
282 		F->f_options = S->m_field_options;
283 	}
284 
285 	return (F);
286 }
287 
288 void
field_delete(field_t * F)289 field_delete(field_t *F)
290 {
291 	free(F);
292 }
293 
294 /*
295  * The recursive implementation of field_add_to_chain() given below is
296  * inappropriate if function calls are expensive, or a truly large number of
297  * fields are anticipated.
298  */
299 void
field_add_to_chain(field_t ** F,field_t * A)300 field_add_to_chain(field_t **F, field_t *A)
301 {
302 	if (*F == NULL)
303 		*F = A;
304 	else
305 		field_add_to_chain(&((*F)->f_next), A);
306 }
307 
308 #ifdef DEBUG
309 #ifndef _LP64
310 #define	FIELD_FMT \
311 "\nStart field: %d\tStart offset: %d\nEnd field: %d\tEnd offset: %d\n"
312 #else /* !_LP64 */
313 #define	FIELD_FMT \
314 "\nStart field: %ld\tStart offset: %ld\nEnd field: %ld\tEnd offset: %ld\n"
315 #endif /* !_LP64 */
316 
317 /*
318  * field_print is used only for debugging purposes.
319  */
320 void
field_print(field_t * F)321 field_print(field_t *F)
322 {
323 	char *field_names[] = {"ALPHA", "MONTH", "NUMERIC"};
324 	int status = 0;
325 
326 	(void) fprintf(stderr, "Type: %s", field_names[F->f_species]);
327 	(void) fprintf(stderr, "\tOptions: ");
328 
329 	if (F->f_options & FIELD_REVERSE_COMPARISONS) {
330 		(void) fprintf(stderr, "REVERSE");
331 		status++;
332 	}
333 	if (F->f_options & FIELD_DICTIONARY_ORDER) {
334 		(void) fprintf(stderr, "DICTIONARY ");
335 		status++;
336 	}
337 	if (F->f_options & FIELD_FOLD_UPPERCASE) {
338 		(void) fprintf(stderr, "UPPERCASE ");
339 		status++;
340 	}
341 	if (F->f_options & FIELD_IGNORE_NONPRINTABLES) {
342 		(void) fprintf(stderr, "PRINTABLES ");
343 		status++;
344 	}
345 	if (F->f_options & FIELD_IGNORE_BLANKS_START) {
346 		(void) fprintf(stderr, "BLANKS_START ");
347 		status++;
348 	}
349 	if (F->f_options & FIELD_IGNORE_BLANKS_END) {
350 		(void) fprintf(stderr, "BLANKS_END ");
351 		status++;
352 	}
353 
354 	if (status == 0)
355 		(void) fprintf(stderr, "NO_MODIFIERS");
356 
357 	(void) fprintf(stderr, FIELD_FMT, F->f_start_field, F->f_start_offset,
358 	    F->f_end_field, F->f_end_offset);
359 }
360 #endif /* DEBUG */
361 
362 static ssize_t
field_boundary(field_t * F,line_rec_t * L,int is_end,int is_blanks)363 field_boundary(field_t *F, line_rec_t *L, int is_end, int is_blanks)
364 {
365 	char *S = L->l_data.sp;
366 	char *T = S;
367 	char *eol = S + L->l_data_length;
368 	ssize_t field = is_end ? F->f_end_field : F->f_start_field;
369 	ssize_t offset = is_end ? F->f_end_offset : F->f_start_offset;
370 	ssize_t ret;
371 
372 	ASSERT(is_end || field > -1);
373 
374 	if (is_end && field == -1)
375 		return (L->l_data_length);
376 
377 	while (field-- > 0) {
378 		while (T < eol && IS_BLANK(*T))
379 			T++;
380 
381 		while (T < eol && !IS_BLANK(*T))
382 			T++;
383 	}
384 
385 	if ((!is_end || offset > 0) && is_blanks) {
386 		while (IS_BLANK(*T))
387 			T++;
388 	}
389 
390 	if ((ret = MAX(T - S, 0) + offset) >= L->l_data_length)
391 		return (L->l_data_length);
392 
393 	return (ret);
394 }
395 
396 static void
field_delimit(field_t * F,line_rec_t * L,ssize_t * start,ssize_t * end)397 field_delimit(field_t *F, line_rec_t *L, ssize_t *start, ssize_t *end)
398 {
399 	ASSERT(F->f_start_field > -1);
400 
401 	*start = field_boundary(F, L, 0,
402 	    F->f_options & FIELD_IGNORE_BLANKS_START);
403 	*end = field_boundary(F, L, 1,
404 	    F->f_options & FIELD_IGNORE_BLANKS_END);
405 }
406 
407 static ssize_t
field_boundary_wide(field_t * F,line_rec_t * L,int is_end,int is_blanks)408 field_boundary_wide(field_t *F, line_rec_t *L, int is_end, int is_blanks)
409 {
410 	wchar_t *S = L->l_data.wp;
411 	wchar_t *T = S;
412 	wchar_t *eol = S + L->l_data_length;
413 	ssize_t field = is_end ? F->f_end_field : F->f_start_field;
414 	ssize_t offset = is_end ? F->f_end_offset : F->f_start_offset;
415 	ssize_t ret;
416 
417 	ASSERT(is_end || field > -1);
418 
419 	if (is_end && field == -1)
420 		return (L->l_data_length);
421 
422 	while (field-- > 0) {
423 		while (T < eol && W_IS_BLANK(*T))
424 			T++;
425 
426 		while (T < eol && !W_IS_BLANK(*T))
427 			T++;
428 	}
429 
430 	if ((!is_end || offset > 0) && is_blanks) {
431 		while (W_IS_BLANK(*T))
432 			T++;
433 	}
434 
435 	if ((ret = MAX(T - S, 0) + offset) >= L->l_data_length)
436 		return (L->l_data_length);
437 
438 	return (ret);
439 }
440 
441 static void
field_delimit_wide(field_t * F,line_rec_t * L,ssize_t * start,ssize_t * end)442 field_delimit_wide(field_t *F, line_rec_t *L, ssize_t *start, ssize_t *end)
443 {
444 	ASSERT(F->f_start_field > -1);
445 
446 	*start = field_boundary_wide(F, L, 0,
447 	    F->f_options & FIELD_IGNORE_BLANKS_START);
448 	*end = field_boundary_wide(F, L, 1,
449 	    F->f_options & FIELD_IGNORE_BLANKS_END);
450 }
451 
452 static ssize_t
field_boundary_tabbed(field_t * F,line_rec_t * L,int is_end,int is_blanks,vchar_t delimiter)453 field_boundary_tabbed(field_t *F, line_rec_t *L, int is_end, int is_blanks,
454     vchar_t delimiter)
455 {
456 	char *S = L->l_data.sp;
457 	char *T = S;
458 	char *eol = S + L->l_data_length;
459 	ssize_t field = is_end ? F->f_end_field : F->f_start_field;
460 	ssize_t offset = is_end ? F->f_end_offset : F->f_start_offset;
461 	ssize_t ret;
462 
463 	ASSERT(is_end || field > -1);
464 
465 	if (is_end && field == -1)
466 		return (L->l_data_length);
467 
468 	while (field-- > 0) {
469 		T = xstrnchr(T, delimiter.sc, eol - T);
470 		if (T == NULL || T > eol)
471 			return (L->l_data_length);
472 
473 		T++;
474 	}
475 
476 	if ((!is_end || offset != 0) && is_blanks) {
477 		while (IS_BLANK(*T))
478 			T++;
479 	}
480 
481 	if ((ret = MAX(T - S, 0) + offset) >= L->l_data_length) {
482 		if (L->l_data_length <= 0)
483 			return (0);
484 		if (S[L->l_data_length - 1] == delimiter.sc) {
485 			return (L->l_data_length - 1);
486 		} else {
487 			return (L->l_data_length);
488 		}
489 	}
490 
491 	if (is_end && offset == 0)
492 		ret--;
493 
494 	return (ret);
495 }
496 
497 /*
498  * field_delimit_tabbed() is called when a field separator has been defined
499  * using the -t option.  The character at the offset, start, is either one or
500  * more character positions past the delimiter marking the start of the
501  * field, or at the end of the line.
502  */
503 static void
field_delimit_tabbed(field_t * F,line_rec_t * L,ssize_t * start,ssize_t * end,vchar_t delimiter)504 field_delimit_tabbed(field_t *F, line_rec_t *L, ssize_t *start, ssize_t *end,
505     vchar_t delimiter)
506 {
507 	ASSERT(F->f_start_field > -1);
508 
509 	*start = field_boundary_tabbed(F, L, 0, F->f_options &
510 	    FIELD_IGNORE_BLANKS_START, delimiter);
511 	*end = field_boundary_tabbed(F, L, 1, F->f_options &
512 	    FIELD_IGNORE_BLANKS_END, delimiter);
513 }
514 
515 static ssize_t
field_boundary_tabbed_wide(field_t * F,line_rec_t * L,int is_end,int is_blanks,vchar_t delimiter)516 field_boundary_tabbed_wide(field_t *F, line_rec_t *L, int is_end, int is_blanks,
517     vchar_t delimiter)
518 {
519 	wchar_t *S = L->l_data.wp;
520 	wchar_t *T = S;
521 	wchar_t *eol = S + L->l_data_length;
522 	ssize_t field = is_end ? F->f_end_field : F->f_start_field;
523 	ssize_t offset = is_end ? F->f_end_offset : F->f_start_offset;
524 	ssize_t ret;
525 
526 	ASSERT(is_end || field > -1);
527 
528 	if (is_end && field == -1)
529 		return (L->l_data_length);
530 
531 	while (field-- > 0) {
532 		T = xwsnchr(T, delimiter.wc, eol - T);
533 		if (T == NULL || T > eol)
534 			return (L->l_data_length);
535 
536 		T++;
537 	}
538 
539 	if ((!is_end || offset != 0) && is_blanks) {
540 		while (W_IS_BLANK(*T))
541 			T++;
542 	}
543 
544 	if ((ret = MAX(T - S, 0) + offset) >= L->l_data_length) {
545 		if (L->l_data_length <= 0)
546 			return (0);
547 		if (S[L->l_data_length - 1] == delimiter.wc) {
548 			return (L->l_data_length - 1);
549 		} else {
550 			return (L->l_data_length);
551 		}
552 	}
553 
554 	if (is_end && offset == 0)
555 		ret--;
556 
557 	return (ret);
558 }
559 
560 static void
field_delimit_tabbed_wide(field_t * F,line_rec_t * L,ssize_t * start,ssize_t * end,vchar_t delimiter)561 field_delimit_tabbed_wide(field_t *F, line_rec_t *L, ssize_t *start,
562     ssize_t *end, vchar_t delimiter)
563 {
564 	ASSERT(F->f_start_field > -1);
565 
566 	*start = field_boundary_tabbed_wide(F, L, 0, F->f_options &
567 	    FIELD_IGNORE_BLANKS_START, delimiter);
568 	*end = field_boundary_tabbed_wide(F, L, 1, F->f_options &
569 	    FIELD_IGNORE_BLANKS_END, delimiter);
570 }
571 
572 /*ARGSUSED*/
573 ssize_t
field_convert_month(field_t * F,line_rec_t * L,vchar_t delimiter,ssize_t data_offset,ssize_t data_length,ssize_t coll_offset)574 field_convert_month(field_t *F, line_rec_t *L, vchar_t delimiter,
575     ssize_t data_offset, ssize_t data_length, ssize_t coll_offset)
576 {
577 	int j;
578 	ssize_t	val;
579 	char month_candidate[MAX_MON_LEN * MB_LEN_MAX];
580 	ssize_t month_length = data_length;
581 	ssize_t month_offset = data_offset;
582 
583 	if (sizeof (char) > L->l_collate_bufsize - coll_offset)
584 		return (-1);
585 
586 	(void) memset(month_candidate, 0, MAX_MON_LEN * MB_LEN_MAX);
587 
588 
589 	/*
590 	 * The month field formally begins with the first non-blank character.
591 	 */
592 	while (IS_BLANK(*(L->l_data.sp + month_offset))) {
593 		month_offset++;
594 		month_length--;
595 	}
596 
597 	for (j = 0; j < MAX_MON_LEN && j < month_length; j++)
598 		month_candidate[j] = toupper((L->l_data.sp + month_offset)[j]);
599 
600 	for (j = 0; j < MONTHS_IN_YEAR; j++) {
601 		if (xstrneql(month_candidate, months[j], month_lengths[j])) {
602 			*(L->l_collate.sp + coll_offset) = '\0' + j + MO_OFFSET;
603 			return (1);
604 		}
605 	}
606 
607 	/*
608 	 * no matching month; copy string into field.  required behaviour is
609 	 * that "month-free" keys sort before month-sortable keys, so insert
610 	 * a "will sort first" token.
611 	 */
612 	*(L->l_collate.sp + coll_offset) = '\0' + MO_NONE;
613 
614 	val = field_convert_alpha_simple(F, L, delimiter, data_offset,
615 	    data_length, coll_offset + 1);
616 
617 	if (val < 0)
618 		return (-1);
619 	else
620 		return (val + 1);
621 }
622 
623 /*ARGSUSED*/
624 ssize_t
field_convert_month_wide(field_t * F,line_rec_t * L,vchar_t delimiter,ssize_t data_offset,ssize_t data_length,ssize_t coll_offset)625 field_convert_month_wide(field_t *F, line_rec_t *L, vchar_t delimiter,
626     ssize_t data_offset, ssize_t data_length, ssize_t coll_offset)
627 {
628 	ssize_t j;
629 	ssize_t val;
630 	wchar_t month_candidate[MAX_MON_LEN];
631 	wchar_t *month;
632 	wchar_t *buffer = L->l_collate.wp + coll_offset;
633 	ssize_t month_length = data_length;
634 	ssize_t month_offset = data_offset;
635 
636 	if (L->l_collate_bufsize - coll_offset * sizeof (wchar_t) <
637 	    sizeof (wchar_t))
638 		return (-1);
639 
640 	(void) memset(month_candidate, 0, MAX_MON_LEN * sizeof (wchar_t));
641 
642 
643 	while (W_IS_BLANK(*(L->l_data.wp + month_offset))) {
644 		month_offset++;
645 		month_length--;
646 	}
647 
648 	month = L->l_data.wp + month_offset;
649 
650 	for (j = 0; j < MAX_MON_LEN && j < month_length; j++)
651 		month_candidate[j] = towupper(month[j]);
652 
653 	for (j = 0; j < MONTHS_IN_YEAR; j++)
654 		if (xwcsneql(month_candidate, w_months[j],
655 		    w_month_lengths[j])) {
656 			*buffer = L'\0' + j + MO_OFFSET;
657 			return (1);
658 		}
659 
660 	*buffer = L'\0' + MO_NONE;
661 
662 	val = field_convert_alpha_wide(F, L, delimiter, data_offset,
663 	    data_length, coll_offset + sizeof (wchar_t));
664 
665 	if (val < 0)
666 		return (-1);
667 	else
668 		return (val + 1);
669 }
670 
671 /*
672  * field_convert_alpha() always fails with return value -1 if the converted
673  * string would cause l_collate_length to exceed l_collate_bufsize
674  */
675 /*ARGSUSED*/
676 ssize_t
field_convert_alpha(field_t * F,line_rec_t * L,vchar_t delimiter,ssize_t data_offset,ssize_t data_length,ssize_t coll_offset)677 field_convert_alpha(field_t *F, line_rec_t *L, vchar_t delimiter,
678     ssize_t data_offset, ssize_t data_length, ssize_t coll_offset)
679 {
680 	static char *compose;
681 	static ssize_t compose_length;
682 
683 	ssize_t	clength = 0;
684 	ssize_t	dlength;
685 	ssize_t	i;
686 
687 	if (compose_length < (data_length + 1)) {
688 		compose_length = data_length + 1;
689 		compose = safe_realloc(compose, compose_length * sizeof (char));
690 	}
691 
692 	for (i = data_offset; i < data_offset + data_length; i++) {
693 		char t = (L->l_data.sp)[i];
694 
695 		if ((F->f_options & FIELD_IGNORE_NONPRINTABLES) &&
696 		    !isprint((uchar_t)t))
697 			continue;
698 
699 		if ((F->f_options & FIELD_DICTIONARY_ORDER) &&
700 		    !isalnum((uchar_t)t) && !isspace((uchar_t)t))
701 			continue;
702 
703 		if (F->f_options & FIELD_FOLD_UPPERCASE)
704 			t = toupper(t);
705 
706 		compose[clength++] = t;
707 	}
708 	compose[clength] = '\0';
709 
710 	if ((dlength = xfrm_ops->sx_len(compose, clength)) <
711 	    L->l_collate_bufsize - coll_offset)
712 		return (xfrm_ops->sx_xfrm(L->l_collate.sp + coll_offset,
713 		    compose, dlength + 1));
714 	else
715 		return ((ssize_t)-1);
716 }
717 
718 /*ARGSUSED*/
719 ssize_t
field_convert_alpha_simple(field_t * F,line_rec_t * L,vchar_t delimiter,ssize_t data_offset,ssize_t data_length,ssize_t coll_offset)720 field_convert_alpha_simple(field_t *F, line_rec_t *L, vchar_t delimiter,
721     ssize_t data_offset, ssize_t data_length, ssize_t coll_offset)
722 {
723 	static char *compose;
724 	static ssize_t compose_length;
725 
726 	ssize_t	clength;
727 	ssize_t	dlength;
728 
729 	if (compose_length < (data_length + 1)) {
730 		compose_length = data_length + 1;
731 		compose = safe_realloc(compose, compose_length * sizeof (char));
732 	}
733 
734 	(void) memcpy(compose, L->l_data.sp + data_offset, data_length);
735 	clength = data_length;
736 	compose[clength] = '\0';
737 
738 	if ((dlength = xfrm_ops->sx_len(compose, clength)) <
739 	    L->l_collate_bufsize - coll_offset)
740 		return (xfrm_ops->sx_xfrm(L->l_collate.sp + coll_offset,
741 		    compose, dlength + 1));
742 	else
743 		return ((ssize_t)-1);
744 }
745 
746 /*ARGSUSED*/
747 ssize_t
field_convert_alpha_wide(field_t * F,line_rec_t * L,vchar_t delimiter,ssize_t data_offset,ssize_t data_length,ssize_t coll_offset)748 field_convert_alpha_wide(field_t *F, line_rec_t *L, vchar_t delimiter,
749     ssize_t data_offset, ssize_t data_length, ssize_t coll_offset)
750 {
751 	wchar_t	*compose = safe_realloc(NULL, (data_length + 1) *
752 	    sizeof (wchar_t));
753 	ssize_t	clength = 0;
754 	ssize_t	dlength;
755 	ssize_t	i;
756 	ssize_t ret;
757 
758 	for (i = data_offset; i < data_offset + data_length; i++) {
759 		wchar_t	t = (L->l_data.wp)[i];
760 
761 		if ((F->f_options & FIELD_IGNORE_NONPRINTABLES) && !iswprint(t))
762 			continue;
763 
764 		if ((F->f_options & FIELD_DICTIONARY_ORDER) && !iswalnum(t) &&
765 		    !iswspace(t))
766 			continue;
767 
768 		if (F->f_options & FIELD_FOLD_UPPERCASE)
769 			t = towupper(t);
770 
771 		compose[clength++] = t;
772 	}
773 	compose[clength] = L'\0';
774 
775 	dlength = wcsxfrm(NULL, compose, (size_t)0);
776 	if ((dlength * sizeof (wchar_t)) < L->l_collate_bufsize -
777 	    coll_offset * sizeof (wchar_t)) {
778 		ret = (ssize_t)wcsxfrm(L->l_collate.wp + coll_offset, compose,
779 		    (size_t)dlength + 1);
780 	} else {
781 		ret = (ssize_t)-1;
782 	}
783 
784 	safe_free(compose);
785 
786 	return (ret);
787 }
788 
789 /*
790  * field_convert_numeric() converts the given field into a collatable numerical
791  * sequence.  The sequence is ordered as { log, integer, separator, fraction },
792  * with an optional sentinel component at the sequence end.
793  */
794 /*ARGSUSED*/
795 ssize_t
field_convert_numeric(field_t * F,line_rec_t * L,vchar_t delimiter,ssize_t data_offset,ssize_t data_length,ssize_t coll_offset)796 field_convert_numeric(field_t *F, line_rec_t *L, vchar_t delimiter,
797     ssize_t data_offset, ssize_t data_length, ssize_t coll_offset)
798 {
799 	char *number;
800 	char *buffer = L->l_collate.sp + coll_offset;
801 	ssize_t length;
802 
803 	char sign = '2';
804 	int log_ten;
805 	char *digits = buffer + 1 + sizeof (int) / sizeof (char);
806 	size_t j = 0;
807 	size_t i;
808 
809 	int state = BEFORE_NUMBER;
810 
811 	number = L->l_data.sp + data_offset;
812 	length = data_length;
813 
814 	/*
815 	 * Eat leading blanks, if any.
816 	 */
817 	for (i = 0; i < length; i++)
818 		if (!IS_BLANK(number[i]))
819 			break;
820 
821 	/*
822 	 * Test that there is sufficient size in the collation buffer for our
823 	 * number.  In addition to the possible remaining characters in the
824 	 * field, we also require space for the sign (char), logarithm (int),
825 	 * separator (char), and as many as two string terminators (for reverse
826 	 * sorts).
827 	 */
828 	if (((length - i) + 4 * sizeof (char) + sizeof (int)) >
829 	    (L->l_collate_bufsize - coll_offset))
830 		return ((ssize_t)-1);
831 
832 	/*
833 	 * If negative, set sign.
834 	 */
835 	if (number[i] == '-') {
836 		i++;
837 		sign = '0';
838 	}
839 
840 	/*
841 	 * Scan integer part; eat leading zeros.
842 	 */
843 	for (; i < length; i++) {
844 		if (IS_SEPARATOR(number[i]))
845 			continue;
846 
847 		if (number[i] == '0' && !(state & IN_NUMBER))
848 			continue;
849 
850 		if (!isdigit((uchar_t)number[i]))
851 			break;
852 
853 		state |= IN_NUMBER;
854 		if (sign == '0')
855 			digits[j++] = '0' + '9' - number[i];
856 		else
857 			digits[j++] = number[i];
858 	}
859 
860 	if (i < length && IS_DECIMAL(number[i])) {
861 		/*
862 		 * Integer part terminated by decimal.
863 		 */
864 		digits[j] = DECIMAL_CHAR;
865 		log_ten = j++;
866 
867 		/*
868 		 * Scan fractional part.
869 		 */
870 		for (++i; i < length; i++) {
871 			if (IS_SEPARATOR(number[i]))
872 				continue;
873 
874 			if (!isdigit((uchar_t)number[i]))
875 				break;
876 
877 			if (number[i] != '0')
878 				state |= IN_NUMBER;
879 
880 			if (sign == '0')
881 				digits[j++] = '0' + '9' - number[i];
882 			else
883 				digits[j++] = number[i];
884 		}
885 
886 		if (sign == '0')
887 			digits[j++] = (char)(UCHAR_MAX - INTERFIELD_SEPARATOR);
888 	} else {
889 		/*
890 		 * Nondigit or end of string seen.
891 		 */
892 		log_ten = (int)j;
893 		if (sign == '0')
894 			digits[j++] = (char)(UCHAR_MAX - INTERFIELD_SEPARATOR);
895 		else
896 			digits[j] = INTERFIELD_SEPARATOR;
897 	}
898 
899 	if ((state & IN_NUMBER) == 0) {
900 		/*
901 		 * A non-zero number was not detected; treat as defined zero.
902 		 */
903 		sign = '1';
904 		log_ten = 0;
905 		digits[0] = '0';
906 		j = 1;
907 	}
908 
909 	/*
910 	 * We subtract a constant from the log of negative values so that
911 	 * they will correctly precede positive values with a zero logarithm.
912 	 */
913 	if (sign == '0') {
914 		if (j != 0)
915 			log_ten = -log_ten - 2;
916 		else
917 			/*
918 			 * Special case for -0.
919 			 */
920 			log_ten = -1;
921 	}
922 
923 	buffer[0] = sign;
924 
925 	/*
926 	 * Place logarithm in big-endian form.
927 	 */
928 	for (i = 0; i < sizeof (int); i++)
929 		buffer[i + 1] = (log_ten << (i * NBBY))
930 		    >> ((sizeof (int) - 1) * NBBY);
931 
932 	if (j + sizeof (char) + sizeof (int) <
933 	    L->l_collate_bufsize - coll_offset)
934 		return (j + 1 + sizeof (int));
935 	else
936 		return ((ssize_t)-1);
937 }
938 
939 /*ARGSUSED*/
940 ssize_t
field_convert_numeric_wide(field_t * F,line_rec_t * L,vchar_t delimiter,ssize_t data_offset,ssize_t data_length,ssize_t coll_offset)941 field_convert_numeric_wide(field_t *F, line_rec_t *L, vchar_t delimiter,
942     ssize_t data_offset, ssize_t data_length, ssize_t coll_offset)
943 {
944 	wchar_t *number;
945 	wchar_t *buffer = L->l_collate.wp + coll_offset;
946 	char *lbuffer;
947 	ssize_t length;
948 
949 	wchar_t	sign = L'2';
950 	int log_ten;
951 	wchar_t	*digits = buffer + 1 + sizeof (int)/sizeof (wchar_t);
952 	size_t j = 0;
953 	size_t i;
954 
955 	int state = BEFORE_NUMBER;
956 
957 	number = L->l_data.wp + data_offset;
958 	length = data_length;
959 
960 	for (i = 0; i < length; i++)
961 		if (!W_IS_BLANK(number[i]))
962 			break;
963 
964 	if (((length - i) * sizeof (wchar_t) + 4 * sizeof (wchar_t) +
965 	    sizeof (int)) > (L->l_collate_bufsize - coll_offset))
966 		return ((ssize_t)-1);
967 
968 	if (number[i] == L'-') {
969 		i++;
970 		sign = L'0';
971 	}
972 
973 	for (; i < length; i++) {
974 		if (W_IS_SEPARATOR(number[i]))
975 			continue;
976 
977 		if (number[i] == L'0' && !(state & IN_NUMBER))
978 			continue;
979 
980 		if (!iswdigit(number[i]))
981 			break;
982 
983 		state |= IN_NUMBER;
984 		if (sign == L'0')
985 			digits[j++] = L'0' + L'9' - number[i];
986 		else
987 			digits[j++] = number[i];
988 	}
989 
990 	if (i < length && W_IS_DECIMAL(number[i])) {
991 		digits[j] = W_DECIMAL_CHAR;
992 		log_ten = j++;
993 
994 		for (++i; i < length; i++) {
995 			if (W_IS_SEPARATOR(number[i]))
996 				continue;
997 
998 			if (!iswdigit(number[i]))
999 				break;
1000 
1001 			if (number[i] != L'0')
1002 				state |= IN_NUMBER;
1003 
1004 			if (sign == L'0')
1005 				digits[j++] = L'0' + L'9' - number[i];
1006 			else
1007 				digits[j++] = number[i];
1008 		}
1009 
1010 		if (sign == L'0')
1011 			digits[j++] = (wchar_t)(WCHAR_MAX -
1012 			    W_INTERFIELD_SEPARATOR);
1013 	} else {
1014 		log_ten = (int)j;
1015 		if (sign == L'0')
1016 			digits[j++] = (wchar_t)(WCHAR_MAX -
1017 			    W_INTERFIELD_SEPARATOR);
1018 		else
1019 			digits[j] = W_INTERFIELD_SEPARATOR;
1020 	}
1021 
1022 	if ((state & IN_NUMBER) == 0) {
1023 		sign = L'1';
1024 		log_ten = 0;
1025 		digits[0] = L'0';
1026 		j = 1;
1027 	}
1028 
1029 	if (sign == L'0') {
1030 		if (j != 0)
1031 			log_ten = -log_ten - 2;
1032 		else
1033 			log_ten = -1;
1034 	}
1035 
1036 	buffer[0] = sign;
1037 	/*
1038 	 * Place logarithm in big-endian form.
1039 	 */
1040 	lbuffer = (char *)(buffer + 1);
1041 	for (i = 0; i < sizeof (int); i++)
1042 		lbuffer[i] = (log_ten << (i * NBBY))
1043 		    >> ((sizeof (int) - 1) * NBBY);
1044 
1045 	if ((j + 1 + sizeof (int)/sizeof (wchar_t)) * sizeof (wchar_t) <
1046 	    L->l_collate_bufsize - coll_offset * sizeof (wchar_t))
1047 		return (j + 1 + sizeof (int) / sizeof (wchar_t));
1048 	else
1049 		return ((ssize_t)-1);
1050 }
1051 
1052 /*
1053  * flags contains one of CV_REALLOC, CV_FAIL, specifying the preferred behaviour
1054  * when coll_offset exceeds l_collate_bufsize.
1055  */
1056 ssize_t
field_convert(field_t * F,line_rec_t * L,int flags,vchar_t field_separator)1057 field_convert(field_t *F, line_rec_t *L, int flags, vchar_t field_separator)
1058 {
1059 	ssize_t coll_offset = 0;
1060 	ssize_t	start, end, distance;
1061 	field_t *cur_fieldp = F;
1062 
1063 	while (cur_fieldp != NULL) {
1064 		/*
1065 		 * delimit field
1066 		 */
1067 		if (!field_separator.sc)
1068 			field_delimit(cur_fieldp, L, &start, &end);
1069 		else
1070 			field_delimit_tabbed(cur_fieldp, L, &start, &end,
1071 			    field_separator);
1072 
1073 		distance = 0;
1074 		if (end - start > 0 ||
1075 		    (end - start == 0 && F->f_species == NUMERIC)) {
1076 			/*
1077 			 * Convert field, appending to collated field of line
1078 			 * record.
1079 			 */
1080 			distance = cur_fieldp->f_convert(cur_fieldp, L,
1081 			    field_separator, start, end - start, coll_offset);
1082 
1083 			/*
1084 			 * branch should execute comparatively rarely
1085 			 */
1086 			if (distance == -1) {
1087 				if (flags & FCV_REALLOC) {
1088 					ASSERT(L->l_collate_bufsize > 0);
1089 					L->l_collate_bufsize *= 2;
1090 					L->l_collate.sp =
1091 					    safe_realloc(L->l_collate.sp,
1092 					    L->l_collate_bufsize);
1093 
1094 					__S(stats_incr_convert_reallocs());
1095 					continue;
1096 				} else {
1097 					/*
1098 					 * FCV_FAIL has been set.
1099 					 */
1100 					return (-1);
1101 				}
1102 			}
1103 		}
1104 
1105 		if (cur_fieldp->f_options & FIELD_REVERSE_COMPARISONS) {
1106 			xstrninv(L->l_collate.sp, coll_offset, distance);
1107 			*(L->l_collate.sp + coll_offset + distance) =
1108 			    (char)(UCHAR_MAX - INTERFIELD_SEPARATOR);
1109 			distance++;
1110 		}
1111 
1112 		ASSERT(distance >= 0);
1113 		coll_offset += distance;
1114 		if (coll_offset >= L->l_collate_bufsize) {
1115 			if (flags & FCV_REALLOC) {
1116 				ASSERT(L->l_collate_bufsize > 0);
1117 				L->l_collate_bufsize *= 2;
1118 				L->l_collate.sp = safe_realloc(L->l_collate.sp,
1119 				    L->l_collate_bufsize);
1120 
1121 				__S(stats_incr_convert_reallocs());
1122 			} else {
1123 				return (-1);
1124 			}
1125 		}
1126 		*(L->l_collate.sp + coll_offset) = INTERFIELD_SEPARATOR;
1127 		coll_offset++;
1128 
1129 		cur_fieldp = cur_fieldp->f_next;
1130 	}
1131 
1132 	L->l_collate_length = coll_offset;
1133 
1134 	return (L->l_collate_length);
1135 }
1136 
1137 ssize_t
field_convert_wide(field_t * F,line_rec_t * L,int flags,vchar_t field_separator)1138 field_convert_wide(field_t *F, line_rec_t *L, int flags,
1139     vchar_t field_separator)
1140 {
1141 	ssize_t coll_offset = 0;
1142 	ssize_t	start, end, distance;
1143 	field_t *cur_fieldp = F;
1144 
1145 	while (cur_fieldp != NULL) {
1146 		if (!field_separator.wc)
1147 			field_delimit_wide(cur_fieldp, L, &start, &end);
1148 		else
1149 			field_delimit_tabbed_wide(cur_fieldp, L, &start, &end,
1150 			    field_separator);
1151 
1152 		distance = 0;
1153 		if (end - start > 0 ||
1154 		    end - start == 0 && F->f_species == NUMERIC) {
1155 			distance = cur_fieldp->f_convert(cur_fieldp, L,
1156 			    field_separator, start, end - start, coll_offset);
1157 
1158 			if (distance == -1) {
1159 				if (flags & FCV_REALLOC) {
1160 					ASSERT(L->l_collate_bufsize > 0);
1161 					L->l_collate_bufsize *= 2;
1162 					L->l_collate.wp = safe_realloc(
1163 					    L->l_collate.wp,
1164 					    L->l_collate_bufsize);
1165 
1166 					__S(stats_incr_convert_reallocs());
1167 					continue;
1168 				} else {
1169 					return (-1);
1170 				}
1171 			}
1172 		}
1173 
1174 		if (cur_fieldp->f_options & FIELD_REVERSE_COMPARISONS) {
1175 			xwcsninv(L->l_collate.wp, coll_offset, distance);
1176 			*(L->l_collate.wp + coll_offset + distance) =
1177 			    WCHAR_MAX - INTERFIELD_SEPARATOR;
1178 			distance++;
1179 		}
1180 
1181 		ASSERT(distance >= 0);
1182 		coll_offset += distance;
1183 		if (coll_offset * sizeof (wchar_t) >= L->l_collate_bufsize) {
1184 			if (flags & FCV_REALLOC) {
1185 				ASSERT(L->l_collate_bufsize > 0);
1186 				L->l_collate_bufsize *= 2;
1187 				L->l_collate.wp = safe_realloc(L->l_collate.wp,
1188 				    L->l_collate_bufsize);
1189 
1190 				__S(stats_incr_convert_reallocs());
1191 			} else {
1192 				return (-1);
1193 			}
1194 		}
1195 		*(L->l_collate.wp + coll_offset) = W_INTERFIELD_SEPARATOR;
1196 		coll_offset++;
1197 
1198 		cur_fieldp = cur_fieldp->f_next;
1199 	}
1200 
1201 	L->l_collate_length = coll_offset * sizeof (wchar_t);
1202 #ifdef _LITTLE_ENDIAN
1203 	xwcsntomsb(L->l_collate.wp, coll_offset);
1204 #endif /* _LITTLE_ENDIAN */
1205 
1206 	return (L->l_collate_length);
1207 }
1208 
1209 /*
1210  * line_convert() and line_convert_wide() are called when the collation vector
1211  * of a given line has been exhausted, and we are performing the final,
1212  * full-line comparison required by the sort specification.  Because we do not
1213  * have a guarantee that l_data is null-terminated, we create an explicitly
1214  * null-terminated copy suitable for transformation to a collatable form for the
1215  * current locale.
1216  */
1217 static void
line_convert(line_rec_t * L)1218 line_convert(line_rec_t *L)
1219 {
1220 	static ssize_t bufsize;
1221 	static char *buffer;
1222 
1223 	if (L->l_raw_collate.sp != NULL)
1224 		return;
1225 
1226 	if (L->l_data_length + 1 > bufsize) {
1227 		buffer = safe_realloc(buffer, L->l_data_length + 1);
1228 		bufsize = L->l_data_length + 1;
1229 	}
1230 
1231 	(void) strncpy(buffer, L->l_data.sp, L->l_data_length);
1232 	buffer[L->l_data_length] = '\0';
1233 
1234 	L->l_raw_collate.sp = safe_realloc(L->l_raw_collate.sp,
1235 	    xfrm_ops->sx_len(buffer, L->l_data_length) + 1);
1236 	xfrm_ops->sx_xfrm(L->l_raw_collate.sp, buffer,
1237 	    xfrm_ops->sx_len(buffer, L->l_data_length) + 1);
1238 
1239 	__S(stats_incr_line_conversions());
1240 }
1241 
1242 static void
line_convert_wide(line_rec_t * L)1243 line_convert_wide(line_rec_t *L)
1244 {
1245 	static wchar_t *buffer;
1246 	static ssize_t bufsize;
1247 
1248 	ssize_t dlength;
1249 
1250 	if (L->l_raw_collate.wp != NULL)
1251 		return;
1252 
1253 	if (L->l_data_length + 1 > bufsize) {
1254 		buffer = safe_realloc(buffer, (L->l_data_length + 1) *
1255 		    sizeof (wchar_t));
1256 		bufsize = L->l_data_length + 1;
1257 	}
1258 
1259 	(void) wcsncpy(buffer, L->l_data.wp, L->l_data_length);
1260 	buffer[L->l_data_length] = L'\0';
1261 
1262 	dlength = wcsxfrm(NULL, buffer, 0) + 1;
1263 	L->l_raw_collate.wp = safe_realloc(L->l_raw_collate.wp, dlength *
1264 	    sizeof (wchar_t));
1265 	(void) wcsxfrm(L->l_raw_collate.wp, buffer, dlength);
1266 
1267 	__S(stats_incr_line_conversions());
1268 }
1269 
1270 /*
1271  * Our convention for collation is
1272  *
1273  *	A > B  => r > 0,
1274  *	A == B => r = 0,
1275  *	A < B  => r < 0
1276  *
1277  * This convention is consistent with the definition of memcmp(), strcmp(), and
1278  * strncmp() in the C locale.  collated() and collated_wide() have two optional
1279  * behaviours, which can be activated by setting the appropriate values in
1280  * coll_flag:  COLL_UNIQUE, which returns 0 if the l_collate fields of the line
1281  * records being compared are identical; COLL_DATA_ONLY, which ignores the
1282  * l_collate field for the current comparison; and COLL_REVERSE, which flips the
1283  * result for comparisons that fall through to an actual data comparison (since
1284  * the collated vector should already reflect reverse ordering from field
1285  * conversion).
1286  */
1287 int
collated(line_rec_t * A,line_rec_t * B,ssize_t depth,flag_t coll_flag)1288 collated(line_rec_t *A, line_rec_t *B, ssize_t depth, flag_t coll_flag)
1289 {
1290 	ssize_t ml = MIN(A->l_collate_length, B->l_collate_length) - depth;
1291 	int r;
1292 	int mask = (coll_flag & COLL_REVERSE) ? INT_SIGN_FLIP_MASK :
1293 	    INT_SIGN_PASS_MASK;
1294 	ssize_t la, lb;
1295 
1296 	if (!(coll_flag & COLL_DATA_ONLY)) {
1297 		if (ml > 0) {
1298 			r = memcmp(A->l_collate.sp + depth,
1299 			    B->l_collate.sp + depth, ml);
1300 
1301 			if (r)
1302 				return (r);
1303 		}
1304 
1305 		if (A->l_collate_length < B->l_collate_length)
1306 			return (-1);
1307 
1308 		if (A->l_collate_length > B->l_collate_length)
1309 			return (1);
1310 	}
1311 
1312 	/*
1313 	 * This is where we cut out, if we know that the current sort is over
1314 	 * the entire line.
1315 	 */
1316 	if (coll_flag & COLL_UNIQUE)
1317 		return (0);
1318 
1319 	line_convert(A);
1320 	line_convert(B);
1321 
1322 	la = strlen(A->l_raw_collate.sp);
1323 	lb = strlen(B->l_raw_collate.sp);
1324 
1325 	r = memcmp(A->l_raw_collate.sp, B->l_raw_collate.sp, MIN(la, lb));
1326 
1327 	if (r)
1328 		return (r ^ mask);
1329 
1330 	if (la < lb)
1331 		return (-1 ^ mask);
1332 
1333 	if (la > lb)
1334 		return (1 ^ mask);
1335 
1336 	return (0);
1337 }
1338 
1339 int
collated_wide(line_rec_t * A,line_rec_t * B,ssize_t depth,flag_t coll_flag)1340 collated_wide(line_rec_t *A, line_rec_t *B, ssize_t depth, flag_t coll_flag)
1341 {
1342 	ssize_t ml = MIN(A->l_collate_length, B->l_collate_length) - depth;
1343 	int r;
1344 	int mask = (coll_flag & COLL_REVERSE) ? INT_SIGN_FLIP_MASK :
1345 	    INT_SIGN_PASS_MASK;
1346 	ssize_t la, lb;
1347 
1348 	if (!(coll_flag & COLL_DATA_ONLY)) {
1349 		if (ml > 0) {
1350 			r = memcmp(A->l_collate.sp + depth,
1351 			    B->l_collate.sp + depth, ml);
1352 
1353 			if (r)
1354 				return (r);
1355 		}
1356 		if (A->l_collate_length < B->l_collate_length)
1357 			return (-1);
1358 
1359 		if (A->l_collate_length > B->l_collate_length)
1360 			return (1);
1361 	}
1362 
1363 	if (coll_flag & COLL_UNIQUE)
1364 		return (0);
1365 
1366 	line_convert_wide(A);
1367 	line_convert_wide(B);
1368 
1369 	la = wcslen(A->l_raw_collate.wp);
1370 	lb = wcslen(B->l_raw_collate.wp);
1371 
1372 	r = wmemcmp(A->l_raw_collate.wp, B->l_raw_collate.wp,
1373 	    (size_t)MIN(la, lb));
1374 
1375 	if (r)
1376 		return (r ^ mask);
1377 
1378 	if (la < lb)
1379 		return (-1 ^ mask);
1380 
1381 	if (la > lb)
1382 		return (1 ^ mask);
1383 
1384 	return (0);
1385 }
1386