xref: /illumos-gate/usr/src/cmd/localedef/collate.c (revision bc09504f)
1 /*
2  * This file and its contents are supplied under the terms of the
3  * Common Development and Distribution License ("CDDL"), version 1.0.
4  * You may only use this file in accordance with the terms of version
5  * 1.0 of the CDDL.
6  *
7  * A full copy of the text of the CDDL should have accompanied this
8  * source.  A copy of the CDDL is also available via the Internet at
9  * http://www.illumos.org/license/CDDL.
10  */
11 
12 /*
13  * Copyright 2010 Nexenta Systems, Inc.  All rights reserved.
14  */
15 
16 /*
17  * LC_COLLATE database generation routines for localedef.
18  */
19 
20 #include <stdio.h>
21 #include <stdlib.h>
22 #include <errno.h>
23 #include <string.h>
24 #include <sys/types.h>
25 #include <sys/avl.h>
26 #include <string.h>
27 #include <unistd.h>
28 #include <wchar.h>
29 #include <widec.h>
30 #include <limits.h>
31 #include "localedef.h"
32 #include "parser.tab.h"
33 #include "collatefile.h"
34 
35 /*
36  * Design notes.
37  *
38  * It will be extremely helpful to the reader if they have access to
39  * the localedef and locale file format specifications available.
40  * Latest versions of these are available from www.opengroup.org.
41  *
42  * The design for the collation code is a bit complex.  The goal is a
43  * single collation database as described in collate.h (in
44  * libc/port/locale).  However, there are some other tidbits:
45  *
46  * a) The substitution entries are now a directly indexable array.  A
47  * priority elsewhere in the table is taken as an index into the
48  * substitution table if it has a high bit (COLLATE_SUBST_PRIORITY)
49  * set.  (The bit is cleared and the result is the index into the
50  * table.
51  *
52  * b) We eliminate duplicate entries into the substitution table.
53  * This saves a lot of space.
54  *
55  * c) The priorities for each level are "compressed", so that each
56  * sorting level has consecutively numbered priorities starting at 1.
57  * (O is reserved for the ignore priority.)  This means sort levels
58  * which only have a few distinct priorities can represent the
59  * priority level in fewer bits, which makes the strxfrm output
60  * smaller.
61  *
62  * d) We record the total number of priorities so that strxfrm can
63  * figure out how many bytes to expand a numeric priority into.
64  *
65  * e) For the UNDEFINED pass (the last pass), we record the maximum
66  * number of bits needed to uniquely prioritize these entries, so that
67  * the last pass can also use smaller strxfrm output when possible.
68  *
69  * f) Priorities with the sign bit set are verboten.  This works out
70  * because no active character set needs that bit to carry significant
71  * information once the character is in wide form.
72  *
73  * To process the entire data to make the database, we actually run
74  * multiple passes over the data.
75  *
76  * The first pass, which is done at parse time, identifies elements,
77  * substitutions, and such, and records them in priority order.  As
78  * some priorities can refer to other priorities, using forward
79  * references, we use a table of references indicating whether the
80  * priority's value has been resolved, or whether it is still a
81  * reference.
82  *
83  * The second pass walks over all the items in priority order, noting
84  * that they are used directly, and not just an indirect reference.
85  * This is done by creating a "weight" structure for the item.  The
86  * weights are stashed in an AVL tree sorted by relative "priority".
87  *
88  * The third pass walks over all the weight structures, in priority
89  * order, and assigns a new monotonically increasing (per sort level)
90  * weight value to them.  These are the values that will actually be
91  * written to the file.
92  *
93  * The fourth pass just writes the data out.
94  */
95 
96 /*
97  * In order to resolve the priorities, we create a table of priorities.
98  * Entries in the table can be in one of three states.
99  *
100  * UNKNOWN is for newly allocated entries, and indicates that nothing
101  * is known about the priority.  (For example, when new entries are created
102  * for collating-symbols, this is the value assigned for them until the
103  * collating symbol's order has been determined.
104  *
105  * RESOLVED is used for an entry where the priority indicates the final
106  * numeric weight.
107  *
108  * REFER is used for entries that reference other entries.  Typically
109  * this is used for forward references.  A collating-symbol can never
110  * have this value.
111  *
112  * The "pass" field is used during final resolution to aid in detection
113  * of referencing loops.  (For example <A> depends on <B>, but <B> has its
114  * priority dependent on <A>.)
115  */
116 typedef enum {
117 	UNKNOWN,	/* priority is totally unknown */
118 	RESOLVED,	/* priority value fully resolved */
119 	REFER		/* priority is a reference (index) */
120 } res_t;
121 
122 typedef struct weight {
123 	int32_t		pri;
124 	int		opt;
125 	avl_node_t	avl;
126 } weight_t;
127 
128 typedef struct priority {
129 	res_t		res;
130 	int32_t		pri;
131 	int		pass;
132 	int		lineno;
133 } collpri_t;
134 
135 #define	NUM_WT	collinfo.directive_count
136 
137 /*
138  * These are the abstract collating symbols, which are just a symbolic
139  * way to reference a priority.
140  */
141 struct collsym {
142 	char		*name;
143 	int32_t		ref;
144 	avl_node_t	avl;
145 };
146 
147 /*
148  * These are also abstract collating symbols, but we allow them to have
149  * different priorities at different levels.
150  */
151 typedef struct collundef {
152 	char		*name;
153 	int32_t		ref[COLL_WEIGHTS_MAX];
154 	avl_node_t	avl;
155 } collundef_t;
156 
157 /*
158  * These are called "chains" in libc.  This records the fact that two
159  * more characters should be treated as a single collating entity when
160  * they appear together.  For example, in Spanish <C><h> gets collated
161  * as a character between <C> and <D>.
162  */
163 struct collelem {
164 	char		*symbol;
165 	wchar_t		*expand;
166 	int32_t		ref[COLL_WEIGHTS_MAX];
167 	avl_node_t	avl_bysymbol;
168 	avl_node_t	avl_byexpand;
169 };
170 
171 /*
172  * Individual characters have a sequence of weights as well.
173  */
174 typedef struct collchar {
175 	wchar_t		wc;
176 	int32_t		ref[COLL_WEIGHTS_MAX];
177 	avl_node_t	avl;
178 } collchar_t;
179 
180 /*
181  * Substitution entries.  The key is itself a priority.  Note that
182  * when we create one of these, we *automatically* wind up with a
183  * fully resolved priority for the key, because creation of
184  * substitutions creates a resolved priority at the same time.
185  */
186 typedef struct {
187 	int32_t		key;
188 	int32_t		ref[COLLATE_STR_LEN];
189 	avl_node_t	avl;
190 	avl_node_t	avl_ref;
191 } subst_t;
192 
193 static avl_tree_t	collsyms;
194 static avl_tree_t	collundefs;
195 static avl_tree_t	elem_by_symbol;
196 static avl_tree_t	elem_by_expand;
197 static avl_tree_t	collchars;
198 static avl_tree_t	substs[COLL_WEIGHTS_MAX];
199 static avl_tree_t	substs_ref[COLL_WEIGHTS_MAX];
200 static avl_tree_t	weights[COLL_WEIGHTS_MAX];
201 static int32_t		nweight[COLL_WEIGHTS_MAX];
202 
203 /*
204  * This is state tracking for the ellipsis token.  Note that we start
205  * the initial values so that the ellipsis logic will think we got a
206  * magic starting value of NUL.  It starts at minus one because the
207  * starting point is exclusive -- i.e. the starting point is not
208  * itself handled by the ellipsis code.
209  */
210 static int currorder = EOF;
211 static int lastorder = EOF;
212 static collelem_t *currelem;
213 static collchar_t *currchar;
214 static collundef_t *currundef;
215 static wchar_t ellipsis_start = 0;
216 static int32_t ellipsis_weights[COLL_WEIGHTS_MAX];
217 
218 /*
219  * We keep a running tally of weights.
220  */
221 static int nextpri = 1;
222 static int nextsubst[COLL_WEIGHTS_MAX] = { 0 };
223 
224 /*
225  * This array collects up the weights for each level.
226  */
227 static int32_t order_weights[COLL_WEIGHTS_MAX];
228 static int curr_weight = 0;
229 static int32_t subst_weights[COLLATE_STR_LEN];
230 static int curr_subst = 0;
231 
232 /*
233  * Some initial priority values.
234  */
235 static int32_t pri_undefined[COLL_WEIGHTS_MAX];
236 static int32_t pri_ignore;
237 
238 static collate_info_t collinfo;
239 
240 static collpri_t	*prilist = NULL;
241 static int		numpri = 0;
242 static int		maxpri = 0;
243 
244 static void start_order(int);
245 
246 static int32_t
247 new_pri(void)
248 {
249 	int i;
250 
251 	if (numpri >= maxpri) {
252 		maxpri = maxpri ? maxpri * 2 : 1024;
253 		prilist = realloc(prilist, sizeof (collpri_t) * maxpri);
254 		if (prilist == NULL) {
255 			errf(_("out of memory"));
256 			return (-1);
257 		}
258 		for (i = numpri; i < maxpri; i++) {
259 			prilist[i].res = UNKNOWN;
260 			prilist[i].pri = 0;
261 			prilist[i].pass = 0;
262 		}
263 	}
264 	return (numpri++);
265 }
266 
267 static collpri_t *
268 get_pri(int32_t ref)
269 {
270 	if ((ref < 0) || (ref > numpri)) {
271 		INTERR;
272 		return (NULL);
273 	}
274 	return (&prilist[ref]);
275 }
276 
277 static void
278 set_pri(int32_t ref, int32_t v, res_t res)
279 {
280 	collpri_t	*pri;
281 
282 	pri = get_pri(ref);
283 
284 	if ((res == REFER) && ((v < 0) || (v >= numpri))) {
285 		INTERR;
286 	}
287 
288 	/* Resolve self references */
289 	if ((res == REFER) && (ref == v)) {
290 		v = nextpri;
291 		res = RESOLVED;
292 	}
293 
294 	if (pri->res != UNKNOWN) {
295 		warn(_("repeated item in order list (first on %d)"),
296 		    pri->lineno);
297 		return;
298 	}
299 	pri->lineno = lineno;
300 	pri->pri = v;
301 	pri->res = res;
302 }
303 
304 static int32_t
305 resolve_pri(int32_t ref)
306 {
307 	collpri_t	*pri;
308 	static int32_t	pass = 0;
309 
310 	pri = get_pri(ref);
311 	pass++;
312 	while (pri->res == REFER) {
313 		if (pri->pass == pass) {
314 			/* report a line with the circular symbol */
315 			lineno = pri->lineno;
316 			errf(_("circular reference in order list"));
317 			return (-1);
318 		}
319 		if ((pri->pri < 0) || (pri->pri >= numpri)) {
320 			INTERR;
321 			return (-1);
322 		}
323 		pri->pass = pass;
324 		pri = &prilist[pri->pri];
325 	}
326 
327 	if (pri->res == UNKNOWN) {
328 		return (-1);
329 	}
330 	if (pri->res != RESOLVED)
331 		INTERR;
332 
333 	return (pri->pri);
334 }
335 
336 static int
337 weight_compare(const void *n1, const void *n2)
338 {
339 	int32_t	k1 = ((const weight_t *)n1)->pri;
340 	int32_t	k2 = ((const weight_t *)n2)->pri;
341 
342 	return (k1 < k2 ? -1 : k1 > k2 ? 1 : 0);
343 }
344 
345 static int
346 collsym_compare(const void *n1, const void *n2)
347 {
348 	const collsym_t *c1 = n1;
349 	const collsym_t *c2 = n2;
350 	int rv;
351 
352 	rv = strcmp(c1->name, c2->name);
353 	return ((rv < 0) ? -1 : (rv > 0) ? 1 : 0);
354 }
355 
356 static int
357 collundef_compare(const void *n1, const void *n2)
358 {
359 	const collundef_t *c1 = n1;
360 	const collundef_t *c2 = n2;
361 	int rv;
362 
363 	rv = strcmp(c1->name, c2->name);
364 	return ((rv < 0) ? -1 : (rv > 0) ? 1 : 0);
365 }
366 
367 static int
368 element_compare_symbol(const void *n1, const void *n2)
369 {
370 	const collelem_t *c1 = n1;
371 	const collelem_t *c2 = n2;
372 	int rv;
373 
374 	rv = strcmp(c1->symbol, c2->symbol);
375 	return ((rv < 0) ? -1 : (rv > 0) ? 1 : 0);
376 }
377 
378 static int
379 element_compare_expand(const void *n1, const void *n2)
380 {
381 	const collelem_t *c1 = n1;
382 	const collelem_t *c2 = n2;
383 	int rv;
384 
385 	rv = wcscmp(c1->expand, c2->expand);
386 	return ((rv < 0) ? -1 : (rv > 0) ? 1 : 0);
387 }
388 
389 static int
390 collchar_compare(const void *n1, const void *n2)
391 {
392 	wchar_t	k1 = ((const collchar_t *)n1)->wc;
393 	wchar_t	k2 = ((const collchar_t *)n2)->wc;
394 
395 	return (k1 < k2 ? -1 : k1 > k2 ? 1 : 0);
396 }
397 
398 static int
399 subst_compare(const void *n1, const void *n2)
400 {
401 	int32_t	k1 = ((const subst_t *)n1)->key;
402 	int32_t	k2 = ((const subst_t *)n2)->key;
403 
404 	return (k1 < k2 ? -1 : k1 > k2 ? 1 : 0);
405 }
406 
407 static int
408 subst_compare_ref(const void *n1, const void *n2)
409 {
410 	int32_t *c1 = ((subst_t *)n1)->ref;
411 	int32_t *c2 = ((subst_t *)n2)->ref;
412 	int rv;
413 
414 	rv = wcscmp((wchar_t *)c1, (wchar_t *)c2);
415 	return ((rv < 0) ? -1 : (rv > 0) ? 1 : 0);
416 }
417 
418 void
419 init_collate(void)
420 {
421 	int i;
422 
423 	avl_create(&collsyms, collsym_compare, sizeof (collsym_t),
424 	    offsetof(collsym_t, avl));
425 
426 	avl_create(&collundefs, collundef_compare, sizeof (collsym_t),
427 	    offsetof(collundef_t, avl));
428 
429 	avl_create(&elem_by_symbol, element_compare_symbol, sizeof (collelem_t),
430 	    offsetof(collelem_t, avl_bysymbol));
431 	avl_create(&elem_by_expand, element_compare_expand, sizeof (collelem_t),
432 	    offsetof(collelem_t, avl_byexpand));
433 
434 	avl_create(&collchars, collchar_compare, sizeof (collchar_t),
435 	    offsetof(collchar_t, avl));
436 
437 	for (i = 0; i < COLL_WEIGHTS_MAX; i++) {
438 		avl_create(&substs[i], subst_compare, sizeof (subst_t),
439 		    offsetof(subst_t, avl));
440 		avl_create(&substs_ref[i], subst_compare_ref,
441 		    sizeof (subst_t), offsetof(subst_t, avl_ref));
442 		avl_create(&weights[i], weight_compare, sizeof (weight_t),
443 		    offsetof(weight_t, avl));
444 		nweight[i] = 1;
445 	}
446 
447 	(void) memset(&collinfo, 0, sizeof (collinfo));
448 
449 	/* allocate some initial priorities */
450 	pri_ignore = new_pri();
451 
452 	set_pri(pri_ignore, 0, RESOLVED);
453 
454 	for (i = 0; i < COLL_WEIGHTS_MAX; i++) {
455 		pri_undefined[i] = new_pri();
456 
457 		/* we will override this later */
458 		set_pri(pri_undefined[i], COLLATE_MAX_PRIORITY, UNKNOWN);
459 	}
460 }
461 
462 void
463 define_collsym(char *name)
464 {
465 	collsym_t	*sym;
466 	avl_index_t	where;
467 
468 	if ((sym = calloc(sizeof (*sym), 1)) == NULL) {
469 		errf(_("out of memory"));
470 		return;
471 	}
472 	sym->name = name;
473 	sym->ref = new_pri();
474 
475 	if (avl_find(&collsyms, sym, &where) != NULL) {
476 		/*
477 		 * This should never happen because we are only called
478 		 * for undefined symbols.
479 		 */
480 		INTERR;
481 		return;
482 	}
483 	avl_insert(&collsyms, sym, where);
484 }
485 
486 collsym_t *
487 lookup_collsym(char *name)
488 {
489 	collsym_t	srch;
490 
491 	srch.name = name;
492 	return (avl_find(&collsyms, &srch, NULL));
493 }
494 
495 collelem_t *
496 lookup_collelem(char *symbol)
497 {
498 	collelem_t	srch;
499 
500 	srch.symbol = symbol;
501 	return (avl_find(&elem_by_symbol, &srch, NULL));
502 }
503 
504 static collundef_t *
505 get_collundef(char *name)
506 {
507 	collundef_t	srch;
508 	collundef_t	*ud;
509 	avl_index_t	where;
510 	int		i;
511 
512 	srch.name = name;
513 	if ((ud = avl_find(&collundefs, &srch, &where)) == NULL) {
514 		if (((ud = calloc(sizeof (*ud), 1)) == NULL) ||
515 		    ((ud->name = strdup(name)) == NULL)) {
516 			errf(_("out of memory"));
517 			return (NULL);
518 		}
519 		for (i = 0; i < NUM_WT; i++) {
520 			ud->ref[i] = new_pri();
521 		}
522 		avl_insert(&collundefs, ud, where);
523 	}
524 	add_charmap_undefined(name);
525 	return (ud);
526 }
527 
528 static collchar_t *
529 get_collchar(wchar_t wc, int create)
530 {
531 	collchar_t	srch;
532 	collchar_t	*cc;
533 	avl_index_t	where;
534 	int		i;
535 
536 	srch.wc = wc;
537 	cc = avl_find(&collchars, &srch, &where);
538 	if ((cc == NULL) && create) {
539 		if ((cc = calloc(sizeof (*cc), 1)) == NULL) {
540 			errf(_("out of memory"));
541 			return (NULL);
542 		}
543 		for (i = 0; i < NUM_WT; i++) {
544 			cc->ref[i] = new_pri();
545 		}
546 		cc->wc = wc;
547 		avl_insert(&collchars, cc, where);
548 	}
549 	return (cc);
550 }
551 
552 void
553 end_order_collsym(collsym_t *sym)
554 {
555 	start_order(T_COLLSYM);
556 	/* update the weight */
557 
558 	set_pri(sym->ref, nextpri, RESOLVED);
559 	nextpri++;
560 }
561 
562 void
563 end_order(void)
564 {
565 	int		i;
566 	int32_t		pri;
567 	int32_t		ref;
568 	collpri_t	*p;
569 
570 	/* advance the priority/weight */
571 	pri = nextpri;
572 
573 	switch (currorder) {
574 	case T_CHAR:
575 		for (i = 0; i < NUM_WT; i++) {
576 			if (((ref = order_weights[i]) < 0) ||
577 			    ((p = get_pri(ref)) == NULL) ||
578 			    (p->pri == -1)) {
579 				/* unspecified weight is a self reference */
580 				set_pri(currchar->ref[i], pri, RESOLVED);
581 			} else {
582 				set_pri(currchar->ref[i], ref, REFER);
583 			}
584 			order_weights[i] = -1;
585 		}
586 
587 		/* leave a cookie trail in case next symbol is ellipsis */
588 		ellipsis_start = currchar->wc + 1;
589 		currchar = NULL;
590 		break;
591 
592 	case T_ELLIPSIS:
593 		/* save off the weights were we can find them */
594 		for (i = 0; i < NUM_WT; i++) {
595 			ellipsis_weights[i] = order_weights[i];
596 			order_weights[i] = -1;
597 		}
598 		break;
599 
600 	case T_COLLELEM:
601 		if (currelem == NULL) {
602 			INTERR;
603 		} else {
604 			for (i = 0; i < NUM_WT; i++) {
605 
606 				if (((ref = order_weights[i]) < 0) ||
607 				    ((p = get_pri(ref)) == NULL) ||
608 				    (p->pri == -1)) {
609 					set_pri(currelem->ref[i], pri,
610 					    RESOLVED);
611 				} else {
612 					set_pri(currelem->ref[i], ref, REFER);
613 				}
614 				order_weights[i] = -1;
615 			}
616 		}
617 		break;
618 
619 	case T_UNDEFINED:
620 		for (i = 0; i < NUM_WT; i++) {
621 			if (((ref = order_weights[i]) < 0) ||
622 			    ((p = get_pri(ref)) == NULL) ||
623 			    (p->pri == -1)) {
624 				set_pri(pri_undefined[i], -1, RESOLVED);
625 			} else {
626 				set_pri(pri_undefined[i], ref, REFER);
627 			}
628 			order_weights[i] = -1;
629 		}
630 		break;
631 
632 	case T_SYMBOL:
633 		for (i = 0; i < NUM_WT; i++) {
634 			if (((ref = order_weights[i]) < 0) ||
635 			    ((p = get_pri(ref)) == NULL) ||
636 			    (p->pri == -1)) {
637 				set_pri(currundef->ref[i], pri, RESOLVED);
638 			} else {
639 				set_pri(currundef->ref[i], ref, REFER);
640 			}
641 			order_weights[i] = -1;
642 		}
643 		break;
644 
645 	default:
646 		INTERR;
647 	}
648 
649 	nextpri++;
650 }
651 
652 static void
653 start_order(int type)
654 {
655 	int	i;
656 
657 	lastorder = currorder;
658 	currorder = type;
659 
660 	/* this is used to protect ELLIPSIS processing */
661 	if ((lastorder == T_ELLIPSIS) && (type != T_CHAR)) {
662 		errf(_("character value expected"));
663 	}
664 
665 	for (i = 0; i < COLL_WEIGHTS_MAX; i++) {
666 		order_weights[i] = -1;
667 	}
668 	curr_weight = 0;
669 }
670 
671 void
672 start_order_undefined(void)
673 {
674 	start_order(T_UNDEFINED);
675 }
676 
677 void
678 start_order_symbol(char *name)
679 {
680 	currundef = get_collundef(name);
681 	start_order(T_SYMBOL);
682 }
683 
684 void
685 start_order_char(wchar_t wc)
686 {
687 	collchar_t	*cc;
688 	int32_t		ref;
689 
690 	start_order(T_CHAR);
691 
692 	/*
693 	 * If we last saw an ellipsis, then we need to close the range.
694 	 * Handle that here.  Note that we have to be careful because the
695 	 * items *inside* the range are treated exclusiveley to the items
696 	 * outside of the range.  The ends of the range can have quite
697 	 * different weights than the range members.
698 	 */
699 	if (lastorder == T_ELLIPSIS) {
700 		int		i;
701 
702 		if (wc < ellipsis_start) {
703 			errf(_("malformed range!"));
704 			return;
705 		}
706 		while (ellipsis_start < wc) {
707 			/*
708 			 * pick all of the saved weights for the
709 			 * ellipsis.  note that -1 encodes for the
710 			 * ellipsis itself, which means to take the
711 			 * current relative priority.
712 			 */
713 			if ((cc = get_collchar(ellipsis_start, 1)) == NULL) {
714 				INTERR;
715 				return;
716 			}
717 			for (i = 0; i < NUM_WT; i++) {
718 				collpri_t *p;
719 				if (((ref = ellipsis_weights[i]) == -1) ||
720 				    ((p = get_pri(ref)) == NULL) ||
721 				    (p->pri == -1)) {
722 					set_pri(cc->ref[i], nextpri, RESOLVED);
723 				} else {
724 					set_pri(cc->ref[i], ref, REFER);
725 				}
726 				ellipsis_weights[i] = 0;
727 			}
728 			ellipsis_start++;
729 			nextpri++;
730 		}
731 	}
732 
733 	currchar = get_collchar(wc, 1);
734 }
735 
736 void
737 start_order_collelem(collelem_t *e)
738 {
739 	start_order(T_COLLELEM);
740 	currelem = e;
741 }
742 
743 void
744 start_order_ellipsis(void)
745 {
746 	int	i;
747 
748 	start_order(T_ELLIPSIS);
749 
750 	if (lastorder != T_CHAR) {
751 		errf(_("illegal starting point for range"));
752 		return;
753 	}
754 
755 	for (i = 0; i < NUM_WT; i++) {
756 		ellipsis_weights[i] = order_weights[i];
757 	}
758 }
759 
760 void
761 define_collelem(char *name, wchar_t *wcs)
762 {
763 	collelem_t	*e;
764 	avl_index_t	where1;
765 	avl_index_t	where2;
766 	int		i;
767 
768 	if (wcslen(wcs) >= COLLATE_STR_LEN) {
769 		errf(_("expanded collation element too long"));
770 		return;
771 	}
772 
773 	if ((e = calloc(sizeof (*e), 1)) == NULL) {
774 		errf(_("out of memory"));
775 		return;
776 	}
777 	e->expand = wcs;
778 	e->symbol = name;
779 
780 	/*
781 	 * This is executed before the order statement, so we don't
782 	 * know how many priorities we *really* need.  We allocate one
783 	 * for each possible weight.  Not a big deal, as collating-elements
784 	 * prove to be quite rare.
785 	 */
786 	for (i = 0; i < COLL_WEIGHTS_MAX; i++) {
787 		e->ref[i] = new_pri();
788 	}
789 
790 	/* A character sequence can only reduce to one element. */
791 	if ((avl_find(&elem_by_symbol, e, &where1) != NULL) ||
792 	    (avl_find(&elem_by_expand, e, &where2) != NULL)) {
793 		errf(_("duplicate collating element definition"));
794 		return;
795 	}
796 	avl_insert(&elem_by_symbol, e, where1);
797 	avl_insert(&elem_by_expand, e, where2);
798 }
799 
800 void
801 add_order_bit(int kw)
802 {
803 	uint8_t bit = DIRECTIVE_UNDEF;
804 
805 	switch (kw) {
806 	case T_FORWARD:
807 		bit = DIRECTIVE_FORWARD;
808 		break;
809 	case T_BACKWARD:
810 		bit = DIRECTIVE_BACKWARD;
811 		break;
812 	case T_POSITION:
813 		bit = DIRECTIVE_POSITION;
814 		break;
815 	default:
816 		INTERR;
817 		break;
818 	}
819 	collinfo.directive[collinfo.directive_count] |= bit;
820 }
821 
822 void
823 add_order_directive(void)
824 {
825 	if (collinfo.directive_count >= COLL_WEIGHTS_MAX) {
826 		errf(_("too many directives (max %d)"), COLL_WEIGHTS_MAX);
827 	}
828 	collinfo.directive_count++;
829 }
830 
831 static void
832 add_order_pri(int32_t ref)
833 {
834 	if (curr_weight >= NUM_WT) {
835 		errf(_("too many weights (max %d)"), NUM_WT);
836 		return;
837 	}
838 	order_weights[curr_weight] = ref;
839 	curr_weight++;
840 }
841 
842 void
843 add_order_collsym(collsym_t *s)
844 {
845 	add_order_pri(s->ref);
846 }
847 
848 void
849 add_order_char(wchar_t wc)
850 {
851 	collchar_t *cc;
852 
853 	if ((cc = get_collchar(wc, 1)) == NULL) {
854 		INTERR;
855 		return;
856 	}
857 
858 	add_order_pri(cc->ref[curr_weight]);
859 }
860 
861 void
862 add_order_collelem(collelem_t *e)
863 {
864 	add_order_pri(e->ref[curr_weight]);
865 }
866 
867 void
868 add_order_ignore(void)
869 {
870 	add_order_pri(pri_ignore);
871 }
872 
873 void
874 add_order_symbol(char *sym)
875 {
876 	collundef_t *c;
877 	if ((c = get_collundef(sym)) == NULL) {
878 		INTERR;
879 		return;
880 	}
881 	add_order_pri(c->ref[curr_weight]);
882 }
883 
884 void
885 add_order_ellipsis(void)
886 {
887 	/* special 0 value indicates self reference */
888 	add_order_pri(0);
889 }
890 
891 void
892 add_order_subst(void)
893 {
894 	subst_t srch;
895 	subst_t	*s;
896 	avl_index_t where;
897 	int i;
898 
899 	(void) memset(&srch, 0, sizeof (srch));
900 	for (i = 0; i < curr_subst; i++) {
901 		srch.ref[i] = subst_weights[i];
902 		subst_weights[i] = 0;
903 	}
904 	s = avl_find(&substs_ref[curr_weight], &srch, &where);
905 
906 	if (s == NULL) {
907 		if ((s = calloc(sizeof (*s), 1)) == NULL) {
908 			errf(_("out of memory"));
909 			return;
910 		}
911 		s->key = new_pri();
912 
913 		/*
914 		 * We use a self reference for our key, but we set a
915 		 * high bit to indicate that this is a substitution
916 		 * reference.  This will expedite table lookups later,
917 		 * and prevent table lookups for situations that don't
918 		 * require it.  (In short, its a big win, because we
919 		 * can skip a lot of binary searching.)
920 		 */
921 		set_pri(s->key,
922 		    (nextsubst[curr_weight] | COLLATE_SUBST_PRIORITY),
923 		    RESOLVED);
924 		nextsubst[curr_weight] += 1;
925 
926 		for (i = 0; i < curr_subst; i++) {
927 			s->ref[i] = srch.ref[i];
928 		}
929 
930 		avl_insert(&substs_ref[curr_weight], s, where);
931 
932 		if (avl_find(&substs[curr_weight], s, &where) != NULL) {
933 			INTERR;
934 			return;
935 		}
936 		avl_insert(&substs[curr_weight], s, where);
937 	}
938 	curr_subst = 0;
939 
940 
941 	/*
942 	 * We are using the current (unique) priority as a search key
943 	 * in the substitution table.
944 	 */
945 	add_order_pri(s->key);
946 }
947 
948 static void
949 add_subst_pri(int32_t ref)
950 {
951 	if (curr_subst >= COLLATE_STR_LEN) {
952 		errf(_("substitution string is too long"));
953 		return;
954 	}
955 	subst_weights[curr_subst] = ref;
956 	curr_subst++;
957 }
958 
959 void
960 add_subst_char(wchar_t wc)
961 {
962 	collchar_t *cc;
963 
964 
965 	if (((cc = get_collchar(wc, 1)) == NULL) ||
966 	    (cc->wc != wc)) {
967 		INTERR;
968 		return;
969 	}
970 	/* we take the weight for the character at that position */
971 	add_subst_pri(cc->ref[curr_weight]);
972 }
973 
974 void
975 add_subst_collelem(collelem_t *e)
976 {
977 	add_subst_pri(e->ref[curr_weight]);
978 }
979 
980 void
981 add_subst_collsym(collsym_t *s)
982 {
983 	add_subst_pri(s->ref);
984 }
985 
986 void
987 add_subst_symbol(char *ptr)
988 {
989 	collundef_t *cu;
990 
991 	if ((cu = get_collundef(ptr)) != NULL) {
992 		add_subst_pri(cu->ref[curr_weight]);
993 	}
994 }
995 
996 void
997 add_weight(int32_t ref, int pass)
998 {
999 	weight_t srch;
1000 	weight_t *w;
1001 	avl_index_t where;
1002 
1003 	srch.pri = resolve_pri(ref);
1004 
1005 	/* No translation of ignores */
1006 	if (srch.pri == 0)
1007 		return;
1008 
1009 	/* Substitution priorities are not weights */
1010 	if (srch.pri & COLLATE_SUBST_PRIORITY)
1011 		return;
1012 
1013 	if (avl_find(&weights[pass], &srch, &where) != NULL)
1014 		return;
1015 
1016 	if ((w = calloc(sizeof (*w), 1)) == NULL) {
1017 		errf(_("out of memory"));
1018 		return;
1019 	}
1020 	w->pri = srch.pri;
1021 	avl_insert(&weights[pass], w, where);
1022 }
1023 
1024 void
1025 add_weights(int32_t *refs)
1026 {
1027 	int i;
1028 	for (i = 0; i < NUM_WT; i++) {
1029 		add_weight(refs[i], i);
1030 	}
1031 }
1032 
1033 int32_t
1034 get_weight(int32_t ref, int pass)
1035 {
1036 	weight_t	srch;
1037 	weight_t	*w;
1038 	int32_t		pri;
1039 
1040 	pri = resolve_pri(ref);
1041 	if (pri & COLLATE_SUBST_PRIORITY) {
1042 		return (pri);
1043 	}
1044 	if (pri <= 0) {
1045 		return (pri);
1046 	}
1047 	srch.pri = pri;
1048 	if ((w = avl_find(&weights[pass], &srch, NULL)) == NULL) {
1049 		INTERR;
1050 		return (-1);
1051 	}
1052 	return (w->opt);
1053 }
1054 
1055 void
1056 dump_collate(void)
1057 {
1058 	FILE			*f;
1059 	int			i, j, n;
1060 	size_t			sz;
1061 	int32_t			pri;
1062 	collelem_t		*ce;
1063 	collchar_t		*cc;
1064 	subst_t			*sb;
1065 	char			vers[COLLATE_STR_LEN];
1066 	collate_char_t		chars[UCHAR_MAX + 1];
1067 	collate_large_t		*large;
1068 	collate_subst_t		*subst[COLL_WEIGHTS_MAX];
1069 	collate_chain_t		*chain;
1070 
1071 	/*
1072 	 * We have to run throught a preliminary pass to identify all the
1073 	 * weights that we use for each sorting level.
1074 	 */
1075 	for (i = 0; i < NUM_WT; i++) {
1076 		add_weight(pri_ignore, i);
1077 	}
1078 	for (i = 0; i < NUM_WT; i++) {
1079 		for (sb = avl_first(&substs[i]); sb;
1080 		    sb = AVL_NEXT(&substs[i], sb)) {
1081 			for (j = 0; sb->ref[j]; j++) {
1082 				add_weight(sb->ref[j], i);
1083 			}
1084 		}
1085 	}
1086 	for (ce = avl_first(&elem_by_expand);
1087 	    ce != NULL;
1088 	    ce = AVL_NEXT(&elem_by_expand, ce)) {
1089 		add_weights(ce->ref);
1090 	}
1091 	for (cc = avl_first(&collchars); cc; cc = AVL_NEXT(&collchars, cc)) {
1092 		add_weights(cc->ref);
1093 	}
1094 
1095 	/*
1096 	 * Now we walk the entire set of weights, removing the gaps
1097 	 * in the weights.  This gives us optimum usage.  The walk
1098 	 * occurs in priority.
1099 	 */
1100 	for (i = 0; i < NUM_WT; i++) {
1101 		weight_t *w;
1102 		for (w = avl_first(&weights[i]); w;
1103 		    w = AVL_NEXT(&weights[i], w)) {
1104 			w->opt = nweight[i];
1105 			nweight[i] += 1;
1106 		}
1107 	}
1108 
1109 	(void) memset(&chars, 0, sizeof (chars));
1110 	(void) memset(vers, 0, COLLATE_STR_LEN);
1111 	(void) strlcpy(vers, COLLATE_VERSION, sizeof (vers));
1112 
1113 	/*
1114 	 * We need to make sure we arrange for the UNDEFINED field
1115 	 * to show up.  Also, set the total weight counts.
1116 	 */
1117 	for (i = 0; i < NUM_WT; i++) {
1118 		if (resolve_pri(pri_undefined[i]) == -1) {
1119 			set_pri(pri_undefined[i], -1, RESOLVED);
1120 			/* they collate at the end of everything else */
1121 			collinfo.undef_pri[i] = COLLATE_MAX_PRIORITY;
1122 		}
1123 		collinfo.pri_count[i] = nweight[i];
1124 	}
1125 
1126 	collinfo.pri_count[NUM_WT] = max_wide();
1127 	collinfo.undef_pri[NUM_WT] = COLLATE_MAX_PRIORITY;
1128 	collinfo.directive[NUM_WT] = DIRECTIVE_UNDEFINED;
1129 
1130 	/*
1131 	 * Ordinary character priorities
1132 	 */
1133 	for (i = 0; i <= UCHAR_MAX; i++) {
1134 		if ((cc = get_collchar(i, 0)) != NULL) {
1135 			for (j = 0; j < NUM_WT; j++) {
1136 				chars[i].pri[j] = get_weight(cc->ref[j], j);
1137 			}
1138 		} else {
1139 			for (j = 0; j < NUM_WT; j++) {
1140 				chars[i].pri[j] =
1141 				    get_weight(pri_undefined[j], j);
1142 			}
1143 			/*
1144 			 * Per POSIX, for undefined characters, we
1145 			 * also have to add a last item, which is the
1146 			 * character code.
1147 			 */
1148 			chars[i].pri[NUM_WT] = i;
1149 		}
1150 	}
1151 
1152 	/*
1153 	 * Substitution tables
1154 	 */
1155 	for (i = 0; i < NUM_WT; i++) {
1156 		collate_subst_t *st = NULL;
1157 		n = collinfo.subst_count[i] = avl_numnodes(&substs[i]);
1158 		if ((st = calloc(sizeof (collate_subst_t) * n, 1)) == NULL) {
1159 			errf(_("out of memory"));
1160 			return;
1161 		}
1162 		n = 0;
1163 		for (sb = avl_first(&substs[i]); sb;
1164 		    sb = AVL_NEXT(&substs[i], sb)) {
1165 			if ((st[n].key = resolve_pri(sb->key)) < 0) {
1166 				/* by definition these resolve! */
1167 				INTERR;
1168 			}
1169 			if (st[n].key != (n | COLLATE_SUBST_PRIORITY)) {
1170 				INTERR;
1171 			}
1172 			for (j = 0; sb->ref[j]; j++) {
1173 				st[n].pri[j] = get_weight(sb->ref[j], i);
1174 			}
1175 			n++;
1176 		}
1177 		if (n != collinfo.subst_count[i])
1178 			INTERR;
1179 		subst[i] = st;
1180 	}
1181 
1182 
1183 	/*
1184 	 * Chains, i.e. collating elements
1185 	 */
1186 	collinfo.chain_count = avl_numnodes(&elem_by_expand);
1187 	chain = calloc(sizeof (collate_chain_t), collinfo.chain_count);
1188 	if (chain == NULL) {
1189 		errf(_("out of memory"));
1190 		return;
1191 	}
1192 	for (n = 0, ce = avl_first(&elem_by_expand);
1193 	    ce != NULL;
1194 	    ce = AVL_NEXT(&elem_by_expand, ce), n++) {
1195 		(void) wsncpy(chain[n].str, ce->expand, COLLATE_STR_LEN);
1196 		for (i = 0; i < NUM_WT; i++) {
1197 			chain[n].pri[i] = get_weight(ce->ref[i], i);
1198 		}
1199 	}
1200 	if (n != collinfo.chain_count)
1201 		INTERR;
1202 
1203 	/*
1204 	 * Large (> UCHAR_MAX) character priorities
1205 	 */
1206 	large = calloc(sizeof (collate_large_t) * avl_numnodes(&collchars), 1);
1207 	if (large == NULL) {
1208 		errf(_("out of memory"));
1209 		return;
1210 	}
1211 
1212 	i = 0;
1213 	for (cc = avl_first(&collchars); cc; cc = AVL_NEXT(&collchars, cc)) {
1214 		int	undef = 0;
1215 		/* we already gathered those */
1216 		if (cc->wc <= UCHAR_MAX)
1217 			continue;
1218 		for (j = 0; j < NUM_WT; j++) {
1219 			if ((pri = get_weight(cc->ref[j], j)) < 0) {
1220 				undef = 1;
1221 			}
1222 			if (undef && (pri >= 0)) {
1223 				/* if undefined, then all priorities are */
1224 				INTERR;
1225 			} else {
1226 				large[i].pri.pri[j] = pri;
1227 			}
1228 		}
1229 		if (!undef) {
1230 			large[i].val = cc->wc;
1231 			collinfo.large_count = i++;
1232 		}
1233 	}
1234 
1235 	if ((f = open_category()) == NULL) {
1236 		return;
1237 	}
1238 
1239 	/* Time to write the entire data set out */
1240 
1241 	if ((wr_category(vers, COLLATE_STR_LEN, f) < 0) ||
1242 	    (wr_category(&collinfo, sizeof (collinfo), f) < 0) ||
1243 	    (wr_category(&chars, sizeof (chars), f) < 0)) {
1244 		return;
1245 	}
1246 
1247 	for (i = 0; i < NUM_WT; i++) {
1248 		sz =  sizeof (collate_subst_t) * collinfo.subst_count[i];
1249 		if (wr_category(subst[i], sz, f) < 0) {
1250 			return;
1251 		}
1252 	}
1253 	sz = sizeof (collate_chain_t) * collinfo.chain_count;
1254 	if (wr_category(chain, sz, f) < 0) {
1255 		return;
1256 	}
1257 	sz = sizeof (collate_large_t) * collinfo.large_count;
1258 	if (wr_category(large, sz, f) < 0) {
1259 		return;
1260 	}
1261 
1262 	close_category(f);
1263 }
1264