xref: /illumos-gate/usr/src/cmd/tr/cset.c (revision 84cf253f)
1163bd69bSGarrett D'Amore /*
2163bd69bSGarrett D'Amore  * Copyright (c) 2004 Tim J. Robbins.
3163bd69bSGarrett D'Amore  * All rights reserved.
4163bd69bSGarrett D'Amore  *
5163bd69bSGarrett D'Amore  * Redistribution and use in source and binary forms, with or without
6163bd69bSGarrett D'Amore  * modification, are permitted provided that the following conditions
7163bd69bSGarrett D'Amore  * are met:
8163bd69bSGarrett D'Amore  * 1. Redistributions of source code must retain the above copyright
9163bd69bSGarrett D'Amore  *    notice, this list of conditions and the following disclaimer.
10163bd69bSGarrett D'Amore  * 2. Redistributions in binary form must reproduce the above copyright
11163bd69bSGarrett D'Amore  *    notice, this list of conditions and the following disclaimer in the
12163bd69bSGarrett D'Amore  *    documentation and/or other materials provided with the distribution.
13163bd69bSGarrett D'Amore  *
14163bd69bSGarrett D'Amore  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15163bd69bSGarrett D'Amore  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16163bd69bSGarrett D'Amore  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17163bd69bSGarrett D'Amore  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18163bd69bSGarrett D'Amore  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19163bd69bSGarrett D'Amore  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20163bd69bSGarrett D'Amore  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21163bd69bSGarrett D'Amore  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22163bd69bSGarrett D'Amore  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23163bd69bSGarrett D'Amore  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24163bd69bSGarrett D'Amore  * SUCH DAMAGE.
25163bd69bSGarrett D'Amore  */
26163bd69bSGarrett D'Amore /*
27163bd69bSGarrett D'Amore  * "Set of characters" ADT implemented as a splay tree of extents, with
28163bd69bSGarrett D'Amore  * a lookup table cache to simplify looking up the first bunch of
29163bd69bSGarrett D'Amore  * characters (which are presumably more common than others).
30163bd69bSGarrett D'Amore  */
31163bd69bSGarrett D'Amore 
32163bd69bSGarrett D'Amore #include <assert.h>
33163bd69bSGarrett D'Amore #include <stdbool.h>
34163bd69bSGarrett D'Amore #include <stdlib.h>
35163bd69bSGarrett D'Amore #include <wchar.h>
36163bd69bSGarrett D'Amore #include <wctype.h>
37163bd69bSGarrett D'Amore #include "cset.h"
38163bd69bSGarrett D'Amore 
39163bd69bSGarrett D'Amore static struct csnode	*cset_delete(struct csnode *, wchar_t);
40163bd69bSGarrett D'Amore static int		cset_rangecmp(struct csnode *, wchar_t);
41163bd69bSGarrett D'Amore static struct csnode	*cset_splay(struct csnode *, wchar_t);
42163bd69bSGarrett D'Amore 
43163bd69bSGarrett D'Amore /*
44163bd69bSGarrett D'Amore  * cset_alloc --
45163bd69bSGarrett D'Amore  *	Allocate a set of characters.
46163bd69bSGarrett D'Amore  */
47163bd69bSGarrett D'Amore struct cset *
cset_alloc(void)48163bd69bSGarrett D'Amore cset_alloc(void)
49163bd69bSGarrett D'Amore {
50163bd69bSGarrett D'Amore 	struct cset *cs;
51163bd69bSGarrett D'Amore 
52163bd69bSGarrett D'Amore 	if ((cs = malloc(sizeof (*cs))) == NULL)
53163bd69bSGarrett D'Amore 		return (NULL);
54163bd69bSGarrett D'Amore 	cs->cs_root = NULL;
55163bd69bSGarrett D'Amore 	cs->cs_classes = NULL;
56163bd69bSGarrett D'Amore 	cs->cs_havecache = false;
57163bd69bSGarrett D'Amore 	cs->cs_invert = false;
58163bd69bSGarrett D'Amore 	return (cs);
59163bd69bSGarrett D'Amore }
60163bd69bSGarrett D'Amore 
61163bd69bSGarrett D'Amore /*
62163bd69bSGarrett D'Amore  * cset_add --
63163bd69bSGarrett D'Amore  *	Add a character to the set.
64163bd69bSGarrett D'Amore  */
65163bd69bSGarrett D'Amore bool
cset_add(struct cset * cs,wchar_t ch)66163bd69bSGarrett D'Amore cset_add(struct cset *cs, wchar_t ch)
67163bd69bSGarrett D'Amore {
68163bd69bSGarrett D'Amore 	struct csnode *csn, *ncsn;
69163bd69bSGarrett D'Amore 	wchar_t oval;
70163bd69bSGarrett D'Amore 
71163bd69bSGarrett D'Amore 	cs->cs_havecache = false;
72163bd69bSGarrett D'Amore 
73163bd69bSGarrett D'Amore 	/*
74163bd69bSGarrett D'Amore 	 * Inserting into empty tree; new item becomes the root.
75163bd69bSGarrett D'Amore 	 */
76163bd69bSGarrett D'Amore 	if (cs->cs_root == NULL) {
77163bd69bSGarrett D'Amore 		csn = malloc(sizeof (*cs->cs_root));
78163bd69bSGarrett D'Amore 		if (csn == NULL)
79163bd69bSGarrett D'Amore 			return (false);
80163bd69bSGarrett D'Amore 		csn->csn_left = csn->csn_right = NULL;
81163bd69bSGarrett D'Amore 		csn->csn_min = csn->csn_max = ch;
82163bd69bSGarrett D'Amore 		cs->cs_root = csn;
83163bd69bSGarrett D'Amore 		return (true);
84163bd69bSGarrett D'Amore 	}
85163bd69bSGarrett D'Amore 
86163bd69bSGarrett D'Amore 	/*
87163bd69bSGarrett D'Amore 	 * Splay to check whether the item already exists, and otherwise,
88163bd69bSGarrett D'Amore 	 * where we should put it.
89163bd69bSGarrett D'Amore 	 */
90163bd69bSGarrett D'Amore 	csn = cs->cs_root = cset_splay(cs->cs_root, ch);
91163bd69bSGarrett D'Amore 
92163bd69bSGarrett D'Amore 	/*
93163bd69bSGarrett D'Amore 	 * Avoid adding duplicate nodes.
94163bd69bSGarrett D'Amore 	 */
95163bd69bSGarrett D'Amore 	if (cset_rangecmp(csn, ch) == 0)
96163bd69bSGarrett D'Amore 		return (true);
97163bd69bSGarrett D'Amore 
98163bd69bSGarrett D'Amore 	/*
99163bd69bSGarrett D'Amore 	 * Allocate a new node and make it the new root.
100163bd69bSGarrett D'Amore 	 */
101163bd69bSGarrett D'Amore 	ncsn = malloc(sizeof (*ncsn));
102163bd69bSGarrett D'Amore 	if (ncsn == NULL)
103163bd69bSGarrett D'Amore 		return (false);
104163bd69bSGarrett D'Amore 	ncsn->csn_min = ncsn->csn_max = ch;
105163bd69bSGarrett D'Amore 	if (cset_rangecmp(csn, ch) < 0) {
106163bd69bSGarrett D'Amore 		ncsn->csn_left = csn->csn_left;
107163bd69bSGarrett D'Amore 		ncsn->csn_right = csn;
108163bd69bSGarrett D'Amore 		csn->csn_left = NULL;
109163bd69bSGarrett D'Amore 	} else {
110163bd69bSGarrett D'Amore 		ncsn->csn_right = csn->csn_right;
111163bd69bSGarrett D'Amore 		ncsn->csn_left = csn;
112163bd69bSGarrett D'Amore 		csn->csn_right = NULL;
113163bd69bSGarrett D'Amore 	}
114163bd69bSGarrett D'Amore 	cs->cs_root = ncsn;
115163bd69bSGarrett D'Amore 
116163bd69bSGarrett D'Amore 	/*
117163bd69bSGarrett D'Amore 	 * Coalesce with left and right neighbours if possible.
118163bd69bSGarrett D'Amore 	 */
119163bd69bSGarrett D'Amore 	if (ncsn->csn_left != NULL) {
120163bd69bSGarrett D'Amore 		ncsn->csn_left = cset_splay(ncsn->csn_left, ncsn->csn_min - 1);
121163bd69bSGarrett D'Amore 		if (ncsn->csn_left->csn_max == ncsn->csn_min - 1) {
122163bd69bSGarrett D'Amore 			oval = ncsn->csn_left->csn_min;
123163bd69bSGarrett D'Amore 			ncsn->csn_left = cset_delete(ncsn->csn_left,
124163bd69bSGarrett D'Amore 			    ncsn->csn_left->csn_min);
125163bd69bSGarrett D'Amore 			ncsn->csn_min = oval;
126163bd69bSGarrett D'Amore 		}
127163bd69bSGarrett D'Amore 	}
128163bd69bSGarrett D'Amore 	if (ncsn->csn_right != NULL) {
129163bd69bSGarrett D'Amore 		ncsn->csn_right = cset_splay(ncsn->csn_right,
130163bd69bSGarrett D'Amore 		    ncsn->csn_max + 1);
131163bd69bSGarrett D'Amore 		if (ncsn->csn_right->csn_min == ncsn->csn_max + 1) {
132163bd69bSGarrett D'Amore 			oval = ncsn->csn_right->csn_max;
133163bd69bSGarrett D'Amore 			ncsn->csn_right = cset_delete(ncsn->csn_right,
134163bd69bSGarrett D'Amore 			    ncsn->csn_right->csn_min);
135163bd69bSGarrett D'Amore 			ncsn->csn_max = oval;
136163bd69bSGarrett D'Amore 		}
137163bd69bSGarrett D'Amore 	}
138163bd69bSGarrett D'Amore 
139163bd69bSGarrett D'Amore 	return (true);
140163bd69bSGarrett D'Amore }
141163bd69bSGarrett D'Amore 
142163bd69bSGarrett D'Amore /*
143163bd69bSGarrett D'Amore  * cset_in_hard --
144163bd69bSGarrett D'Amore  *	Determine whether a character is in the set without using
145163bd69bSGarrett D'Amore  *	the cache.
146163bd69bSGarrett D'Amore  */
147163bd69bSGarrett D'Amore bool
cset_in_hard(struct cset * cs,wchar_t ch)148163bd69bSGarrett D'Amore cset_in_hard(struct cset *cs, wchar_t ch)
149163bd69bSGarrett D'Amore {
150163bd69bSGarrett D'Amore 	struct csclass *csc;
151163bd69bSGarrett D'Amore 
152163bd69bSGarrett D'Amore 	for (csc = cs->cs_classes; csc != NULL; csc = csc->csc_next)
153*84cf253fSRichard Lowe 		if (csc->csc_invert ^ (iswctype(ch, csc->csc_type) != 0))
154163bd69bSGarrett D'Amore 			return (cs->cs_invert ^ true);
155163bd69bSGarrett D'Amore 	if (cs->cs_root != NULL) {
156163bd69bSGarrett D'Amore 		cs->cs_root = cset_splay(cs->cs_root, ch);
157*84cf253fSRichard Lowe 		return (cs->cs_invert ^ (cset_rangecmp(cs->cs_root, ch) == 0));
158163bd69bSGarrett D'Amore 	}
159163bd69bSGarrett D'Amore 	return (cs->cs_invert ^ false);
160163bd69bSGarrett D'Amore }
161163bd69bSGarrett D'Amore 
162163bd69bSGarrett D'Amore /*
163163bd69bSGarrett D'Amore  * cset_cache --
164163bd69bSGarrett D'Amore  *	Update the cache.
165163bd69bSGarrett D'Amore  */
166163bd69bSGarrett D'Amore void
cset_cache(struct cset * cs)167163bd69bSGarrett D'Amore cset_cache(struct cset *cs)
168163bd69bSGarrett D'Amore {
169163bd69bSGarrett D'Amore 	wchar_t i;
170163bd69bSGarrett D'Amore 
171163bd69bSGarrett D'Amore 	for (i = 0; i < CS_CACHE_SIZE; i++)
172163bd69bSGarrett D'Amore 		cs->cs_cache[i] = cset_in_hard(cs, i);
173163bd69bSGarrett D'Amore 
174163bd69bSGarrett D'Amore 	cs->cs_havecache = true;
175163bd69bSGarrett D'Amore }
176163bd69bSGarrett D'Amore 
177163bd69bSGarrett D'Amore /*
178163bd69bSGarrett D'Amore  * cset_invert --
179163bd69bSGarrett D'Amore  *	Invert the character set.
180163bd69bSGarrett D'Amore  */
181163bd69bSGarrett D'Amore void
cset_invert(struct cset * cs)182163bd69bSGarrett D'Amore cset_invert(struct cset *cs)
183163bd69bSGarrett D'Amore {
184163bd69bSGarrett D'Amore 
185163bd69bSGarrett D'Amore 	cs->cs_invert ^= true;
186163bd69bSGarrett D'Amore 	cs->cs_havecache = false;
187163bd69bSGarrett D'Amore }
188163bd69bSGarrett D'Amore 
189163bd69bSGarrett D'Amore /*
190163bd69bSGarrett D'Amore  * cset_addclass --
191163bd69bSGarrett D'Amore  *	Add a wctype()-style character class to the set, optionally
192163bd69bSGarrett D'Amore  *	inverting it.
193163bd69bSGarrett D'Amore  */
194163bd69bSGarrett D'Amore bool
cset_addclass(struct cset * cs,wctype_t type,bool invert)195163bd69bSGarrett D'Amore cset_addclass(struct cset *cs, wctype_t type, bool invert)
196163bd69bSGarrett D'Amore {
197163bd69bSGarrett D'Amore 	struct csclass *csc;
198163bd69bSGarrett D'Amore 
199163bd69bSGarrett D'Amore 	csc = malloc(sizeof (*csc));
200163bd69bSGarrett D'Amore 	if (csc == NULL)
201163bd69bSGarrett D'Amore 		return (false);
202163bd69bSGarrett D'Amore 	csc->csc_type = type;
203163bd69bSGarrett D'Amore 	csc->csc_invert = invert;
204163bd69bSGarrett D'Amore 	csc->csc_next = cs->cs_classes;
205163bd69bSGarrett D'Amore 	cs->cs_classes = csc;
206163bd69bSGarrett D'Amore 	cs->cs_havecache = false;
207163bd69bSGarrett D'Amore 	return (true);
208163bd69bSGarrett D'Amore }
209163bd69bSGarrett D'Amore 
210163bd69bSGarrett D'Amore static int
cset_rangecmp(struct csnode * t,wchar_t ch)211163bd69bSGarrett D'Amore cset_rangecmp(struct csnode *t, wchar_t ch)
212163bd69bSGarrett D'Amore {
213163bd69bSGarrett D'Amore 
214163bd69bSGarrett D'Amore 	if (ch < t->csn_min)
215163bd69bSGarrett D'Amore 		return (-1);
216163bd69bSGarrett D'Amore 	if (ch > t->csn_max)
217163bd69bSGarrett D'Amore 		return (1);
218163bd69bSGarrett D'Amore 	return (0);
219163bd69bSGarrett D'Amore }
220163bd69bSGarrett D'Amore 
221163bd69bSGarrett D'Amore static struct csnode *
cset_splay(struct csnode * t,wchar_t ch)222163bd69bSGarrett D'Amore cset_splay(struct csnode *t, wchar_t ch)
223163bd69bSGarrett D'Amore {
224163bd69bSGarrett D'Amore 	struct csnode N, *l, *r, *y;
225163bd69bSGarrett D'Amore 
226163bd69bSGarrett D'Amore 	/*
227163bd69bSGarrett D'Amore 	 * Based on public domain code from Sleator.
228163bd69bSGarrett D'Amore 	 */
229163bd69bSGarrett D'Amore 
230163bd69bSGarrett D'Amore 	assert(t != NULL);
231163bd69bSGarrett D'Amore 
232163bd69bSGarrett D'Amore 	N.csn_left = N.csn_right = NULL;
233163bd69bSGarrett D'Amore 	l = r = &N;
234163bd69bSGarrett D'Amore 	for (;;) {
235163bd69bSGarrett D'Amore 		if (cset_rangecmp(t, ch) < 0) {
236163bd69bSGarrett D'Amore 			if (t->csn_left != NULL &&
237163bd69bSGarrett D'Amore 			    cset_rangecmp(t->csn_left, ch) < 0) {
238163bd69bSGarrett D'Amore 				y = t->csn_left;
239163bd69bSGarrett D'Amore 				t->csn_left = y->csn_right;
240163bd69bSGarrett D'Amore 				y->csn_right = t;
241163bd69bSGarrett D'Amore 				t = y;
242163bd69bSGarrett D'Amore 			}
243163bd69bSGarrett D'Amore 			if (t->csn_left == NULL)
244163bd69bSGarrett D'Amore 				break;
245163bd69bSGarrett D'Amore 			r->csn_left = t;
246163bd69bSGarrett D'Amore 			r = t;
247163bd69bSGarrett D'Amore 			t = t->csn_left;
248163bd69bSGarrett D'Amore 		} else if (cset_rangecmp(t, ch) > 0) {
249163bd69bSGarrett D'Amore 			if (t->csn_right != NULL &&
250163bd69bSGarrett D'Amore 			    cset_rangecmp(t->csn_right, ch) > 0) {
251163bd69bSGarrett D'Amore 				y = t->csn_right;
252163bd69bSGarrett D'Amore 				t->csn_right = y->csn_left;
253163bd69bSGarrett D'Amore 				y->csn_left = t;
254163bd69bSGarrett D'Amore 				t = y;
255163bd69bSGarrett D'Amore 			}
256163bd69bSGarrett D'Amore 			if (t->csn_right == NULL)
257163bd69bSGarrett D'Amore 				break;
258163bd69bSGarrett D'Amore 			l->csn_right = t;
259163bd69bSGarrett D'Amore 			l = t;
260163bd69bSGarrett D'Amore 			t = t->csn_right;
261163bd69bSGarrett D'Amore 		} else
262163bd69bSGarrett D'Amore 			break;
263163bd69bSGarrett D'Amore 	}
264163bd69bSGarrett D'Amore 	l->csn_right = t->csn_left;
265163bd69bSGarrett D'Amore 	r->csn_left = t->csn_right;
266163bd69bSGarrett D'Amore 	t->csn_left = N.csn_right;
267163bd69bSGarrett D'Amore 	t->csn_right = N.csn_left;
268163bd69bSGarrett D'Amore 	return (t);
269163bd69bSGarrett D'Amore }
270163bd69bSGarrett D'Amore 
271163bd69bSGarrett D'Amore static struct csnode *
cset_delete(struct csnode * t,wchar_t ch)272163bd69bSGarrett D'Amore cset_delete(struct csnode *t, wchar_t ch)
273163bd69bSGarrett D'Amore {
274163bd69bSGarrett D'Amore 	struct csnode *x;
275163bd69bSGarrett D'Amore 
276163bd69bSGarrett D'Amore 	assert(t != NULL);
277163bd69bSGarrett D'Amore 	t = cset_splay(t, ch);
278163bd69bSGarrett D'Amore 	assert(cset_rangecmp(t, ch) == 0);
279163bd69bSGarrett D'Amore 	if (t->csn_left == NULL)
280163bd69bSGarrett D'Amore 		x = t->csn_right;
281163bd69bSGarrett D'Amore 	else {
282163bd69bSGarrett D'Amore 		x = cset_splay(t->csn_left, ch);
283163bd69bSGarrett D'Amore 		x->csn_right = t->csn_right;
284163bd69bSGarrett D'Amore 	}
285163bd69bSGarrett D'Amore 	free(t);
286163bd69bSGarrett D'Amore 	return (x);
287163bd69bSGarrett D'Amore }
288