xref: /illumos-gate/usr/src/cmd/tr/cset.c (revision 84cf253f)
1 /*
2  * Copyright (c) 2004 Tim J. Robbins.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  */
26 /*
27  * "Set of characters" ADT implemented as a splay tree of extents, with
28  * a lookup table cache to simplify looking up the first bunch of
29  * characters (which are presumably more common than others).
30  */
31 
32 #include <assert.h>
33 #include <stdbool.h>
34 #include <stdlib.h>
35 #include <wchar.h>
36 #include <wctype.h>
37 #include "cset.h"
38 
39 static struct csnode	*cset_delete(struct csnode *, wchar_t);
40 static int		cset_rangecmp(struct csnode *, wchar_t);
41 static struct csnode	*cset_splay(struct csnode *, wchar_t);
42 
43 /*
44  * cset_alloc --
45  *	Allocate a set of characters.
46  */
47 struct cset *
cset_alloc(void)48 cset_alloc(void)
49 {
50 	struct cset *cs;
51 
52 	if ((cs = malloc(sizeof (*cs))) == NULL)
53 		return (NULL);
54 	cs->cs_root = NULL;
55 	cs->cs_classes = NULL;
56 	cs->cs_havecache = false;
57 	cs->cs_invert = false;
58 	return (cs);
59 }
60 
61 /*
62  * cset_add --
63  *	Add a character to the set.
64  */
65 bool
cset_add(struct cset * cs,wchar_t ch)66 cset_add(struct cset *cs, wchar_t ch)
67 {
68 	struct csnode *csn, *ncsn;
69 	wchar_t oval;
70 
71 	cs->cs_havecache = false;
72 
73 	/*
74 	 * Inserting into empty tree; new item becomes the root.
75 	 */
76 	if (cs->cs_root == NULL) {
77 		csn = malloc(sizeof (*cs->cs_root));
78 		if (csn == NULL)
79 			return (false);
80 		csn->csn_left = csn->csn_right = NULL;
81 		csn->csn_min = csn->csn_max = ch;
82 		cs->cs_root = csn;
83 		return (true);
84 	}
85 
86 	/*
87 	 * Splay to check whether the item already exists, and otherwise,
88 	 * where we should put it.
89 	 */
90 	csn = cs->cs_root = cset_splay(cs->cs_root, ch);
91 
92 	/*
93 	 * Avoid adding duplicate nodes.
94 	 */
95 	if (cset_rangecmp(csn, ch) == 0)
96 		return (true);
97 
98 	/*
99 	 * Allocate a new node and make it the new root.
100 	 */
101 	ncsn = malloc(sizeof (*ncsn));
102 	if (ncsn == NULL)
103 		return (false);
104 	ncsn->csn_min = ncsn->csn_max = ch;
105 	if (cset_rangecmp(csn, ch) < 0) {
106 		ncsn->csn_left = csn->csn_left;
107 		ncsn->csn_right = csn;
108 		csn->csn_left = NULL;
109 	} else {
110 		ncsn->csn_right = csn->csn_right;
111 		ncsn->csn_left = csn;
112 		csn->csn_right = NULL;
113 	}
114 	cs->cs_root = ncsn;
115 
116 	/*
117 	 * Coalesce with left and right neighbours if possible.
118 	 */
119 	if (ncsn->csn_left != NULL) {
120 		ncsn->csn_left = cset_splay(ncsn->csn_left, ncsn->csn_min - 1);
121 		if (ncsn->csn_left->csn_max == ncsn->csn_min - 1) {
122 			oval = ncsn->csn_left->csn_min;
123 			ncsn->csn_left = cset_delete(ncsn->csn_left,
124 			    ncsn->csn_left->csn_min);
125 			ncsn->csn_min = oval;
126 		}
127 	}
128 	if (ncsn->csn_right != NULL) {
129 		ncsn->csn_right = cset_splay(ncsn->csn_right,
130 		    ncsn->csn_max + 1);
131 		if (ncsn->csn_right->csn_min == ncsn->csn_max + 1) {
132 			oval = ncsn->csn_right->csn_max;
133 			ncsn->csn_right = cset_delete(ncsn->csn_right,
134 			    ncsn->csn_right->csn_min);
135 			ncsn->csn_max = oval;
136 		}
137 	}
138 
139 	return (true);
140 }
141 
142 /*
143  * cset_in_hard --
144  *	Determine whether a character is in the set without using
145  *	the cache.
146  */
147 bool
cset_in_hard(struct cset * cs,wchar_t ch)148 cset_in_hard(struct cset *cs, wchar_t ch)
149 {
150 	struct csclass *csc;
151 
152 	for (csc = cs->cs_classes; csc != NULL; csc = csc->csc_next)
153 		if (csc->csc_invert ^ (iswctype(ch, csc->csc_type) != 0))
154 			return (cs->cs_invert ^ true);
155 	if (cs->cs_root != NULL) {
156 		cs->cs_root = cset_splay(cs->cs_root, ch);
157 		return (cs->cs_invert ^ (cset_rangecmp(cs->cs_root, ch) == 0));
158 	}
159 	return (cs->cs_invert ^ false);
160 }
161 
162 /*
163  * cset_cache --
164  *	Update the cache.
165  */
166 void
cset_cache(struct cset * cs)167 cset_cache(struct cset *cs)
168 {
169 	wchar_t i;
170 
171 	for (i = 0; i < CS_CACHE_SIZE; i++)
172 		cs->cs_cache[i] = cset_in_hard(cs, i);
173 
174 	cs->cs_havecache = true;
175 }
176 
177 /*
178  * cset_invert --
179  *	Invert the character set.
180  */
181 void
cset_invert(struct cset * cs)182 cset_invert(struct cset *cs)
183 {
184 
185 	cs->cs_invert ^= true;
186 	cs->cs_havecache = false;
187 }
188 
189 /*
190  * cset_addclass --
191  *	Add a wctype()-style character class to the set, optionally
192  *	inverting it.
193  */
194 bool
cset_addclass(struct cset * cs,wctype_t type,bool invert)195 cset_addclass(struct cset *cs, wctype_t type, bool invert)
196 {
197 	struct csclass *csc;
198 
199 	csc = malloc(sizeof (*csc));
200 	if (csc == NULL)
201 		return (false);
202 	csc->csc_type = type;
203 	csc->csc_invert = invert;
204 	csc->csc_next = cs->cs_classes;
205 	cs->cs_classes = csc;
206 	cs->cs_havecache = false;
207 	return (true);
208 }
209 
210 static int
cset_rangecmp(struct csnode * t,wchar_t ch)211 cset_rangecmp(struct csnode *t, wchar_t ch)
212 {
213 
214 	if (ch < t->csn_min)
215 		return (-1);
216 	if (ch > t->csn_max)
217 		return (1);
218 	return (0);
219 }
220 
221 static struct csnode *
cset_splay(struct csnode * t,wchar_t ch)222 cset_splay(struct csnode *t, wchar_t ch)
223 {
224 	struct csnode N, *l, *r, *y;
225 
226 	/*
227 	 * Based on public domain code from Sleator.
228 	 */
229 
230 	assert(t != NULL);
231 
232 	N.csn_left = N.csn_right = NULL;
233 	l = r = &N;
234 	for (;;) {
235 		if (cset_rangecmp(t, ch) < 0) {
236 			if (t->csn_left != NULL &&
237 			    cset_rangecmp(t->csn_left, ch) < 0) {
238 				y = t->csn_left;
239 				t->csn_left = y->csn_right;
240 				y->csn_right = t;
241 				t = y;
242 			}
243 			if (t->csn_left == NULL)
244 				break;
245 			r->csn_left = t;
246 			r = t;
247 			t = t->csn_left;
248 		} else if (cset_rangecmp(t, ch) > 0) {
249 			if (t->csn_right != NULL &&
250 			    cset_rangecmp(t->csn_right, ch) > 0) {
251 				y = t->csn_right;
252 				t->csn_right = y->csn_left;
253 				y->csn_left = t;
254 				t = y;
255 			}
256 			if (t->csn_right == NULL)
257 				break;
258 			l->csn_right = t;
259 			l = t;
260 			t = t->csn_right;
261 		} else
262 			break;
263 	}
264 	l->csn_right = t->csn_left;
265 	r->csn_left = t->csn_right;
266 	t->csn_left = N.csn_right;
267 	t->csn_right = N.csn_left;
268 	return (t);
269 }
270 
271 static struct csnode *
cset_delete(struct csnode * t,wchar_t ch)272 cset_delete(struct csnode *t, wchar_t ch)
273 {
274 	struct csnode *x;
275 
276 	assert(t != NULL);
277 	t = cset_splay(t, ch);
278 	assert(cset_rangecmp(t, ch) == 0);
279 	if (t->csn_left == NULL)
280 		x = t->csn_right;
281 	else {
282 		x = cset_splay(t->csn_left, ch);
283 		x->csn_right = t->csn_right;
284 	}
285 	free(t);
286 	return (x);
287 }
288