1163bd69bSGarrett D'Amore /*
2163bd69bSGarrett D'Amore * Copyright (c) 2004 Tim J. Robbins.
3163bd69bSGarrett D'Amore * All rights reserved.
4163bd69bSGarrett D'Amore *
5163bd69bSGarrett D'Amore * Redistribution and use in source and binary forms, with or without
6163bd69bSGarrett D'Amore * modification, are permitted provided that the following conditions
7163bd69bSGarrett D'Amore * are met:
8163bd69bSGarrett D'Amore * 1. Redistributions of source code must retain the above copyright
9163bd69bSGarrett D'Amore * notice, this list of conditions and the following disclaimer.
10163bd69bSGarrett D'Amore * 2. Redistributions in binary form must reproduce the above copyright
11163bd69bSGarrett D'Amore * notice, this list of conditions and the following disclaimer in the
12163bd69bSGarrett D'Amore * documentation and/or other materials provided with the distribution.
13163bd69bSGarrett D'Amore *
14163bd69bSGarrett D'Amore * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15163bd69bSGarrett D'Amore * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16163bd69bSGarrett D'Amore * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17163bd69bSGarrett D'Amore * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18163bd69bSGarrett D'Amore * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19163bd69bSGarrett D'Amore * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20163bd69bSGarrett D'Amore * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21163bd69bSGarrett D'Amore * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22163bd69bSGarrett D'Amore * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23163bd69bSGarrett D'Amore * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24163bd69bSGarrett D'Amore * SUCH DAMAGE.
25163bd69bSGarrett D'Amore */
26163bd69bSGarrett D'Amore /*
27163bd69bSGarrett D'Amore * "Set of characters" ADT implemented as a splay tree of extents, with
28163bd69bSGarrett D'Amore * a lookup table cache to simplify looking up the first bunch of
29163bd69bSGarrett D'Amore * characters (which are presumably more common than others).
30163bd69bSGarrett D'Amore */
31163bd69bSGarrett D'Amore
32163bd69bSGarrett D'Amore #include <assert.h>
33163bd69bSGarrett D'Amore #include <stdbool.h>
34163bd69bSGarrett D'Amore #include <stdlib.h>
35163bd69bSGarrett D'Amore #include <wchar.h>
36163bd69bSGarrett D'Amore #include <wctype.h>
37163bd69bSGarrett D'Amore #include "cset.h"
38163bd69bSGarrett D'Amore
39163bd69bSGarrett D'Amore static struct csnode *cset_delete(struct csnode *, wchar_t);
40163bd69bSGarrett D'Amore static int cset_rangecmp(struct csnode *, wchar_t);
41163bd69bSGarrett D'Amore static struct csnode *cset_splay(struct csnode *, wchar_t);
42163bd69bSGarrett D'Amore
43163bd69bSGarrett D'Amore /*
44163bd69bSGarrett D'Amore * cset_alloc --
45163bd69bSGarrett D'Amore * Allocate a set of characters.
46163bd69bSGarrett D'Amore */
47163bd69bSGarrett D'Amore struct cset *
cset_alloc(void)48163bd69bSGarrett D'Amore cset_alloc(void)
49163bd69bSGarrett D'Amore {
50163bd69bSGarrett D'Amore struct cset *cs;
51163bd69bSGarrett D'Amore
52163bd69bSGarrett D'Amore if ((cs = malloc(sizeof (*cs))) == NULL)
53163bd69bSGarrett D'Amore return (NULL);
54163bd69bSGarrett D'Amore cs->cs_root = NULL;
55163bd69bSGarrett D'Amore cs->cs_classes = NULL;
56163bd69bSGarrett D'Amore cs->cs_havecache = false;
57163bd69bSGarrett D'Amore cs->cs_invert = false;
58163bd69bSGarrett D'Amore return (cs);
59163bd69bSGarrett D'Amore }
60163bd69bSGarrett D'Amore
61163bd69bSGarrett D'Amore /*
62163bd69bSGarrett D'Amore * cset_add --
63163bd69bSGarrett D'Amore * Add a character to the set.
64163bd69bSGarrett D'Amore */
65163bd69bSGarrett D'Amore bool
cset_add(struct cset * cs,wchar_t ch)66163bd69bSGarrett D'Amore cset_add(struct cset *cs, wchar_t ch)
67163bd69bSGarrett D'Amore {
68163bd69bSGarrett D'Amore struct csnode *csn, *ncsn;
69163bd69bSGarrett D'Amore wchar_t oval;
70163bd69bSGarrett D'Amore
71163bd69bSGarrett D'Amore cs->cs_havecache = false;
72163bd69bSGarrett D'Amore
73163bd69bSGarrett D'Amore /*
74163bd69bSGarrett D'Amore * Inserting into empty tree; new item becomes the root.
75163bd69bSGarrett D'Amore */
76163bd69bSGarrett D'Amore if (cs->cs_root == NULL) {
77163bd69bSGarrett D'Amore csn = malloc(sizeof (*cs->cs_root));
78163bd69bSGarrett D'Amore if (csn == NULL)
79163bd69bSGarrett D'Amore return (false);
80163bd69bSGarrett D'Amore csn->csn_left = csn->csn_right = NULL;
81163bd69bSGarrett D'Amore csn->csn_min = csn->csn_max = ch;
82163bd69bSGarrett D'Amore cs->cs_root = csn;
83163bd69bSGarrett D'Amore return (true);
84163bd69bSGarrett D'Amore }
85163bd69bSGarrett D'Amore
86163bd69bSGarrett D'Amore /*
87163bd69bSGarrett D'Amore * Splay to check whether the item already exists, and otherwise,
88163bd69bSGarrett D'Amore * where we should put it.
89163bd69bSGarrett D'Amore */
90163bd69bSGarrett D'Amore csn = cs->cs_root = cset_splay(cs->cs_root, ch);
91163bd69bSGarrett D'Amore
92163bd69bSGarrett D'Amore /*
93163bd69bSGarrett D'Amore * Avoid adding duplicate nodes.
94163bd69bSGarrett D'Amore */
95163bd69bSGarrett D'Amore if (cset_rangecmp(csn, ch) == 0)
96163bd69bSGarrett D'Amore return (true);
97163bd69bSGarrett D'Amore
98163bd69bSGarrett D'Amore /*
99163bd69bSGarrett D'Amore * Allocate a new node and make it the new root.
100163bd69bSGarrett D'Amore */
101163bd69bSGarrett D'Amore ncsn = malloc(sizeof (*ncsn));
102163bd69bSGarrett D'Amore if (ncsn == NULL)
103163bd69bSGarrett D'Amore return (false);
104163bd69bSGarrett D'Amore ncsn->csn_min = ncsn->csn_max = ch;
105163bd69bSGarrett D'Amore if (cset_rangecmp(csn, ch) < 0) {
106163bd69bSGarrett D'Amore ncsn->csn_left = csn->csn_left;
107163bd69bSGarrett D'Amore ncsn->csn_right = csn;
108163bd69bSGarrett D'Amore csn->csn_left = NULL;
109163bd69bSGarrett D'Amore } else {
110163bd69bSGarrett D'Amore ncsn->csn_right = csn->csn_right;
111163bd69bSGarrett D'Amore ncsn->csn_left = csn;
112163bd69bSGarrett D'Amore csn->csn_right = NULL;
113163bd69bSGarrett D'Amore }
114163bd69bSGarrett D'Amore cs->cs_root = ncsn;
115163bd69bSGarrett D'Amore
116163bd69bSGarrett D'Amore /*
117163bd69bSGarrett D'Amore * Coalesce with left and right neighbours if possible.
118163bd69bSGarrett D'Amore */
119163bd69bSGarrett D'Amore if (ncsn->csn_left != NULL) {
120163bd69bSGarrett D'Amore ncsn->csn_left = cset_splay(ncsn->csn_left, ncsn->csn_min - 1);
121163bd69bSGarrett D'Amore if (ncsn->csn_left->csn_max == ncsn->csn_min - 1) {
122163bd69bSGarrett D'Amore oval = ncsn->csn_left->csn_min;
123163bd69bSGarrett D'Amore ncsn->csn_left = cset_delete(ncsn->csn_left,
124163bd69bSGarrett D'Amore ncsn->csn_left->csn_min);
125163bd69bSGarrett D'Amore ncsn->csn_min = oval;
126163bd69bSGarrett D'Amore }
127163bd69bSGarrett D'Amore }
128163bd69bSGarrett D'Amore if (ncsn->csn_right != NULL) {
129163bd69bSGarrett D'Amore ncsn->csn_right = cset_splay(ncsn->csn_right,
130163bd69bSGarrett D'Amore ncsn->csn_max + 1);
131163bd69bSGarrett D'Amore if (ncsn->csn_right->csn_min == ncsn->csn_max + 1) {
132163bd69bSGarrett D'Amore oval = ncsn->csn_right->csn_max;
133163bd69bSGarrett D'Amore ncsn->csn_right = cset_delete(ncsn->csn_right,
134163bd69bSGarrett D'Amore ncsn->csn_right->csn_min);
135163bd69bSGarrett D'Amore ncsn->csn_max = oval;
136163bd69bSGarrett D'Amore }
137163bd69bSGarrett D'Amore }
138163bd69bSGarrett D'Amore
139163bd69bSGarrett D'Amore return (true);
140163bd69bSGarrett D'Amore }
141163bd69bSGarrett D'Amore
142163bd69bSGarrett D'Amore /*
143163bd69bSGarrett D'Amore * cset_in_hard --
144163bd69bSGarrett D'Amore * Determine whether a character is in the set without using
145163bd69bSGarrett D'Amore * the cache.
146163bd69bSGarrett D'Amore */
147163bd69bSGarrett D'Amore bool
cset_in_hard(struct cset * cs,wchar_t ch)148163bd69bSGarrett D'Amore cset_in_hard(struct cset *cs, wchar_t ch)
149163bd69bSGarrett D'Amore {
150163bd69bSGarrett D'Amore struct csclass *csc;
151163bd69bSGarrett D'Amore
152163bd69bSGarrett D'Amore for (csc = cs->cs_classes; csc != NULL; csc = csc->csc_next)
153*84cf253fSRichard Lowe if (csc->csc_invert ^ (iswctype(ch, csc->csc_type) != 0))
154163bd69bSGarrett D'Amore return (cs->cs_invert ^ true);
155163bd69bSGarrett D'Amore if (cs->cs_root != NULL) {
156163bd69bSGarrett D'Amore cs->cs_root = cset_splay(cs->cs_root, ch);
157*84cf253fSRichard Lowe return (cs->cs_invert ^ (cset_rangecmp(cs->cs_root, ch) == 0));
158163bd69bSGarrett D'Amore }
159163bd69bSGarrett D'Amore return (cs->cs_invert ^ false);
160163bd69bSGarrett D'Amore }
161163bd69bSGarrett D'Amore
162163bd69bSGarrett D'Amore /*
163163bd69bSGarrett D'Amore * cset_cache --
164163bd69bSGarrett D'Amore * Update the cache.
165163bd69bSGarrett D'Amore */
166163bd69bSGarrett D'Amore void
cset_cache(struct cset * cs)167163bd69bSGarrett D'Amore cset_cache(struct cset *cs)
168163bd69bSGarrett D'Amore {
169163bd69bSGarrett D'Amore wchar_t i;
170163bd69bSGarrett D'Amore
171163bd69bSGarrett D'Amore for (i = 0; i < CS_CACHE_SIZE; i++)
172163bd69bSGarrett D'Amore cs->cs_cache[i] = cset_in_hard(cs, i);
173163bd69bSGarrett D'Amore
174163bd69bSGarrett D'Amore cs->cs_havecache = true;
175163bd69bSGarrett D'Amore }
176163bd69bSGarrett D'Amore
177163bd69bSGarrett D'Amore /*
178163bd69bSGarrett D'Amore * cset_invert --
179163bd69bSGarrett D'Amore * Invert the character set.
180163bd69bSGarrett D'Amore */
181163bd69bSGarrett D'Amore void
cset_invert(struct cset * cs)182163bd69bSGarrett D'Amore cset_invert(struct cset *cs)
183163bd69bSGarrett D'Amore {
184163bd69bSGarrett D'Amore
185163bd69bSGarrett D'Amore cs->cs_invert ^= true;
186163bd69bSGarrett D'Amore cs->cs_havecache = false;
187163bd69bSGarrett D'Amore }
188163bd69bSGarrett D'Amore
189163bd69bSGarrett D'Amore /*
190163bd69bSGarrett D'Amore * cset_addclass --
191163bd69bSGarrett D'Amore * Add a wctype()-style character class to the set, optionally
192163bd69bSGarrett D'Amore * inverting it.
193163bd69bSGarrett D'Amore */
194163bd69bSGarrett D'Amore bool
cset_addclass(struct cset * cs,wctype_t type,bool invert)195163bd69bSGarrett D'Amore cset_addclass(struct cset *cs, wctype_t type, bool invert)
196163bd69bSGarrett D'Amore {
197163bd69bSGarrett D'Amore struct csclass *csc;
198163bd69bSGarrett D'Amore
199163bd69bSGarrett D'Amore csc = malloc(sizeof (*csc));
200163bd69bSGarrett D'Amore if (csc == NULL)
201163bd69bSGarrett D'Amore return (false);
202163bd69bSGarrett D'Amore csc->csc_type = type;
203163bd69bSGarrett D'Amore csc->csc_invert = invert;
204163bd69bSGarrett D'Amore csc->csc_next = cs->cs_classes;
205163bd69bSGarrett D'Amore cs->cs_classes = csc;
206163bd69bSGarrett D'Amore cs->cs_havecache = false;
207163bd69bSGarrett D'Amore return (true);
208163bd69bSGarrett D'Amore }
209163bd69bSGarrett D'Amore
210163bd69bSGarrett D'Amore static int
cset_rangecmp(struct csnode * t,wchar_t ch)211163bd69bSGarrett D'Amore cset_rangecmp(struct csnode *t, wchar_t ch)
212163bd69bSGarrett D'Amore {
213163bd69bSGarrett D'Amore
214163bd69bSGarrett D'Amore if (ch < t->csn_min)
215163bd69bSGarrett D'Amore return (-1);
216163bd69bSGarrett D'Amore if (ch > t->csn_max)
217163bd69bSGarrett D'Amore return (1);
218163bd69bSGarrett D'Amore return (0);
219163bd69bSGarrett D'Amore }
220163bd69bSGarrett D'Amore
221163bd69bSGarrett D'Amore static struct csnode *
cset_splay(struct csnode * t,wchar_t ch)222163bd69bSGarrett D'Amore cset_splay(struct csnode *t, wchar_t ch)
223163bd69bSGarrett D'Amore {
224163bd69bSGarrett D'Amore struct csnode N, *l, *r, *y;
225163bd69bSGarrett D'Amore
226163bd69bSGarrett D'Amore /*
227163bd69bSGarrett D'Amore * Based on public domain code from Sleator.
228163bd69bSGarrett D'Amore */
229163bd69bSGarrett D'Amore
230163bd69bSGarrett D'Amore assert(t != NULL);
231163bd69bSGarrett D'Amore
232163bd69bSGarrett D'Amore N.csn_left = N.csn_right = NULL;
233163bd69bSGarrett D'Amore l = r = &N;
234163bd69bSGarrett D'Amore for (;;) {
235163bd69bSGarrett D'Amore if (cset_rangecmp(t, ch) < 0) {
236163bd69bSGarrett D'Amore if (t->csn_left != NULL &&
237163bd69bSGarrett D'Amore cset_rangecmp(t->csn_left, ch) < 0) {
238163bd69bSGarrett D'Amore y = t->csn_left;
239163bd69bSGarrett D'Amore t->csn_left = y->csn_right;
240163bd69bSGarrett D'Amore y->csn_right = t;
241163bd69bSGarrett D'Amore t = y;
242163bd69bSGarrett D'Amore }
243163bd69bSGarrett D'Amore if (t->csn_left == NULL)
244163bd69bSGarrett D'Amore break;
245163bd69bSGarrett D'Amore r->csn_left = t;
246163bd69bSGarrett D'Amore r = t;
247163bd69bSGarrett D'Amore t = t->csn_left;
248163bd69bSGarrett D'Amore } else if (cset_rangecmp(t, ch) > 0) {
249163bd69bSGarrett D'Amore if (t->csn_right != NULL &&
250163bd69bSGarrett D'Amore cset_rangecmp(t->csn_right, ch) > 0) {
251163bd69bSGarrett D'Amore y = t->csn_right;
252163bd69bSGarrett D'Amore t->csn_right = y->csn_left;
253163bd69bSGarrett D'Amore y->csn_left = t;
254163bd69bSGarrett D'Amore t = y;
255163bd69bSGarrett D'Amore }
256163bd69bSGarrett D'Amore if (t->csn_right == NULL)
257163bd69bSGarrett D'Amore break;
258163bd69bSGarrett D'Amore l->csn_right = t;
259163bd69bSGarrett D'Amore l = t;
260163bd69bSGarrett D'Amore t = t->csn_right;
261163bd69bSGarrett D'Amore } else
262163bd69bSGarrett D'Amore break;
263163bd69bSGarrett D'Amore }
264163bd69bSGarrett D'Amore l->csn_right = t->csn_left;
265163bd69bSGarrett D'Amore r->csn_left = t->csn_right;
266163bd69bSGarrett D'Amore t->csn_left = N.csn_right;
267163bd69bSGarrett D'Amore t->csn_right = N.csn_left;
268163bd69bSGarrett D'Amore return (t);
269163bd69bSGarrett D'Amore }
270163bd69bSGarrett D'Amore
271163bd69bSGarrett D'Amore static struct csnode *
cset_delete(struct csnode * t,wchar_t ch)272163bd69bSGarrett D'Amore cset_delete(struct csnode *t, wchar_t ch)
273163bd69bSGarrett D'Amore {
274163bd69bSGarrett D'Amore struct csnode *x;
275163bd69bSGarrett D'Amore
276163bd69bSGarrett D'Amore assert(t != NULL);
277163bd69bSGarrett D'Amore t = cset_splay(t, ch);
278163bd69bSGarrett D'Amore assert(cset_rangecmp(t, ch) == 0);
279163bd69bSGarrett D'Amore if (t->csn_left == NULL)
280163bd69bSGarrett D'Amore x = t->csn_right;
281163bd69bSGarrett D'Amore else {
282163bd69bSGarrett D'Amore x = cset_splay(t->csn_left, ch);
283163bd69bSGarrett D'Amore x->csn_right = t->csn_right;
284163bd69bSGarrett D'Amore }
285163bd69bSGarrett D'Amore free(t);
286163bd69bSGarrett D'Amore return (x);
287163bd69bSGarrett D'Amore }
288