1 /***********************************************************************
2 *                                                                      *
3 *               This software is part of the ast package               *
4 *          Copyright (c) 1992-2011 AT&T Intellectual Property          *
5 *                      and is licensed under the                       *
6 *                 Eclipse Public License, Version 1.0                  *
7 *                    by AT&T Intellectual Property                     *
8 *                                                                      *
9 *                A copy of the License is available at                 *
10 *          http://www.eclipse.org/org/documents/epl-v10.html           *
11 *         (with md5 checksum b35adb5213ca9657e911e9befb180842)         *
12 *                                                                      *
13 *              Information and Software Systems Research               *
14 *                            AT&T Research                             *
15 *                           Florham Park NJ                            *
16 *                                                                      *
17 *                 Glenn Fowler <gsf@research.att.com>                  *
18 *                  David Korn <dgk@research.att.com>                   *
19 *                                                                      *
20 ***********************************************************************/
21 /*
22  * Copyright (c) 2007, 2012, Oracle and/or its affiliates. All rights reserved.
23  */
24 #pragma prototyped
25 /*
26  * David Korn
27  * AT&T Bell Laboratories
28  *
29  * library interface for word count
30  */
31 
32 #include <cmd.h>
33 #include <wc.h>
34 #include <ctype.h>
35 
36 #if _hdr_wchar && _hdr_wctype && _lib_iswctype
37 
38 #include <wchar.h>
39 #include <wctype.h>
40 #include <lc.h>
41 
42 #else
43 
44 #ifndef iswspace
45 #define iswspace(x)	isspace(x)
46 #endif
47 
48 #endif
49 
50 #define	WC_SP		0x08
51 #define	WC_NL		0x10
52 #define	WC_MB		0x20
53 #define	WC_ERR		0x40
54 
55 #define eol(c)		((c)&WC_NL)
56 #define mbc(c)		((c)&WC_MB)
57 #define spc(c)		((c)&WC_SP)
58 #define mb2wc(w,p,n)	(*ast.mb_towc)(&w,(char*)p,n)
59 
wc_init(int mode)60 Wc_t* wc_init(int mode)
61 {
62 	register int	n;
63 	register int	w;
64 	Wc_t*		wp;
65 
66 	if (!(wp = (Wc_t*)stakalloc(sizeof(Wc_t))))
67 		return 0;
68 	if (!mbwide())
69 		wp->mb = 0;
70 #if _hdr_wchar && _hdr_wctype && _lib_iswctype
71 	else if (!(mode & WC_NOUTF8) && (lcinfo(LC_CTYPE)->lc->flags & LC_utf8))
72 		wp->mb = 1;
73 #endif
74 	else
75 		wp->mb = -1;
76 	w = mode & WC_WORDS;
77 	for (n = (1<<CHAR_BIT); --n >= 0;)
78 		wp->type[n] = (w && isspace(n)) ? WC_SP : 0;
79 	wp->type['\n'] = WC_SP|WC_NL;
80 	if ((mode & (WC_MBYTE|WC_WORDS)) && wp->mb > 0)
81 	{
82 		for (n = 0; n < 64; n++)
83 		{
84 			wp->type[0x80+n] |= WC_MB;
85 			if (n<32)
86 				wp->type[0xc0+n] |= WC_MB+1;
87 			else if (n<48)
88 				wp->type[0xc0+n] |= WC_MB+2;
89 			else if (n<56)
90 				wp->type[0xc0+n] |= WC_MB+3;
91 			else if (n<60)
92 				wp->type[0xc0+n] |= WC_MB+4;
93 			else if (n<62)
94 				wp->type[0xc0+n] |= WC_MB+5;
95 		}
96 		wp->type[0xc0] = WC_MB|WC_ERR;
97 		wp->type[0xc1] = WC_MB|WC_ERR;
98 		wp->type[0xfe] = WC_MB|WC_ERR;
99 		wp->type[0xff] = WC_MB|WC_ERR;
100 	}
101 	wp->mode = mode;
102 	return wp;
103 }
104 
invalid(const char * file,int nlines)105 static int invalid(const char *file, int nlines)
106 {
107 	error_info.file = (char*)file;
108 	error_info.line = nlines;
109 	error(ERROR_SYSTEM|1, "invalid multibyte character");
110 	error_info.file = 0;
111 	error_info.line = 0;
112 	return nlines;
113 }
114 
115 /*
116  * handle utf space characters
117  */
118 
chkstate(int state,register unsigned int c)119 static int chkstate(int state, register unsigned int c)
120 {
121 	switch(state)
122 	{
123 	case 1:
124 		state = (c==0x9a?4:0);
125 		break;
126 	case 2:
127 		state = ((c==0x80||c==0x81)?6+(c&1):0);
128 		break;
129 	case 3:
130 		state = (c==0x80?5:0);
131 		break;
132 	case 4:
133 		state = (c==0x80?10:0);
134 		break;
135 	case 5:
136 		state = (c==0x80?10:0);
137 		break;
138 	case 6:
139 		state = 0;
140 		if(c==0xa0 || c==0xa1)
141 			return(10);
142 		else if((c&0xf0)== 0x80)
143 		{
144 			if((c&=0xf)==7)
145 				return(iswspace(0x2007)?10:0);
146 			if(c<=0xb)
147 				return(10);
148 		}
149 		else if(c==0xaf && iswspace(0x202f))
150 			return(10);
151 		break;
152 	case 7:
153 		state = (c==0x9f?10:0);
154 		break;
155 	case 8:
156 		return (iswspace(c)?10:0);
157 	}
158 	return state;
159 }
160 
161 /*
162  * compute the line, word, and character count for file <fd>
163  */
164 
wc_count(Wc_t * wp,Sfio_t * fd,const char * file)165 int wc_count(Wc_t *wp, Sfio_t *fd, const char* file)
166 {
167 	register char*		type = wp->type;
168 	register unsigned char*	cp;
169 	register Sfoff_t	nbytes;
170 	register Sfoff_t	nchars;
171 	register Sfoff_t	nwords;
172 	register Sfoff_t	nlines;
173 	register Sfoff_t	eline = -1;
174 	register Sfoff_t	longest = 0;
175 	register ssize_t	c;
176 	register unsigned char*	endbuff;
177 	register int		lasttype = WC_SP;
178 	unsigned int		lastchar;
179 	ssize_t			n;
180 	ssize_t			o;
181 	unsigned char*		buff;
182 	wchar_t			x;
183 	unsigned char		side[32];
184 
185 	sfset(fd,SF_WRITE,1);
186 	nlines = nwords = nchars = nbytes = 0;
187 	wp->longest = 0;
188 	if (wp->mb < 0 && (wp->mode & (WC_MBYTE|WC_WORDS)))
189 	{
190 		cp = buff = endbuff = 0;
191 		for (;;)
192 		{
193 			if (cp >= endbuff || (n = mb2wc(x, cp, endbuff-cp)) < 0)
194 			{
195 				if ((o = endbuff-cp) < sizeof(side))
196 				{
197 					if (buff)
198 					{
199 						if (o)
200 							memcpy(side, cp, o);
201 						mbinit();
202 					}
203 					else
204 						o = 0;
205 					cp = side + o;
206 					if (!(buff = (unsigned char*)sfreserve(fd, SF_UNBOUND, 0)) || (n = sfvalue(fd)) <= 0)
207 					{
208 						if ((nchars - longest) > wp->longest)
209 							wp->longest = nchars - longest;
210 						break;
211 					}
212 					nbytes += n;
213 					if ((c = sizeof(side) - o) > n)
214 						c = n;
215 					if (c)
216 						memcpy(cp, buff, c);
217 					endbuff = buff + n;
218 					cp = side;
219 					x = mbchar(cp);
220 					if ((cp-side) < o)
221 					{
222 						cp = buff;
223 						nchars += (cp-side) - 1;
224 					}
225 					else
226 						cp = buff + (cp-side) - o;
227 				}
228 				else
229 				{
230 					cp++;
231 					x = -1;
232 				}
233 				if (x == -1 && eline != nlines && !(wp->mode & WC_QUIET))
234 					eline = invalid(file, nlines);
235 			}
236 			else
237 				cp += n ? n : 1;
238 			if (x == '\n')
239 			{
240 				if ((nchars - longest) > wp->longest)
241 					wp->longest = nchars - longest;
242 				longest = nchars + 1;
243 				nlines++;
244 				lasttype = 1;
245 			}
246 			else if (iswspace(x))
247 				lasttype = 1;
248 			else if (lasttype)
249 			{
250 				lasttype = 0;
251 				nwords++;
252 			}
253 			nchars++;
254 		}
255 		if (!(wp->mode & WC_MBYTE))
256 			nchars = nbytes;
257 	}
258 	else if (!wp->mb && !(wp->mode & WC_LONGEST) || wp->mb > 0 && !(wp->mode & (WC_MBYTE|WC_WORDS|WC_LONGEST)))
259 	{
260 		if (!(wp->mode & (WC_MBYTE|WC_WORDS|WC_LONGEST)))
261 		{
262 			while ((cp = (unsigned char*)sfreserve(fd, SF_UNBOUND, 0)) && (c = sfvalue(fd)) > 0)
263 			{
264 				nchars += c;
265 				endbuff = cp + c;
266 				if (*--endbuff == '\n')
267 					nlines++;
268 				else
269 					*endbuff = '\n';
270 				for (;;)
271 					if (*cp++ == '\n')
272 					{
273 						if (cp > endbuff)
274 							break;
275 						nlines++;
276 					}
277 			}
278 		}
279 		else
280 		{
281 			while ((cp = buff = (unsigned char*)sfreserve(fd, SF_UNBOUND, 0)) && (c = sfvalue(fd)) > 0)
282 			{
283 				nchars += c;
284 				/* check to see whether first character terminates word */
285 				if (c==1)
286 				{
287 					if (eol(lasttype))
288 						nlines++;
289 					if ((c = type[*cp]) && !lasttype)
290 						nwords++;
291 					lasttype = c;
292 					continue;
293 				}
294 				if (!lasttype && type[*cp])
295 					nwords++;
296 				lastchar = cp[--c];
297 				*(endbuff = cp+c) = '\n';
298 				c = lasttype;
299 				/* process each buffer */
300 				for (;;)
301 				{
302 					/* process spaces and new-lines */
303 					do
304 					{
305 						if (eol(c))
306 							for (;;)
307 							{
308 								/* check for end of buffer */
309 								if (cp > endbuff)
310 									goto beob;
311 								nlines++;
312 								if (*cp != '\n')
313 									break;
314 								cp++;
315 							}
316 					} while (c = type[*cp++]);
317 					/* skip over word characters */
318 					while (!(c = type[*cp++]));
319 					nwords++;
320 				}
321 			beob:
322 				if ((cp -= 2) >= buff)
323 					c = type[*cp];
324 				else
325 					c = lasttype;
326 				lasttype = type[lastchar];
327 				/* see if was in word */
328 				if (!c && !lasttype)
329 					nwords--;
330 			}
331 			if (eol(lasttype))
332 				nlines++;
333 			else if (!lasttype)
334 				nwords++;
335 		}
336 	}
337 	else
338 	{
339 		int		lineoff=0;
340 		int		skip=0;
341 		int		adjust=0;
342 		int		state=0;
343 		int		oldc;
344 		int		xspace;
345 		int		wasspace = 1;
346 		unsigned char*	start;
347 		int             flagm = 0;
348 
349 
350 		lastchar = 0;
351 		start = (endbuff = side) + 1;
352 		xspace = iswspace(0xa0) || iswspace(0x85);
353 		while ((cp = buff = (unsigned char*)sfreserve(fd, SF_UNBOUND, 0)) && (c = sfvalue(fd)) > 0)
354 		{
355 			nbytes += c;
356 			nchars += c;
357 			start = cp-lineoff;
358 			/* check to see whether first character terminates word */
359 			if(c==1)
360 			{
361 				if(eol(lasttype))
362 					nlines++;
363 				if((c = type[*cp]) && !lasttype)
364 					nwords++;
365 				lasttype = c;
366 				endbuff = start;
367 				continue;
368 			}
369 			lastchar = cp[--c];
370 			endbuff = cp+c;
371 			cp[c] = '\n';
372 			if(mbc(lasttype))
373 			{
374 				c = lasttype;
375 				flagm = 1;
376 				goto mbyte;
377 			}
378 			if(!lasttype && spc(type[*cp]))
379 				nwords++;
380 			c = lasttype;
381 			/* process each buffer */
382 			for (;;)
383 			{
384 				/* process spaces and new-lines */
385 			spaces:
386 				do
387 				{
388 					if (eol(c))
389 					{
390 						/* check for end of buffer */
391 						if (cp > endbuff)
392 							goto eob;
393 						if(wp->mode&WC_LONGEST)
394 						{
395 							if((cp-start)-adjust > longest)
396 								longest = (cp-start)-adjust-1;
397 							start = cp;
398 						}
399 						nlines++;
400 						nchars -= adjust;
401 						adjust = 0;
402 					}
403 				} while (spc(c = type[*cp++]));
404 				wasspace=1;
405 				if(mbc(c))
406 				{
407 				mbyte:
408 					do
409 					{
410 						if(c&WC_ERR)
411 							goto err;
412 						if(skip && (c&7))
413 							break;
414 						if(!skip)
415 						{
416 							if(!(c&7))
417 							{
418 								skip=1;
419 								break;
420 							}
421 							skip = (c&7);
422 							adjust += skip;
423 							state = 0;
424 							if (flagm == 1) {
425 								flagm = 0;
426 								oldc = *cp;
427 								if (xspace && (
428 								    iswspace
429 								    (*cp)
430 								    == 1)) {
431 									state
432 									    = 8;
433 								}
434 								continue;
435 							}
436 							if(skip==2 && (cp[-1]&0xc)==0 && (state=(cp[-1]&0x3)))
437 								oldc = *cp;
438 							else if(xspace && cp[-1]==0xc2)
439 							{
440 								state = 8;
441 								oldc = *cp;
442 							}
443 						}
444 						else
445 						{
446 							skip--;
447 							if(state && (state=chkstate(state,oldc)))
448 							{
449 								if(state==10)
450 								{
451 									if(!wasspace)
452 										nwords++;
453 									wasspace = 1;
454 									state=0;
455 									goto spaces;
456 								}
457 								oldc = *cp;
458 							}
459 						}
460 					} while (mbc(c = type[*cp++]));
461 					wasspace = 0;
462 					if(skip)
463 					{
464 						if(eol(c) && (cp > endbuff))
465 							goto eob;
466 				err:
467 						skip = 0;
468 						state = 0;
469 						if(eline!=nlines && !(wp->mode & WC_QUIET))
470 							eline = invalid(file, nlines);
471 						while(mbc(c) && ((c|WC_ERR) || (c&7)==0))
472 							c=type[*cp++];
473 						if(eol(c) && (cp > endbuff))
474 						{
475 							c = WC_MB|WC_ERR;
476 							goto eob;
477 						}
478 						if(mbc(c))
479 							goto mbyte;
480 						else if(c&WC_SP)
481 							goto spaces;
482 					}
483 					if(spc(c))
484 					{
485 						nwords++;
486 						continue;
487 					}
488 				}
489 				/* skip over word characters */
490 				while(!(c = type[*cp++]));
491 				if(mbc(c))
492 					goto mbyte;
493 				nwords++;
494 			}
495 		eob:
496 			lineoff = cp-start;
497 			if((cp -= 2) >= buff)
498 				c = type[*cp];
499 			else
500 				c = lasttype;
501 			lasttype = type[lastchar];
502 			/* see if was in word */
503 			if(!c && !lasttype)
504 				nwords--;
505 		}
506 		if ((wp->mode&WC_LONGEST) && ((endbuff + 1 - start) - adjust - (lastchar == '\n')) > longest)
507 			longest = (endbuff + 1 - start) - adjust - (lastchar == '\n');
508 		wp->longest = longest;
509 		if (eol(lasttype))
510 			nlines++;
511 		else if (!lasttype)
512 			nwords++;
513 		if (wp->mode & WC_MBYTE)
514 			nchars -= adjust;
515 		else
516 			nchars = nbytes;
517 	}
518 	wp->chars = nchars;
519 	wp->words = nwords;
520 	wp->lines = nlines;
521 	return 0;
522 }
523 
524