xref: /illumos-gate/usr/src/lib/libc/port/locale/utf8.c (revision eda3ef2d)
14297a3b0SGarrett D'Amore /*
22d08521bSGarrett D'Amore  * Copyright 2013 Garrett D'Amore <garrett@damore.org>
3475b496bSGarrett D'Amore  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
44297a3b0SGarrett D'Amore  * Copyright (c) 2002-2004 Tim J. Robbins
54297a3b0SGarrett D'Amore  * All rights reserved.
64297a3b0SGarrett D'Amore  *
74297a3b0SGarrett D'Amore  * Redistribution and use in source and binary forms, with or without
84297a3b0SGarrett D'Amore  * modification, are permitted provided that the following conditions
94297a3b0SGarrett D'Amore  * are met:
104297a3b0SGarrett D'Amore  * 1. Redistributions of source code must retain the above copyright
114297a3b0SGarrett D'Amore  *    notice, this list of conditions and the following disclaimer.
124297a3b0SGarrett D'Amore  * 2. Redistributions in binary form must reproduce the above copyright
134297a3b0SGarrett D'Amore  *    notice, this list of conditions and the following disclaimer in the
144297a3b0SGarrett D'Amore  *    documentation and/or other materials provided with the distribution.
154297a3b0SGarrett D'Amore  *
164297a3b0SGarrett D'Amore  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
174297a3b0SGarrett D'Amore  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
184297a3b0SGarrett D'Amore  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
194297a3b0SGarrett D'Amore  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
204297a3b0SGarrett D'Amore  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
214297a3b0SGarrett D'Amore  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
224297a3b0SGarrett D'Amore  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
234297a3b0SGarrett D'Amore  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
244297a3b0SGarrett D'Amore  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
254297a3b0SGarrett D'Amore  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
264297a3b0SGarrett D'Amore  * SUCH DAMAGE.
274297a3b0SGarrett D'Amore  */
284297a3b0SGarrett D'Amore 
294297a3b0SGarrett D'Amore #include "lint.h"
304297a3b0SGarrett D'Amore #include <errno.h>
314297a3b0SGarrett D'Amore #include <limits.h>
324297a3b0SGarrett D'Amore #include <stdlib.h>
334297a3b0SGarrett D'Amore #include <string.h>
344297a3b0SGarrett D'Amore #include <wchar.h>
354297a3b0SGarrett D'Amore #include "mblocal.h"
362d08521bSGarrett D'Amore #include "lctype.h"
374297a3b0SGarrett D'Amore 
384297a3b0SGarrett D'Amore static size_t	_UTF8_mbrtowc(wchar_t *_RESTRICT_KYWD,
394297a3b0SGarrett D'Amore 		    const char *_RESTRICT_KYWD,
40*0ac311baSRobert Mustacchi 		    size_t, mbstate_t *_RESTRICT_KYWD, boolean_t);
414297a3b0SGarrett D'Amore static int	_UTF8_mbsinit(const mbstate_t *);
424297a3b0SGarrett D'Amore static size_t	_UTF8_mbsnrtowcs(wchar_t *_RESTRICT_KYWD,
434297a3b0SGarrett D'Amore 		    const char **_RESTRICT_KYWD, size_t, size_t,
444297a3b0SGarrett D'Amore 		    mbstate_t *_RESTRICT_KYWD);
454297a3b0SGarrett D'Amore static size_t	_UTF8_wcrtomb(char *_RESTRICT_KYWD, wchar_t,
464297a3b0SGarrett D'Amore 		    mbstate_t *_RESTRICT_KYWD);
474297a3b0SGarrett D'Amore static size_t	_UTF8_wcsnrtombs(char *_RESTRICT_KYWD,
484297a3b0SGarrett D'Amore 		    const wchar_t **_RESTRICT_KYWD,
494297a3b0SGarrett D'Amore 		    size_t, size_t, mbstate_t *_RESTRICT_KYWD);
504297a3b0SGarrett D'Amore 
512d08521bSGarrett D'Amore void
_UTF8_init(struct lc_ctype * lct)522d08521bSGarrett D'Amore _UTF8_init(struct lc_ctype *lct)
534297a3b0SGarrett D'Amore {
542d08521bSGarrett D'Amore 	lct->lc_mbrtowc = _UTF8_mbrtowc;
552d08521bSGarrett D'Amore 	lct->lc_wcrtomb = _UTF8_wcrtomb;
562d08521bSGarrett D'Amore 	lct->lc_mbsinit = _UTF8_mbsinit;
572d08521bSGarrett D'Amore 	lct->lc_mbsnrtowcs = _UTF8_mbsnrtowcs;
582d08521bSGarrett D'Amore 	lct->lc_wcsnrtombs = _UTF8_wcsnrtombs;
592d08521bSGarrett D'Amore 	lct->lc_is_ascii = 0;
602d08521bSGarrett D'Amore 	lct->lc_max_mblen = 4;
614297a3b0SGarrett D'Amore }
624297a3b0SGarrett D'Amore 
634297a3b0SGarrett D'Amore static int
_UTF8_mbsinit(const mbstate_t * ps)644297a3b0SGarrett D'Amore _UTF8_mbsinit(const mbstate_t *ps)
654297a3b0SGarrett D'Amore {
664297a3b0SGarrett D'Amore 
674297a3b0SGarrett D'Amore 	return (ps == NULL || ((const _UTF8State *)ps)->want == 0);
684297a3b0SGarrett D'Amore }
694297a3b0SGarrett D'Amore 
704297a3b0SGarrett D'Amore static size_t
_UTF8_mbrtowc(wchar_t * _RESTRICT_KYWD pwc,const char * _RESTRICT_KYWD s,size_t n,mbstate_t * _RESTRICT_KYWD ps,boolean_t zero)714297a3b0SGarrett D'Amore _UTF8_mbrtowc(wchar_t *_RESTRICT_KYWD pwc, const char *_RESTRICT_KYWD s,
72*0ac311baSRobert Mustacchi     size_t n, mbstate_t *_RESTRICT_KYWD ps, boolean_t zero)
734297a3b0SGarrett D'Amore {
744297a3b0SGarrett D'Amore 	_UTF8State *us;
754297a3b0SGarrett D'Amore 	int ch, i, mask, want;
764297a3b0SGarrett D'Amore 	wchar_t lbound, wch;
774297a3b0SGarrett D'Amore 
784297a3b0SGarrett D'Amore 	us = (_UTF8State *)ps;
794297a3b0SGarrett D'Amore 
804297a3b0SGarrett D'Amore 	if (us->want < 0 || us->want > 6) {
814297a3b0SGarrett D'Amore 		errno = EINVAL;
824297a3b0SGarrett D'Amore 		return ((size_t)-1);
834297a3b0SGarrett D'Amore 	}
844297a3b0SGarrett D'Amore 
854297a3b0SGarrett D'Amore 	if (s == NULL) {
864297a3b0SGarrett D'Amore 		s = "";
874297a3b0SGarrett D'Amore 		n = 1;
884297a3b0SGarrett D'Amore 		pwc = NULL;
894297a3b0SGarrett D'Amore 	}
904297a3b0SGarrett D'Amore 
914297a3b0SGarrett D'Amore 	if (n == 0)
924297a3b0SGarrett D'Amore 		/* Incomplete multibyte sequence */
934297a3b0SGarrett D'Amore 		return ((size_t)-2);
944297a3b0SGarrett D'Amore 
954297a3b0SGarrett D'Amore 	if (us->want == 0) {
964297a3b0SGarrett D'Amore 		/*
974297a3b0SGarrett D'Amore 		 * Determine the number of octets that make up this character
984297a3b0SGarrett D'Amore 		 * from the first octet, and a mask that extracts the
994297a3b0SGarrett D'Amore 		 * interesting bits of the first octet. We already know
1004297a3b0SGarrett D'Amore 		 * the character is at least two bytes long.
1014297a3b0SGarrett D'Amore 		 *
1024297a3b0SGarrett D'Amore 		 * We also specify a lower bound for the character code to
1034297a3b0SGarrett D'Amore 		 * detect redundant, non-"shortest form" encodings. For
1044297a3b0SGarrett D'Amore 		 * example, the sequence C0 80 is _not_ a legal representation
1054297a3b0SGarrett D'Amore 		 * of the null character. This enforces a 1-to-1 mapping
1064297a3b0SGarrett D'Amore 		 * between character codes and their multibyte representations.
1074297a3b0SGarrett D'Amore 		 */
1084297a3b0SGarrett D'Amore 		ch = (unsigned char)*s;
1094297a3b0SGarrett D'Amore 		if ((ch & 0x80) == 0) {
110475b496bSGarrett D'Amore 			/* Fast path for plain ASCII characters. */
111475b496bSGarrett D'Amore 			if (pwc != NULL)
112475b496bSGarrett D'Amore 				*pwc = ch;
113*0ac311baSRobert Mustacchi 			if (zero || ch != '\0') {
114*0ac311baSRobert Mustacchi 				return (1);
115*0ac311baSRobert Mustacchi 			} else {
116*0ac311baSRobert Mustacchi 				return (0);
117*0ac311baSRobert Mustacchi 			}
118475b496bSGarrett D'Amore 		}
119475b496bSGarrett D'Amore 		if ((ch & 0xe0) == 0xc0) {
1204297a3b0SGarrett D'Amore 			mask = 0x1f;
1214297a3b0SGarrett D'Amore 			want = 2;
1224297a3b0SGarrett D'Amore 			lbound = 0x80;
1234297a3b0SGarrett D'Amore 		} else if ((ch & 0xf0) == 0xe0) {
1244297a3b0SGarrett D'Amore 			mask = 0x0f;
1254297a3b0SGarrett D'Amore 			want = 3;
1264297a3b0SGarrett D'Amore 			lbound = 0x800;
1274297a3b0SGarrett D'Amore 		} else if ((ch & 0xf8) == 0xf0) {
1284297a3b0SGarrett D'Amore 			mask = 0x07;
1294297a3b0SGarrett D'Amore 			want = 4;
1304297a3b0SGarrett D'Amore 			lbound = 0x10000;
1314297a3b0SGarrett D'Amore #if 0
1324297a3b0SGarrett D'Amore 		/* These would be illegal in the UTF-8 space */
1334297a3b0SGarrett D'Amore 
1344297a3b0SGarrett D'Amore 		} else if ((ch & 0xfc) == 0xf8) {
1354297a3b0SGarrett D'Amore 			mask = 0x03;
1364297a3b0SGarrett D'Amore 			want = 5;
1374297a3b0SGarrett D'Amore 			lbound = 0x200000;
1384297a3b0SGarrett D'Amore 		} else if ((ch & 0xfe) == 0xfc) {
1394297a3b0SGarrett D'Amore 			mask = 0x01;
1404297a3b0SGarrett D'Amore 			want = 6;
1414297a3b0SGarrett D'Amore 			lbound = 0x4000000;
1424297a3b0SGarrett D'Amore #endif
1434297a3b0SGarrett D'Amore 		} else {
1444297a3b0SGarrett D'Amore 			/*
1454297a3b0SGarrett D'Amore 			 * Malformed input; input is not UTF-8.
1464297a3b0SGarrett D'Amore 			 */
1474297a3b0SGarrett D'Amore 			errno = EILSEQ;
1484297a3b0SGarrett D'Amore 			return ((size_t)-1);
1494297a3b0SGarrett D'Amore 		}
1504297a3b0SGarrett D'Amore 	} else {
1514297a3b0SGarrett D'Amore 		want = us->want;
1524297a3b0SGarrett D'Amore 		lbound = us->lbound;
1534297a3b0SGarrett D'Amore 	}
1544297a3b0SGarrett D'Amore 
1554297a3b0SGarrett D'Amore 	/*
1564297a3b0SGarrett D'Amore 	 * Decode the octet sequence representing the character in chunks
1574297a3b0SGarrett D'Amore 	 * of 6 bits, most significant first.
1584297a3b0SGarrett D'Amore 	 */
1594297a3b0SGarrett D'Amore 	if (us->want == 0)
1604297a3b0SGarrett D'Amore 		wch = (unsigned char)*s++ & mask;
1614297a3b0SGarrett D'Amore 	else
1624297a3b0SGarrett D'Amore 		wch = us->ch;
1634297a3b0SGarrett D'Amore 
1644297a3b0SGarrett D'Amore 	for (i = (us->want == 0) ? 1 : 0; i < MIN(want, n); i++) {
1654297a3b0SGarrett D'Amore 		if ((*s & 0xc0) != 0x80) {
1664297a3b0SGarrett D'Amore 			/*
1674297a3b0SGarrett D'Amore 			 * Malformed input; bad characters in the middle
1684297a3b0SGarrett D'Amore 			 * of a character.
1694297a3b0SGarrett D'Amore 			 */
1704297a3b0SGarrett D'Amore 			errno = EILSEQ;
1714297a3b0SGarrett D'Amore 			return ((size_t)-1);
1724297a3b0SGarrett D'Amore 		}
1734297a3b0SGarrett D'Amore 		wch <<= 6;
1744297a3b0SGarrett D'Amore 		wch |= *s++ & 0x3f;
1754297a3b0SGarrett D'Amore 	}
1764297a3b0SGarrett D'Amore 	if (i < want) {
1774297a3b0SGarrett D'Amore 		/* Incomplete multibyte sequence. */
1784297a3b0SGarrett D'Amore 		us->want = want - i;
1794297a3b0SGarrett D'Amore 		us->lbound = lbound;
1804297a3b0SGarrett D'Amore 		us->ch = wch;
1814297a3b0SGarrett D'Amore 		return ((size_t)-2);
1824297a3b0SGarrett D'Amore 	}
1834297a3b0SGarrett D'Amore 	if (wch < lbound) {
1844297a3b0SGarrett D'Amore 		/*
1854297a3b0SGarrett D'Amore 		 * Malformed input; redundant encoding.
1864297a3b0SGarrett D'Amore 		 */
1874297a3b0SGarrett D'Amore 		errno = EILSEQ;
1884297a3b0SGarrett D'Amore 		return ((size_t)-1);
1894297a3b0SGarrett D'Amore 	}
1904297a3b0SGarrett D'Amore 	if (pwc != NULL)
1914297a3b0SGarrett D'Amore 		*pwc = wch;
1924297a3b0SGarrett D'Amore 	us->want = 0;
193*0ac311baSRobert Mustacchi 	if (zero || wch != L'\0') {
194*0ac311baSRobert Mustacchi 		return (want);
195*0ac311baSRobert Mustacchi 	} else {
196*0ac311baSRobert Mustacchi 		return (0);
197*0ac311baSRobert Mustacchi 	}
1984297a3b0SGarrett D'Amore }
1994297a3b0SGarrett D'Amore 
2004297a3b0SGarrett D'Amore static size_t
_UTF8_mbsnrtowcs(wchar_t * _RESTRICT_KYWD dst,const char ** _RESTRICT_KYWD src,size_t nms,size_t len,mbstate_t * _RESTRICT_KYWD ps)2014297a3b0SGarrett D'Amore _UTF8_mbsnrtowcs(wchar_t *_RESTRICT_KYWD dst, const char **_RESTRICT_KYWD src,
2024297a3b0SGarrett D'Amore     size_t nms, size_t len, mbstate_t *_RESTRICT_KYWD ps)
2034297a3b0SGarrett D'Amore {
2044297a3b0SGarrett D'Amore 	_UTF8State *us;
2054297a3b0SGarrett D'Amore 	const char *s;
2064297a3b0SGarrett D'Amore 	size_t nchr;
2074297a3b0SGarrett D'Amore 	wchar_t wc;
2084297a3b0SGarrett D'Amore 	size_t nb;
2094297a3b0SGarrett D'Amore 
2104297a3b0SGarrett D'Amore 	us = (_UTF8State *)ps;
2114297a3b0SGarrett D'Amore 
2124297a3b0SGarrett D'Amore 	s = *src;
2134297a3b0SGarrett D'Amore 	nchr = 0;
2144297a3b0SGarrett D'Amore 
2154297a3b0SGarrett D'Amore 	if (dst == NULL) {
2164297a3b0SGarrett D'Amore 		/*
2174297a3b0SGarrett D'Amore 		 * The fast path in the loop below is not safe if an ASCII
2184297a3b0SGarrett D'Amore 		 * character appears as anything but the first byte of a
2194297a3b0SGarrett D'Amore 		 * multibyte sequence. Check now to avoid doing it in the loop.
2204297a3b0SGarrett D'Amore 		 */
2214297a3b0SGarrett D'Amore 		if (nms > 0 && us->want > 0 && (signed char)*s > 0) {
2224297a3b0SGarrett D'Amore 			errno = EILSEQ;
2234297a3b0SGarrett D'Amore 			return ((size_t)-1);
2244297a3b0SGarrett D'Amore 		}
2254297a3b0SGarrett D'Amore 		for (;;) {
226*0ac311baSRobert Mustacchi 			if (nms > 0 && (signed char)*s > 0) {
2274297a3b0SGarrett D'Amore 				/*
2284297a3b0SGarrett D'Amore 				 * Fast path for plain ASCII characters
2294297a3b0SGarrett D'Amore 				 * excluding NUL.
2304297a3b0SGarrett D'Amore 				 */
2314297a3b0SGarrett D'Amore 				nb = 1;
232*0ac311baSRobert Mustacchi 			} else if ((nb = _UTF8_mbrtowc(&wc, s, nms, ps,
233*0ac311baSRobert Mustacchi 			    B_FALSE)) == (size_t)-1) {
2344297a3b0SGarrett D'Amore 				/* Invalid sequence - mbrtowc() sets errno. */
2354297a3b0SGarrett D'Amore 				return ((size_t)-1);
236*0ac311baSRobert Mustacchi 			} else if (nb == 0 || nb == (size_t)-2) {
2374297a3b0SGarrett D'Amore 				return (nchr);
238*0ac311baSRobert Mustacchi 			}
2394297a3b0SGarrett D'Amore 			s += nb;
2404297a3b0SGarrett D'Amore 			nms -= nb;
2414297a3b0SGarrett D'Amore 			nchr++;
2424297a3b0SGarrett D'Amore 		}
2434297a3b0SGarrett D'Amore 		/*NOTREACHED*/
2444297a3b0SGarrett D'Amore 	}
2454297a3b0SGarrett D'Amore 
2464297a3b0SGarrett D'Amore 	/*
2474297a3b0SGarrett D'Amore 	 * The fast path in the loop below is not safe if an ASCII
2484297a3b0SGarrett D'Amore 	 * character appears as anything but the first byte of a
2494297a3b0SGarrett D'Amore 	 * multibyte sequence. Check now to avoid doing it in the loop.
2504297a3b0SGarrett D'Amore 	 */
2514297a3b0SGarrett D'Amore 	if (nms > 0 && len > 0 && us->want > 0 && (signed char)*s > 0) {
2524297a3b0SGarrett D'Amore 		errno = EILSEQ;
2534297a3b0SGarrett D'Amore 		return ((size_t)-1);
2544297a3b0SGarrett D'Amore 	}
2554297a3b0SGarrett D'Amore 	while (len-- > 0) {
2564297a3b0SGarrett D'Amore 		if (nms > 0 && (signed char)*s > 0) {
2574297a3b0SGarrett D'Amore 			/*
2584297a3b0SGarrett D'Amore 			 * Fast path for plain ASCII characters
2594297a3b0SGarrett D'Amore 			 * excluding NUL.
2604297a3b0SGarrett D'Amore 			 */
2614297a3b0SGarrett D'Amore 			*dst = (wchar_t)*s;
2624297a3b0SGarrett D'Amore 			nb = 1;
263*0ac311baSRobert Mustacchi 		} else if ((nb = _UTF8_mbrtowc(dst, s, nms, ps, B_FALSE)) ==
2644297a3b0SGarrett D'Amore 		    (size_t)-1) {
2654297a3b0SGarrett D'Amore 			*src = s;
2664297a3b0SGarrett D'Amore 			return ((size_t)-1);
2674297a3b0SGarrett D'Amore 		} else if (nb == (size_t)-2) {
2684297a3b0SGarrett D'Amore 			*src = s + nms;
2694297a3b0SGarrett D'Amore 			return (nchr);
2704297a3b0SGarrett D'Amore 		} else if (nb == 0) {
2714297a3b0SGarrett D'Amore 			*src = NULL;
2724297a3b0SGarrett D'Amore 			return (nchr);
2734297a3b0SGarrett D'Amore 		}
2744297a3b0SGarrett D'Amore 		s += nb;
2754297a3b0SGarrett D'Amore 		nms -= nb;
2764297a3b0SGarrett D'Amore 		nchr++;
2774297a3b0SGarrett D'Amore 		dst++;
2784297a3b0SGarrett D'Amore 	}
2794297a3b0SGarrett D'Amore 	*src = s;
2804297a3b0SGarrett D'Amore 	return (nchr);
2814297a3b0SGarrett D'Amore }
2824297a3b0SGarrett D'Amore 
2834297a3b0SGarrett D'Amore static size_t
_UTF8_wcrtomb(char * _RESTRICT_KYWD s,wchar_t wc,mbstate_t * _RESTRICT_KYWD ps)2844297a3b0SGarrett D'Amore _UTF8_wcrtomb(char *_RESTRICT_KYWD s, wchar_t wc, mbstate_t *_RESTRICT_KYWD ps)
2854297a3b0SGarrett D'Amore {
2864297a3b0SGarrett D'Amore 	_UTF8State *us;
2874297a3b0SGarrett D'Amore 	unsigned char lead;
2884297a3b0SGarrett D'Amore 	int i, len;
2894297a3b0SGarrett D'Amore 
2904297a3b0SGarrett D'Amore 	us = (_UTF8State *)ps;
2914297a3b0SGarrett D'Amore 
2924297a3b0SGarrett D'Amore 	if (us->want != 0) {
2934297a3b0SGarrett D'Amore 		errno = EINVAL;
2944297a3b0SGarrett D'Amore 		return ((size_t)-1);
2954297a3b0SGarrett D'Amore 	}
2964297a3b0SGarrett D'Amore 
2974297a3b0SGarrett D'Amore 	if (s == NULL)
2984297a3b0SGarrett D'Amore 		/* Reset to initial shift state (no-op) */
2994297a3b0SGarrett D'Amore 		return (1);
3004297a3b0SGarrett D'Amore 
3014297a3b0SGarrett D'Amore 	/*
3024297a3b0SGarrett D'Amore 	 * Determine the number of octets needed to represent this character.
3034297a3b0SGarrett D'Amore 	 * We always output the shortest sequence possible. Also specify the
3044297a3b0SGarrett D'Amore 	 * first few bits of the first octet, which contains the information
3054297a3b0SGarrett D'Amore 	 * about the sequence length.
3064297a3b0SGarrett D'Amore 	 */
3074297a3b0SGarrett D'Amore 	if ((wc & ~0x7f) == 0) {
308475b496bSGarrett D'Amore 		/* Fast path for plain ASCII characters. */
309475b496bSGarrett D'Amore 		*s = (char)wc;
310475b496bSGarrett D'Amore 		return (1);
3114297a3b0SGarrett D'Amore 	} else if ((wc & ~0x7ff) == 0) {
3124297a3b0SGarrett D'Amore 		lead = 0xc0;
3134297a3b0SGarrett D'Amore 		len = 2;
3144297a3b0SGarrett D'Amore 	} else if ((wc & ~0xffff) == 0) {
3154297a3b0SGarrett D'Amore 		lead = 0xe0;
3164297a3b0SGarrett D'Amore 		len = 3;
3174297a3b0SGarrett D'Amore 	} else if ((wc & ~0x1fffff) == 0) {
3184297a3b0SGarrett D'Amore 		lead = 0xf0;
3194297a3b0SGarrett D'Amore 		len = 4;
3204297a3b0SGarrett D'Amore #if 0
3214297a3b0SGarrett D'Amore 	/* Again, 5 and 6 byte encodings are simply not permitted */
3224297a3b0SGarrett D'Amore 	} else if ((wc & ~0x3ffffff) == 0) {
3234297a3b0SGarrett D'Amore 		lead = 0xf8;
3244297a3b0SGarrett D'Amore 		len = 5;
3254297a3b0SGarrett D'Amore 	} else if ((wc & ~0x7fffffff) == 0) {
3264297a3b0SGarrett D'Amore 		lead = 0xfc;
3274297a3b0SGarrett D'Amore 		len = 6;
3284297a3b0SGarrett D'Amore #endif
3294297a3b0SGarrett D'Amore 	} else {
3304297a3b0SGarrett D'Amore 		errno = EILSEQ;
3314297a3b0SGarrett D'Amore 		return ((size_t)-1);
3324297a3b0SGarrett D'Amore 	}
3334297a3b0SGarrett D'Amore 
3344297a3b0SGarrett D'Amore 	/*
3354297a3b0SGarrett D'Amore 	 * Output the octets representing the character in chunks
3364297a3b0SGarrett D'Amore 	 * of 6 bits, least significant last. The first octet is
3374297a3b0SGarrett D'Amore 	 * a special case because it contains the sequence length
3384297a3b0SGarrett D'Amore 	 * information.
3394297a3b0SGarrett D'Amore 	 */
3404297a3b0SGarrett D'Amore 	for (i = len - 1; i > 0; i--) {
3414297a3b0SGarrett D'Amore 		s[i] = (wc & 0x3f) | 0x80;
3424297a3b0SGarrett D'Amore 		wc >>= 6;
3434297a3b0SGarrett D'Amore 	}
3444297a3b0SGarrett D'Amore 	*s = (wc & 0xff) | lead;
3454297a3b0SGarrett D'Amore 
3464297a3b0SGarrett D'Amore 	return (len);
3474297a3b0SGarrett D'Amore }
3484297a3b0SGarrett D'Amore 
3494297a3b0SGarrett D'Amore static size_t
_UTF8_wcsnrtombs(char * _RESTRICT_KYWD dst,const wchar_t ** _RESTRICT_KYWD src,size_t nwc,size_t len,mbstate_t * _RESTRICT_KYWD ps)3504297a3b0SGarrett D'Amore _UTF8_wcsnrtombs(char *_RESTRICT_KYWD dst, const wchar_t **_RESTRICT_KYWD src,
3514297a3b0SGarrett D'Amore     size_t nwc, size_t len, mbstate_t *_RESTRICT_KYWD ps)
3524297a3b0SGarrett D'Amore {
3534297a3b0SGarrett D'Amore 	_UTF8State *us;
3544297a3b0SGarrett D'Amore 	char buf[MB_LEN_MAX];
3554297a3b0SGarrett D'Amore 	const wchar_t *s;
3564297a3b0SGarrett D'Amore 	size_t nbytes;
3574297a3b0SGarrett D'Amore 	size_t nb;
3584297a3b0SGarrett D'Amore 
3594297a3b0SGarrett D'Amore 	us = (_UTF8State *)ps;
3604297a3b0SGarrett D'Amore 
3614297a3b0SGarrett D'Amore 	if (us->want != 0) {
3624297a3b0SGarrett D'Amore 		errno = EINVAL;
3634297a3b0SGarrett D'Amore 		return ((size_t)-1);
3644297a3b0SGarrett D'Amore 	}
3654297a3b0SGarrett D'Amore 
3664297a3b0SGarrett D'Amore 	s = *src;
3674297a3b0SGarrett D'Amore 	nbytes = 0;
3684297a3b0SGarrett D'Amore 
3694297a3b0SGarrett D'Amore 	if (dst == NULL) {
3704297a3b0SGarrett D'Amore 		while (nwc-- > 0) {
3714297a3b0SGarrett D'Amore 			if (0 <= *s && *s < 0x80)
3724297a3b0SGarrett D'Amore 				/* Fast path for plain ASCII characters. */
3734297a3b0SGarrett D'Amore 				nb = 1;
3744297a3b0SGarrett D'Amore 			else if ((nb = _UTF8_wcrtomb(buf, *s, ps)) ==
3754297a3b0SGarrett D'Amore 			    (size_t)-1)
3764297a3b0SGarrett D'Amore 				/* Invalid character - wcrtomb() sets errno. */
3774297a3b0SGarrett D'Amore 				return ((size_t)-1);
3784297a3b0SGarrett D'Amore 			if (*s == L'\0')
3794297a3b0SGarrett D'Amore 				return (nbytes + nb - 1);
3804297a3b0SGarrett D'Amore 			s++;
3814297a3b0SGarrett D'Amore 			nbytes += nb;
3824297a3b0SGarrett D'Amore 		}
3834297a3b0SGarrett D'Amore 		return (nbytes);
3844297a3b0SGarrett D'Amore 	}
3854297a3b0SGarrett D'Amore 
3864297a3b0SGarrett D'Amore 	while (len > 0 && nwc-- > 0) {
3874297a3b0SGarrett D'Amore 		if (0 <= *s && *s < 0x80) {
3884297a3b0SGarrett D'Amore 			/* Fast path for plain ASCII characters. */
3894297a3b0SGarrett D'Amore 			nb = 1;
3904297a3b0SGarrett D'Amore 			*dst = *s;
3914297a3b0SGarrett D'Amore 		} else if (len > (size_t)MB_CUR_MAX) {
3924297a3b0SGarrett D'Amore 			/* Enough space to translate in-place. */
3934297a3b0SGarrett D'Amore 			if ((nb = _UTF8_wcrtomb(dst, *s, ps)) == (size_t)-1) {
3944297a3b0SGarrett D'Amore 				*src = s;
3954297a3b0SGarrett D'Amore 				return ((size_t)-1);
3964297a3b0SGarrett D'Amore 			}
3974297a3b0SGarrett D'Amore 		} else {
3984297a3b0SGarrett D'Amore 			/*
3994297a3b0SGarrett D'Amore 			 * May not be enough space; use temp. buffer.
4004297a3b0SGarrett D'Amore 			 */
4014297a3b0SGarrett D'Amore 			if ((nb = _UTF8_wcrtomb(buf, *s, ps)) == (size_t)-1) {
4024297a3b0SGarrett D'Amore 				*src = s;
4034297a3b0SGarrett D'Amore 				return ((size_t)-1);
4044297a3b0SGarrett D'Amore 			}
4054297a3b0SGarrett D'Amore 			if (nb > (int)len)
4064297a3b0SGarrett D'Amore 				/* MB sequence for character won't fit. */
4074297a3b0SGarrett D'Amore 				break;
4084297a3b0SGarrett D'Amore 			(void) memcpy(dst, buf, nb);
4094297a3b0SGarrett D'Amore 		}
4104297a3b0SGarrett D'Amore 		if (*s == L'\0') {
4114297a3b0SGarrett D'Amore 			*src = NULL;
4124297a3b0SGarrett D'Amore 			return (nbytes + nb - 1);
4134297a3b0SGarrett D'Amore 		}
4144297a3b0SGarrett D'Amore 		s++;
4154297a3b0SGarrett D'Amore 		dst += nb;
4164297a3b0SGarrett D'Amore 		len -= nb;
4174297a3b0SGarrett D'Amore 		nbytes += nb;
4184297a3b0SGarrett D'Amore 	}
4194297a3b0SGarrett D'Amore 	*src = s;
4204297a3b0SGarrett D'Amore 	return (nbytes);
4214297a3b0SGarrett D'Amore }
422