From 02f4f60ad5d433a663771c65ce3f99a8f82e95ae Mon Sep 17 00:00:00 2001
From: "Tim J. Robbins" <tjr@FreeBSD.org>
Date: Sun, 2 Nov 2003 10:09:33 +0000
Subject: Convert the Big5, EUC, MSKanji and UTF-8 encoding methods to
 implement mbrtowc() and wcrtomb() directly. GB18030, GBK and UTF2 are left
 unconverted; GB18030 will be done eventually, but GBK and UTF2 may just be
 removed, as they are subsets of GB18030 and UTF-8 respectively.

---
 lib/libc/locale/big5.c    | 103 +++++++++++++++----------------
 lib/libc/locale/euc.c     | 152 +++++++++++++++++++++-------------------------
 lib/libc/locale/mskanji.c |  92 ++++++++++++++--------------
 lib/libc/locale/utf8.c    | 140 +++++++++++++++++++++---------------------
 4 files changed, 233 insertions(+), 254 deletions(-)

(limited to 'lib/libc/locale')

diff --git a/lib/libc/locale/big5.c b/lib/libc/locale/big5.c
index 12cc312d29aa..7c0c98179b99 100644
--- a/lib/libc/locale/big5.c
+++ b/lib/libc/locale/big5.c
@@ -1,4 +1,5 @@
 /*-
+ * Copyright (c) 2002, 2003 Tim J. Robbins. All rights reserved.
  * Copyright (c) 1993
  *	The Regents of the University of California.  All rights reserved.
  *
@@ -40,80 +41,76 @@ static char sccsid[] = "@(#)big5.c	8.1 (Berkeley) 6/4/93";
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
-#include <rune.h>
+#include <sys/types.h>
+#include <runetype.h>
 #include <stddef.h>
 #include <stdio.h>
 #include <stdlib.h>
-#include <sys/types.h>
+#include <wchar.h>
+
+extern size_t (*__mbrtowc)(wchar_t * __restrict, const char * __restrict,
+    size_t, mbstate_t * __restrict);
+extern size_t (*__wcrtomb)(char * __restrict, wchar_t, mbstate_t * __restrict);
 
-rune_t	_BIG5_sgetrune(const char *, size_t, char const **);
-int	_BIG5_sputrune(rune_t, char *, size_t, char **);
+int	_BIG5_init(_RuneLocale *);
+size_t	_BIG5_mbrtowc(wchar_t * __restrict, const char * __restrict, size_t,
+	    mbstate_t * __restrict);
+size_t	_BIG5_wcrtomb(char * __restrict, wchar_t, mbstate_t * __restrict);
 
 int
-_BIG5_init(rl)
-	_RuneLocale *rl;
+_BIG5_init(_RuneLocale *rl)
 {
-	rl->sgetrune = _BIG5_sgetrune;
-	rl->sputrune = _BIG5_sputrune;
+
+	__mbrtowc = _BIG5_mbrtowc;
+	__wcrtomb = _BIG5_wcrtomb;
 	_CurrentRuneLocale = rl;
 	__mb_cur_max = 2;
 	return (0);
 }
 
-static inline int
-_big5_check(c)
-	u_int c;
+static __inline int
+_big5_check(u_int c)
 {
+
 	c &= 0xff;
 	return ((c >= 0xa1 && c <= 0xfe) ? 2 : 1);
 }
 
-rune_t
-_BIG5_sgetrune(string, n, result)
-	const char *string;
-	size_t n;
-	char const **result;
+size_t
+_BIG5_mbrtowc(wchar_t * __restrict pwc, const char * __restrict s, size_t n,
+    mbstate_t * __restrict ps __unused)
 {
-	rune_t rune = 0;
-	int len;
+	wchar_t wc;
+	int i, len;
 
-	if (n < 1 || (len = _big5_check(*string)) > n) {
-		if (result)
-			*result = string;
-		return (_INVALID_RUNE);
-	}
-	while (--len >= 0)
-		rune = (rune << 8) | ((u_int)(*string++) & 0xff);
-	if (result)
-		*result = string;
-	return rune;
+	if (s == NULL)
+		/* Reset to initial shift state (no-op) */
+		return (0);
+	if (n == 0 || (size_t)(len = _big5_check(*s)) > n)
+		/* Incomplete multibyte sequence */
+		return ((size_t)-2);
+	wc = 0;
+	i = len;
+	while (i-- > 0)
+		wc = (wc << 8) | (unsigned char)*s++;
+	if (pwc != NULL)
+		*pwc = wc;
+	return (wc == L'\0' ? 0 : len);
 }
 
-int
-_BIG5_sputrune(c, string, n, result)
-	rune_t c;
-	char *string, **result;
-	size_t n;
+size_t
+_BIG5_wcrtomb(char * __restrict s, wchar_t wc,
+    mbstate_t * __restrict ps __unused)
 {
-	if (c & 0x8000) {
-		if (n >= 2) {
-			string[0] = (c >> 8) & 0xff;
-			string[1] = c & 0xff;
-			if (result)
-				*result = string + 2;
-			return (2);
-		}
-	}
-	else {
-		if (n >= 1) {
-			*string = c & 0xff;
-			if (result)
-				*result = string + 1;
-			return (1);
-		}
+
+	if (s == NULL)
+		/* Reset to initial shift state (no-op) */
+		return (1);
+	if (wc & 0x8000) {
+		*s++ = (wc >> 8) & 0xff;
+		*s = wc & 0xff;
+		return (2);
 	}
-	if (result)
-		*result = string;
-	return (0);
-	
+	*s = wc & 0xff;
+	return (1);
 }
diff --git a/lib/libc/locale/euc.c b/lib/libc/locale/euc.c
index 596d107a88ca..355e7a5731fa 100644
--- a/lib/libc/locale/euc.c
+++ b/lib/libc/locale/euc.c
@@ -1,4 +1,5 @@
 /*-
+ * Copyright (c) 2002, 2003 Tim J. Robbins. All rights reserved.
  * Copyright (c) 1993
  *	The Regents of the University of California.  All rights reserved.
  *
@@ -43,32 +44,35 @@ __FBSDID("$FreeBSD$");
 #include <sys/types.h>
 
 #include <errno.h>
-#include <rune.h>
+#include <runetype.h>
 #include <stddef.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+#include <wchar.h>
 
-rune_t	_EUC_sgetrune(const char *, size_t, char const **);
-int	_EUC_sputrune(rune_t, char *, size_t, char **);
+extern size_t (*__mbrtowc)(wchar_t * __restrict, const char * __restrict,
+    size_t, mbstate_t * __restrict);
+extern size_t (*__wcrtomb)(char * __restrict, wchar_t, mbstate_t * __restrict);
+
+int	_EUC_init(_RuneLocale *);
+size_t	_EUC_mbrtowc(wchar_t * __restrict, const char * __restrict, size_t,
+	    mbstate_t * __restrict);
+size_t	_EUC_wcrtomb(char * __restrict, wchar_t, mbstate_t * __restrict);
 
 typedef struct {
 	int	count[4];
-	rune_t	bits[4];
-	rune_t	mask;
+	wchar_t	bits[4];
+	wchar_t	mask;
 } _EucInfo;
 
 int
-_EUC_init(rl)
-	_RuneLocale *rl;
+_EUC_init(_RuneLocale *rl)
 {
 	_EucInfo *ei;
 	int x, new__mb_cur_max;
 	char *v, *e;
 
-	rl->sgetrune = _EUC_sgetrune;
-	rl->sputrune = _EUC_sputrune;
-
 	if (rl->variable == NULL)
 		return (EFTYPE);
 
@@ -108,6 +112,8 @@ _EUC_init(rl)
 	rl->variable_len = sizeof(_EucInfo);
 	_CurrentRuneLocale = rl;
 	__mb_cur_max = new__mb_cur_max;
+	__mbrtowc = _EUC_mbrtowc;
+	__wcrtomb = _EUC_wcrtomb;
 	return (0);
 }
 
@@ -118,105 +124,85 @@ _EUC_init(rl)
 
 #define	GR_BITS	0x80808080 /* XXX: to be fixed */
 
-static inline int
-_euc_set(c)
-	u_int c;
+static __inline int
+_euc_set(u_int c)
 {
 	c &= 0xff;
-
 	return ((c & 0x80) ? c == _SS3 ? 3 : c == _SS2 ? 2 : 1 : 0);
 }
-rune_t
-_EUC_sgetrune(string, n, result)
-	const char *string;
-	size_t n;
-	char const **result;
-{
-	rune_t rune = 0;
-	int len, set;
 
-	if (n < 1 || (len = CEI->count[set = _euc_set(*string)]) > n) {
-		if (result)
-			*result = string;
-		return (_INVALID_RUNE);
-	}
+size_t
+_EUC_mbrtowc(wchar_t * __restrict pwc, const char * __restrict s, size_t n,
+    mbstate_t * __restrict ps __unused)
+{
+	int len, remain, set;
+	wchar_t wc;
+
+	if (s == NULL)
+		/* Reset to initial shift state (no-op) */
+		return (0);
+	if (n == 0 || (size_t)(len = CEI->count[set = _euc_set(*s)]) > n)
+		/* Incomplete multibyte sequence */
+		return ((size_t)-2);
+	wc = 0;
+	remain = len;
 	switch (set) {
 	case 3:
 	case 2:
-		--len;
-		++string;
+		--remain;
+		++s;
 		/* FALLTHROUGH */
 	case 1:
 	case 0:
-		while (len-- > 0)
-			rune = (rune << 8) | ((u_int)(*string++) & 0xff);
+		while (remain-- > 0)
+			wc = (wc << 8) | (unsigned char)*s++;
 		break;
 	}
-	if (result)
-		*result = string;
-	return ((rune & ~CEI->mask) | CEI->bits[set]);
+	wc = (wc & ~CEI->mask) | CEI->bits[set];
+	if (pwc != NULL)
+		*pwc = wc;
+	return (wc == L'\0' ? 0 : len);
 }
 
-int
-_EUC_sputrune(c, string, n, result)
-	rune_t c;
-	char *string, **result;
-	size_t n;
+size_t
+_EUC_wcrtomb(char * __restrict s, wchar_t wc,
+    mbstate_t * __restrict ps __unused)
 {
-	rune_t m = c & CEI->mask;
-	rune_t nm = c & ~m;
+	wchar_t m, nm;
 	int i, len;
 
+	if (s == NULL)
+		/* Reset to initial shift state (no-op) */
+		return (1);
+
+	m = wc & CEI->mask;
+	nm = wc & ~m;
+
 	if (m == CEI->bits[1]) {
 CodeSet1:
 		/* Codeset 1: The first byte must have 0x80 in it. */
 		i = len = CEI->count[1];
-		if (n >= len) {
-			if (result)
-				*result = string + len;
-			while (i-- > 0)
-				*string++ = (nm >> (i << 3)) | 0x80;
-		} else
-			if (result)
-				*result = (char *) 0;
+		while (i-- > 0)
+			*s++ = (nm >> (i << 3)) | 0x80;
 	} else {
-		if (m == CEI->bits[0]) {
+		if (m == CEI->bits[0])
 			i = len = CEI->count[0];
-			if (n < len) {
-				if (result)
-					*result = NULL;
-				return (len);
-			}
+		else if (m == CEI->bits[2]) {
+			i = len = CEI->count[2];
+			*s++ = _SS2;
+			--i;
+			/* SS2 designates G2 into GR */
+			nm |= GR_BITS;
+		} else if (m == CEI->bits[3]) {
+			i = len = CEI->count[3];
+			*s++ = _SS3;
+			--i;
+			/* SS3 designates G3 into GR */
+			nm |= GR_BITS;
 		} else
-			if (m == CEI->bits[2]) {
-				i = len = CEI->count[2];
-				if (n < len) {
-					if (result)
-						*result = NULL;
-					return (len);
-				}
-				*string++ = _SS2;
-				--i;
-				/* SS2 designates G2 into GR */
-				nm |= GR_BITS;
-			} else
-				if (m == CEI->bits[3]) {
-					i = len = CEI->count[3];
-					if (n < len) {
-						if (result)
-							*result = NULL;
-						return (len);
-					}
-					*string++ = _SS3;
-					--i;
-					/* SS3 designates G3 into GR */
-					nm |= GR_BITS;
-				} else
-					goto CodeSet1;	/* Bletch */
+			goto CodeSet1;	/* Bletch */
 		while (i-- > 0)
-			*string++ = (nm >> (i << 3)) & 0xff;
-		if (result)
-			*result = string;
+			*s++ = (nm >> (i << 3)) & 0xff;
 	}
 	return (len);
 }
diff --git a/lib/libc/locale/mskanji.c b/lib/libc/locale/mskanji.c
index 482e5b5727bb..f4efcca9c530 100644
--- a/lib/libc/locale/mskanji.c
+++ b/lib/libc/locale/mskanji.c
@@ -1,4 +1,5 @@
 /*
+ * Copyright (c) 2002, 2003 Tim J. Robbins. All rights reserved.
  *    ja_JP.SJIS locale table for BSD4.4/rune
  *    version 1.0
  *    (C) Sin'ichiro MIYATANI / Phase One, Inc
@@ -38,74 +39,71 @@ static char sccsid[] = "@(#)mskanji.c	1.0 (Phase One) 5/5/95";
 __FBSDID("$FreeBSD$");
 
 #include <sys/types.h>
-
-#include <rune.h>
+#include <runetype.h>
 #include <stddef.h>
 #include <stdio.h>
 #include <stdlib.h>
+#include <wchar.h>
+
+extern size_t (*__mbrtowc)(wchar_t * __restrict, const char * __restrict,
+    size_t, mbstate_t * __restrict);
+extern size_t (*__wcrtomb)(char * __restrict, wchar_t, mbstate_t * __restrict);
 
-rune_t	_MSKanji_sgetrune(const char *, size_t, char const **);
-int	_MSKanji_sputrune(rune_t, char *, size_t, char **);
+int	_MSKanji_init(_RuneLocale *);
+size_t  _MSKanji_mbrtowc(wchar_t * __restrict, const char * __restrict, size_t,
+	    mbstate_t * __restrict);
+size_t  _MSKanji_wcrtomb(char * __restrict, wchar_t, mbstate_t * __restrict);
 
 int
-_MSKanji_init(rl)
-	_RuneLocale *rl;
+_MSKanji_init(_RuneLocale *rl)
 {
-	rl->sgetrune = _MSKanji_sgetrune;
-	rl->sputrune = _MSKanji_sputrune;
 
+	__mbrtowc = _MSKanji_mbrtowc;
+	__wcrtomb = _MSKanji_wcrtomb;
 	_CurrentRuneLocale = rl;
 	__mb_cur_max = 2;
 	return (0);
 }
 
-rune_t
-_MSKanji_sgetrune(string, n, result)
-	const char *string;
-	size_t n;
-	char const **result;
+size_t
+_MSKanji_mbrtowc(wchar_t * __restrict pwc, const char * __restrict s, size_t n,
+    mbstate_t * __restrict ps __unused)
 {
-	rune_t rune = 0;
-
-	if (n < 1) {
-		if (result != NULL)
-			*result = string;
-		return (_INVALID_RUNE);
-	}
+	wchar_t wc;
+	int len;
 
-	rune = *string++ & 0xff;
-	if ((rune > 0x80 && rune < 0xa0) ||
-	    (rune >= 0xe0 && rune < 0xfd)) {
-		if (n < 2) {
-			rune = _INVALID_RUNE;
-			--string;
-		} else
-			rune = (rune << 8) | (*string++ & 0xff);
+	if (s == NULL)
+		/* Reset to initial shift state (no-op) */
+		return (0);
+	if (n == 0)
+		/* Incomplete multibyte sequence */
+		return ((size_t)-2);
+	len = 1;
+	wc = *s++ & 0xff;
+	if ((wc > 0x80 && wc < 0xa0) || (wc >= 0xe0 && wc < 0xfd)) {
+		if (n < 2)
+			/* Incomplete multibyte sequence */
+			return ((size_t)-2);
+		wc = (wc << 8) | (*s++ & 0xff);
+		len = 2;
 	}
-	if (result != NULL)
-		*result = string;
-
-	return (rune);
+	if (pwc != NULL)
+		*pwc = wc;
+	return (wc == L'\0' ? 0 : len);
 }
 
-int
-_MSKanji_sputrune(c, string, n, result)
-	rune_t c;
-	char *string, **result;
-	size_t n;
+size_t
+_MSKanji_wcrtomb(char * __restrict s, wchar_t wc,
+    mbstate_t * __restrict ps __unused)
 {
 	int len, i;
 
-	len = (c > 0x100) ? 2 : 1;
-	if (n < len) {
-		if (result != NULL)
-			*result = NULL;
-	} else {
-		if (result != NULL)
-			*result = string + len;
-		for (i = len; i-- > 0; )
-			*string++ = c >> (i << 3);
-	}
+	if (s == NULL)
+		/* Reset to initial shift state (no-op) */
+		return (1);
 
+	len = (wc > 0x100) ? 2 : 1;
+	for (i = len; i-- > 0; )
+		*s++ = wc >> (i << 3);
 	return (len);
 }
diff --git a/lib/libc/locale/utf8.c b/lib/libc/locale/utf8.c
index c22d3d6750de..10f937b9bb46 100644
--- a/lib/libc/locale/utf8.c
+++ b/lib/libc/locale/utf8.c
@@ -1,5 +1,5 @@
 /*-
- * Copyright (c) 2002 Tim J. Robbins
+ * Copyright (c) 2002, 2003 Tim J. Robbins
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -27,37 +27,46 @@
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
-#include <rune.h>
+#include <errno.h>
+#include <runetype.h>
 #include <stddef.h>
 #include <stdio.h>
 #include <stdlib.h>
+#include <wchar.h>
 
-rune_t	_UTF8_sgetrune(const char *, size_t, char const **);
-int	_UTF8_sputrune(rune_t, char *, size_t, char **);
+extern size_t (*__mbrtowc)(wchar_t * __restrict, const char * __restrict,
+    size_t, mbstate_t * __restrict);
+extern size_t (*__wcrtomb)(char * __restrict, wchar_t, mbstate_t * __restrict);
+
+size_t  _UTF8_mbrtowc(wchar_t * __restrict, const char * __restrict, size_t,
+	    mbstate_t * __restrict);
+size_t  _UTF8_wcrtomb(char * __restrict, wchar_t, mbstate_t * __restrict);
 
 int
 _UTF8_init(_RuneLocale *rl)
 {
 
-	rl->sgetrune = _UTF8_sgetrune;
-	rl->sputrune = _UTF8_sputrune;
+	__mbrtowc = _UTF8_mbrtowc;
+	__wcrtomb = _UTF8_wcrtomb;
 	_CurrentRuneLocale = rl;
 	__mb_cur_max = 6;
 
 	return (0);
 }
 
-rune_t
-_UTF8_sgetrune(const char *string, size_t n, const char **result)
+size_t
+_UTF8_mbrtowc(wchar_t * __restrict pwc, const char * __restrict s, size_t n,
+    mbstate_t * __restrict ps __unused)
 {
-	int ch, len, mask;
-	rune_t lbound, wch;
+	int ch, i, len, mask;
+	wchar_t lbound, wch;
 
-	if (n < 1) {
-		if (result != NULL)
-			*result = string;
-		return (_INVALID_RUNE);
-	}
+	if (s == NULL)
+		/* Reset to initial shift state (no-op) */
+		return (0);
+	if (n == 0)
+		/* Incomplete multibyte sequence */
+		return ((size_t)-2);
 
 	/*
 	 * Determine the number of octets that make up this character from
@@ -70,7 +79,7 @@ _UTF8_sgetrune(const char *string, size_t n, const char **result)
 	 * character. This enforces a 1-to-1 mapping between character
 	 * codes and their multibyte representations.
 	 */
-	ch = (unsigned char)*string;
+	ch = (unsigned char)*s;
 	if ((ch & 0x80) == 0) {
 		mask = 0x7f;
 		len = 1;
@@ -99,106 +108,95 @@ _UTF8_sgetrune(const char *string, size_t n, const char **result)
 		/*
 		 * Malformed input; input is not UTF-8.
 		 */
-		if (result != NULL)
-			*result = string + 1;
-		return (_INVALID_RUNE);
+		errno = EILSEQ;
+		return ((size_t)-1);
 	}
 
-	if (n < len) {
-		/*
-		 * Truncated or partial input.
-		 */
-		if (result != NULL)
-			*result = string;
-		return (_INVALID_RUNE);
-	}
+	if (n < (size_t)len)
+		/* Incomplete multibyte sequence */
+		return ((size_t)-2);
 
 	/*
 	 * Decode the octet sequence representing the character in chunks
 	 * of 6 bits, most significant first.
 	 */
-	wch = (unsigned char)*string++ & mask;
-	while (--len != 0) {
-		if ((*string & 0xc0) != 0x80) {
+	wch = (unsigned char)*s++ & mask;
+	i = len;
+	while (--i != 0) {
+		if ((*s & 0xc0) != 0x80) {
 			/*
 			 * Malformed input; bad characters in the middle
 			 * of a character.
 			 */
-			wch = _INVALID_RUNE;
-			if (result != NULL)
-				*result = string + 1;
-			return (_INVALID_RUNE);
+			errno = EILSEQ;
+			return ((size_t)-1);
 		}
 		wch <<= 6;
-		wch |= *string++ & 0x3f;
+		wch |= *s++ & 0x3f;
 	}
-	if (wch != _INVALID_RUNE && wch < lbound)
+	if (wch < lbound) {
 		/*
 		 * Malformed input; redundant encoding.
 		 */
-		wch = _INVALID_RUNE;
-	if (result != NULL)
-		*result = string;
-	return (wch);
+		errno = EILSEQ;
+		return ((size_t)-1);
+	}
+	if (pwc != NULL)
+		*pwc = wch;
+	return (wch == L'\0' ? 0 : i);
 }
 
-int
-_UTF8_sputrune(rune_t c, char *string, size_t n, char **result)
+size_t
+_UTF8_wcrtomb(char * __restrict s, wchar_t wc,
+    mbstate_t * __restrict ps __unused)
 {
 	unsigned char lead;
 	int i, len;
 
+	if (s == NULL)
+		/* Reset to initial shift state (no-op) */
+		return (1);
+
 	/*
 	 * Determine the number of octets needed to represent this character.
 	 * We always output the shortest sequence possible. Also specify the
 	 * first few bits of the first octet, which contains the information
 	 * about the sequence length.
 	 */
-	if ((c & ~0x7f) == 0) {
+	if ((wc & ~0x7f) == 0) {
 		lead = 0;
 		len = 1;
-	} else if ((c & ~0x7ff) == 0) {
+	} else if ((wc & ~0x7ff) == 0) {
 		lead = 0xc0;
 		len = 2;
-	} else if ((c & ~0xffff) == 0) {
+	} else if ((wc & ~0xffff) == 0) {
 		lead = 0xe0;
 		len = 3;
-	} else if ((c & ~0x1fffff) == 0) {
+	} else if ((wc & ~0x1fffff) == 0) {
 		lead = 0xf0;
 		len = 4;
-	} else if ((c & ~0x3ffffff) == 0) {
+	} else if ((wc & ~0x3ffffff) == 0) {
 		lead = 0xf8;
 		len = 5;
-	} else if ((c & ~0x7fffffff) == 0) {
+	} else if ((wc & ~0x7fffffff) == 0) {
 		lead = 0xfc;
 		len = 6;
 	} else {
-		/*
-		 * Wide character code is out of range.
-		 */
-		if (result != NULL)
-			*result = NULL;
-		return (0);
+		errno = EILSEQ;
+		return ((size_t)-1);
 	}
 
-	if (n < len) {
-		if (result != NULL)
-			*result = NULL;
-	} else {
-		/*
-		 * Output the octets representing the character in chunks
-		 * of 6 bits, least significant last. The first octet is
-		 * a special case because it contains the sequence length
-		 * information.
-		 */
-		for (i = len - 1; i > 0; i--) {
-			string[i] = (c & 0x3f) | 0x80;
-			c >>= 6;
-		}
-		*string = (c & 0xff) | lead;
-		if (result != NULL)
-			*result = string + len;
+	/*
+	 * Output the octets representing the character in chunks
+	 * of 6 bits, least significant last. The first octet is
+	 * a special case because it contains the sequence length
+	 * information.
+	 */
+	for (i = len - 1; i > 0; i--) {
+		s[i] = (wc & 0x3f) | 0x80;
+		wc >>= 6;
 	}
+	*s = (wc & 0xff) | lead;
 
 	return (len);
 }
-- 
cgit v1.2.3