From 02f4f60ad5d433a663771c65ce3f99a8f82e95ae Mon Sep 17 00:00:00 2001 From: "Tim J. Robbins" Date: Sun, 2 Nov 2003 10:09:33 +0000 Subject: Convert the Big5, EUC, MSKanji and UTF-8 encoding methods to implement mbrtowc() and wcrtomb() directly. GB18030, GBK and UTF2 are left unconverted; GB18030 will be done eventually, but GBK and UTF2 may just be removed, as they are subsets of GB18030 and UTF-8 respectively. --- lib/libc/locale/big5.c | 103 +++++++++++++++---------------- lib/libc/locale/euc.c | 152 +++++++++++++++++++++------------------------- lib/libc/locale/mskanji.c | 92 ++++++++++++++-------------- lib/libc/locale/utf8.c | 140 +++++++++++++++++++++--------------------- 4 files changed, 233 insertions(+), 254 deletions(-) (limited to 'lib/libc/locale') diff --git a/lib/libc/locale/big5.c b/lib/libc/locale/big5.c index 12cc312d29aa..7c0c98179b99 100644 --- a/lib/libc/locale/big5.c +++ b/lib/libc/locale/big5.c @@ -1,4 +1,5 @@ /*- + * Copyright (c) 2002, 2003 Tim J. Robbins. All rights reserved. * Copyright (c) 1993 * The Regents of the University of California. All rights reserved. * @@ -40,80 +41,76 @@ static char sccsid[] = "@(#)big5.c 8.1 (Berkeley) 6/4/93"; #include __FBSDID("$FreeBSD$"); -#include +#include +#include #include #include #include -#include +#include + +extern size_t (*__mbrtowc)(wchar_t * __restrict, const char * __restrict, + size_t, mbstate_t * __restrict); +extern size_t (*__wcrtomb)(char * __restrict, wchar_t, mbstate_t * __restrict); -rune_t _BIG5_sgetrune(const char *, size_t, char const **); -int _BIG5_sputrune(rune_t, char *, size_t, char **); +int _BIG5_init(_RuneLocale *); +size_t _BIG5_mbrtowc(wchar_t * __restrict, const char * __restrict, size_t, + mbstate_t * __restrict); +size_t _BIG5_wcrtomb(char * __restrict, wchar_t, mbstate_t * __restrict); int -_BIG5_init(rl) - _RuneLocale *rl; +_BIG5_init(_RuneLocale *rl) { - rl->sgetrune = _BIG5_sgetrune; - rl->sputrune = _BIG5_sputrune; + + __mbrtowc = _BIG5_mbrtowc; + __wcrtomb = _BIG5_wcrtomb; _CurrentRuneLocale = rl; __mb_cur_max = 2; return (0); } -static inline int -_big5_check(c) - u_int c; +static __inline int +_big5_check(u_int c) { + c &= 0xff; return ((c >= 0xa1 && c <= 0xfe) ? 2 : 1); } -rune_t -_BIG5_sgetrune(string, n, result) - const char *string; - size_t n; - char const **result; +size_t +_BIG5_mbrtowc(wchar_t * __restrict pwc, const char * __restrict s, size_t n, + mbstate_t * __restrict ps __unused) { - rune_t rune = 0; - int len; + wchar_t wc; + int i, len; - if (n < 1 || (len = _big5_check(*string)) > n) { - if (result) - *result = string; - return (_INVALID_RUNE); - } - while (--len >= 0) - rune = (rune << 8) | ((u_int)(*string++) & 0xff); - if (result) - *result = string; - return rune; + if (s == NULL) + /* Reset to initial shift state (no-op) */ + return (0); + if (n == 0 || (size_t)(len = _big5_check(*s)) > n) + /* Incomplete multibyte sequence */ + return ((size_t)-2); + wc = 0; + i = len; + while (i-- > 0) + wc = (wc << 8) | (unsigned char)*s++; + if (pwc != NULL) + *pwc = wc; + return (wc == L'\0' ? 0 : len); } -int -_BIG5_sputrune(c, string, n, result) - rune_t c; - char *string, **result; - size_t n; +size_t +_BIG5_wcrtomb(char * __restrict s, wchar_t wc, + mbstate_t * __restrict ps __unused) { - if (c & 0x8000) { - if (n >= 2) { - string[0] = (c >> 8) & 0xff; - string[1] = c & 0xff; - if (result) - *result = string + 2; - return (2); - } - } - else { - if (n >= 1) { - *string = c & 0xff; - if (result) - *result = string + 1; - return (1); - } + + if (s == NULL) + /* Reset to initial shift state (no-op) */ + return (1); + if (wc & 0x8000) { + *s++ = (wc >> 8) & 0xff; + *s = wc & 0xff; + return (2); } - if (result) - *result = string; - return (0); - + *s = wc & 0xff; + return (1); } diff --git a/lib/libc/locale/euc.c b/lib/libc/locale/euc.c index 596d107a88ca..355e7a5731fa 100644 --- a/lib/libc/locale/euc.c +++ b/lib/libc/locale/euc.c @@ -1,4 +1,5 @@ /*- + * Copyright (c) 2002, 2003 Tim J. Robbins. All rights reserved. * Copyright (c) 1993 * The Regents of the University of California. All rights reserved. * @@ -43,32 +44,35 @@ __FBSDID("$FreeBSD$"); #include #include -#include +#include #include #include #include #include +#include -rune_t _EUC_sgetrune(const char *, size_t, char const **); -int _EUC_sputrune(rune_t, char *, size_t, char **); +extern size_t (*__mbrtowc)(wchar_t * __restrict, const char * __restrict, + size_t, mbstate_t * __restrict); +extern size_t (*__wcrtomb)(char * __restrict, wchar_t, mbstate_t * __restrict); + +int _EUC_init(_RuneLocale *); +size_t _EUC_mbrtowc(wchar_t * __restrict, const char * __restrict, size_t, + mbstate_t * __restrict); +size_t _EUC_wcrtomb(char * __restrict, wchar_t, mbstate_t * __restrict); typedef struct { int count[4]; - rune_t bits[4]; - rune_t mask; + wchar_t bits[4]; + wchar_t mask; } _EucInfo; int -_EUC_init(rl) - _RuneLocale *rl; +_EUC_init(_RuneLocale *rl) { _EucInfo *ei; int x, new__mb_cur_max; char *v, *e; - rl->sgetrune = _EUC_sgetrune; - rl->sputrune = _EUC_sputrune; - if (rl->variable == NULL) return (EFTYPE); @@ -108,6 +112,8 @@ _EUC_init(rl) rl->variable_len = sizeof(_EucInfo); _CurrentRuneLocale = rl; __mb_cur_max = new__mb_cur_max; + __mbrtowc = _EUC_mbrtowc; + __wcrtomb = _EUC_wcrtomb; return (0); } @@ -118,105 +124,85 @@ _EUC_init(rl) #define GR_BITS 0x80808080 /* XXX: to be fixed */ -static inline int -_euc_set(c) - u_int c; +static __inline int +_euc_set(u_int c) { c &= 0xff; - return ((c & 0x80) ? c == _SS3 ? 3 : c == _SS2 ? 2 : 1 : 0); } -rune_t -_EUC_sgetrune(string, n, result) - const char *string; - size_t n; - char const **result; -{ - rune_t rune = 0; - int len, set; - if (n < 1 || (len = CEI->count[set = _euc_set(*string)]) > n) { - if (result) - *result = string; - return (_INVALID_RUNE); - } +size_t +_EUC_mbrtowc(wchar_t * __restrict pwc, const char * __restrict s, size_t n, + mbstate_t * __restrict ps __unused) +{ + int len, remain, set; + wchar_t wc; + + if (s == NULL) + /* Reset to initial shift state (no-op) */ + return (0); + if (n == 0 || (size_t)(len = CEI->count[set = _euc_set(*s)]) > n) + /* Incomplete multibyte sequence */ + return ((size_t)-2); + wc = 0; + remain = len; switch (set) { case 3: case 2: - --len; - ++string; + --remain; + ++s; /* FALLTHROUGH */ case 1: case 0: - while (len-- > 0) - rune = (rune << 8) | ((u_int)(*string++) & 0xff); + while (remain-- > 0) + wc = (wc << 8) | (unsigned char)*s++; break; } - if (result) - *result = string; - return ((rune & ~CEI->mask) | CEI->bits[set]); + wc = (wc & ~CEI->mask) | CEI->bits[set]; + if (pwc != NULL) + *pwc = wc; + return (wc == L'\0' ? 0 : len); } -int -_EUC_sputrune(c, string, n, result) - rune_t c; - char *string, **result; - size_t n; +size_t +_EUC_wcrtomb(char * __restrict s, wchar_t wc, + mbstate_t * __restrict ps __unused) { - rune_t m = c & CEI->mask; - rune_t nm = c & ~m; + wchar_t m, nm; int i, len; + if (s == NULL) + /* Reset to initial shift state (no-op) */ + return (1); + + m = wc & CEI->mask; + nm = wc & ~m; + if (m == CEI->bits[1]) { CodeSet1: /* Codeset 1: The first byte must have 0x80 in it. */ i = len = CEI->count[1]; - if (n >= len) { - if (result) - *result = string + len; - while (i-- > 0) - *string++ = (nm >> (i << 3)) | 0x80; - } else - if (result) - *result = (char *) 0; + while (i-- > 0) + *s++ = (nm >> (i << 3)) | 0x80; } else { - if (m == CEI->bits[0]) { + if (m == CEI->bits[0]) i = len = CEI->count[0]; - if (n < len) { - if (result) - *result = NULL; - return (len); - } + else if (m == CEI->bits[2]) { + i = len = CEI->count[2]; + *s++ = _SS2; + --i; + /* SS2 designates G2 into GR */ + nm |= GR_BITS; + } else if (m == CEI->bits[3]) { + i = len = CEI->count[3]; + *s++ = _SS3; + --i; + /* SS3 designates G3 into GR */ + nm |= GR_BITS; } else - if (m == CEI->bits[2]) { - i = len = CEI->count[2]; - if (n < len) { - if (result) - *result = NULL; - return (len); - } - *string++ = _SS2; - --i; - /* SS2 designates G2 into GR */ - nm |= GR_BITS; - } else - if (m == CEI->bits[3]) { - i = len = CEI->count[3]; - if (n < len) { - if (result) - *result = NULL; - return (len); - } - *string++ = _SS3; - --i; - /* SS3 designates G3 into GR */ - nm |= GR_BITS; - } else - goto CodeSet1; /* Bletch */ + goto CodeSet1; /* Bletch */ while (i-- > 0) - *string++ = (nm >> (i << 3)) & 0xff; - if (result) - *result = string; + *s++ = (nm >> (i << 3)) & 0xff; } return (len); } diff --git a/lib/libc/locale/mskanji.c b/lib/libc/locale/mskanji.c index 482e5b5727bb..f4efcca9c530 100644 --- a/lib/libc/locale/mskanji.c +++ b/lib/libc/locale/mskanji.c @@ -1,4 +1,5 @@ /* + * Copyright (c) 2002, 2003 Tim J. Robbins. All rights reserved. * ja_JP.SJIS locale table for BSD4.4/rune * version 1.0 * (C) Sin'ichiro MIYATANI / Phase One, Inc @@ -38,74 +39,71 @@ static char sccsid[] = "@(#)mskanji.c 1.0 (Phase One) 5/5/95"; __FBSDID("$FreeBSD$"); #include - -#include +#include #include #include #include +#include + +extern size_t (*__mbrtowc)(wchar_t * __restrict, const char * __restrict, + size_t, mbstate_t * __restrict); +extern size_t (*__wcrtomb)(char * __restrict, wchar_t, mbstate_t * __restrict); -rune_t _MSKanji_sgetrune(const char *, size_t, char const **); -int _MSKanji_sputrune(rune_t, char *, size_t, char **); +int _MSKanji_init(_RuneLocale *); +size_t _MSKanji_mbrtowc(wchar_t * __restrict, const char * __restrict, size_t, + mbstate_t * __restrict); +size_t _MSKanji_wcrtomb(char * __restrict, wchar_t, mbstate_t * __restrict); int -_MSKanji_init(rl) - _RuneLocale *rl; +_MSKanji_init(_RuneLocale *rl) { - rl->sgetrune = _MSKanji_sgetrune; - rl->sputrune = _MSKanji_sputrune; + __mbrtowc = _MSKanji_mbrtowc; + __wcrtomb = _MSKanji_wcrtomb; _CurrentRuneLocale = rl; __mb_cur_max = 2; return (0); } -rune_t -_MSKanji_sgetrune(string, n, result) - const char *string; - size_t n; - char const **result; +size_t +_MSKanji_mbrtowc(wchar_t * __restrict pwc, const char * __restrict s, size_t n, + mbstate_t * __restrict ps __unused) { - rune_t rune = 0; - - if (n < 1) { - if (result != NULL) - *result = string; - return (_INVALID_RUNE); - } + wchar_t wc; + int len; - rune = *string++ & 0xff; - if ((rune > 0x80 && rune < 0xa0) || - (rune >= 0xe0 && rune < 0xfd)) { - if (n < 2) { - rune = _INVALID_RUNE; - --string; - } else - rune = (rune << 8) | (*string++ & 0xff); + if (s == NULL) + /* Reset to initial shift state (no-op) */ + return (0); + if (n == 0) + /* Incomplete multibyte sequence */ + return ((size_t)-2); + len = 1; + wc = *s++ & 0xff; + if ((wc > 0x80 && wc < 0xa0) || (wc >= 0xe0 && wc < 0xfd)) { + if (n < 2) + /* Incomplete multibyte sequence */ + return ((size_t)-2); + wc = (wc << 8) | (*s++ & 0xff); + len = 2; } - if (result != NULL) - *result = string; - - return (rune); + if (pwc != NULL) + *pwc = wc; + return (wc == L'\0' ? 0 : len); } -int -_MSKanji_sputrune(c, string, n, result) - rune_t c; - char *string, **result; - size_t n; +size_t +_MSKanji_wcrtomb(char * __restrict s, wchar_t wc, + mbstate_t * __restrict ps __unused) { int len, i; - len = (c > 0x100) ? 2 : 1; - if (n < len) { - if (result != NULL) - *result = NULL; - } else { - if (result != NULL) - *result = string + len; - for (i = len; i-- > 0; ) - *string++ = c >> (i << 3); - } + if (s == NULL) + /* Reset to initial shift state (no-op) */ + return (1); + len = (wc > 0x100) ? 2 : 1; + for (i = len; i-- > 0; ) + *s++ = wc >> (i << 3); return (len); } diff --git a/lib/libc/locale/utf8.c b/lib/libc/locale/utf8.c index c22d3d6750de..10f937b9bb46 100644 --- a/lib/libc/locale/utf8.c +++ b/lib/libc/locale/utf8.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2002 Tim J. Robbins + * Copyright (c) 2002, 2003 Tim J. Robbins * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -27,37 +27,46 @@ #include __FBSDID("$FreeBSD$"); -#include +#include +#include #include #include #include +#include -rune_t _UTF8_sgetrune(const char *, size_t, char const **); -int _UTF8_sputrune(rune_t, char *, size_t, char **); +extern size_t (*__mbrtowc)(wchar_t * __restrict, const char * __restrict, + size_t, mbstate_t * __restrict); +extern size_t (*__wcrtomb)(char * __restrict, wchar_t, mbstate_t * __restrict); + +size_t _UTF8_mbrtowc(wchar_t * __restrict, const char * __restrict, size_t, + mbstate_t * __restrict); +size_t _UTF8_wcrtomb(char * __restrict, wchar_t, mbstate_t * __restrict); int _UTF8_init(_RuneLocale *rl) { - rl->sgetrune = _UTF8_sgetrune; - rl->sputrune = _UTF8_sputrune; + __mbrtowc = _UTF8_mbrtowc; + __wcrtomb = _UTF8_wcrtomb; _CurrentRuneLocale = rl; __mb_cur_max = 6; return (0); } -rune_t -_UTF8_sgetrune(const char *string, size_t n, const char **result) +size_t +_UTF8_mbrtowc(wchar_t * __restrict pwc, const char * __restrict s, size_t n, + mbstate_t * __restrict ps __unused) { - int ch, len, mask; - rune_t lbound, wch; + int ch, i, len, mask; + wchar_t lbound, wch; - if (n < 1) { - if (result != NULL) - *result = string; - return (_INVALID_RUNE); - } + if (s == NULL) + /* Reset to initial shift state (no-op) */ + return (0); + if (n == 0) + /* Incomplete multibyte sequence */ + return ((size_t)-2); /* * Determine the number of octets that make up this character from @@ -70,7 +79,7 @@ _UTF8_sgetrune(const char *string, size_t n, const char **result) * character. This enforces a 1-to-1 mapping between character * codes and their multibyte representations. */ - ch = (unsigned char)*string; + ch = (unsigned char)*s; if ((ch & 0x80) == 0) { mask = 0x7f; len = 1; @@ -99,106 +108,95 @@ _UTF8_sgetrune(const char *string, size_t n, const char **result) /* * Malformed input; input is not UTF-8. */ - if (result != NULL) - *result = string + 1; - return (_INVALID_RUNE); + errno = EILSEQ; + return ((size_t)-1); } - if (n < len) { - /* - * Truncated or partial input. - */ - if (result != NULL) - *result = string; - return (_INVALID_RUNE); - } + if (n < (size_t)len) + /* Incomplete multibyte sequence */ + return ((size_t)-2); /* * Decode the octet sequence representing the character in chunks * of 6 bits, most significant first. */ - wch = (unsigned char)*string++ & mask; - while (--len != 0) { - if ((*string & 0xc0) != 0x80) { + wch = (unsigned char)*s++ & mask; + i = len; + while (--i != 0) { + if ((*s & 0xc0) != 0x80) { /* * Malformed input; bad characters in the middle * of a character. */ - wch = _INVALID_RUNE; - if (result != NULL) - *result = string + 1; - return (_INVALID_RUNE); + errno = EILSEQ; + return ((size_t)-1); } wch <<= 6; - wch |= *string++ & 0x3f; + wch |= *s++ & 0x3f; } - if (wch != _INVALID_RUNE && wch < lbound) + if (wch < lbound) { /* * Malformed input; redundant encoding. */ - wch = _INVALID_RUNE; - if (result != NULL) - *result = string; - return (wch); + errno = EILSEQ; + return ((size_t)-1); + } + if (pwc != NULL) + *pwc = wch; + return (wch == L'\0' ? 0 : i); } -int -_UTF8_sputrune(rune_t c, char *string, size_t n, char **result) +size_t +_UTF8_wcrtomb(char * __restrict s, wchar_t wc, + mbstate_t * __restrict ps __unused) { unsigned char lead; int i, len; + if (s == NULL) + /* Reset to initial shift state (no-op) */ + return (1); + /* * Determine the number of octets needed to represent this character. * We always output the shortest sequence possible. Also specify the * first few bits of the first octet, which contains the information * about the sequence length. */ - if ((c & ~0x7f) == 0) { + if ((wc & ~0x7f) == 0) { lead = 0; len = 1; - } else if ((c & ~0x7ff) == 0) { + } else if ((wc & ~0x7ff) == 0) { lead = 0xc0; len = 2; - } else if ((c & ~0xffff) == 0) { + } else if ((wc & ~0xffff) == 0) { lead = 0xe0; len = 3; - } else if ((c & ~0x1fffff) == 0) { + } else if ((wc & ~0x1fffff) == 0) { lead = 0xf0; len = 4; - } else if ((c & ~0x3ffffff) == 0) { + } else if ((wc & ~0x3ffffff) == 0) { lead = 0xf8; len = 5; - } else if ((c & ~0x7fffffff) == 0) { + } else if ((wc & ~0x7fffffff) == 0) { lead = 0xfc; len = 6; } else { - /* - * Wide character code is out of range. - */ - if (result != NULL) - *result = NULL; - return (0); + errno = EILSEQ; + return ((size_t)-1); } - if (n < len) { - if (result != NULL) - *result = NULL; - } else { - /* - * Output the octets representing the character in chunks - * of 6 bits, least significant last. The first octet is - * a special case because it contains the sequence length - * information. - */ - for (i = len - 1; i > 0; i--) { - string[i] = (c & 0x3f) | 0x80; - c >>= 6; - } - *string = (c & 0xff) | lead; - if (result != NULL) - *result = string + len; + /* + * Output the octets representing the character in chunks + * of 6 bits, least significant last. The first octet is + * a special case because it contains the sequence length + * information. + */ + for (i = len - 1; i > 0; i--) { + s[i] = (wc & 0x3f) | 0x80; + wc >>= 6; } + *s = (wc & 0xff) | lead; return (len); } -- cgit v1.2.3