diff options
author | Tim J. Robbins <tjr@FreeBSD.org> | 2002-10-10 22:56:18 +0000 |
---|---|---|
committer | Tim J. Robbins <tjr@FreeBSD.org> | 2002-10-10 22:56:18 +0000 |
commit | 972baa3747e149e6345e27925bf2441a99cc1898 (patch) | |
tree | 818571da6c27befa8ace080e6465f3a6668ac52e /lib | |
parent | 9b30d71989b1191abe1a5212fa0bf89bbe2524f1 (diff) | |
download | src-972baa3747e149e6345e27925bf2441a99cc1898.tar.gz src-972baa3747e149e6345e27925bf2441a99cc1898.zip |
Add a UTF-8 encoding method, which will eventually replace the antique
"UTF2" method. Although UTF-8 and the old UTF2 encoding are compatible
for 16-bit characters, the new UTF-8 implementation is much more strict
about rejecting malformed input and also handles the full 31 bit range
of characters.
Notes
Notes:
svn path=/head/; revision=104828
Diffstat (limited to 'lib')
-rw-r--r-- | lib/libc/locale/Makefile.inc | 4 | ||||
-rw-r--r-- | lib/libc/locale/mbrune.3 | 3 | ||||
-rw-r--r-- | lib/libc/locale/multibyte.3 | 3 | ||||
-rw-r--r-- | lib/libc/locale/rune.3 | 3 | ||||
-rw-r--r-- | lib/libc/locale/setlocale.3 | 3 | ||||
-rw-r--r-- | lib/libc/locale/setrunelocale.c | 3 | ||||
-rw-r--r-- | lib/libc/locale/utf2.4 | 10 | ||||
-rw-r--r-- | lib/libc/locale/utf2.5 | 10 | ||||
-rw-r--r-- | lib/libc/locale/utf8.5 | 115 | ||||
-rw-r--r-- | lib/libc/locale/utf8.c | 204 |
10 files changed, 349 insertions, 9 deletions
diff --git a/lib/libc/locale/Makefile.inc b/lib/libc/locale/Makefile.inc index 66ea2758e07b..08f94ceb4e16 100644 --- a/lib/libc/locale/Makefile.inc +++ b/lib/libc/locale/Makefile.inc @@ -11,7 +11,8 @@ SRCS+= big5.c btowc.c collate.c collcmp.c euc.c fix_grouping.c frune.c \ mbrtowc.c mbrune.c mbsinit.c mbsrtowcs.c mbtowc.c mbstowcs.c \ mskanji.c nl_langinfo.c nomacros.c none.c rune.c \ runetype.c setinvalidrune.c setlocale.c setrunelocale.c table.c \ - tolower.c toupper.c utf2.c wcrtomb.c wcsrtombs.c wcsftime.c wcstod.c \ + tolower.c toupper.c utf2.c utf8.c wcrtomb.c wcsrtombs.c wcsftime.c \ + wcstod.c \ wcstoimax.c wcstol.c wcstoll.c \ wcstombs.c \ wcstoul.c wcstoull.c wcstoumax.c wctob.c wctomb.c wctrans.c wctype.c \ @@ -31,6 +32,7 @@ MAN+= btowc.3 \ wcsrtombs.3 wcstod.3 wcstol.3 \ wctrans.3 wctype.3 wcwidth.3 MAN+= euc.4 utf2.4 +MAN+= utf8.5 MLINKS+=btowc.3 wctob.3 MLINKS+=isdigit.3 isnumber.3 diff --git a/lib/libc/locale/mbrune.3 b/lib/libc/locale/mbrune.3 index 8e0aa08c7f60..f2598e7a4182 100644 --- a/lib/libc/locale/mbrune.3 +++ b/lib/libc/locale/mbrune.3 @@ -161,7 +161,8 @@ does not appear in the string. .Xr rune 3 , .Xr setlocale 3 , .Xr euc 4 , -.Xr utf2 4 +.Xr utf2 4 , +.Xr utf8 5 .Sh HISTORY The .Fn mbrune , diff --git a/lib/libc/locale/multibyte.3 b/lib/libc/locale/multibyte.3 index 4d4a7d7dc563..8988eb6103df 100644 --- a/lib/libc/locale/multibyte.3 +++ b/lib/libc/locale/multibyte.3 @@ -232,7 +232,8 @@ both functions return \-1. .Xr wcrtomb 3 , .Xr wcsrtombs 3 , .Xr euc 4 , -.Xr utf2 4 +.Xr utf2 4 , +.Xr utf8 5 .Sh STANDARDS The .Fn mblen , diff --git a/lib/libc/locale/rune.3 b/lib/libc/locale/rune.3 index c11dc8990288..755ac9675cde 100644 --- a/lib/libc/locale/rune.3 +++ b/lib/libc/locale/rune.3 @@ -265,7 +265,8 @@ binary LC_CTYPE file for the locale .Xr mbrune 3 , .Xr setlocale 3 , .Xr euc 4 , -.Xr utf2 4 +.Xr utf2 4 , +.Xr utf8 5 .Sh HISTORY These functions first appeared in .Bx 4.4 . diff --git a/lib/libc/locale/setlocale.3 b/lib/libc/locale/setlocale.3 index 75808672a823..95996b7bd5f1 100644 --- a/lib/libc/locale/setlocale.3 +++ b/lib/libc/locale/setlocale.3 @@ -334,7 +334,8 @@ and the category .Xr strcoll 3 , .Xr strxfrm 3 , .Xr euc 4 , -.Xr utf2 4 +.Xr utf2 4 , +.Xr utf8 5 .Sh STANDARDS The .Fn setlocale diff --git a/lib/libc/locale/setrunelocale.c b/lib/libc/locale/setrunelocale.c index 042a06c2bce7..9cfe8d641414 100644 --- a/lib/libc/locale/setrunelocale.c +++ b/lib/libc/locale/setrunelocale.c @@ -48,6 +48,7 @@ __FBSDID("$FreeBSD$"); extern int _none_init(_RuneLocale *); extern int _UTF2_init(_RuneLocale *); +extern int _UTF8_init(_RuneLocale *); extern int _EUC_init(_RuneLocale *); extern int _BIG5_init(_RuneLocale *); extern int _MSKanji_init(_RuneLocale *); @@ -130,6 +131,8 @@ setrunelocale(char *encoding) ret = _none_init(rl); else if (strcmp(rl->encoding, "UTF2") == 0) ret = _UTF2_init(rl); + else if (strcmp(rl->encoding, "UTF-8") == 0) + ret = _UTF8_init(rl); else if (strcmp(rl->encoding, "EUC") == 0) ret = _EUC_init(rl); else if (strcmp(rl->encoding, "BIG5") == 0) diff --git a/lib/libc/locale/utf2.4 b/lib/libc/locale/utf2.4 index 35eb3ad64e1d..f41c0f922dd9 100644 --- a/lib/libc/locale/utf2.4 +++ b/lib/libc/locale/utf2.4 @@ -35,7 +35,7 @@ .\" @(#)utf2.4 8.1 (Berkeley) 6/4/93 .\" $FreeBSD$ .\" -.Dd June 4, 1993 +.Dd October 11, 2002 .Dt UTF2 4 .Os .Sh NAME @@ -45,6 +45,11 @@ .Nm ENCODING .Qq UTF2 .Sh DESCRIPTION +.Bf Em +The UTF2 encoding has been deprecated in favour of UTF-8. +.Ef +New applications should not use UTF2. +.Pp The .Nm UTF2 encoding is based on a proposed X-Open multibyte @@ -85,4 +90,5 @@ which provides for the entire proposed ISO-10646 31 bit standard are currently not implemented. .Sh "SEE ALSO" .Xr mklocale 1 , -.Xr setlocale 3 +.Xr setlocale 3 , +.Xr utf8 5 diff --git a/lib/libc/locale/utf2.5 b/lib/libc/locale/utf2.5 index 35eb3ad64e1d..f41c0f922dd9 100644 --- a/lib/libc/locale/utf2.5 +++ b/lib/libc/locale/utf2.5 @@ -35,7 +35,7 @@ .\" @(#)utf2.4 8.1 (Berkeley) 6/4/93 .\" $FreeBSD$ .\" -.Dd June 4, 1993 +.Dd October 11, 2002 .Dt UTF2 4 .Os .Sh NAME @@ -45,6 +45,11 @@ .Nm ENCODING .Qq UTF2 .Sh DESCRIPTION +.Bf Em +The UTF2 encoding has been deprecated in favour of UTF-8. +.Ef +New applications should not use UTF2. +.Pp The .Nm UTF2 encoding is based on a proposed X-Open multibyte @@ -85,4 +90,5 @@ which provides for the entire proposed ISO-10646 31 bit standard are currently not implemented. .Sh "SEE ALSO" .Xr mklocale 1 , -.Xr setlocale 3 +.Xr setlocale 3 , +.Xr utf8 5 diff --git a/lib/libc/locale/utf8.5 b/lib/libc/locale/utf8.5 new file mode 100644 index 000000000000..079e9eabe8e1 --- /dev/null +++ b/lib/libc/locale/utf8.5 @@ -0,0 +1,115 @@ +.\" Copyright (c) 1993 +.\" The Regents of the University of California. All rights reserved. +.\" +.\" This code is derived from software contributed to Berkeley by +.\" Paul Borman at Krystal Technologies. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" 3. All advertising materials mentioning features or use of this software +.\" must display the following acknowledgement: +.\" This product includes software developed by the University of +.\" California, Berkeley and its contributors. +.\" 4. Neither the name of the University nor the names of its contributors +.\" may be used to endorse or promote products derived from this software +.\" without specific prior written permission. +.\" +.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND +.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE +.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +.\" SUCH DAMAGE. +.\" +.\" @(#)utf2.4 8.1 (Berkeley) 6/4/93 +.\" $FreeBSD$ +.\" +.Dd October 10, 2002 +.Dt UTF8 5 +.Os +.Sh NAME +.Nm utf8 +.Nd "UTF-8, a transformation format of ISO 10646" +.Sh SYNOPSIS +.Nm ENCODING +.Qq UTF-8 +.Sh DESCRIPTION +The +.Nm UTF-8 +encoding represents UCS-4 characters as a sequence of octets, using +between 1 and 6 for each character. +It is backwards compatible with +.Tn ASCII , +so 0x00-0x7f refer to the +.Tn ASCII +character set. +The multibyte encoding of non- +.Tn ASCII +characters +consist entirely of bytes whose high order bit is set. +The actual +encoding is represented by the following table: +.Bd -literal +[0x00000000 - 0x0000007f] [00000000.0bbbbbbb] -> 0bbbbbbb +[0x00000080 - 0x000007ff] [00000bbb.bbbbbbbb] -> 110bbbbb, 10bbbbbb +[0x00000800 - 0x0000ffff] [bbbbbbbb.bbbbbbbb] -> + 1110bbbb, 10bbbbbb, 10bbbbbb +[0x00010000 - 0x001fffff] [00000000.000bbbbb.bbbbbbbb.bbbbbbbb] -> + 11110bbb, 10bbbbbb, 10bbbbbb, 10bbbbbb +[0x00200000 - 0x03ffffff] [000000bb.bbbbbbbb.bbbbbbbb.bbbbbbbb] -> + 111110bb, 10bbbbbb, 10bbbbbb, 10bbbbbb, 10bbbbbb +[0x04000000 - 0x7fffffff] [0bbbbbbb.bbbbbbbb.bbbbbbbb.bbbbbbbb] -> + 1111110b, 10bbbbbb, 10bbbbbb, 10bbbbbb, 10bbbbbb, 10bbbbbb +.Ed +.Pp +If more than a single representation of a value exists (for example, +0x00; 0xC0 0x80; 0xE0 0x80 0x80) the shortest representation is always +used. +Longer ones are detected as an error as they pose a potential +security risk, and destroy the 1:1 character:octet sequence mapping. +.Sh COMPATIBILITY +The +.Nm +encoding supersedes the +.Xr utf2 4 +encoding. +The only differences between the two are that +.Nm +handles the full 31-bit character set of +.Tn ISO +10646 +whereas +.Xr utf2 4 +is limited to a 16-bit character set, +and that +.Xr utf2 4 +accepts redundant, non-"shortest form" representations of characters. +.Sh SEE ALSO +.Xr euc 4 , +.Xr utf2 4 +.Rs +.%A "F. Yergeau" +.%T "UTF-8, a transformation format of ISO 10646" +.%O "RFC 2279" +.%D "January 1998" +.Re +.Sh STANDARDS +The +.Nm +encoding is compatible with RFC 2279. +.Sh BUGS +Byte order marker (BOM) characters are neither added nor removed +from UTF-8-encoded wide character +.Xr stdio 3 +streams. diff --git a/lib/libc/locale/utf8.c b/lib/libc/locale/utf8.c new file mode 100644 index 000000000000..590bd4334c01 --- /dev/null +++ b/lib/libc/locale/utf8.c @@ -0,0 +1,204 @@ +/*- + * Copyright (c) 2002 Tim J. Robbins + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <rune.h> +#include <stddef.h> +#include <stdio.h> +#include <stdlib.h> + +rune_t _UTF8_sgetrune(const char *, size_t, char const **); +int _UTF8_sputrune(rune_t, char *, size_t, char **); + +int +_UTF8_init(_RuneLocale *rl) +{ + + rl->sgetrune = _UTF8_sgetrune; + rl->sputrune = _UTF8_sputrune; + _CurrentRuneLocale = rl; + __mb_cur_max = 6; + + return (0); +} + +rune_t +_UTF8_sgetrune(const char *string, size_t n, const char **result) +{ + int ch, len, mask, siglen; + rune_t lbound, wch; + + if (n < 1) { + if (result != NULL) + *result = string; + return (_INVALID_RUNE); + } + + /* + * Determine the number of octets that make up this character from + * the first octet, and a mask that extracts the interesting bits of + * the first octet. + * + * We also specify a lower bound for the character code to detect + * redundant, non-"shortest form" encodings. For example, the + * sequence C0 80 is _not_ a legal representation of the null + * character. This enforces a 1-to-1 mapping between character + * codes and their multibyte representations. + */ + ch = (unsigned char)*string; + if ((ch & 0x80) == 0) { + mask = 0x7f; + len = 1; + lbound = 0; + } else if ((ch & 0xe0) == 0xc0) { + mask = 0x1f; + len = 2; + lbound = 0x80; + } else if ((ch & 0xf0) == 0xe0) { + mask = 0x0f; + len = 3; + lbound = 0x800; + } else if ((ch & 0xf8) == 0xf0) { + mask = 0x07; + len = 4; + lbound = 0x10000; + } else if ((ch & 0xfc) == 0xf8) { + mask = 0x03; + len = 5; + lbound = 0x200000; + } else if ((ch & 0xfc) == 0xfc) { + mask = 0x01; + len = 6; + lbound = 0x4000000; + } else { + /* + * Malformed input; input is not UTF-8. + */ + if (result != NULL) + *result = string + 1; + return (_INVALID_RUNE); + } + + if (n < len) { + /* + * Truncated or partial input. + */ + if (result != NULL) + *result = string; + return (_INVALID_RUNE); + } + + /* + * Decode the octet sequence representing the character in chunks + * of 6 bits, most significant first. + */ + wch = (unsigned char)*string++ & mask; + while (--len != 0) { + if ((*string & 0xc0) != 0x80) { + /* + * Malformed input; bad characters in the middle + * of a character. + */ + wch = _INVALID_RUNE; + if (result != NULL) + *result = string + 1; + return (_INVALID_RUNE); + } + wch <<= 6; + wch |= *string++ & 0x3f; + } + if (wch != _INVALID_RUNE && wch < lbound) + /* + * Malformed input; redundant encoding. + */ + wch = _INVALID_RUNE; + if (result != NULL) + *result = string; + return (wch); +} + +int +_UTF8_sputrune(rune_t c, char *string, size_t n, char **result) +{ + unsigned char lead; + int i, len; + + /* + * Determine the number of octets needed to represent this character. + * We always output the shortest sequence possible. Also specify the + * first few bits of the first octet, which contains the information + * about the sequence length. + */ + if ((c & ~0x7f) == 0) { + lead = 0; + len = 1; + } else if ((c & ~0x7ff) == 0) { + lead = 0xc0; + len = 2; + } else if ((c & ~0xffff) == 0) { + lead = 0xe0; + len = 3; + } else if ((c & ~0x1fffff) == 0) { + lead = 0xf0; + len = 4; + } else if ((c & ~0x3ffffff) == 0) { + lead = 0xf8; + len = 5; + } else if ((c & ~0x7fffffff) == 0) { + lead = 0xfc; + len = 6; + } else { + /* + * Wide character code is out of range. + */ + if (result != NULL) + *result = NULL; + return (0); + } + + if (n < len) { + if (result != NULL) + *result = NULL; + } else { + /* + * Output the octets representing the character in chunks + * of 6 bits, least significant last. The first octet is + * a special case because it contains the sequence length + * information. + */ + for (i = len - 1; i > 0; i--) { + string[i] = (c & 0x3f) | 0x80; + c >>= 6; + } + *string = (c & 0xff) | lead; + if (result != NULL) + *result = string + len; + } + + return (len); +} |