src - FreeBSD source tree

diff options


context:
space:
mode:

author	Tim J. Robbins <tjr@FreeBSD.org>	2004-05-17 12:32:40 +0000
committer	Tim J. Robbins <tjr@FreeBSD.org>	2004-05-17 12:32:40 +0000
commit	5e44d7ebe189e5b0c59038bd80d0030d473d768e (patch)
tree	314729e4e801b26971a9b1a60a738b91641469f0 /lib
parent	61074767591998c6f8a3a19ee27ad556e754b94c (diff)
download	src-5e44d7ebe189e5b0c59038bd80d0030d473d768e.tar.gz src-5e44d7ebe189e5b0c59038bd80d0030d473d768e.zip

Use conversion state objects to store the accumulated wide character,

low bound, and the number of bytes remaining instead of storing the raw byte sequence and deriving them every time mbrtowc() is called. This is much faster -- about twice as fast in some crude benchmarks.

Notes

Notes: svn path=/head/; revision=129336

Diffstat (limited to 'lib')

-rw-r--r--

lib/libc/locale/utf8.c

130

1 files changed, 67 insertions, 63 deletions

diff --git a/lib/libc/locale/utf8.c b/lib/libc/locale/utf8.c
index d754a19a4e29..c0fbcf47d22b 100644
--- a/lib/libc/locale/utf8.c
+++ b/lib/libc/locale/utf8.c

@@ -40,8 +40,9 @@ int _UTF8_mbsinit(const mbstate_t *);

size_t _UTF8_wcrtomb(char * __restrict, wchar_t, mbstate_t * __restrict);

typedef struct {

- int count;

- u_char bytes[6];

+ wchar_t ch;

+ int want;

+ wchar_t lbound;

} _UTF8State;

int

@@ -61,7 +62,7 @@ int

_UTF8_mbsinit(const mbstate_t *ps)

{

- return (ps == NULL || ((const _UTF8State *)ps)->count == 0);

+ return (ps == NULL || ((const _UTF8State *)ps)->want == 0);

}

size_t

@@ -69,13 +70,12 @@ _UTF8_mbrtowc(wchar_t * __restrict pwc, const char * __restrict s, size_t n,

mbstate_t * __restrict ps)

{

_UTF8State *us;

- int ch, i, len, mask, ocount;

+ int ch, i, mask, want;

wchar_t lbound, wch;

- size_t ncopy;

us = (_UTF8State *)ps;

- if (us->count < 0 || us->count > sizeof(us->bytes)) {

+ if (us->want < 0 || us->want > 6) {

errno = EINVAL;

return ((size_t)-1);

}

@@ -86,72 +86,69 @@ _UTF8_mbrtowc(wchar_t * __restrict pwc, const char * __restrict s, size_t n,

pwc = NULL;

}

- ncopy = MIN(MIN(n, MB_CUR_MAX), sizeof(us->bytes) - us->count);

- memcpy(us->bytes + us->count, s, ncopy);

- ocount = us->count;

- us->count += ncopy;

- s = (char *)us->bytes;

- n = us->count;

if (n == 0)

/* Incomplete multibyte sequence */

return ((size_t)-2);

- /*

- * Determine the number of octets that make up this character from

- * the first octet, and a mask that extracts the interesting bits of

- * the first octet.

- *

- * We also specify a lower bound for the character code to detect

- * redundant, non-"shortest form" encodings. For example, the

- * sequence C0 80 is _not_ a legal representation of the null

- * character. This enforces a 1-to-1 mapping between character

- * codes and their multibyte representations.

- */

- ch = (unsigned char)*s;

- if ((ch & 0x80) == 0) {

- mask = 0x7f;

- len = 1;

- lbound = 0;

- } else if ((ch & 0xe0) == 0xc0) {

- mask = 0x1f;

- len = 2;

- lbound = 0x80;

- } else if ((ch & 0xf0) == 0xe0) {

- mask = 0x0f;

- len = 3;

- lbound = 0x800;

- } else if ((ch & 0xf8) == 0xf0) {

- mask = 0x07;

- len = 4;

- lbound = 0x10000;

- } else if ((ch & 0xfc) == 0xf8) {

- mask = 0x03;

- len = 5;

- lbound = 0x200000;

- } else if ((ch & 0xfc) == 0xfc) {

- mask = 0x01;

- len = 6;

- lbound = 0x4000000;

- } else {

+ if (us->want == 0) {

- * Malformed input; input is not UTF-8.

+ * Determine the number of octets that make up this character

+ * from the first octet, and a mask that extracts the

+ * interesting bits of the first octet. We already know

+ * the character is at least two bytes long.

+ *

+ * We also specify a lower bound for the character code to

+ * detect redundant, non-"shortest form" encodings. For

+ * example, the sequence C0 80 is _not_ a legal representation

+ * of the null character. This enforces a 1-to-1 mapping

+ * between character codes and their multibyte representations.

- errno = EILSEQ;

- return ((size_t)-1);

+ ch = (unsigned char)*s;

+ if ((ch & 0x80) == 0) {

+ mask = 0x7f;

+ want = 1;

+ lbound = 0;

+ } else if ((ch & 0xe0) == 0xc0) {

+ mask = 0x1f;

+ want = 2;

+ lbound = 0x80;

+ } else if ((ch & 0xf0) == 0xe0) {

+ mask = 0x0f;

+ want = 3;

+ lbound = 0x800;

+ } else if ((ch & 0xf8) == 0xf0) {

+ mask = 0x07;

+ want = 4;

+ lbound = 0x10000;

+ } else if ((ch & 0xfc) == 0xf8) {

+ mask = 0x03;

+ want = 5;

+ lbound = 0x200000;

+ } else if ((ch & 0xfc) == 0xfc) {

+ mask = 0x01;

+ want = 6;

+ lbound = 0x4000000;

+ } else {

+ /*

+ * Malformed input; input is not UTF-8.

+ */

+ errno = EILSEQ;

+ return ((size_t)-1);

+ }

+ } else {

+ want = us->want;

+ lbound = us->lbound;

}

- if (n < (size_t)len)

- /* Incomplete multibyte sequence */

- return ((size_t)-2);

* Decode the octet sequence representing the character in chunks

* of 6 bits, most significant first.

- wch = (unsigned char)*s++ & mask;

- i = len;

- while (--i != 0) {

+ if (us->want == 0)

+ wch = (unsigned char)*s++ & mask;

+ else

+ wch = us->ch;

+ for (i = (us->want == 0) ? 1 : 0; i < MIN(want, n); i++) {

if ((*s & 0xc0) != 0x80) {

* Malformed input; bad characters in the middle

@@ -163,6 +160,13 @@ _UTF8_mbrtowc(wchar_t * __restrict pwc, const char * __restrict s, size_t n,

wch <<= 6;

wch |= *s++ & 0x3f;

}

+ if (i < want) {

+ /* Incomplete multibyte sequence. */

+ us->want = want - i;

+ us->lbound = lbound;

+ us->ch = wch;

+ return ((size_t)-2);

+ }

if (wch < lbound) {

* Malformed input; redundant encoding.

@@ -172,8 +176,8 @@ _UTF8_mbrtowc(wchar_t * __restrict pwc, const char * __restrict s, size_t n,

}

if (pwc != NULL)

*pwc = wch;

- us->count = 0;

- return (wch == L'\0' ? 0 : len - ocount);

+ us->want = 0;

+ return (wch == L'\0' ? 0 : want);

}

size_t

@@ -185,7 +189,7 @@ _UTF8_wcrtomb(char * __restrict s, wchar_t wc, mbstate_t * __restrict ps)

us = (_UTF8State *)ps;

- if (us->count < 0 || us->count > sizeof(us->bytes)) {

+ if (us->want != 0) {

errno = EINVAL;

return ((size_t)-1);

}