From 0cadb4301d18724e7513d7489cb5bebd262c82f1 Mon Sep 17 00:00:00 2001 From: Russ Cox Date: Fri, 11 Sep 2009 17:03:06 -0400 Subject: convert to 4-byte UTF-8 and 32-bit Rune http://codereview.appspot.com/116075 --- src/lib9/fmt/dofmt.c | 11 ++++---- src/lib9/utf/rune.c | 78 +++++++++++++++++++++++++++++++++++++++------------- 2 files changed, 65 insertions(+), 24 deletions(-) (limited to 'src/lib9') diff --git a/src/lib9/fmt/dofmt.c b/src/lib9/fmt/dofmt.c index 214b71db..353c76e3 100644 --- a/src/lib9/fmt/dofmt.c +++ b/src/lib9/fmt/dofmt.c @@ -605,12 +605,13 @@ __flagfmt(Fmt *f) int __badfmt(Fmt *f) { - char x[3]; + char x[2+UTFmax]; + int n; x[0] = '%'; - x[1] = f->r; - x[2] = '%'; - f->prec = 3; - __fmtcpy(f, (const void*)x, 3, 3); + n = 1 + runetochar(x+1, &f->r); + x[n++] = '%'; + f->prec = n; + __fmtcpy(f, (const void*)x, n, n); return 0; } diff --git a/src/lib9/utf/rune.c b/src/lib9/utf/rune.c index 3d6831b0..f5944806 100644 --- a/src/lib9/utf/rune.c +++ b/src/lib9/utf/rune.c @@ -23,16 +23,19 @@ enum Bit2 = 5, Bit3 = 4, Bit4 = 3, + Bit5 = 2, T1 = ((1<<(Bit1+1))-1) ^ 0xFF, /* 0000 0000 */ Tx = ((1<<(Bitx+1))-1) ^ 0xFF, /* 1000 0000 */ T2 = ((1<<(Bit2+1))-1) ^ 0xFF, /* 1100 0000 */ T3 = ((1<<(Bit3+1))-1) ^ 0xFF, /* 1110 0000 */ T4 = ((1<<(Bit4+1))-1) ^ 0xFF, /* 1111 0000 */ + T5 = ((1<<(Bit5+1))-1) ^ 0xFF, /* 1111 1000 */ - Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0111 1111 */ - Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0111 1111 1111 */ - Rune3 = (1<<(Bit3+2*Bitx))-1, /* 1111 1111 1111 1111 */ + Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0000 0000 0111 1111 */ + Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0000 0000 0111 1111 1111 */ + Rune3 = (1<<(Bit3+2*Bitx))-1, /* 0000 0000 1111 1111 1111 1111 */ + Rune4 = (1<<(Bit4+3*Bitx))-1, /* 0011 1111 1111 1111 1111 1111 */ Maskx = (1< T4 Tx Tx Tx + */ + if(UTFmax >= 4) { + c3 = *(uchar*)(str+3) ^ Tx; + if(c3 & Testx) + goto bad; + if(c < T5) { + l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4; + if(l <= Rune3) + goto bad; + if(l > Runemax) + goto bad; + *rune = l; + return 4; + } + } + /* * bad decoding */ @@ -113,7 +135,7 @@ runetochar(char *str, Rune *rune) /* * two character sequence - * 0080-07FF => T2 Tx + * 00080-007FF => T2 Tx */ if(c <= Rune2) { str[0] = T2 | (c >> 1*Bitx); @@ -123,12 +145,26 @@ runetochar(char *str, Rune *rune) /* * three character sequence - * 0800-FFFF => T3 Tx Tx + * 00800-0FFFF => T3 Tx Tx */ - str[0] = T3 | (c >> 2*Bitx); - str[1] = Tx | ((c >> 1*Bitx) & Maskx); - str[2] = Tx | (c & Maskx); - return 3; + if(c > Runemax) + c = Runeerror; + if(c <= Rune3) { + str[0] = T3 | (c >> 2*Bitx); + str[1] = Tx | ((c >> 1*Bitx) & Maskx); + str[2] = Tx | (c & Maskx); + return 3; + } + + /* + * four character sequence + * 010000-1FFFFF => T4 Tx Tx Tx + */ + str[0] = T4 | (c >> 3*Bitx); + str[1] = Tx | ((c >> 2*Bitx) & Maskx); + str[2] = Tx | ((c >> 1*Bitx) & Maskx); + str[3] = Tx | (c & Maskx); + return 4; } int @@ -155,7 +191,10 @@ runenlen(Rune *r, int nrune) if(c <= Rune2) nb += 2; else + if(c <= Rune3 || c > Runemax) nb += 3; + else + nb += 4; } return nb; } @@ -165,13 +204,14 @@ fullrune(char *str, int n) { int c; - if(n > 0) { - c = *(uchar*)str; - if(c < Tx) - return 1; - if(n > 1) - if(c < T3 || n > 2) - return 1; - } - return 0; + if(n <= 0) + return 0; + c = *(uchar*)str; + if(c < Tx) + return 1; + if(c < T3) + return n >= 2; + if(UTFmax == 3 || c < T4) + return n >= 3; + return n >= 4; } -- cgit v1.2.3