UTF-8 Encode problems with \u \U
Configuration Information [Automatically generated, do not change]: Machine: x86_64 OS: linux-gnu Compiler: gcc Compilation CFLAGS: -DPROGRAM='bash' -DCONF_HOSTTYPE='x86_64' -DCONF_OSTYPE='linux-gnu' -DCONF_MACHTYPE='x86_64-pc-linux-gnu' -DCONF_VENDOR='pc' -DLOCALEDIR='/usr/share/locale' -DPACKAGE='bash' -DSHELL -DHAVE_CONFIG_H -I. -I../bash -I../bash/include -I../bash/lib -g -O2 -Wall uname output: Linux DETH00 3.0.0-15-generic #26-Ubuntu SMP Fri Jan 20 17:23:00 UTC 2012 x86_64 x86_64 x86_64 GNU/Linux Machine Type: x86_64-pc-linux-gnu Bash Version: 4.2 Patch Level: 10 Release Status: release Description: \u and \U incorrectly encode values between \u80 and \uff Repeat-By: printf '%q\n' "$(printf '\uff')" printf '%q\n' $'\uff' # outputs $'\377' instead of $'\303\277' Fix: iff --git a/builtins/printf.def b/builtins/printf.def index 9eca215..b155160 100644 --- a/builtins/printf.def +++ b/builtins/printf.def @@ -859,7 +859,7 @@ tescape (estart, cp, lenp, sawc) *cp = '\\'; return 0; } -if (uvalue <= UCHAR_MAX) +if (uvalue <= CHAR_MAX) *cp = uvalue; else { diff --git a/lib/sh/strtrans.c b/lib/sh/strtrans.c index 2265782..2e6e37b 100644 --- a/lib/sh/strtrans.c +++ b/lib/sh/strtrans.c @@ -144,7 +144,7 @@ ansicstr (string, len, flags, sawc, rlen) *r++ = '\\';/* c remains unchanged */ break; } - else if (v <= UCHAR_MAX) + else if (v <= CHAR_MAX) { c = v; break;
Re: UTF-8 Encode problems with \u \U
John Kearney writes: > Fix: > iff --git a/builtins/printf.def b/builtins/printf.def > index 9eca215..b155160 100644 > --- a/builtins/printf.def > +++ b/builtins/printf.def > @@ -859,7 +859,7 @@ tescape (estart, cp, lenp, sawc) > *cp = '\\'; > return 0; >} > -if (uvalue <= UCHAR_MAX) > +if (uvalue <= CHAR_MAX) CHAR_MAX has nothing at all to do with UTF-8. Andreas. -- Andreas Schwab, sch...@linux-m68k.org GPG Key fingerprint = 58CA 54C7 6D53 942B 1756 01D3 44D5 214B 8276 4ED5 "And now for something completely different."
Re: UTF-8 Encode problems with \u \U
-BEGIN PGP SIGNED MESSAGE- Hash: SHA1 I know To be hones I get a bad feeling with that code, I'm guessing it was done for performance reasons, Personally I'd just remove the special handling of any values, and always call the encoding function, but was trying for a minimalist solution. I mean you could do something like #define MAX_SINGLE_BYTE_UTF8 0x7F if (uvalue <= MAX_SINGLE_BYTE_UTF8) I'm guessing the code was done originally for UTF-2 encoding. what I suggest will fix the UTF-8 case and not affect the UTF-2 case. On 02/18/2012 11:11 AM, Andreas Schwab wrote: > John Kearney writes: > >> Fix: iff --git a/builtins/printf.def b/builtins/printf.def index >> 9eca215..b155160 100644 --- a/builtins/printf.def +++ >> b/builtins/printf.def @@ -859,7 +859,7 @@ tescape (estart, cp, >> lenp, sawc) *cp = '\\'; return 0; } -if (uvalue <= UCHAR_MAX) >> +if (uvalue <= CHAR_MAX) > > CHAR_MAX has nothing at all to do with UTF-8. > > Andreas. > -BEGIN PGP SIGNATURE- Version: GnuPG v1.4.11 (GNU/Linux) Comment: Using GnuPG with Mozilla - http://enigmail.mozdev.org/ iQEcBAEBAgAGBQJPP3u8AAoJEKUDtR0WmS056GIH/1TO/A8RmRCfTU3llNG1tMJy MJiby2gdvz2v/Q+Y83llCU01fcQ1tGpp2iOO7rbfYmfdqiJ8iMfNc1pK302Tb77u HcZSSVQKnBwNpL6eeAhwLVzrpfdcKWY/diQknsiXLtrm0AcPhsrf5Bu/OgHjeu7m 3uyqlcQAvYVKj5Z4eV75Hn1+lrCp26fkjZSOZPN9AH8yv1chQXrYPB+/Wj82Cp/S sSgupvpmAv3b4HaZhXsA2DPxEEb2ESj/ZaHMC4/AxyABJoub++erxm/k8r3iUDjc rud6jWoVJcwt+UkVyqi8V8qIJ/urVG01FVoVXTYIiqA73ZdJ3fkLw0PCmliZMtA= =pZin -END PGP SIGNATURE-
Re: UTF-8 Encode problems with \u \U
John Kearney writes: > what I suggest will fix the UTF-8 case No, it won't. > and not affect the UTF-2 case. That is impossible. Andreas. -- Andreas Schwab, sch...@linux-m68k.org GPG Key fingerprint = 58CA 54C7 6D53 942B 1756 01D3 44D5 214B 8276 4ED5 "And now for something completely different."
Re: UTF-8 Encode problems with \u \U
-BEGIN PGP SIGNED MESSAGE- Hash: SHA1 On 02/18/2012 11:29 AM, Andreas Schwab wrote: > John Kearney writes: > >> what I suggest will fix the UTF-8 case > > No, it won't. > >> and not affect the UTF-2 case. > > That is impossible. > > Andreas. > Current code if (uvalue <= UCHAR_MAX) *cp = uvalue; else { temp = u32cconv (uvalue, cp); cp[temp] = '\0'; if (lenp) *lenp = temp; } Robust Code temp = u32cconv (uvalue, cp); cp[temp] = '\0'; if (lenp) *lenp = temp; Compromise solution if (uvalue <= 0x7f) *cp = uvalue; else { temp = u32cconv (uvalue, cp); cp[temp] = '\0'; if (lenp) *lenp = temp; } How can doing a direct assignment, in less cases break anything, if it does u32cconv is broken. And it does work for me, so impossible seems to be overstating it. -BEGIN PGP SIGNATURE- Version: GnuPG v1.4.11 (GNU/Linux) Comment: Using GnuPG with Mozilla - http://enigmail.mozdev.org/ iQEcBAEBAgAGBQJPP39rAAoJEKUDtR0WmS052JIH/09at08oGR16hvj2blL4YxWJ V1Slbkh9O8pJ4DV9NOwEweIpjAxYUzRFzOEVV0tiYzeqISJ36uKnttewiP5VcRSv heS6QwOl5R3wnx0ecNkpLMo2nT054Fqd+OHSHFOgkBeAM28PVwjT+GmfFyCp1f4K hPevpejPLyxHYWaXJwy4+1XN0Wp/YatzEXr21pHgU7CPyMGYLbju4su0kNpYledj 5Zo3tT/cvoBGVysJo5AbQ8D07cG85eoARxz6erJatjKDKCUPl1kKdcikG3nGvnQc 66HdR/lJRShDh344uss6/4sw2R9LFut0QP+ChhJowQ9ZBI1uZo7/fn0gQv7gOdo= =fXLm -END PGP SIGNATURE-
Fix u32toutf8 so it encodes values > 0xFFFF correctly.
-BEGIN PGP SIGNED MESSAGE- Hash: SHA1 Configuration Information [Automatically generated, do not change]: Machine: x86_64 OS: linux-gnu Compiler: gcc Compilation CFLAGS: -DPROGRAM='bash' -DCONF_HOSTTYPE='x86_64' - -DCONF_OSTYPE='linux-gnu' -DCONF_MACHTYPE='x86_64-pc-linux-gnu' - -DCONF_VENDOR='pc' -DLOCALEDIR='/usr/share/locale' -DPACKAGE='bash' - -DSHELL -DHAVE_CONFIG_H -I. -I../bash -I../bash/include - -I../bash/lib -g -O2 -Wall uname output: Linux DETH00 3.0.0-15-generic #26-Ubuntu SMP Fri Jan 20 17:23:00 UTC 2012 x86_64 x86_64 x86_64 GNU/Linux Machine Type: x86_64-pc-linux-gnu Bash Version: 4.2 Patch Level: 10 Release Status: release Description: Current u32toutf8 only encode values below 0x correctly. wchar_t can be ambiguous size better in my opinion to use unsigned long, or uint32_t, or something clearer. Repeat-By: ---' Fix: diff --git a/lib/sh/unicode.c b/lib/sh/unicode.c index d34fa08..3f7d378 100644 - --- a/lib/sh/unicode.c +++ b/lib/sh/unicode.c @@ -54,7 +54,7 @@ extern const char *locale_charset __P((void)); extern char *get_locale_var __P((char *)); #endif - -static int u32init = 0; +static int u32init = 0; static int utf8locale = 0; #if defined (HAVE_ICONV) static iconv_t localconv; @@ -115,26 +115,61 @@ u32tochar (wc, s) } int - -u32toutf8 (wc, s) - - wchar_t wc; +u32toutf8 (c, s) + unsigned long c; char *s; { int l; - - l = (wc < 0x0080) ? 1 : ((wc < 0x0800) ? 2 : 3); - - - - if (wc < 0x0080) - -s[0] = (unsigned char)wc; - - else if (wc < 0x0800) + if (c <= 0x7F) +{ + s[0] = (char)c; + l = 1; +} + else if (c <= 0x7FF) +{ + s[0] = (c >> 6)| 0xc0; /* 110x */ + s[1] = (c& 0x3f) | 0x80; /* 10xx */ + l = 2; +} + else if (c <= 0x) +{ + s[0] = (c >> 12) | 0xe0; /* 1110 */ + s[1] = ((c >> 6) & 0x3f) | 0x80; /* 10xx */ + s[2] = (c& 0x3f) | 0x80; /* 10xx */ + l = 3; +} + else if (c <= 0x1F) { - - s[0] = (wc >> 6) | 0xc0; - - s[1] = (wc & 0x3f) | 0x80; + s[0] = (c >> 18) | 0xf0; /* 0xxx */ + s[1] = ((c >> 12) & 0x3f) | 0x80; /* 10xx */ + s[2] = ((c >> 6) & 0x3f) | 0x80; /* 10xx */ + s[3] = ( c& 0x3f) | 0x80; /* 10xx */ + l = 4; +} + else if (c <= 0x3FF) +{ + s[0] = (c >> 24) | 0xf8; /* 10xx */ + s[1] = ((c >> 18) & 0x3f) | 0x80; /* 10xx */ + s[2] = ((c >> 12) & 0x3f) | 0x80; /* 10xx */ + s[3] = ((c >> 6) & 0x3f) | 0x80; /* 10xx */ + s[4] = ( c& 0x3f) | 0x80; /* 10xx */ + l = 5; +} + else if (c <= 0x7FFF) +{ + s[0] = (c >> 30) | 0xfc; /* 110x */ + s[1] = ((c >> 24) & 0x3f) | 0x80; /* 10xx */ + s[2] = ((c >> 18) & 0x3f) | 0x80; /* 10xx */ + s[3] = ((c >> 12) & 0x3f) | 0x80; /* 10xx */ + s[4] = ((c >> 6) & 0x3f) | 0x80; /* 10xx */ + s[5] = ( c& 0x3f) | 0x80; /* 10xx */ + l = 6; } else { - - s[0] = (wc >> 12) | 0xe0; - - s[1] = ((wc >> 6) & 0x3f) | 0x80; - - s[2] = (wc & 0x3f) | 0x80; + /* Error Invalid UTF-8 */ + l = 0; } s[l] = '\0'; return l; @@ -150,7 +185,7 @@ u32cconv (c, s) -BEGIN PGP SIGNATURE- Version: GnuPG v1.4.11 (GNU/Linux) Comment: Using GnuPG with Mozilla - http://enigmail.mozdev.org/ iQEcBAEBAgAGBQJPP3/tAAoJEKUDtR0WmS059CcH/iIyBOGhf0IgSmnIFyw0YLpA 3ZWSaXWoEZodrDr1fX67hj2424icXm9fTZw70G+rS1YjtCfm86O/Qou4VNROylAv TbjPUWkHRWVci7IqcDGb1tNWRrulxUvNFA/Uc1xBtKckAO6HHHRTYFa+sCkd5Fnx dm7e0iMTqMMmL/dUwB+di+hSkGD+ZXS1vY76wizdwG7CteUxAVunse+ffP7TRYbn K86Whc7p7llG12hruCPGArc9iS7YiBaC/XNIKXmN7fn93dhQTcdzzk/UTGmaZgDk cQk4R7/NBljP4LtQtKwX4JYAi5XJM5TeSLykL97UFxW/5OGM+SmSVJbKLlHU/mQ= =EJUb -END PGP SIGNATURE-
Re: UTF-8 Encode problems with \u \U
John Kearney writes: > How can doing a direct assignment, in less cases break anything, Where did I say that? > And it does work for me, so impossible seems to be overstating it. How is it possible to affect UTF-8 while not affecting UTF-8? Andreas. -- Andreas Schwab, sch...@linux-m68k.org GPG Key fingerprint = 58CA 54C7 6D53 942B 1756 01D3 44D5 214B 8276 4ED5 "And now for something completely different."
Questionable code behavior in u32cconv?
-BEGIN PGP SIGNED MESSAGE- Hash: SHA1 Configuration Information [Automatically generated, do not change]: Machine: x86_64 OS: linux-gnu Compiler: gcc Compilation CFLAGS: -DPROGRAM='bash' -DCONF_HOSTTYPE='x86_64' - -DCONF_OSTYPE='linux-gnu' -DCONF_MACHTYPE='x86_64-pc-linux-gnu' - -DCONF_VENDOR='pc' -DLOCALEDIR='/usr/share/locale' -DPACKAGE='bash' - -DSHELL -DHAVE_CONFIG_H -I. -I../bash -I../bash/include - -I../bash/lib -g -O2 -Wall uname output: Linux DETH00 3.0.0-15-generic #26-Ubuntu SMP Fri Jan 20 17:23:00 UTC 2012 x86_64 x86_64 x86_64 GNU/Linux Machine Type: x86_64-pc-linux-gnu Bash Version: 4.2 Patch Level: 10 Release Status: release Description: Now I may be misreading the code but it looks like the code relating to iconv is only checking the destination charset the first time, the code is executed. as such breaking the following functionality. LC_CTYPE=C printf '\uff' LC_CTYPE=C.UTF-8 printf '\uff' Repeat-By: haven't seen the problem. Fix: Not so much a fix as a modification that should hopefully clarify my concern. diff --git a/lib/sh/unicode.c b/lib/sh/unicode.c index d34fa08..3f7d378 100644 - --- a/lib/sh/unicode.c +++ b/lib/sh/unicode.c @@ -54,7 +54,7 @@ extern const char *locale_charset __P((void)); extern char *get_locale_var __P((char *)); #endif - -static int u32init = 0; +const char *charset; static int utf8locale = 0; #if defined (HAVE_ICONV) static iconv_t localconv; @@ -115,26 +115,61 @@ u32tochar (wc, s) } @@ -150,7 +185,7 @@ u32cconv (c, s) wchar_t wc; int n; #if HAVE_ICONV - - const char *charset; + const char *ncharset; char obuf[25], *optr; size_t obytesleft; const char *iptr; @@ -171,20 +206,22 @@ u32cconv (c, s) codeset = nl_langinfo (CODESET); if (STREQ (codeset, "UTF-8")) { n = u32toutf8 (wc, s); return n; } #endif #if HAVE_ICONV - - /* this is mostly from coreutils-8.5/lib/unicodeio.c */ - - if (u32init == 0) - -{ # if HAVE_LOCALE_CHARSET - - charset = locale_charset (); /* XXX - fix later */ + ncharset = locale_charset ();/* XXX - fix later */ # else - - charset = stub_charset (); + ncharset = stub_charset (); # endif + /* this is mostly from coreutils-8.5/lib/unicodeio.c */ + if (STREQ (charset, ncharset)) +{ + /* Free Old charset str ? */ + charset=ncharset; if (STREQ (charset, "UTF-8")) utf8locale = 1; else -BEGIN PGP SIGNATURE- Version: GnuPG v1.4.11 (GNU/Linux) Comment: Using GnuPG with Mozilla - http://enigmail.mozdev.org/ iQEcBAEBAgAGBQJPP5SCAAoJEKUDtR0WmS05L8QH/RUz/X8QZk7HXDIFUTCd0Eah MkfWpCtib9Jt5jUBcb+/UZKiwTSxYGm7D9X08Tpho+i7c+3kknWUGTkivqg7eVo4 TlRA+N4k3x8PdpbYPFNGxgy9LRSViQjqbbzNfYaX+Pbi2YIbZRuaPBipEdbvBqDG bN7KaUM/97vZicZn5SOrhcDiq1RfJosdTkr7egEON4P4BBIXIVk4vRcCF/xXCw6M w2BmvpavV3ra1TXhYN2C678qMyncq5kr8e0tvIl4EY6oCurMlvXhoNkOcz14fOMa XrYJUu1dDNKXmTsJFjDGZhyzvTejLVezjn91/so2OINinqHW++2IMFim5ED9w28= =rW+v -END PGP SIGNATURE-