UTF-8 Encode problems with \u \U

2012-02-18 Thread John Kearney
Configuration Information [Automatically generated, do not change]:
Machine: x86_64
OS: linux-gnu
Compiler: gcc
Compilation CFLAGS:  -DPROGRAM='bash' -DCONF_HOSTTYPE='x86_64'
-DCONF_OSTYPE='linux-gnu' -DCONF_MACHTYPE='x86_64-pc-linux-gnu'
-DCONF_VENDOR='pc' -DLOCALEDIR='/usr/share/locale' -DPACKAGE='bash'
-DSHELL -DHAVE_CONFIG_H   -I.  -I../bash -I../bash/include
-I../bash/lib   -g -O2 -Wall
uname output: Linux DETH00 3.0.0-15-generic #26-Ubuntu SMP Fri Jan 20
17:23:00 UTC 2012 x86_64 x86_64 x86_64 GNU/Linux
Machine Type: x86_64-pc-linux-gnu

Bash Version: 4.2
Patch Level: 10
Release Status: release

Description:
\u and \U incorrectly encode values between \u80 and \uff

Repeat-By:
  printf '%q\n' "$(printf '\uff')"
  printf '%q\n' $'\uff'
  # outputs $'\377' instead of $'\303\277'

Fix:
iff --git a/builtins/printf.def b/builtins/printf.def
index 9eca215..b155160 100644
--- a/builtins/printf.def
+++ b/builtins/printf.def
@@ -859,7 +859,7 @@ tescape (estart, cp, lenp, sawc)
 *cp = '\\';
 return 0;
   }
-if (uvalue <= UCHAR_MAX)
+if (uvalue <= CHAR_MAX)
   *cp = uvalue;
 else
   {
diff --git a/lib/sh/strtrans.c b/lib/sh/strtrans.c
index 2265782..2e6e37b 100644
--- a/lib/sh/strtrans.c
+++ b/lib/sh/strtrans.c
@@ -144,7 +144,7 @@ ansicstr (string, len, flags, sawc, rlen)
   *r++ = '\\';/* c remains unchanged */
   break;
 }
-  else if (v <= UCHAR_MAX)
+  else if (v <= CHAR_MAX)
 {
   c = v;
   break;




Re: UTF-8 Encode problems with \u \U

2012-02-18 Thread Andreas Schwab
John Kearney  writes:

> Fix:
> iff --git a/builtins/printf.def b/builtins/printf.def
> index 9eca215..b155160 100644
> --- a/builtins/printf.def
> +++ b/builtins/printf.def
> @@ -859,7 +859,7 @@ tescape (estart, cp, lenp, sawc)
>  *cp = '\\';
>  return 0;
>}
> -if (uvalue <= UCHAR_MAX)
> +if (uvalue <= CHAR_MAX)

CHAR_MAX has nothing at all to do with UTF-8.

Andreas.

-- 
Andreas Schwab, sch...@linux-m68k.org
GPG Key fingerprint = 58CA 54C7 6D53 942B 1756  01D3 44D5 214B 8276 4ED5
"And now for something completely different."



Re: UTF-8 Encode problems with \u \U

2012-02-18 Thread John Kearney
-BEGIN PGP SIGNED MESSAGE-
Hash: SHA1

I know
To be hones I get a bad feeling with that code, I'm guessing it was
done for performance reasons, Personally I'd just remove the special
handling of any values, and always call the encoding function, but was
trying for a minimalist solution.
I mean you could do something like

#define MAX_SINGLE_BYTE_UTF8 0x7F
if (uvalue <= MAX_SINGLE_BYTE_UTF8)

I'm guessing the code was done originally for UTF-2 encoding.

what I suggest will fix the UTF-8 case and not affect the UTF-2 case.


On 02/18/2012 11:11 AM, Andreas Schwab wrote:
> John Kearney  writes:
> 
>> Fix: iff --git a/builtins/printf.def b/builtins/printf.def index 
>> 9eca215..b155160 100644 --- a/builtins/printf.def +++ 
>> b/builtins/printf.def @@ -859,7 +859,7 @@ tescape (estart, cp, 
>> lenp, sawc) *cp = '\\'; return 0; } -if (uvalue <= UCHAR_MAX)
>> +if (uvalue <= CHAR_MAX)
> 
> CHAR_MAX has nothing at all to do with UTF-8.
> 
> Andreas.
> 

-BEGIN PGP SIGNATURE-
Version: GnuPG v1.4.11 (GNU/Linux)
Comment: Using GnuPG with Mozilla - http://enigmail.mozdev.org/

iQEcBAEBAgAGBQJPP3u8AAoJEKUDtR0WmS056GIH/1TO/A8RmRCfTU3llNG1tMJy
MJiby2gdvz2v/Q+Y83llCU01fcQ1tGpp2iOO7rbfYmfdqiJ8iMfNc1pK302Tb77u
HcZSSVQKnBwNpL6eeAhwLVzrpfdcKWY/diQknsiXLtrm0AcPhsrf5Bu/OgHjeu7m
3uyqlcQAvYVKj5Z4eV75Hn1+lrCp26fkjZSOZPN9AH8yv1chQXrYPB+/Wj82Cp/S
sSgupvpmAv3b4HaZhXsA2DPxEEb2ESj/ZaHMC4/AxyABJoub++erxm/k8r3iUDjc
rud6jWoVJcwt+UkVyqi8V8qIJ/urVG01FVoVXTYIiqA73ZdJ3fkLw0PCmliZMtA=
=pZin
-END PGP SIGNATURE-



Re: UTF-8 Encode problems with \u \U

2012-02-18 Thread Andreas Schwab
John Kearney  writes:

> what I suggest will fix the UTF-8 case

No, it won't.

> and not affect the UTF-2 case.

That is impossible.

Andreas.

-- 
Andreas Schwab, sch...@linux-m68k.org
GPG Key fingerprint = 58CA 54C7 6D53 942B 1756  01D3 44D5 214B 8276 4ED5
"And now for something completely different."



Re: UTF-8 Encode problems with \u \U

2012-02-18 Thread John Kearney
-BEGIN PGP SIGNED MESSAGE-
Hash: SHA1

On 02/18/2012 11:29 AM, Andreas Schwab wrote:
> John Kearney  writes:
> 
>> what I suggest will fix the UTF-8 case
> 
> No, it won't.
> 
>> and not affect the UTF-2 case.
> 
> That is impossible.
> 
> Andreas.
> 

Current code
if (uvalue <= UCHAR_MAX)
  *cp = uvalue;
else
  {
temp = u32cconv (uvalue, cp);
cp[temp] = '\0';
if (lenp)
  *lenp = temp;
  }

Robust Code
temp = u32cconv (uvalue, cp);
cp[temp] = '\0';
if (lenp)
  *lenp = temp;

Compromise solution
if (uvalue <= 0x7f)
  *cp = uvalue;
else
  {
temp = u32cconv (uvalue, cp);
cp[temp] = '\0';
if (lenp)
  *lenp = temp;
  }

How can doing a direct assignment, in less cases break anything, if it
does u32cconv is broken.

And it does work for me, so impossible seems to be overstating it.

-BEGIN PGP SIGNATURE-
Version: GnuPG v1.4.11 (GNU/Linux)
Comment: Using GnuPG with Mozilla - http://enigmail.mozdev.org/

iQEcBAEBAgAGBQJPP39rAAoJEKUDtR0WmS052JIH/09at08oGR16hvj2blL4YxWJ
V1Slbkh9O8pJ4DV9NOwEweIpjAxYUzRFzOEVV0tiYzeqISJ36uKnttewiP5VcRSv
heS6QwOl5R3wnx0ecNkpLMo2nT054Fqd+OHSHFOgkBeAM28PVwjT+GmfFyCp1f4K
hPevpejPLyxHYWaXJwy4+1XN0Wp/YatzEXr21pHgU7CPyMGYLbju4su0kNpYledj
5Zo3tT/cvoBGVysJo5AbQ8D07cG85eoARxz6erJatjKDKCUPl1kKdcikG3nGvnQc
66HdR/lJRShDh344uss6/4sw2R9LFut0QP+ChhJowQ9ZBI1uZo7/fn0gQv7gOdo=
=fXLm
-END PGP SIGNATURE-



Fix u32toutf8 so it encodes values > 0xFFFF correctly.

2012-02-18 Thread John Kearney
-BEGIN PGP SIGNED MESSAGE-
Hash: SHA1

Configuration Information [Automatically generated, do not change]:
Machine: x86_64
OS: linux-gnu
Compiler: gcc
Compilation CFLAGS:  -DPROGRAM='bash' -DCONF_HOSTTYPE='x86_64'
- -DCONF_OSTYPE='linux-gnu' -DCONF_MACHTYPE='x86_64-pc-linux-gnu'
- -DCONF_VENDOR='pc' -DLOCALEDIR='/usr/share/locale' -DPACKAGE='bash'
- -DSHELL -DHAVE_CONFIG_H   -I.  -I../bash -I../bash/include
- -I../bash/lib   -g -O2 -Wall
uname output: Linux DETH00 3.0.0-15-generic #26-Ubuntu SMP Fri Jan 20
17:23:00 UTC 2012 x86_64 x86_64 x86_64 GNU/Linux
Machine Type: x86_64-pc-linux-gnu

Bash Version: 4.2
Patch Level: 10
Release Status: release

Description:
Current u32toutf8 only encode values below 0x correctly.
wchar_t can be ambiguous size better in my opinion to use
unsigned long, or uint32_t, or something clearer.
Repeat-By:
  ---'

Fix:
diff --git a/lib/sh/unicode.c b/lib/sh/unicode.c
index d34fa08..3f7d378 100644
- --- a/lib/sh/unicode.c
+++ b/lib/sh/unicode.c
@@ -54,7 +54,7 @@ extern const char *locale_charset __P((void));
 extern char *get_locale_var __P((char *));
 #endif

- -static int u32init = 0;
+static int u32init = 0;
 static int utf8locale = 0;
 #if defined (HAVE_ICONV)
 static iconv_t localconv;
@@ -115,26 +115,61 @@ u32tochar (wc, s)
 }

 int
- -u32toutf8 (wc, s)
- - wchar_t wc;
+u32toutf8 (c, s)
+ unsigned long c;
  char *s;
 {
   int l;

- -  l = (wc < 0x0080) ? 1 : ((wc < 0x0800) ? 2 : 3);
- -
- -  if (wc < 0x0080)
- -s[0] = (unsigned char)wc;
- -  else if (wc < 0x0800)
+  if (c <= 0x7F)
+{
+  s[0] = (char)c;
+  l = 1;
+}
+  else if (c <= 0x7FF)
+{
+  s[0] = (c >>   6)| 0xc0; /* 110x  */
+  s[1] = (c& 0x3f) | 0x80; /* 10xx  */
+  l = 2;
+}
+  else if (c <= 0x)
+{
+  s[0] =  (c >> 12) | 0xe0; /* 1110  */
+  s[1] = ((c >>  6) & 0x3f) | 0x80; /* 10xx  */
+  s[2] =  (c& 0x3f) | 0x80; /* 10xx  */
+  l = 3;
+}
+  else if (c <= 0x1F)
 {
- -  s[0] = (wc >> 6) | 0xc0;
- -  s[1] = (wc & 0x3f) | 0x80;
+  s[0] =  (c >> 18) | 0xf0; /*  0xxx */
+  s[1] = ((c >> 12) & 0x3f) | 0x80; /* 10xx  */
+  s[2] = ((c >>  6) & 0x3f) | 0x80; /* 10xx  */
+  s[3] = ( c& 0x3f) | 0x80; /* 10xx  */
+  l = 4;
+}
+  else if (c <= 0x3FF)
+{
+  s[0] =  (c >> 24) | 0xf8; /*  10xx */
+  s[1] = ((c >> 18) & 0x3f) | 0x80; /* 10xx  */
+  s[2] = ((c >> 12) & 0x3f) | 0x80; /* 10xx  */
+  s[3] = ((c >>  6) & 0x3f) | 0x80; /* 10xx  */
+  s[4] = ( c& 0x3f) | 0x80; /* 10xx  */
+  l = 5;
+}
+  else if (c <= 0x7FFF)
+{
+  s[0] =  (c >> 30) | 0xfc; /*  110x */
+  s[1] = ((c >> 24) & 0x3f) | 0x80; /* 10xx  */
+  s[2] = ((c >> 18) & 0x3f) | 0x80; /* 10xx  */
+  s[3] = ((c >> 12) & 0x3f) | 0x80; /* 10xx  */
+  s[4] = ((c >>  6) & 0x3f) | 0x80; /* 10xx  */
+  s[5] = ( c& 0x3f) | 0x80; /* 10xx  */
+  l = 6;
 }
   else
 {
- -  s[0] = (wc >> 12) | 0xe0;
- -  s[1] = ((wc >> 6) & 0x3f) | 0x80;
- -  s[2] = (wc & 0x3f) | 0x80;
+  /* Error Invalid UTF-8 */
+  l = 0;
 }
   s[l] = '\0';
   return l;
@@ -150,7 +185,7 @@ u32cconv (c, s)
-BEGIN PGP SIGNATURE-
Version: GnuPG v1.4.11 (GNU/Linux)
Comment: Using GnuPG with Mozilla - http://enigmail.mozdev.org/

iQEcBAEBAgAGBQJPP3/tAAoJEKUDtR0WmS059CcH/iIyBOGhf0IgSmnIFyw0YLpA
3ZWSaXWoEZodrDr1fX67hj2424icXm9fTZw70G+rS1YjtCfm86O/Qou4VNROylAv
TbjPUWkHRWVci7IqcDGb1tNWRrulxUvNFA/Uc1xBtKckAO6HHHRTYFa+sCkd5Fnx
dm7e0iMTqMMmL/dUwB+di+hSkGD+ZXS1vY76wizdwG7CteUxAVunse+ffP7TRYbn
K86Whc7p7llG12hruCPGArc9iS7YiBaC/XNIKXmN7fn93dhQTcdzzk/UTGmaZgDk
cQk4R7/NBljP4LtQtKwX4JYAi5XJM5TeSLykL97UFxW/5OGM+SmSVJbKLlHU/mQ=
=EJUb
-END PGP SIGNATURE-



Re: UTF-8 Encode problems with \u \U

2012-02-18 Thread Andreas Schwab
John Kearney  writes:

> How can doing a direct assignment, in less cases break anything,

Where did I say that?

> And it does work for me, so impossible seems to be overstating it.

How is it possible to affect UTF-8 while not affecting UTF-8?

Andreas.

-- 
Andreas Schwab, sch...@linux-m68k.org
GPG Key fingerprint = 58CA 54C7 6D53 942B 1756  01D3 44D5 214B 8276 4ED5
"And now for something completely different."



Questionable code behavior in u32cconv?

2012-02-18 Thread John Kearney
-BEGIN PGP SIGNED MESSAGE-
Hash: SHA1

Configuration Information [Automatically generated, do not change]:
Machine: x86_64
OS: linux-gnu
Compiler: gcc
Compilation CFLAGS:  -DPROGRAM='bash' -DCONF_HOSTTYPE='x86_64'
- -DCONF_OSTYPE='linux-gnu' -DCONF_MACHTYPE='x86_64-pc-linux-gnu'
- -DCONF_VENDOR='pc' -DLOCALEDIR='/usr/share/locale' -DPACKAGE='bash'
- -DSHELL -DHAVE_CONFIG_H   -I.  -I../bash -I../bash/include
- -I../bash/lib   -g -O2 -Wall
uname output: Linux DETH00 3.0.0-15-generic #26-Ubuntu SMP Fri Jan 20
17:23:00 UTC 2012 x86_64 x86_64 x86_64 GNU/Linux
Machine Type: x86_64-pc-linux-gnu

Bash Version: 4.2
Patch Level: 10
Release Status: release

Description:
Now I may be misreading the code but it looks like the code relating
to iconv is only checking the destination charset the first time, the
code is executed.

as such breaking the following functionality.
LC_CTYPE=C printf '\uff'
LC_CTYPE=C.UTF-8 printf '\uff'

Repeat-By:
haven't seen the problem.

Fix:
  Not so much a fix as a modification that should hopefully clarify my
concern.



diff --git a/lib/sh/unicode.c b/lib/sh/unicode.c
index d34fa08..3f7d378 100644
- --- a/lib/sh/unicode.c
+++ b/lib/sh/unicode.c
@@ -54,7 +54,7 @@ extern const char *locale_charset __P((void));
 extern char *get_locale_var __P((char *));
 #endif

- -static int u32init = 0;
+const char *charset;
 static int utf8locale = 0;
 #if defined (HAVE_ICONV)
 static iconv_t localconv;
@@ -115,26 +115,61 @@ u32tochar (wc, s)
 }

@@ -150,7 +185,7 @@ u32cconv (c, s)
   wchar_t wc;
   int n;
 #if HAVE_ICONV
- -  const char *charset;
+  const char *ncharset;
   char obuf[25], *optr;
   size_t obytesleft;
   const char *iptr;
@@ -171,20 +206,22 @@ u32cconv (c, s)
   codeset = nl_langinfo (CODESET);
   if (STREQ (codeset, "UTF-8"))
 {
   n = u32toutf8 (wc, s);
   return n;
 }
 #endif

 #if HAVE_ICONV
- -  /* this is mostly from coreutils-8.5/lib/unicodeio.c */
- -  if (u32init == 0)
- -{
 #  if HAVE_LOCALE_CHARSET
- -  charset = locale_charset ();   /* XXX - fix later */
+  ncharset = locale_charset ();/* XXX - fix later */
 #  else
- -  charset = stub_charset ();
+  ncharset = stub_charset ();
 #  endif
+  /* this is mostly from coreutils-8.5/lib/unicodeio.c */
+  if (STREQ (charset, ncharset))
+{
+  /* Free Old charset str ? */
+  charset=ncharset;
   if (STREQ (charset, "UTF-8"))
utf8locale = 1;
   else
-BEGIN PGP SIGNATURE-
Version: GnuPG v1.4.11 (GNU/Linux)
Comment: Using GnuPG with Mozilla - http://enigmail.mozdev.org/

iQEcBAEBAgAGBQJPP5SCAAoJEKUDtR0WmS05L8QH/RUz/X8QZk7HXDIFUTCd0Eah
MkfWpCtib9Jt5jUBcb+/UZKiwTSxYGm7D9X08Tpho+i7c+3kknWUGTkivqg7eVo4
TlRA+N4k3x8PdpbYPFNGxgy9LRSViQjqbbzNfYaX+Pbi2YIbZRuaPBipEdbvBqDG
bN7KaUM/97vZicZn5SOrhcDiq1RfJosdTkr7egEON4P4BBIXIVk4vRcCF/xXCw6M
w2BmvpavV3ra1TXhYN2C678qMyncq5kr8e0tvIl4EY6oCurMlvXhoNkOcz14fOMa
XrYJUu1dDNKXmTsJFjDGZhyzvTejLVezjn91/so2OINinqHW++2IMFim5ED9w28=
=rW+v
-END PGP SIGNATURE-