In Cygwin 1.7.2, the wctob() function clobbers the %ebx register, which belongs to the caller. The effects are random behaviour and crashes in the caller.
How to reproduce: ================= Compile this program, consisting of 2 parts, without optimization. It works fine. ================================= bugpart1.c ================================= #include <wchar.h> #include <locale.h> #include <stdio.h> #include <stdlib.h> #include <string.h> #include <errno.h> #include <limits.h> #define ASSERT(expr) \ do \ { \ if (!(expr)) \ { \ fprintf (stderr, "%s:%d: assertion failed\n", \ __FILE__, __LINE__); \ fflush (stderr); \ abort (); \ } \ } \ while (0) #define BUFSIZE 10 void dumpbuf(wchar_t buf[BUFSIZE]) { int i; printf ("buf ="); for (i = 0; i < BUFSIZE; i++) printf(" %04X", buf[i]); printf ("\n"); fflush (stdout); } void dumpstate(const char *prefix, mbstate_t *statep) { int i; printf ("%s = ", prefix); for (i = 0; i < sizeof (mbstate_t); i++) printf("%02X", ((unsigned char *)statep)[i]); printf ("\n"); fflush (stdout); } void step1 (wchar_t buf[BUFSIZE]) { size_t i; for (i = 0; i < BUFSIZE; i++) buf[i] = (wchar_t) 0xBADFACE; } void step2 (mbstate_t *statep) { memset (statep, '\0', sizeof (mbstate_t)); } void step3 (wchar_t buf[BUFSIZE], mbstate_t *statep, char *input) { wchar_t wc; size_t ret; wc = (wchar_t) 0xBADFACE; ret = mbrtowc (&wc, input + 1, 1, statep); ASSERT (ret == (size_t)(-2)); ASSERT (wc == (wchar_t) 0xBADFACE); ASSERT (!mbsinit (statep)); input[1] = '\0'; dumpbuf(buf); dumpstate("state",statep); } void step4 (wchar_t buf[BUFSIZE], mbstate_t *statep, mbstate_t *temp_statep, char *input) { const char *src; size_t ret; src = input + 2; *temp_statep = *statep; ret = mbsrtowcs (NULL, &src, 2, temp_statep); ASSERT (ret == 4); ASSERT (src == input + 2); ASSERT (!mbsinit (statep)); dumpbuf(buf); dumpstate("state",statep); dumpstate("temps",temp_statep); } extern void step5 (wchar_t buf[BUFSIZE], mbstate_t *statep, char *input); int main (int argc, char *argv[]) { if (setlocale (LC_ALL, "fr_FR.UTF-8") == NULL) return 1; { wchar_t buf[BUFSIZE]; mbstate_t state; mbstate_t temp_state; step1 (buf); /* Locale encoding is UTF-8. */ { char input[] = "B\303\274\303\237er"; step2 (&state); dumpbuf(buf); dumpstate("state",&state); step3 (buf, &state, input); #if 1 step4 (buf, &state, &temp_state, input); #else { const char *src; size_t ret; src = input + 2; temp_state = state; ret = mbsrtowcs (NULL, &src, 2, &temp_state); ASSERT (ret == 4); ASSERT (src == input + 2); ASSERT (!mbsinit (&state)); dumpbuf(buf); dumpstate("state",&state); dumpstate("temps",&temp_state); } #endif #if 1 step5 (buf, &state, input); #else { const char *src; size_t ret; src = input + 2; ret = mbsrtowcs (buf, &src, 2, &state); ASSERT (ret == 2); ASSERT (src == input + 5); dumpbuf(buf); dumpstate("state",&state); ASSERT (wctob (buf[0]) == EOF); ASSERT (wctob (buf[1]) == EOF); ASSERT (buf[2] == (wchar_t) 0xBADFACE); ASSERT (mbsinit (&state)); } #endif } } return 0; } ================================= bugpart2.c ================================= #include <wchar.h> #include <locale.h> #include <stdio.h> #include <stdlib.h> #include <string.h> #include <errno.h> #include <limits.h> #define ASSERT(expr) \ do \ { \ if (!(expr)) \ { \ fprintf (stderr, "%s:%d: assertion failed\n", \ __FILE__, __LINE__); \ fflush (stderr); \ abort (); \ } \ } \ while (0) #define BUFSIZE 10 extern void dumpbuf(wchar_t buf[BUFSIZE]); extern void dumpstate(const char *prefix, mbstate_t *statep); void step5 (wchar_t buf[BUFSIZE], mbstate_t *statep, char *input) { const char *src; size_t ret; src = input + 2; ret = mbsrtowcs (buf, &src, 2, statep); ASSERT (ret == 2); ASSERT (src == input + 5); dumpbuf(buf); dumpstate("state",statep); ASSERT (wctob (buf[0]) == EOF); ASSERT (wctob (buf[1]) == EOF); ASSERT (buf[2] == (wchar_t) 0xBADFACE); ASSERT (mbsinit (statep)); } ============================================================================== $ gcc -c bugpart1.c -Wall $ gcc -c bugpart2.c -Wall $ gcc bugpart1.o bugpart2.o $ ./a.exe buf = FACE FACE FACE FACE FACE FACE FACE FACE FACE FACE state = 0000000000000000 buf = FACE FACE FACE FACE FACE FACE FACE FACE FACE FACE state = 01000000C3000000 buf = FACE FACE FACE FACE FACE FACE FACE FACE FACE FACE state = 01000000C3000000 temps = 00000000C3000000 buf = 00FC 00DF FACE FACE FACE FACE FACE FACE FACE FACE state = 00000000C3000000 Then compile bugpart2 with optimization. The program crahes: $ gcc -c bugpart2.c -Wall -O $ gcc bugpart1.o bugpart2.o $ ./a.exe buf = FACE FACE FACE FACE FACE FACE FACE FACE FACE FACE state = 0000000000000000 buf = FACE FACE FACE FACE FACE FACE FACE FACE FACE FACE state = 01000000C3000000 buf = FACE FACE FACE FACE FACE FACE FACE FACE FACE FACE state = 01000000C3000000 temps = 00000000C3000000 buf = 00FC 00DF FACE FACE FACE FACE FACE FACE FACE FACE state = 00000000C3000000 bugpart2.c:38: assertion failed bash: [5528: 1] tcsetattr: Inappropriate ioctl for device Aborted (core dumped) Known facts: - When GCC optimizes, it allocates variables in registers. In this case, in bugpart2, the variable 'buf' gets tied to register %ebx. - %ebx is a saved registers, see gcc-4.5.0/gcc/config/i386/i386.h the value of CALL_USED_REGISTERS. Then single-step through bugpart2 (with gdb's 'nexti' command), while looking at the values of the saved registers. The gcc generated code is correct. Here it is, with comments on the right-hand side: ------------------------------------------------------------------------------------- _step5: pushl %ebp movl %esp, %ebp pushl %edi pushl %esi pushl %ebx subl $44, %esp movl 8(%ebp), %ebx buf movl 12(%ebp), %edi statep movl 16(%ebp), %esi input leal 2(%esi), %eax input+2 movl %eax, -16(%ebp) src movl %edi, 12(%esp) movl $2, 8(%esp) leal -16(%ebp), %eax movl %eax, 4(%esp) movl %ebx, (%esp) %ebx=0x22cd10 %esi=0x22ccf8 %edi=0x22cd08 call _mbsrtowcs call mbsrtowcs %ebx=0x22cd10 %esi=0x22ccf8 %edi=0x22cd08 cmpl $2, %eax ret == 2 je L2 ... L2: leal 5(%esi), %eax input+5 cmpl %eax, -16(%ebp) == src je L3 ... L3: movl %ebx, (%esp) buf %ebx=0x22cd10 %esi=0x22ccf8 %edi=0x22cd08 call _dumpbuf %ebx=0x22cd10 %esi=0x22ccf8 %edi=0x22cd08 movl %edi, 4(%esp) statep movl $LC2, (%esp) %ebx=0x22cd10 %esi=0x22ccf8 %edi=0x22cd08 call _dumpstate %ebx=0x22cd10 %esi=0x22ccf8 %edi=0x22cd08 movzwl (%ebx), %eax buf[0] movl %eax, (%esp) %ebx=0x22cd10 %esi=0x22ccf8 %edi=0x22cd08 call _wctob %ebx=0x22cdbc %esi=0x22ccf8 %edi=0x22cd08 cmpl $-1, %eax je L4 ... L4: movzwl 2(%ebx), %eax buf[1] movl %eax, (%esp) %ebx=0x22cdbc %esi=0x22ccf8 %edi=0x22cd08 call _wctob %ebx=0x228084 %esi=0x22ccf8 %edi=0x22cd08 cmpl $-1, %eax je L5 ... L5: cmpw $-1330, 4(%ebx) buf[2] == 0xbadface je L6 .p2align 4,,6 ... L6: movl %edi, (%esp) statep call _mbsinit testl %eax, %eax jne L8 ... L8: addl $44, %esp popl %ebx popl %esi popl %edi popl %ebp ret ------------------------------------------------------------------------------------- You can see that across each call to wctob, %ebx is clobbered. Origin of the bug: ================== This is the code in wctob.c: int wctob (wint_t c) { mbstate_t mbs; int retval = 0; unsigned char pwc; /* Put mbs in initial state. */ memset (&mbs, '\0', sizeof (mbs)); _REENT_CHECK_MISC(_REENT); retval = __wctomb (_REENT, &pwc, c, __locale_charset (), &mbs); if (c == EOF || retval != 1) return WEOF; else return (int)pwc; } And this is its disassembly: ------------------------------------------------------------------------------- 0x6110d510 <wctob>: push %ebp 0x6110d511 <wctob+1>: mov %esp,%ebp 0x6110d513 <wctob+3>: sub $0x38,%esp 0x6110d516 <wctob+6>: mov %ebx,-0xc(%ebp) save %ebx 0x6110d519 <wctob+9>: lea -0x18(%ebp),%ebx &mbs 0x6110d51c <wctob+12>: mov %esi,-0x8(%ebp) save %esi 0x6110d51f <wctob+15>: mov %edi,-0x4(%ebp) save %edi 0x6110d522 <wctob+18>: mov 0x8(%ebp),%edi c 0x6110d525 <wctob+21>: movl $0x8,0x8(%esp) 0x6110d52d <wctob+29>: movl $0x0,0x4(%esp) 0x6110d535 <wctob+37>: mov %ebx,(%esp) 0x6110d538 <wctob+40>: call 0x61107d30 <memset> call memset 0x6110d53d <wctob+45>: mov 0x6115da24,%esi 0x6110d543 <wctob+51>: call 0x61103a50 <__locale_charset> 0x6110d548 <wctob+56>: mov %ebx,0x10(%esp) 0x6110d54c <wctob+60>: mov %eax,0xc(%esp) 0x6110d550 <wctob+64>: movzwl %di,%eax 0x6110d553 <wctob+67>: mov %eax,0x8(%esp) 0x6110d557 <wctob+71>: lea -0xd(%ebp),%eax &pwc 0x6110d55a <wctob+74>: mov %eax,0x4(%esp) 0x6110d55e <wctob+78>: mov %fs:0x4,%eax 0x6110d564 <wctob+84>: sub $0x3000,%eax 0x6110d569 <wctob+89>: mov %eax,(%esp) _REENT 0x6110d56c <wctob+92>: call *%esi call __wctomb 0x6110d56e <wctob+94>: add $0x1,%edi 0x6110d571 <wctob+97>: je 0x6110d578 <wctob+104> 0x6110d573 <wctob+99>: sub $0x1,%eax 0x6110d576 <wctob+102>: je 0x6110d590 <wctob+128> 0x6110d578 <wctob+104>: mov $0xffffffff,%eax 0x6110d57d <wctob+109>: mov -0xc(%ebp),%ebx restore %ebx 0x6110d580 <wctob+112>: mov -0x8(%ebp),%esi restore %esi 0x6110d583 <wctob+115>: mov -0x4(%ebp),%edi restore %edi 0x6110d586 <wctob+118>: mov %ebp,%esp 0x6110d588 <wctob+120>: pop %ebp 0x6110d589 <wctob+121>: ret 0x6110d590 <wctob+128>: movzbl -0xd(%ebp),%eax 0x6110d594 <wctob+132>: jmp 0x6110d57d <wctob+109> ------------------------------------------------------------------------------- You can see that the area where %ebx is saved is in the bytes %ebp-12..%ebp-9. And in %ebp-13 you have the 'pwc' variable. The bug is that you are passing a 1-byte buffer to a function which will write up to MB_CUR_MAX bytes into this buffer. Of course it will clobber the memory area next to the 1-byte buffer, and this is the %ebx save area! This code dates back to 2002. When Cygwin did not support multibyte encodings, MB_CUR_MAX was effectively 1 always. But now, for the UTF-8 encoding at least, MB_CUR_MAX is effectively 4. Bruno -- Problem reports: http://cygwin.com/problems.html FAQ: http://cygwin.com/faq/ Documentation: http://cygwin.com/docs.html Unsubscribe info: http://cygwin.com/ml/#unsubscribe-simple