I wrote: > Level 2: Behave correctly, except that a 2-Unicode-character sequence > may be split although it shouldn't. > This is what code that uses mbrtoc32() does, when it has the > lines > if (bytes == (size_t) -3) > bytes = 0; > Without these lines, the string pointer could be decremented > by 3, thus accessing invalid memory or running into an endless > loop. > This level is also what > printf (".*s", nbytes, string); > does: it truncates strings at a position where they should not > be truncated. So, it's not terribly uncommon. > > Level 3: Behave correctly. Don't split a 2-Unicode-character sequence. > This is what code that uses mbrtoc32() does, when it has the > lines > if (bytes == (size_t) -3) > bytes = 0; > and uses !mbsinit (&state) in the loop termination condition.
The patch below brings all mbrtoc32 uses in gnulib to level 3. There is nothing to do - in mbswidth.c and quotearg.c, because mbsinit (&state) is already part of the loop termination condition, - in mbuiter.h, because the termination condition is a NUL character. 2023-07-04 Bruno Haible <br...@clisp.org> mbiter, mbfile, mbmemcasecoll: Improve handling of mbrtoc32 result. * lib/mbiter.h (mbi_avail): If cur.ptr has reached the limit but in_shift is true, call mbiter_multi_next. (mbiter_multi_next): Set in_shift to false after an incomplete multibyte character. * lib/mbfile.h (mbfile_multi_getc): Pass the input bytes incrementally into mbrtoc32. When mbf->state is not in the initial state, call mbrtoc32 again. * lib/mbmemcasecoll.c (apply_c32tolower): When the state is not in the initial state, call mbrtoc32 again. (diff -w patches below) diff --git a/lib/mbiter.h b/lib/mbiter.h index e4963010f3..8bd83d7262 100644 --- a/lib/mbiter.h +++ b/lib/mbiter.h @@ -155,8 +155,10 @@ mbiter_multi_next (struct mbiter_multi *iter) /* An incomplete multibyte character at the end. */ iter->cur.bytes = iter->limit - iter->cur.ptr; iter->cur.wc_valid = false; - /* Whether to set iter->in_shift = false and reset iter->state - or not is not important; the string end is reached anyway. */ + /* Cause the next mbi_avail invocation to return false. */ + iter->in_shift = false; + /* Whether to reset iter->state or not is not important; the + string end is reached anyway. */ } else { @@ -208,7 +210,8 @@ typedef struct mbiter_multi mbi_iterator_t; (iter).in_shift = false, memset (&(iter).state, '\0', sizeof (mbstate_t)), \ (iter).next_done = false) #define mbi_avail(iter) \ - ((iter).cur.ptr < (iter).limit && (mbiter_multi_next (&(iter)), true)) + (((iter).cur.ptr < (iter).limit || (iter).in_shift) \ + && (mbiter_multi_next (&(iter)), true)) #define mbi_advance(iter) \ ((iter).cur.ptr += (iter).cur.bytes, (iter).next_done = false) diff --git a/lib/mbfile.h b/lib/mbfile.h index 7670dabfbb..ee1dd5a4b9 100644 --- a/lib/mbfile.h +++ b/lib/mbfile.h @@ -77,6 +77,7 @@ struct mbfile_multi { MBFILE_INLINE void mbfile_multi_getc (struct mbchar *mbc, struct mbfile_multi *mbf) { + unsigned int new_bufcount; size_t bytes; /* If EOF has already been seen, don't use getc. This matters if @@ -92,8 +93,14 @@ mbfile_multi_getc (struct mbchar *mbc, struct mbfile_multi *mbf) return; } - /* Before using mbrtowc, we need at least one byte. */ - if (mbf->bufcount == 0) + new_bufcount = mbf->bufcount; + + /* If mbf->state is not in the initial state, some more 32-bit wide character + may be hiding in the state. We need to call mbrtoc32 again. */ + if (mbsinit (&mbf->state)) + { + /* Before using mbrtoc32, we need at least one byte. */ + if (new_bufcount == 0) { int c = getc (mbf->fp); if (c == EOF) @@ -102,11 +109,11 @@ mbfile_multi_getc (struct mbchar *mbc, struct mbfile_multi *mbf) goto eof; } mbf->buf[0] = (unsigned char) c; - mbf->bufcount++; + new_bufcount++; } - /* Handle most ASCII characters quickly, without calling mbrtowc(). */ - if (mbf->bufcount == 1 && mbsinit (&mbf->state) && is_basic (mbf->buf[0])) + /* Handle most ASCII characters quickly, without calling mbrtoc32(). */ + if (new_bufcount == 1 && is_basic (mbf->buf[0])) { /* These characters are part of the POSIX portable character set. For most of them, namely those in the ISO C basic character set, @@ -121,25 +128,16 @@ mbfile_multi_getc (struct mbchar *mbc, struct mbfile_multi *mbf) mbf->bufcount = 0; return; } + } - /* Use mbrtowc on an increasing number of bytes. Read only as many bytes + /* Use mbrtoc32 on an increasing number of bytes. Read only as many bytes from mbf->fp as needed. This is needed to give reasonable interactive behaviour when mbf->fp is connected to an interactive tty. */ for (;;) { - /* We don't know whether the 'mbrtowc' function updates the state when - it returns -2, - this is the ISO C 99 and glibc-2.2 behaviour - or - not - amended ANSI C, glibc-2.1 and Solaris 2.7 behaviour. We - don't have an autoconf test for this, yet. - The new behaviour would allow us to feed the bytes one by one into - mbrtowc. But the old behaviour forces us to feed all bytes since - the end of the last character into mbrtowc. Since we want to retry - with more bytes when mbrtowc returns -2, we must backup the state - before calling mbrtowc, because implementations with the new - behaviour will clobber it. */ - mbstate_t backup_state = mbf->state; - - bytes = mbrtoc32 (&mbc->wc, &mbf->buf[0], mbf->bufcount, &mbf->state); + /* Feed the bytes one by one into mbrtoc32. */ + bytes = mbrtoc32 (&mbc->wc, &mbf->buf[mbf->bufcount], new_bufcount - mbf->bufcount, &mbf->state); + mbf->bufcount = new_bufcount; if (bytes == (size_t) -1) { @@ -154,7 +152,6 @@ mbfile_multi_getc (struct mbchar *mbc, struct mbfile_multi *mbf) else if (bytes == (size_t) -2) { /* An incomplete multibyte character. */ - mbf->state = backup_state; if (mbf->bufcount == MBCHAR_BUF_SIZE) { /* An overlong incomplete multibyte sequence was encountered. */ @@ -165,18 +162,18 @@ mbfile_multi_getc (struct mbchar *mbc, struct mbfile_multi *mbf) } else { - /* Read one more byte and retry mbrtowc. */ + /* Read one more byte and retry mbrtoc32. */ int c = getc (mbf->fp); if (c == EOF) { /* An incomplete multibyte character at the end. */ mbf->eof_seen = true; - bytes = mbf->bufcount; + bytes = new_bufcount; mbc->wc_valid = false; break; } - mbf->buf[mbf->bufcount] = (unsigned char) c; - mbf->bufcount++; + mbf->buf[new_bufcount] = (unsigned char) c; + new_bufcount++; } } else diff --git a/lib/mbmemcasecoll.c b/lib/mbmemcasecoll.c index de7390f6d1..f79f262dc0 100644 --- a/lib/mbmemcasecoll.c +++ b/lib/mbmemcasecoll.c @@ -51,15 +51,36 @@ apply_c32tolower (const char *inbuf, size_t inbufsize, remaining = inbufsize; while (remaining > 0) { - char32_t wc1; - size_t n1; mbstate_t state; memset (&state, '\0', sizeof (mbstate_t)); + do + { + char32_t wc1; + size_t n1; + n1 = mbrtoc32 (&wc1, inbuf, remaining, &state); - if (n1 == (size_t)(-2)) + + if (n1 == (size_t)(-1)) + { + /* Invalid multibyte character on input. + Copy one byte without modification. */ + *outbuf++ = *inbuf++; + remaining -= 1; + break; + } + else if (n1 == (size_t)(-2)) + { + /* Incomplete multibyte sequence on input. + Pass it through unmodified. */ + while (remaining > 0) + { + *outbuf++ = *inbuf++; + remaining -= 1; + } break; - if (n1 != (size_t)(-1)) + } + else { wint_t wc2; @@ -71,39 +92,28 @@ apply_c32tolower (const char *inbuf, size_t inbufsize, wc2 = c32tolower (wc1); if (wc2 != wc1) { + mbstate_t state2; size_t n2; - memset (&state, '\0', sizeof (mbstate_t)); - n2 = c32rtomb (outbuf, wc2, &state); + memset (&state2, '\0', sizeof (mbstate_t)); + n2 = c32rtomb (outbuf, wc2, &state2); if (n2 != (size_t)(-1)) { /* Store the translated multibyte character. */ - inbuf += n1; - remaining -= n1; outbuf += n2; - continue; + goto done_storing; } } /* Nothing to translate. */ memcpy (outbuf, inbuf, n1); + outbuf += n1; + done_storing: inbuf += n1; remaining -= n1; - outbuf += n1; - continue; } - - /* Invalid multibyte character on input. - Copy one byte without modification. */ - *outbuf++ = *inbuf++; - remaining -= 1; } - /* Incomplete multibyte sequence on input. - Pass it through unmodified. */ - while (remaining > 0) - { - *outbuf++ = *inbuf++; - remaining -= 1; + while (! mbsinit (&state)); } /* Verify the output buffer was large enough. */