Some of the code that uses utf8clen checks the validity of the utf8
string before making the call.
However, there were some hairy areas where I felt that the new semantics
may cause issues (if not now, then in future changes).
I've attached two patches:
* new_semantics.diff keeps the new semantics and updates those
hairy areas above.
* old_semantics.diff maintains the old semantics (return 1 even for
continuation bytes).
I don't think the new semantics will cause issues, especially with the
updates, but we can err on the side of caution and keep the old
semantics. I feel that the new semantics provide a clearer interface
though (the function expects a start byte and should return an error if
a start byte is not supplied).
In either case, the utf8_table4 array has been removed.
Sahil
On 03/19/2017 05:38 AM, Duncan Murdoch wrote:
On 19/03/2017 2:31 AM, Sahil Kang wrote:
Given a char `c' which should be the start byte of a utf8 character,
the utf8clen function returns the byte length of the utf8 character.
Before this patch, the utf8clen function would return either:
* 1 if `c' was an ascii character or a utf8 continuation byte
* An int in the range [2, 6] indicating the byte length of the utf8
character
With this patch, the utf8clen function will now return either:
* -1 if `c' is not a valid utf8 start byte
* The byte length of the utf8 character (the number of leading 1's,
really)
I believe returning -1 for continuation bytes makes utf8clen less error
prone.
The utf8_table4 array is no longer needed and has been removed.
utf8clen is used internally by R in more than a dozen places, and is
likely used in packages as well. Have you checked that this change in
semantics won't break any of those uses?
Duncan Murdoch
Index: src/main/util.c
===================================================================
--- src/main/util.c (revision 72365)
+++ src/main/util.c (working copy)
@@ -1183,18 +1183,23 @@
return TRUE;
}
-/* Number of additional bytes */
-static const unsigned char utf8_table4[] = {
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
- 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
- 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 };
-
+/*
+ * If `c' is not a valid utf8 start byte, return 1.
+ * Otherwise, return the number of bytes in the utf8 string with start byte `c'
+ */
int attribute_hidden utf8clen(char c)
{
- /* This allows through 8-bit chars 10xxxxxx, which are invalid */
- if ((c & 0xc0) != 0xc0) return 1;
- return 1 + utf8_table4[c & 0x3f];
+ int n = 0; /* number of leading 1's */
+ int m = 0x80; /* byte mask */
+
+ while (c & m) {
+ ++n;
+ m >>= 1;
+ }
+
+ if (n == 0) return 1; /* an ascii char of the form 0xxxxxxx */
+ else if (n == 1) return 1; /* invalid start byte of the form 10xxxxxx */
+ else return n;
}
/* These return the result in wchar_t, but does assume
Index: src/main/valid_utf8.h
===================================================================
--- src/main/valid_utf8.h (revision 72365)
+++ src/main/valid_utf8.h (working copy)
@@ -75,7 +75,7 @@
if (c < 0xc0) return 1; /* Isolated 10xx xxxx byte */
if (c >= 0xfe) return 1; /* Invalid 0xfe or 0xff bytes */
- ab = utf8_table4[c & 0x3f]; /* Number of additional bytes */
+ ab = utf8clen(c) - 1; /* Number of additional bytes */
if (length < ab) return 1;
length -= ab; /* Length remaining */
Index: src/main/character.c
===================================================================
--- src/main/character.c (revision 72365)
+++ src/main/character.c (working copy)
@@ -276,7 +276,9 @@
if (ienc == CE_UTF8) {
const char *end = str + strlen(str);
for (i = 0; i < so && str < end; i++) {
- int used = utf8clen(*str);
+ used = utf8clen(*str);
+ if (used < 0) used = 1; /* gobble up invalid utf8 start byte */
+
if (i < sa - 1) { str += used; continue; }
for (j = 0; j < used; j++) *buf++ = *str++;
}
@@ -459,10 +461,18 @@
int i, in = 0, out = 0;
if (ienc == CE_UTF8) {
- for (i = 1; i < sa; i++) buf += utf8clen(*buf);
+ int len;
+ for (i = 1; i < sa; i++) {
+ len = utf8clen(*buf);
+ buf += len < 0 ? 1 : len;
+ }
for (i = sa; i <= so && in < strlen(str); i++) {
- in += utf8clen(str[in]);
- out += utf8clen(buf[out]);
+ len = utf8clen(str[in]);
+ in += len < 0 ? 1 : len;
+
+ len = utf8clen(buf[out]);
+ out += len < 0 ? 1 : len;
+
if (!str[in]) break;
}
if (in != out) memmove(buf+in, buf+out, strlen(buf+out)+1);
Index: src/main/connections.c
===================================================================
--- src/main/connections.c (revision 72365)
+++ src/main/connections.c (working copy)
@@ -4408,6 +4408,7 @@
if (iread >= nbytes) break;
q = bytes + iread;
clen = utf8clen(*q);
+ if (clen < 0) clen = 1; /* gobble up invalid utf8 start byte */
if (iread + clen > nbytes)
error(_("invalid UTF-8 input in readChar()"));
memcpy(p, q, clen);
Index: src/main/gram.c
===================================================================
--- src/main/gram.c (revision 72365)
+++ src/main/gram.c (working copy)
@@ -282,6 +282,7 @@
}
if(utf8locale) {
clen = utf8clen((char) c);
+ if (clen < 0) clen = 1; /* gobble up invalid utf8 start byte */
for(i = 1; i < clen; i++) {
s[i] = (char) xxgetc();
if(s[i] == R_EOF) error(_("EOF whilst reading MBCS char at line %d"), ParseState.xxlineno);
@@ -4578,6 +4579,7 @@
}
if(utf8locale) {
clen = utf8clen(c);
+ if (clen < 0) clen = 1; /* gobble up invalid utf8 start byte */
for(i = 1; i < clen; i++) {
s[i] = xxgetc();
if(s[i] == R_EOF) error(_("EOF whilst reading MBCS char at line %d"), ParseState.xxlineno);
Index: src/main/gram.y
===================================================================
--- src/main/gram.y (revision 72365)
+++ src/main/gram.y (working copy)
@@ -217,6 +217,7 @@
}
if(utf8locale) {
clen = utf8clen((char) c);
+ if (clen < 0) clen = 1; /* gobble up invalid utf8 start byte */
for(i = 1; i < clen; i++) {
s[i] = (char) xxgetc();
if(s[i] == R_EOF) error(_("EOF whilst reading MBCS char at line %d"), ParseState.xxlineno);
@@ -2240,6 +2241,7 @@
}
if(utf8locale) {
clen = utf8clen(c);
+ if (clen < 0) clen = 1; /* gobble up invalid utf8 start byte */
for(i = 1; i < clen; i++) {
s[i] = xxgetc();
if(s[i] == R_EOF) error(_("EOF whilst reading MBCS char at line %d"), ParseState.xxlineno);
Index: src/main/grep.c
===================================================================
--- src/main/grep.c (revision 72365)
+++ src/main/grep.c (working copy)
@@ -374,12 +374,17 @@
ssize_t nt; /* need to check error on size_t */
if (use_UTF8) {
- for (ntok = 0; *p; p += used, ntok++)
- used = utf8clen(*p);
+ int len;
+ for (ntok = 0; *p; p += used, ntok++) {
+ len = utf8clen(*p);
+ used = len < 0 ? 1 : len;
+ }
p = buf;
PROTECT(t = allocVector(STRSXP, ntok));
for (j = 0; j < ntok; j++, p += used) {
- used = utf8clen(*p);
+ len = utf8clen(*p);
+ used = len < 0 ? 1 : len;
+
memcpy(bf, p, used); bf[used] = '\0';
SET_STRING_ELT(t, j, mkCharCE(bf, CE_UTF8));
}
@@ -837,6 +842,7 @@
return i;
}
used = utf8clen(target[ib]);
+ if (used < 0) used = 1; /* gobble up invalid utf8 start byte */
if (used <= 0) break;
ib += used;
}
@@ -884,6 +890,7 @@
for (ib = 0, i = 0; ib <= len-plen; i++) {
if (strncmp(pat, target+ib, plen) == 0) return ib;
used = utf8clen(target[ib]);
+ if (used < 0) used = 1; /* gobble up invalid utf8 start byte */
if (used <= 0) break;
ib += used;
}
@@ -1955,7 +1962,7 @@
if (use_UTF8) {
int used, pos = 0;
while( (used = utf8clen(s[pos])) ) {
- pos += used;
+ pos += used < 0 ? 1 : used;
if (pos > offset) {
for (j = offset; j < pos; j++) *u++ = s[j];
offset = pos;
Index: src/main/util.c
===================================================================
--- src/main/util.c (revision 72365)
+++ src/main/util.c (working copy)
@@ -1183,18 +1183,23 @@
return TRUE;
}
-/* Number of additional bytes */
-static const unsigned char utf8_table4[] = {
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
- 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
- 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 };
-
+/*
+ * If `c' is not a valid utf8 start byte, return -1.
+ * Otherwise, return the number of bytes in the utf8 string with start byte `c'
+ */
int attribute_hidden utf8clen(char c)
{
- /* This allows through 8-bit chars 10xxxxxx, which are invalid */
- if ((c & 0xc0) != 0xc0) return 1;
- return 1 + utf8_table4[c & 0x3f];
+ int n = 0; /* number of leading 1's */
+ int m = 0x80; /* byte mask */
+
+ while (c & m) {
+ ++n;
+ m >>= 1;
+ }
+
+ if (n == 0) return 1; /* an ascii char of the form 0xxxxxxx */
+ else if (n == 1) return -1; /* invalid start byte of the form 10xxxxxx */
+ else return n;
}
/* These return the result in wchar_t, but does assume
Index: src/main/valid_utf8.h
===================================================================
--- src/main/valid_utf8.h (revision 72365)
+++ src/main/valid_utf8.h (working copy)
@@ -75,7 +75,7 @@
if (c < 0xc0) return 1; /* Isolated 10xx xxxx byte */
if (c >= 0xfe) return 1; /* Invalid 0xfe or 0xff bytes */
- ab = utf8_table4[c & 0x3f]; /* Number of additional bytes */
+ ab = utf8clen(c) - 1; /* Number of additional bytes */
if (length < ab) return 1;
length -= ab; /* Length remaining */
______________________________________________
R-devel@r-project.org mailing list
https://stat.ethz.ch/mailman/listinfo/r-devel