Some of the code that uses utf8clen checks the validity of the utf8 string before making the call. However, there were some hairy areas where I felt that the new semantics may cause issues (if not now, then in future changes).

I've attached two patches:
* new_semantics.diff keeps the new semantics and updates those hairy areas above. * old_semantics.diff maintains the old semantics (return 1 even for continuation bytes).

I don't think the new semantics will cause issues, especially with the updates, but we can err on the side of caution and keep the old semantics. I feel that the new semantics provide a clearer interface though (the function expects a start byte and should return an error if a start byte is not supplied).
In either case, the utf8_table4 array has been removed.

Sahil

On 03/19/2017 05:38 AM, Duncan Murdoch wrote:
On 19/03/2017 2:31 AM, Sahil Kang wrote:
Given a char `c' which should be the start byte of a utf8 character,
the utf8clen function returns the byte length of the utf8 character.

Before this patch, the utf8clen function would return either:
     * 1 if `c' was an ascii character or a utf8 continuation byte
     * An int in the range [2, 6] indicating the byte length of the utf8
character

With this patch, the utf8clen function will now return either:
     * -1 if `c' is not a valid utf8 start byte
     * The byte length of the utf8 character (the number of leading 1's,
really)

I believe returning -1 for continuation bytes makes utf8clen less error
prone.
The utf8_table4 array is no longer needed and has been removed.

utf8clen is used internally by R in more than a dozen places, and is likely used in packages as well. Have you checked that this change in semantics won't break any of those uses?

Duncan Murdoch


Index: src/main/util.c
===================================================================
--- src/main/util.c	(revision 72365)
+++ src/main/util.c	(working copy)
@@ -1183,18 +1183,23 @@
     return TRUE;
 }
 
-/* Number of additional bytes */
-static const unsigned char utf8_table4[] = {
-  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
-  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
-  2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
-  3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 };
-
+/*
+ * If `c' is not a valid utf8 start byte, return 1.
+ * Otherwise, return the number of bytes in the utf8 string with start byte `c'
+ */
 int attribute_hidden utf8clen(char c)
 {
-    /* This allows through 8-bit chars 10xxxxxx, which are invalid */
-    if ((c & 0xc0) != 0xc0) return 1;
-    return 1 + utf8_table4[c & 0x3f];
+    int n = 0; /* number of leading 1's */
+    int m = 0x80; /* byte mask */
+
+    while (c & m) {
+        ++n;
+        m >>= 1;
+    }
+
+    if (n == 0) return 1; /* an ascii char of the form 0xxxxxxx */
+    else if (n == 1) return 1; /* invalid start byte of the form 10xxxxxx */
+    else return n;
 }
 
 /* These return the result in wchar_t, but does assume
Index: src/main/valid_utf8.h
===================================================================
--- src/main/valid_utf8.h	(revision 72365)
+++ src/main/valid_utf8.h	(working copy)
@@ -75,7 +75,7 @@
 	if (c < 0xc0) return 1;               /* Isolated 10xx xxxx byte */
 	if (c >= 0xfe) return 1;             /* Invalid 0xfe or 0xff bytes */
 
-	ab = utf8_table4[c & 0x3f];     /* Number of additional bytes */
+	ab = utf8clen(c) - 1;     /* Number of additional bytes */
 	if (length < ab) return 1;
 	length -= ab;                         /* Length remaining */
 
Index: src/main/character.c
===================================================================
--- src/main/character.c	(revision 72365)
+++ src/main/character.c	(working copy)
@@ -276,7 +276,9 @@
     if (ienc == CE_UTF8) {
 	const char *end = str + strlen(str);
 	for (i = 0; i < so && str < end; i++) {
-	    int used = utf8clen(*str);
+	    used = utf8clen(*str);
+	    if (used < 0) used = 1; /* gobble up invalid utf8 start byte */
+
 	    if (i < sa - 1) { str += used; continue; }
 	    for (j = 0; j < used; j++) *buf++ = *str++;
 	}
@@ -459,10 +461,18 @@
     int i, in = 0, out = 0;
 
     if (ienc == CE_UTF8) {
-	for (i = 1; i < sa; i++) buf += utf8clen(*buf);
+	int len;
+	for (i = 1; i < sa; i++) {
+	    len = utf8clen(*buf);
+	    buf += len < 0 ? 1 : len;
+	}
 	for (i = sa; i <= so && in < strlen(str); i++) {
-	    in +=  utf8clen(str[in]);
-	    out += utf8clen(buf[out]);
+	    len = utf8clen(str[in]);
+	    in +=  len < 0 ? 1 : len;
+
+	    len = utf8clen(buf[out]);
+	    out += len < 0 ? 1 : len;
+
 	    if (!str[in]) break;
 	}
 	if (in != out) memmove(buf+in, buf+out, strlen(buf+out)+1);
Index: src/main/connections.c
===================================================================
--- src/main/connections.c	(revision 72365)
+++ src/main/connections.c	(working copy)
@@ -4408,6 +4408,7 @@
 	    if (iread >= nbytes) break;
 	    q = bytes + iread;
 	    clen = utf8clen(*q);
+	    if (clen < 0) clen = 1; /* gobble up invalid utf8 start byte */
 	    if (iread + clen > nbytes)
 		error(_("invalid UTF-8 input in readChar()"));
 	    memcpy(p, q, clen);
Index: src/main/gram.c
===================================================================
--- src/main/gram.c	(revision 72365)
+++ src/main/gram.c	(working copy)
@@ -282,6 +282,7 @@
     }
     if(utf8locale) {
 	clen = utf8clen((char) c);
+	if (clen < 0) clen = 1; /* gobble up invalid utf8 start byte */
 	for(i = 1; i < clen; i++) {
 	    s[i] = (char) xxgetc();
 	    if(s[i] == R_EOF) error(_("EOF whilst reading MBCS char at line %d"), ParseState.xxlineno);
@@ -4578,6 +4579,7 @@
     }
     if(utf8locale) {
 	clen = utf8clen(c);
+	if (clen < 0) clen = 1; /* gobble up invalid utf8 start byte */
 	for(i = 1; i < clen; i++) {
 	    s[i] = xxgetc();
 	    if(s[i] == R_EOF) error(_("EOF whilst reading MBCS char at line %d"), ParseState.xxlineno);
Index: src/main/gram.y
===================================================================
--- src/main/gram.y	(revision 72365)
+++ src/main/gram.y	(working copy)
@@ -217,6 +217,7 @@
     }
     if(utf8locale) {
 	clen = utf8clen((char) c);
+	if (clen < 0) clen = 1; /* gobble up invalid utf8 start byte */
 	for(i = 1; i < clen; i++) {
 	    s[i] = (char) xxgetc();
 	    if(s[i] == R_EOF) error(_("EOF whilst reading MBCS char at line %d"), ParseState.xxlineno);
@@ -2240,6 +2241,7 @@
     }
     if(utf8locale) {
 	clen = utf8clen(c);
+	if (clen < 0) clen = 1; /* gobble up invalid utf8 start byte */
 	for(i = 1; i < clen; i++) {
 	    s[i] = xxgetc();
 	    if(s[i] == R_EOF) error(_("EOF whilst reading MBCS char at line %d"), ParseState.xxlineno);
Index: src/main/grep.c
===================================================================
--- src/main/grep.c	(revision 72365)
+++ src/main/grep.c	(working copy)
@@ -374,12 +374,17 @@
 		    ssize_t nt;  /* need to check error on size_t */
 
 		    if (use_UTF8) {
-			for (ntok = 0; *p; p += used, ntok++)
-			    used = utf8clen(*p);
+		        int len;
+			for (ntok = 0; *p; p += used, ntok++) {
+			    len = utf8clen(*p);
+			    used = len < 0 ? 1 : len;
+			}
 			p = buf;
 			PROTECT(t = allocVector(STRSXP, ntok));
 			for (j = 0; j < ntok; j++, p += used) {
-			    used = utf8clen(*p);
+			    len = utf8clen(*p);
+			    used = len < 0 ? 1 : len;
+
 			    memcpy(bf, p, used); bf[used] = '\0';
 			    SET_STRING_ELT(t, j, mkCharCE(bf, CE_UTF8));
 			}
@@ -837,6 +842,7 @@
 		return i;
 	    }
 	    used = utf8clen(target[ib]);
+	    if (used < 0) used = 1; /* gobble up invalid utf8 start byte */
 	    if (used <= 0) break;
 	    ib += used;
 	}
@@ -884,6 +890,7 @@
 	for (ib = 0, i = 0; ib <= len-plen; i++) {
 	    if (strncmp(pat, target+ib, plen) == 0) return ib;
 	    used = utf8clen(target[ib]);
+	    if (used < 0) used = 1; /* gobble up invalid utf8 start byte */
 	    if (used <= 0) break;
 	    ib += used;
 	}
@@ -1955,7 +1962,7 @@
 		   if (use_UTF8) {
 		       int used, pos = 0;
 		       while( (used = utf8clen(s[pos])) ) {
-			   pos += used;
+			   pos += used < 0 ? 1 : used;
 			   if (pos > offset) {
 			       for (j = offset; j < pos; j++) *u++ = s[j];
 			       offset = pos;
Index: src/main/util.c
===================================================================
--- src/main/util.c	(revision 72365)
+++ src/main/util.c	(working copy)
@@ -1183,18 +1183,23 @@
     return TRUE;
 }
 
-/* Number of additional bytes */
-static const unsigned char utf8_table4[] = {
-  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
-  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
-  2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
-  3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 };
-
+/*
+ * If `c' is not a valid utf8 start byte, return -1.
+ * Otherwise, return the number of bytes in the utf8 string with start byte `c'
+ */
 int attribute_hidden utf8clen(char c)
 {
-    /* This allows through 8-bit chars 10xxxxxx, which are invalid */
-    if ((c & 0xc0) != 0xc0) return 1;
-    return 1 + utf8_table4[c & 0x3f];
+    int n = 0; /* number of leading 1's */
+    int m = 0x80; /* byte mask */
+
+    while (c & m) {
+        ++n;
+        m >>= 1;
+    }
+
+    if (n == 0) return 1; /* an ascii char of the form 0xxxxxxx */
+    else if (n == 1) return -1; /* invalid start byte of the form 10xxxxxx */
+    else return n;
 }
 
 /* These return the result in wchar_t, but does assume
Index: src/main/valid_utf8.h
===================================================================
--- src/main/valid_utf8.h	(revision 72365)
+++ src/main/valid_utf8.h	(working copy)
@@ -75,7 +75,7 @@
 	if (c < 0xc0) return 1;               /* Isolated 10xx xxxx byte */
 	if (c >= 0xfe) return 1;             /* Invalid 0xfe or 0xff bytes */
 
-	ab = utf8_table4[c & 0x3f];     /* Number of additional bytes */
+	ab = utf8clen(c) - 1;     /* Number of additional bytes */
 	if (length < ab) return 1;
 	length -= ab;                         /* Length remaining */
 
______________________________________________
R-devel@r-project.org mailing list
https://stat.ethz.ch/mailman/listinfo/r-devel

Reply via email to