Given a char `c' which should be the start byte of a utf8 character,
the utf8clen function returns the byte length of the utf8 character.
Before this patch, the utf8clen function would return either:
* 1 if `c' was an ascii character or a utf8 continuation byte
* An int in the range [2, 6] indicating the byte length of the utf8
character
With this patch, the utf8clen function will now return either:
* -1 if `c' is not a valid utf8 start byte
* The byte length of the utf8 character (the number of leading 1's,
really)
I believe returning -1 for continuation bytes makes utf8clen less error
prone.
The utf8_table4 array is no longer needed and has been removed.
Sahil
Index: src/main/util.c
===================================================================
--- src/main/util.c (revision 72365)
+++ src/main/util.c (working copy)
@@ -1183,18 +1183,23 @@
return TRUE;
}
-/* Number of additional bytes */
-static const unsigned char utf8_table4[] = {
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
- 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
- 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 };
-
+/*
+ * If `c' is not a valid utf8 start byte, return -1.
+ * Otherwise, return the number of bytes in the utf8 string with start byte `c'
+ */
int attribute_hidden utf8clen(char c)
{
- /* This allows through 8-bit chars 10xxxxxx, which are invalid */
- if ((c & 0xc0) != 0xc0) return 1;
- return 1 + utf8_table4[c & 0x3f];
+ int n = 0; /* number of leading 1's */
+ int m = 0x80; /* byte mask */
+
+ while (c & m) {
+ ++n;
+ m >>= 1;
+ }
+
+ if (n == 0) return 1; /* an ascii char of the form 0xxxxxxx */
+ else if (n == 1) return -1; /* invalid start byte of the form 10xxxxxx */
+ else return n;
}
/* These return the result in wchar_t, but does assume
Index: src/main/valid_utf8.h
===================================================================
--- src/main/valid_utf8.h (revision 72365)
+++ src/main/valid_utf8.h (working copy)
@@ -75,7 +75,7 @@
if (c < 0xc0) return 1; /* Isolated 10xx xxxx byte */
if (c >= 0xfe) return 1; /* Invalid 0xfe or 0xff bytes */
- ab = utf8_table4[c & 0x3f]; /* Number of additional bytes */
+ ab = utf8clen(c) - 1; /* Number of additional bytes */
if (length < ab) return 1;
length -= ab; /* Length remaining */
______________________________________________
R-devel@r-project.org mailing list
https://stat.ethz.ch/mailman/listinfo/r-devel