src/hb-buffer-private.hh | 1 src/hb-buffer.cc | 79 ++++++++- src/hb-buffer.h | 23 ++ src/hb-ot-layout-gsubgpos-private.hh | 25 -- src/hb-utf-private.hh | 306 ++++++++++++++++++----------------- test/api/test-buffer.c | 65 +++++++ 6 files changed, 328 insertions(+), 171 deletions(-)
New commits: commit 976c8f455221eb599d1c446eafd88d51d7d2aa65 Author: Behdad Esfahbod <[email protected]> Date: Wed Jul 16 15:34:20 2014 -0400 New API: hb_buffer_[sg]et_replacement_codepoint() With this change, we now by default replace broken UTF-8/16/32 bits with U+FFFD. This can be changed by calling new API on the buffer. Previously the replacement value used to be (hb_codepoint_t)-1. Note that hb_buffer_clear_contents() does NOT reset the replacement character. See discussion here: https://github.com/behdad/harfbuzz/commit/6f13b6d62daae4989e3cc2fe4b168e5c59650964 New API: hb_buffer_set_replacement_codepoint() hb_buffer_get_replacement_codepoint() diff --git a/src/hb-buffer-private.hh b/src/hb-buffer-private.hh index 3a2b9ab..5eccd3c 100644 --- a/src/hb-buffer-private.hh +++ b/src/hb-buffer-private.hh @@ -52,6 +52,7 @@ struct hb_buffer_t { hb_unicode_funcs_t *unicode; /* Unicode functions */ hb_segment_properties_t props; /* Script, language, direction */ hb_buffer_flags_t flags; /* BOT / EOT / etc. */ + hb_codepoint_t replacement; /* U+FFFD or something else. */ /* Buffer contents */ diff --git a/src/hb-buffer.cc b/src/hb-buffer.cc index 242cded..2377ba4 100644 --- a/src/hb-buffer.cc +++ b/src/hb-buffer.cc @@ -178,6 +178,7 @@ hb_buffer_t::reset (void) hb_unicode_funcs_destroy (unicode); unicode = hb_unicode_funcs_get_default (); + replacement = HB_BUFFER_REPLACEMENT_CODEPOINT_DEFAULT; clear (); } @@ -703,6 +704,7 @@ hb_buffer_get_empty (void) const_cast<hb_unicode_funcs_t *> (&_hb_unicode_funcs_nil), HB_SEGMENT_PROPERTIES_DEFAULT, HB_BUFFER_FLAG_DEFAULT, + HB_BUFFER_REPLACEMENT_CODEPOINT_DEFAULT, HB_BUFFER_CONTENT_TYPE_INVALID, true, /* in_error */ @@ -1048,6 +1050,42 @@ hb_buffer_get_flags (hb_buffer_t *buffer) /** + * hb_buffer_set_replacement_codepoint: + * @buffer: a buffer. + * @replacement: + * + * + * + * Since: 1.0 + **/ +void +hb_buffer_set_replacement_codepoint (hb_buffer_t *buffer, + hb_codepoint_t replacement) +{ + if (unlikely (hb_object_is_inert (buffer))) + return; + + buffer->replacement = replacement; +} + +/** + * hb_buffer_get_replacement_codepoint: + * @buffer: a buffer. + * + * + * + * Return value: + * + * Since: 1.0 + **/ +hb_codepoint_t +hb_buffer_get_replacement_codepoint (hb_buffer_t *buffer) +{ + return buffer->replacement; +} + + +/** * hb_buffer_reset: * @buffer: a buffer. * @@ -1299,6 +1337,7 @@ hb_buffer_add_utf (hb_buffer_t *buffer, int item_length) { typedef hb_utf_t<T, true> utf_t; + const hb_codepoint_t replacement = buffer->replacement; assert (buffer->content_type == HB_BUFFER_CONTENT_TYPE_UNICODE || (!buffer->len && buffer->content_type == HB_BUFFER_CONTENT_TYPE_INVALID)); @@ -1330,7 +1369,7 @@ hb_buffer_add_utf (hb_buffer_t *buffer, while (start < prev && buffer->context_len[0] < buffer->CONTEXT_LENGTH) { hb_codepoint_t u; - prev = utf_t::prev (prev, start, &u); + prev = utf_t::prev (prev, start, &u, replacement); buffer->context[0][buffer->context_len[0]++] = u; } } @@ -1341,7 +1380,7 @@ hb_buffer_add_utf (hb_buffer_t *buffer, { hb_codepoint_t u; const T *old_next = next; - next = utf_t::next (next, end, &u); + next = utf_t::next (next, end, &u, replacement); buffer->add (u, old_next - (const T *) text); } @@ -1351,7 +1390,7 @@ hb_buffer_add_utf (hb_buffer_t *buffer, while (next < end && buffer->context_len[1] < buffer->CONTEXT_LENGTH) { hb_codepoint_t u; - next = utf_t::next (next, end, &u); + next = utf_t::next (next, end, &u, replacement); buffer->context[1][buffer->context_len[1]++] = u; } diff --git a/src/hb-buffer.h b/src/hb-buffer.h index 777c3d9..7b0c920 100644 --- a/src/hb-buffer.h +++ b/src/hb-buffer.h @@ -186,12 +186,25 @@ hb_buffer_flags_t hb_buffer_get_flags (hb_buffer_t *buffer); + +#define HB_BUFFER_REPLACEMENT_CODEPOINT_DEFAULT 0xFFFDu + +/* Sets codepoint used to replace invalid UTF-8/16/32 entries. + * Default is 0xFFFDu. */ +void +hb_buffer_set_replacement_codepoint (hb_buffer_t *buffer, + hb_codepoint_t replacement); + +hb_codepoint_t +hb_buffer_get_replacement_codepoint (hb_buffer_t *buffer); + + /* Resets the buffer. Afterwards it's as if it was just created, * except that it has a larger buffer allocated perhaps... */ void hb_buffer_reset (hb_buffer_t *buffer); -/* Like reset, but does NOT clear unicode_funcs. */ +/* Like reset, but does NOT clear unicode_funcs and replacement_codepoint. */ void hb_buffer_clear_contents (hb_buffer_t *buffer); diff --git a/src/hb-utf-private.hh b/src/hb-utf-private.hh index cbacd67..68216c4 100644 --- a/src/hb-utf-private.hh +++ b/src/hb-utf-private.hh @@ -40,7 +40,8 @@ struct hb_utf_t<uint8_t, true> static inline const uint8_t * next (const uint8_t *text, const uint8_t *end, - hb_codepoint_t *unicode) + hb_codepoint_t *unicode, + hb_codepoint_t replacement) { /* Written to only accept well-formed sequences. * Based on ideas from ICU's U8_NEXT. @@ -101,23 +102,24 @@ struct hb_utf_t<uint8_t, true> return text; error: - *unicode = -1; + *unicode = replacement; return text; } static inline const uint8_t * prev (const uint8_t *text, const uint8_t *start, - hb_codepoint_t *unicode) + hb_codepoint_t *unicode, + hb_codepoint_t replacement) { const uint8_t *end = text--; while (start < text && (*text & 0xc0) == 0x80 && end - text < 4) text--; - if (likely (next (text, end, unicode) == end)) + if (likely (next (text, end, unicode, replacement) == end)) return text; - *unicode = -1; + *unicode = replacement; return end - 1; } @@ -137,7 +139,8 @@ struct hb_utf_t<uint16_t, true> static inline const uint16_t * next (const uint16_t *text, const uint16_t *end, - hb_codepoint_t *unicode) + hb_codepoint_t *unicode, + hb_codepoint_t replacement) { hb_codepoint_t c = *text++; @@ -161,14 +164,15 @@ struct hb_utf_t<uint16_t, true> } /* Lonely / out-of-order surrogate. */ - *unicode = -1; + *unicode = replacement; return text; } static inline const uint16_t * prev (const uint16_t *text, const uint16_t *start, - hb_codepoint_t *unicode) + hb_codepoint_t *unicode, + hb_codepoint_t replacement) { const uint16_t *end = text--; hb_codepoint_t c = *text; @@ -182,10 +186,10 @@ struct hb_utf_t<uint16_t, true> if (likely (start < text && hb_in_range (c, 0xDC00u, 0xDFFFu))) text--; - if (likely (next (text, end, unicode) == end)) + if (likely (next (text, end, unicode, replacement) == end)) return text; - *unicode = -1; + *unicode = replacement; return end - 1; } @@ -208,7 +212,8 @@ struct hb_utf_t<uint32_t, validate> static inline const uint32_t * next (const uint32_t *text, const uint32_t *end HB_UNUSED, - hb_codepoint_t *unicode) + hb_codepoint_t *unicode, + hb_codepoint_t replacement) { hb_codepoint_t c = *text++; if (validate && unlikely (c > 0x10FFFFu || hb_in_range (c, 0xD800u, 0xDFFFu))) @@ -217,16 +222,17 @@ struct hb_utf_t<uint32_t, validate> return text; error: - *unicode = -1; + *unicode = replacement; return text; } static inline const uint32_t * prev (const uint32_t *text, const uint32_t *start HB_UNUSED, - hb_codepoint_t *unicode) + hb_codepoint_t *unicode, + hb_codepoint_t replacement) { - next (text - 1, text, unicode); + next (text - 1, text, unicode, replacement); return text - 1; } diff --git a/test/api/test-buffer.c b/test/api/test-buffer.c index 1be6931..af73c3f 100644 --- a/test/api/test-buffer.c +++ b/test/api/test-buffer.c @@ -374,6 +374,7 @@ test_buffer_utf8_conversion (void) unsigned int bytes, chars, i, j, len; b = hb_buffer_create (); + hb_buffer_set_replacement_codepoint (b, (hb_codepoint_t) -1); for (i = 0; i < G_N_ELEMENTS (utf8_conversion_tests); i++) { @@ -388,7 +389,7 @@ test_buffer_utf8_conversion (void) for (chars = 0; test->codepoints[chars]; chars++) ; - hb_buffer_reset (b); + hb_buffer_clear_contents (b); hb_buffer_add_utf8 (b, test->utf8, bytes, 1, bytes - 2); glyphs = hb_buffer_get_glyph_infos (b, &len); @@ -660,6 +661,7 @@ test_buffer_utf8_validity (void) unsigned int i; b = hb_buffer_create (); + hb_buffer_set_replacement_codepoint (b, (hb_codepoint_t) -1); for (i = 0; i < G_N_ELEMENTS (utf8_validity_tests); i++) { @@ -678,7 +680,7 @@ test_buffer_utf8_validity (void) else segment_bytes = test->max_len; - hb_buffer_reset (b); + hb_buffer_clear_contents (b); hb_buffer_add_utf8 (b, test->utf8, text_bytes, 0, segment_bytes); glyphs = hb_buffer_get_glyph_infos (b, &len); @@ -718,6 +720,7 @@ test_buffer_utf16_conversion (void) unsigned int i; b = hb_buffer_create (); + hb_buffer_set_replacement_codepoint (b, (hb_codepoint_t) -1); for (i = 0; i < G_N_ELEMENTS (utf16_conversion_tests); i++) { @@ -732,7 +735,7 @@ test_buffer_utf16_conversion (void) for (chars = 0; test->codepoints[chars]; chars++) ; - hb_buffer_reset (b); + hb_buffer_clear_contents (b); hb_buffer_add_utf16 (b, test->utf16, u_len, 1, u_len - 2); glyphs = hb_buffer_get_glyph_infos (b, &len); @@ -752,15 +755,15 @@ typedef struct { /* note: we skip the first and last item from utf32 when adding to buffer */ static const utf32_conversion_test_t utf32_conversion_tests[] = { - {{0x41, 0x004D, 0x0430, 0x4E8C, 0xD800, 0xDF02, 0x61} , {0x004D, 0x0430, 0x4E8C, -1, -1}}, + {{0x41, 0x004D, 0x0430, 0x4E8C, 0xD800, 0xDF02, 0x61} , {0x004D, 0x0430, 0x4E8C, -3, -3}}, {{0x41, 0x004D, 0x0430, 0x4E8C, 0x10302, 0x61} , {0x004D, 0x0430, 0x4E8C, 0x10302}}, - {{0x41, 0xD800, 0xDF02, 0x61}, {-1, -1}}, - {{0x41, 0xD800, 0xDF02}, {-1}}, - {{0x41, 0x61, 0xD800, 0xDF02}, {0x61, -1}}, - {{0x41, 0xD800, 0x61, 0xDF02}, {-1, 0x61}}, - {{0x41, 0xDF00, 0x61}, {-1}}, + {{0x41, 0xD800, 0xDF02, 0x61}, {-3, -3}}, + {{0x41, 0xD800, 0xDF02}, {-3}}, + {{0x41, 0x61, 0xD800, 0xDF02}, {0x61, -3}}, + {{0x41, 0xD800, 0x61, 0xDF02}, {-3, 0x61}}, + {{0x41, 0xDF00, 0x61}, {-3}}, {{0x41, 0x10FFFF, 0x61}, {0x10FFFF}}, - {{0x41, 0x110000, 0x61}, {-1}}, + {{0x41, 0x110000, 0x61}, {-3}}, {{0x41, 0x61}, {0}} }; @@ -771,6 +774,7 @@ test_buffer_utf32_conversion (void) unsigned int i; b = hb_buffer_create (); + hb_buffer_set_replacement_codepoint (b, (hb_codepoint_t) -3); for (i = 0; i < G_N_ELEMENTS (utf32_conversion_tests); i++) { @@ -785,7 +789,7 @@ test_buffer_utf32_conversion (void) for (chars = 0; test->codepoints[chars]; chars++) ; - hb_buffer_reset (b); + hb_buffer_clear_contents (b); hb_buffer_add_utf32 (b, test->utf32, u_len, 1, u_len - 2); glyphs = hb_buffer_get_glyph_infos (b, &len); commit bcba8b45024e1eca8be77ca2657de1dc44dbf8fb Author: Behdad Esfahbod <[email protected]> Date: Wed Jul 16 14:59:04 2014 -0400 New API hb_buffer_add_codepoints() Like hb_buffer_add_utf32, but doesn't do any Unicode validation. This is like what hb_buffer_add_utf32 used to be until a couple commits ago. diff --git a/src/hb-buffer.cc b/src/hb-buffer.cc index d920552..242cded 100644 --- a/src/hb-buffer.cc +++ b/src/hb-buffer.cc @@ -1290,7 +1290,7 @@ hb_buffer_guess_segment_properties (hb_buffer_t *buffer) buffer->guess_segment_properties (); } -template <typename T> +template <bool validate, typename T> static inline void hb_buffer_add_utf (hb_buffer_t *buffer, const T *text, @@ -1298,7 +1298,7 @@ hb_buffer_add_utf (hb_buffer_t *buffer, unsigned int item_offset, int item_length) { - typedef hb_utf_t<T> utf_t; + typedef hb_utf_t<T, true> utf_t; assert (buffer->content_type == HB_BUFFER_CONTENT_TYPE_UNICODE || (!buffer->len && buffer->content_type == HB_BUFFER_CONTENT_TYPE_INVALID)); @@ -1377,7 +1377,7 @@ hb_buffer_add_utf8 (hb_buffer_t *buffer, unsigned int item_offset, int item_length) { - hb_buffer_add_utf (buffer, (const uint8_t *) text, text_length, item_offset, item_length); + hb_buffer_add_utf<true> (buffer, (const uint8_t *) text, text_length, item_offset, item_length); } /** @@ -1399,7 +1399,7 @@ hb_buffer_add_utf16 (hb_buffer_t *buffer, unsigned int item_offset, int item_length) { - hb_buffer_add_utf (buffer, text, text_length, item_offset, item_length); + hb_buffer_add_utf<true> (buffer, text, text_length, item_offset, item_length); } /** @@ -1421,7 +1421,29 @@ hb_buffer_add_utf32 (hb_buffer_t *buffer, unsigned int item_offset, int item_length) { - hb_buffer_add_utf (buffer, text, text_length, item_offset, item_length); + hb_buffer_add_utf<true> (buffer, text, text_length, item_offset, item_length); +} + +/** + * hb_buffer_add_codepoints: + * @buffer: a buffer. + * @text: (array length=text_length): + * @text_length: + * @item_offset: + * @item_length: + * + * + * + * Since: 1.0 + **/ +void +hb_buffer_add_codepoints (hb_buffer_t *buffer, + const hb_codepoint_t *text, + int text_length, + unsigned int item_offset, + int item_length) +{ + hb_buffer_add_utf<false> (buffer, text, text_length, item_offset, item_length); } diff --git a/src/hb-buffer.h b/src/hb-buffer.h index 3086851..777c3d9 100644 --- a/src/hb-buffer.h +++ b/src/hb-buffer.h @@ -240,6 +240,14 @@ hb_buffer_add_utf32 (hb_buffer_t *buffer, unsigned int item_offset, int item_length); +/* Like add_utf32 but does NOT check for invalid Unicode codepoints. */ +void +hb_buffer_add_codepoints (hb_buffer_t *buffer, + const hb_codepoint_t *text, + int text_length, + unsigned int item_offset, + int item_length); + /* Clears any new items added at the end */ hb_bool_t commit 625dbf141a05f1ae81a7b8cbc529996370101284 Author: Behdad Esfahbod <[email protected]> Date: Wed Jul 16 14:49:55 2014 -0400 [buffer] Templatize UTF-* functions diff --git a/src/hb-buffer.cc b/src/hb-buffer.cc index 76bb10c..d920552 100644 --- a/src/hb-buffer.cc +++ b/src/hb-buffer.cc @@ -1298,6 +1298,8 @@ hb_buffer_add_utf (hb_buffer_t *buffer, unsigned int item_offset, int item_length) { + typedef hb_utf_t<T> utf_t; + assert (buffer->content_type == HB_BUFFER_CONTENT_TYPE_UNICODE || (!buffer->len && buffer->content_type == HB_BUFFER_CONTENT_TYPE_INVALID)); @@ -1305,7 +1307,7 @@ hb_buffer_add_utf (hb_buffer_t *buffer, return; if (text_length == -1) - text_length = hb_utf_strlen (text); + text_length = utf_t::strlen (text); if (item_length == -1) item_length = text_length - item_offset; @@ -1328,7 +1330,7 @@ hb_buffer_add_utf (hb_buffer_t *buffer, while (start < prev && buffer->context_len[0] < buffer->CONTEXT_LENGTH) { hb_codepoint_t u; - prev = hb_utf_prev (prev, start, &u); + prev = utf_t::prev (prev, start, &u); buffer->context[0][buffer->context_len[0]++] = u; } } @@ -1339,7 +1341,7 @@ hb_buffer_add_utf (hb_buffer_t *buffer, { hb_codepoint_t u; const T *old_next = next; - next = hb_utf_next (next, end, &u); + next = utf_t::next (next, end, &u); buffer->add (u, old_next - (const T *) text); } @@ -1349,7 +1351,7 @@ hb_buffer_add_utf (hb_buffer_t *buffer, while (next < end && buffer->context_len[1] < buffer->CONTEXT_LENGTH) { hb_codepoint_t u; - next = hb_utf_next (next, end, &u); + next = utf_t::next (next, end, &u); buffer->context[1][buffer->context_len[1]++] = u; } diff --git a/src/hb-utf-private.hh b/src/hb-utf-private.hh index 398f73c..cbacd67 100644 --- a/src/hb-utf-private.hh +++ b/src/hb-utf-private.hh @@ -29,202 +29,215 @@ #include "hb-private.hh" +template <typename T, bool validate=true> struct hb_utf_t; + /* UTF-8 */ -static inline const uint8_t * -hb_utf_next (const uint8_t *text, - const uint8_t *end, - hb_codepoint_t *unicode) +template <> +struct hb_utf_t<uint8_t, true> { - /* Written to only accept well-formed sequences. - * Based on ideas from ICU's U8_NEXT. - * Generates a -1 for each ill-formed byte. */ + static inline const uint8_t * + next (const uint8_t *text, + const uint8_t *end, + hb_codepoint_t *unicode) + { + /* Written to only accept well-formed sequences. + * Based on ideas from ICU's U8_NEXT. + * Generates a -1 for each ill-formed byte. */ - hb_codepoint_t c = *text++; + hb_codepoint_t c = *text++; - if (c > 0x7Fu) - { - if (hb_in_range (c, 0xC2u, 0xDFu)) /* Two-byte */ + if (c > 0x7Fu) { - unsigned int t1; - if (likely (text < end && - (t1 = text[0] - 0x80u) <= 0x3Fu)) + if (hb_in_range (c, 0xC2u, 0xDFu)) /* Two-byte */ { - c = ((c&0x1Fu)<<6) | t1; - text++; + unsigned int t1; + if (likely (text < end && + (t1 = text[0] - 0x80u) <= 0x3Fu)) + { + c = ((c&0x1Fu)<<6) | t1; + text++; + } + else + goto error; } - else - goto error; - } - else if (hb_in_range (c, 0xE0u, 0xEFu)) /* Three-byte */ - { - unsigned int t1, t2; - if (likely (1 < end - text && - (t1 = text[0] - 0x80u) <= 0x3Fu && - (t2 = text[1] - 0x80u) <= 0x3Fu)) + else if (hb_in_range (c, 0xE0u, 0xEFu)) /* Three-byte */ { - c = ((c&0xFu)<<12) | (t1<<6) | t2; - if (unlikely (c < 0x0800u || hb_in_range (c, 0xD800u, 0xDFFFu))) + unsigned int t1, t2; + if (likely (1 < end - text && + (t1 = text[0] - 0x80u) <= 0x3Fu && + (t2 = text[1] - 0x80u) <= 0x3Fu)) + { + c = ((c&0xFu)<<12) | (t1<<6) | t2; + if (unlikely (c < 0x0800u || hb_in_range (c, 0xD800u, 0xDFFFu))) + goto error; + text += 2; + } + else goto error; - text += 2; } - else - goto error; - } - else if (hb_in_range (c, 0xF0u, 0xF4u)) /* Four-byte */ - { - unsigned int t1, t2, t3; - if (likely (2 < end - text && - (t1 = text[0] - 0x80u) <= 0x3Fu && - (t2 = text[1] - 0x80u) <= 0x3Fu && - (t3 = text[2] - 0x80u) <= 0x3Fu)) + else if (hb_in_range (c, 0xF0u, 0xF4u)) /* Four-byte */ { - c = ((c&0x7u)<<18) | (t1<<12) | (t2<<6) | t3; - if (unlikely (!hb_in_range (c, 0x10000u, 0x10FFFFu))) + unsigned int t1, t2, t3; + if (likely (2 < end - text && + (t1 = text[0] - 0x80u) <= 0x3Fu && + (t2 = text[1] - 0x80u) <= 0x3Fu && + (t3 = text[2] - 0x80u) <= 0x3Fu)) + { + c = ((c&0x7u)<<18) | (t1<<12) | (t2<<6) | t3; + if (unlikely (!hb_in_range (c, 0x10000u, 0x10FFFFu))) + goto error; + text += 3; + } + else goto error; - text += 3; } else goto error; } - else - goto error; - } - - *unicode = c; - return text; - -error: - *unicode = -1; - return text; -} -static inline const uint8_t * -hb_utf_prev (const uint8_t *text, - const uint8_t *start, - hb_codepoint_t *unicode) -{ - const uint8_t *end = text--; - while (start < text && (*text & 0xc0) == 0x80 && end - text < 4) - text--; + *unicode = c; + return text; - if (likely (hb_utf_next (text, end, unicode) == end)) + error: + *unicode = -1; return text; + } - *unicode = -1; - return end - 1; -} + static inline const uint8_t * + prev (const uint8_t *text, + const uint8_t *start, + hb_codepoint_t *unicode) + { + const uint8_t *end = text--; + while (start < text && (*text & 0xc0) == 0x80 && end - text < 4) + text--; + if (likely (next (text, end, unicode) == end)) + return text; -static inline unsigned int -hb_utf_strlen (const uint8_t *text) -{ - return strlen ((const char *) text); -} + *unicode = -1; + return end - 1; + } + + static inline unsigned int + strlen (const uint8_t *text) + { + return ::strlen ((const char *) text); + } +}; /* UTF-16 */ -static inline const uint16_t * -hb_utf_next (const uint16_t *text, - const uint16_t *end, - hb_codepoint_t *unicode) +template <> +struct hb_utf_t<uint16_t, true> { - hb_codepoint_t c = *text++; - - if (likely (!hb_in_range (c, 0xD800u, 0xDFFFu))) + static inline const uint16_t * + next (const uint16_t *text, + const uint16_t *end, + hb_codepoint_t *unicode) { - *unicode = c; + hb_codepoint_t c = *text++; + + if (likely (!hb_in_range (c, 0xD800u, 0xDFFFu))) + { + *unicode = c; + return text; + } + + if (likely (hb_in_range (c, 0xD800u, 0xDBFFu))) + { + /* High-surrogate in c */ + hb_codepoint_t l; + if (text < end && ((l = *text), likely (hb_in_range (l, 0xDC00u, 0xDFFFu)))) + { + /* Low-surrogate in l */ + *unicode = (c << 10) + l - ((0xD800u << 10) - 0x10000u + 0xDC00u); + text++; + return text; + } + } + + /* Lonely / out-of-order surrogate. */ + *unicode = -1; return text; } - if (likely (hb_in_range (c, 0xD800u, 0xDBFFu))) + static inline const uint16_t * + prev (const uint16_t *text, + const uint16_t *start, + hb_codepoint_t *unicode) { - /* High-surrogate in c */ - hb_codepoint_t l; - if (text < end && ((l = *text), likely (hb_in_range (l, 0xDC00u, 0xDFFFu)))) + const uint16_t *end = text--; + hb_codepoint_t c = *text; + + if (likely (!hb_in_range (c, 0xD800u, 0xDFFFu))) { - /* Low-surrogate in l */ - *unicode = (c << 10) + l - ((0xD800u << 10) - 0x10000u + 0xDC00u); - text++; - return text; + *unicode = c; + return text; } - } - /* Lonely / out-of-order surrogate. */ - *unicode = -1; - return text; -} + if (likely (start < text && hb_in_range (c, 0xDC00u, 0xDFFFu))) + text--; -static inline const uint16_t * -hb_utf_prev (const uint16_t *text, - const uint16_t *start, - hb_codepoint_t *unicode) -{ - const uint16_t *end = text--; - hb_codepoint_t c = *text; + if (likely (next (text, end, unicode) == end)) + return text; - if (likely (!hb_in_range (c, 0xD800u, 0xDFFFu))) - { - *unicode = c; - return text; + *unicode = -1; + return end - 1; } - if (likely (start < text && hb_in_range (c, 0xDC00u, 0xDFFFu))) - text--; - if (likely (hb_utf_next (text, end, unicode) == end)) - return text; + static inline unsigned int + strlen (const uint16_t *text) + { + unsigned int l = 0; + while (*text++) l++; + return l; + } +}; - *unicode = -1; - return end - 1; -} +/* UTF-32 */ -static inline unsigned int -hb_utf_strlen (const uint16_t *text) +template <bool validate> +struct hb_utf_t<uint32_t, validate> { - unsigned int l = 0; - while (*text++) l++; - return l; -} - + static inline const uint32_t * + next (const uint32_t *text, + const uint32_t *end HB_UNUSED, + hb_codepoint_t *unicode) + { + hb_codepoint_t c = *text++; + if (validate && unlikely (c > 0x10FFFFu || hb_in_range (c, 0xD800u, 0xDFFFu))) + goto error; + *unicode = c; + return text; -/* UTF-32 */ + error: + *unicode = -1; + return text; + } -static inline const uint32_t * -hb_utf_next (const uint32_t *text, - const uint32_t *end HB_UNUSED, - hb_codepoint_t *unicode) -{ - hb_codepoint_t c = *text++; - if (unlikely (c > 0x10FFFFu || hb_in_range (c, 0xD800u, 0xDFFFu))) - goto error; - *unicode = c; - return text; - -error: - *unicode = -1; - return text; -} - -static inline const uint32_t * -hb_utf_prev (const uint32_t *text, - const uint32_t *start HB_UNUSED, - hb_codepoint_t *unicode) -{ - hb_utf_next (text - 1, text, unicode); - return text - 1; -} + static inline const uint32_t * + prev (const uint32_t *text, + const uint32_t *start HB_UNUSED, + hb_codepoint_t *unicode) + { + next (text - 1, text, unicode); + return text - 1; + } -static inline unsigned int -hb_utf_strlen (const uint32_t *text) -{ - unsigned int l = 0; - while (*text++) l++; - return l; -} + static inline unsigned int + strlen (const uint32_t *text) + { + unsigned int l = 0; + while (*text++) l++; + return l; + } +}; #endif /* HB_UTF_PRIVATE_HH */ commit e634fed4285ce440d277345727ed01757df6d779 Author: Behdad Esfahbod <[email protected]> Date: Wed Jul 16 14:17:26 2014 -0400 [buffer] Validate UTF-32 input Same as what we do for UTF-8 and UTF-16. diff --git a/src/hb-utf-private.hh b/src/hb-utf-private.hh index a4c6236..398f73c 100644 --- a/src/hb-utf-private.hh +++ b/src/hb-utf-private.hh @@ -198,7 +198,14 @@ hb_utf_next (const uint32_t *text, const uint32_t *end HB_UNUSED, hb_codepoint_t *unicode) { - *unicode = *text++; + hb_codepoint_t c = *text++; + if (unlikely (c > 0x10FFFFu || hb_in_range (c, 0xD800u, 0xDFFFu))) + goto error; + *unicode = c; + return text; + +error: + *unicode = -1; return text; } @@ -207,8 +214,8 @@ hb_utf_prev (const uint32_t *text, const uint32_t *start HB_UNUSED, hb_codepoint_t *unicode) { - *unicode = *--text; - return text; + hb_utf_next (text - 1, text, unicode); + return text - 1; } static inline unsigned int diff --git a/test/api/test-buffer.c b/test/api/test-buffer.c index 1956c92..1be6931 100644 --- a/test/api/test-buffer.c +++ b/test/api/test-buffer.c @@ -744,6 +744,60 @@ test_buffer_utf16_conversion (void) hb_buffer_destroy (b); } + +typedef struct { + const uint32_t utf32[8]; + const uint32_t codepoints[8]; +} utf32_conversion_test_t; + +/* note: we skip the first and last item from utf32 when adding to buffer */ +static const utf32_conversion_test_t utf32_conversion_tests[] = { + {{0x41, 0x004D, 0x0430, 0x4E8C, 0xD800, 0xDF02, 0x61} , {0x004D, 0x0430, 0x4E8C, -1, -1}}, + {{0x41, 0x004D, 0x0430, 0x4E8C, 0x10302, 0x61} , {0x004D, 0x0430, 0x4E8C, 0x10302}}, + {{0x41, 0xD800, 0xDF02, 0x61}, {-1, -1}}, + {{0x41, 0xD800, 0xDF02}, {-1}}, + {{0x41, 0x61, 0xD800, 0xDF02}, {0x61, -1}}, + {{0x41, 0xD800, 0x61, 0xDF02}, {-1, 0x61}}, + {{0x41, 0xDF00, 0x61}, {-1}}, + {{0x41, 0x10FFFF, 0x61}, {0x10FFFF}}, + {{0x41, 0x110000, 0x61}, {-1}}, + {{0x41, 0x61}, {0}} +}; + +static void +test_buffer_utf32_conversion (void) +{ + hb_buffer_t *b; + unsigned int i; + + b = hb_buffer_create (); + + for (i = 0; i < G_N_ELEMENTS (utf32_conversion_tests); i++) + { + const utf32_conversion_test_t *test = &utf32_conversion_tests[i]; + unsigned int u_len, chars, j, len; + hb_glyph_info_t *glyphs; + + g_test_message ("UTF-32 test #%d", i); + + for (u_len = 0; test->utf32[u_len]; u_len++) + ; + for (chars = 0; test->codepoints[chars]; chars++) + ; + + hb_buffer_reset (b); + hb_buffer_add_utf32 (b, test->utf32, u_len, 1, u_len - 2); + + glyphs = hb_buffer_get_glyph_infos (b, &len); + g_assert_cmpint (len, ==, chars); + for (j = 0; j < chars; j++) + g_assert_cmphex (glyphs[j].codepoint, ==, test->codepoints[j]); + } + + hb_buffer_destroy (b); +} + + static void test_empty (hb_buffer_t *b) { @@ -810,6 +864,7 @@ main (int argc, char **argv) hb_test_add (test_buffer_utf8_conversion); hb_test_add (test_buffer_utf8_validity); hb_test_add (test_buffer_utf16_conversion); + hb_test_add (test_buffer_utf32_conversion); hb_test_add (test_buffer_empty); return hb_test_run(); commit b98c5db32d15fcfb27ce2f6737203ce1ad124319 Author: Behdad Esfahbod <[email protected]> Date: Wed Jul 16 13:44:01 2014 -0400 Minor refactoring diff --git a/src/hb-ot-layout-gsubgpos-private.hh b/src/hb-ot-layout-gsubgpos-private.hh index 470c353..546ff4b 100644 --- a/src/hb-ot-layout-gsubgpos-private.hh +++ b/src/hb-ot-layout-gsubgpos-private.hh @@ -349,11 +349,7 @@ struct hb_apply_context_t may_skip (const hb_apply_context_t *c, const hb_glyph_info_t &info) const { - unsigned int property; - - property = _hb_glyph_info_get_glyph_props (&info); - - if (!c->match_properties (info.codepoint, property, lookup_props)) + if (!c->check_glyph_property (&info, lookup_props)) return SKIP_YES; if (unlikely (_hb_glyph_info_is_default_ignorable (&info) && @@ -537,10 +533,12 @@ struct hb_apply_context_t } inline bool - match_properties (hb_codepoint_t glyph, - unsigned int glyph_props, - unsigned int lookup_props) const + check_glyph_property (const hb_glyph_info_t *info, + unsigned int lookup_props) const { + hb_codepoint_t glyph = info->codepoint; + unsigned int glyph_props = _hb_glyph_info_get_glyph_props (info); + /* Not covered, if, for example, glyph class is ligature and * lookup_props includes LookupFlags::IgnoreLigatures */ @@ -553,17 +551,6 @@ struct hb_apply_context_t return true; } - inline bool - check_glyph_property (hb_glyph_info_t *info, - unsigned int lookup_props) const - { - unsigned int property; - - property = _hb_glyph_info_get_glyph_props (info); - - return match_properties (info->codepoint, property, lookup_props); - } - inline void _set_glyph_props (hb_codepoint_t glyph_index, unsigned int class_guess = 0, bool ligature = false, _______________________________________________ HarfBuzz mailing list [email protected] http://lists.freedesktop.org/mailman/listinfo/harfbuzz
