ping On 8/28/13, Shawn Landden <[email protected]> wrote: > This version counts all multibyte characters as 1 width, not taking into > account double width cjk characters and zerowidth characters > --- > TODO | 4 --- > src/shared/utf8.c | 56 +++++++++++++++++++++++++++++++ > src/shared/utf8.h | 98 > +++++++++++++++++++++++++++++++++++++++++++++++++++++++ > src/shared/util.c | 55 +++++++++++++++++++++++++++++++ > src/shared/util.h | 2 ++ > 5 files changed, 211 insertions(+), 4 deletions(-) > > diff --git a/TODO b/TODO > index fe305ec..a77ebe5 100644 > --- a/TODO > +++ b/TODO > @@ -19,10 +19,6 @@ Bugfixes: > > * properly handle .mount unit state tracking when two mount points are > stacked one on top of another on the exact same mount point. > > -* ellipsize_mem must take into account multi-byte unicode characters, and > - - make the resulting line the requested number of *characters*, not > *bytes*, > - - avoid truncuating multi-byte sequences in the middle. > - > * When we detect invalid UTF-8, we cant't use it in an error message: > log...("Path is not UTF-8 clean, ignoring assignment: %s", rvalue); > > diff --git a/src/shared/utf8.c b/src/shared/utf8.c > index 655cc77..8a37c3a 100644 > --- a/src/shared/utf8.c > +++ b/src/shared/utf8.c > @@ -22,6 +22,11 @@ > /* This file is based on the GLIB utf8 validation functions. The > * original license text follows. */ > > +/* gunicode.h - Unicode manipulation functions > + * > + * Copyright (C) 1999, 2000 Tom Tromey > + * Copyright 2000, 2005 Red Hat, Inc. > + */ > /* gutf8.c - Operations on UTF-8 strings. > * > * Copyright (C) 1999 Tom Tromey > @@ -317,3 +322,54 @@ char *utf16_to_utf8(const void *s, size_t length) { > > return r; > } > + > +/** > + * g_utf8_prev_char: > + * @p: a pointer to a position within a UTF-8 encoded string > + * > + * Finds the previous UTF-8 character in the string before @p. > + * > + * @p does not have to be at the beginning of a UTF-8 character. No check > + * is made to see if the character found is actually valid other than > + * it starts with an appropriate byte. If @p might be the first > + * character of the string, you must use g_utf8_find_prev_char() instead. > + * > + * Return value: a pointer to the found character. > + **/ > +char * > +utf8_prev_char (const char *p) > +{ > + while (1) > + { > + p--; > + if ((*p & 0xc0) != 0x80) > + return (char *)p; > + } > +} > + > +/** > + * g_utf8_get_char: > + * @p: a pointer to Unicode character encoded as UTF-8 > + * > + * Converts a sequence of bytes encoded as UTF-8 to a Unicode character. > + * If @p does not point to a valid UTF-8 encoded character, results are > + * undefined. If you are not sure that the bytes are complete > + * valid Unicode characters, you should use g_utf8_get_char_validated() > + * instead. > + * > + * Return value: the resulting character > + **/ > +unichar > +utf8_get_char (const char *p) > +{ > + int i, mask = 0, len; > + unichar result; > + unsigned char c = (unsigned char) *p; > + > + UTF8_COMPUTE (c, mask, len); > + if (len == -1) > + return (unichar)-1; > + UTF8_GET (result, p, i, mask, len); > + > + return result; > +} > \ No newline at end of file > diff --git a/src/shared/utf8.h b/src/shared/utf8.h > index f805ea6..020bc27 100644 > --- a/src/shared/utf8.h > +++ b/src/shared/utf8.h > @@ -34,3 +34,101 @@ char *utf8_filter(const char *s); > char *ascii_filter(const char *s); > > char *utf16_to_utf8(const void *s, size_t length); > + > +#define unichar uint32_t > + > +char *utf8_prev_char (const char *p); > +unichar utf8_get_char (const char *p); > + > +#define UTF8_COMPUTE(Char, Mask, Len) > \ > + if (Char < 128) \ > + { > \ > + Len = 1; > \ > + Mask = 0x7f; \ > + } > \ > + else if ((Char & 0xe0) == 0xc0) \ > + { > \ > + Len = 2; > \ > + Mask = 0x1f; \ > + } > \ > + else if ((Char & 0xf0) == 0xe0) \ > + { > \ > + Len = 3; > \ > + Mask = 0x0f; \ > + } > \ > + else if ((Char & 0xf8) == 0xf0) \ > + { > \ > + Len = 4; > \ > + Mask = 0x07; \ > + } > \ > + else if ((Char & 0xfc) == 0xf8) \ > + { > \ > + Len = 5; > \ > + Mask = 0x03; \ > + } > \ > + else if ((Char & 0xfe) == 0xfc) \ > + { > \ > + Len = 6; > \ > + Mask = 0x01; \ > + } > \ > + else > \ > + Len = -1; > + > +#define UTF8_LENGTH(Char) \ > + ((Char) < 0x80 ? 1 : \ > + ((Char) < 0x800 ? 2 : \ > + ((Char) < 0x10000 ? 3 : \ > + ((Char) < 0x200000 ? 4 : \ > + ((Char) < 0x4000000 ? 5 : 6))))) > + > + > +#define UTF8_GET(Result, Chars, Count, Mask, Len) \ > + (Result) = (Chars)[0] & (Mask); \ > + for ((Count) = 1; (Count) < (Len); ++(Count)) > \ > + { > \ > + if (((Chars)[(Count)] & 0xc0) != 0x80) \ > + { \ > + (Result) = -1; \ > + break; \ > + } \ > + (Result) <<= 6; > \ > + (Result) |= ((Chars)[(Count)] & 0x3f); \ > + } > + > +/* > + * Check whether a Unicode (5.2) char is in a valid range. > + * > + * The first check comes from the Unicode guarantee to never encode > + * a point above 0x0010ffff, since UTF-16 couldn't represent it. > + * > + * The second check covers surrogate pairs (category Cs). > + * > + * @param Char the character > + */ > +#define UNICODE_VALID(Char) \ > + ((Char) < 0x110000 && \ > + (((Char) & 0xFFFFF800) != 0xD800)) > + > +static const char utf8_skip_data[256] = { > + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, > + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, > + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, > + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, > + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, > + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, > + 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, > + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1 > +}; > + > +/** > + * g_utf8_next_char: > + * @p: Pointer to the start of a valid UTF-8 character > + * > + * Skips to the next character in a UTF-8 string. The string must be > + * valid; this macro is as fast as possible, and has no error-checking. > + * You would use this macro to iterate over a string character by > + * character. The macro returns the start of the next UTF-8 character. > + * Before using this macro, use g_utf8_validate() to validate strings > + * that may contain invalid UTF-8. > + */ > +#define utf8_next_char(p) (char *)((p) + utf8_skip_data[*(const char > *)(p)]) > \ No newline at end of file > diff --git a/src/shared/util.c b/src/shared/util.c > index 9af99aa..58a1787 100644 > --- a/src/shared/util.c > +++ b/src/shared/util.c > @@ -73,6 +73,7 @@ > #include "hashmap.h" > #include "env-util.h" > #include "fileio.h" > +#include "utf8.h" > > int saved_argc = 0; > char **saved_argv = NULL; > @@ -3327,6 +3328,60 @@ char *ellipsize(const char *s, size_t length, > unsigned percent) { > return ellipsize_mem(s, strlen(s), length, percent); > } > > +char *wellipsize_mem(const char *s, size_t old_length, size_t new_length, > unsigned percent) { > + size_t x; > + char *e, *i; > + unichar c; > + unsigned j, k = 0; > + > + assert(s); > + assert(percent <= 100); > + assert(new_length >= 3); > + > + /* if no multibyte characters use ellipsize_mem for speed */ > + if (ascii_is_valid(s)) > + return ellipsize_mem(s, old_length, new_length, percent); > + > + if (old_length <= 3 || old_length <= new_length) > + return strndup(s, old_length); > + > + if (!utf8_is_valid(s)) > + return NULL; > + > + e = new0(char, new_length*4 < old_length ? new_length*4 : > old_length); > + if (!e) > + return NULL; > + > + x = (new_length * percent) / 100; > + > + if (x > new_length - 3) > + x = new_length - 3; > + > + for (i = (char *)s;k < x;i = utf8_next_char(i)) > + k++; > + > + j = i - s; > + memcpy(e, s, j); > + e[j] = '.'; /* TODO: use … tri-dot? */ > + e[j+1] = '.'; /* 0xe2 0x80 0xa6 */ > + e[j+2] = '.'; > + > + k = 0; > + for (i = (char *)s + old_length; > + k < new_length - x - 3;) { > + i = utf8_prev_char(i); > + k++; > + } > + > + strcpy(e + j + 3, i); > + > + return e; > +} > + > +char *wellipsize(const char *s, size_t length, unsigned percent) { > + return wellipsize_mem(s, strlen(s), length, percent); > +} > + > int touch(const char *path) { > int fd; > > diff --git a/src/shared/util.h b/src/shared/util.h > index 63f4e3d..9b17db9 100644 > --- a/src/shared/util.h > +++ b/src/shared/util.h > @@ -404,6 +404,8 @@ int running_in_chroot(void); > > char *ellipsize(const char *s, size_t length, unsigned percent); > char *ellipsize_mem(const char *s, size_t old_length, size_t new_length, > unsigned percent); > +char *wellipsize(const char *s, size_t length, unsigned percent); > +char *wellipsize_mem(const char *s, size_t old_length, size_t new_length, > unsigned percent); > > int touch(const char *path); > > -- > 1.8.4.rc3 > >
-- --- Shawn Landden +1 360 389 3001 (SMS preferred) _______________________________________________ systemd-devel mailing list [email protected] http://lists.freedesktop.org/mailman/listinfo/systemd-devel
