Hi

On Wed, Nov 12, 2014 at 11:49 AM, WaLyong Cho <[email protected]> wrote:
> ---
>  src/shared/utf8.c    | 39 +++++++++++++++++++++++++++++++++++++++
>  src/shared/utf8.h    |  1 +
>  src/test/test-utf8.c | 25 +++++++++++++++++++++++++
>  3 files changed, 65 insertions(+)
>
> diff --git a/src/shared/utf8.c b/src/shared/utf8.c
> index 8702ceb..0b6c38e 100644
> --- a/src/shared/utf8.c
> +++ b/src/shared/utf8.c
> @@ -212,6 +212,45 @@ char *utf8_escape_invalid(const char *str) {
>          return p;
>  }
>
> +char *utf8_escape_non_printable(const char *str) {
> +        char *p, *s;
> +
> +        assert(str);
> +
> +        p = s = malloc(strlen(str) * 4 + 1);
> +        if (!p)
> +                return NULL;
> +
> +        while (*str) {
> +                int len;
> +
> +                len = utf8_encoded_valid_unichar(str);
> +                if (len > 0) {
> +                        if (utf8_is_printable(str, len)) {
> +                                s = mempcpy(s, str, len);
> +                                str += len;
> +                        } else {
> +                                if ((*str < ' ') || (*str >= 127)) {
> +                                        *(s++) = '\\';
> +                                        *(s++) = 'x';
> +                                        *(s++) = hexchar((int) *str >> 4);
> +                                        *(s++) = hexchar((int) *str);
> +                                } else
> +                                        *(s++) = *str;
> +
> +                                str += 1;

This part is wrong. You cannot rely on ``*str'' to be the correct
Unicode value for the character. utf8_is_printable() returns false
also for multi-byte UTF8 characters. By taking it unmodified, it will
include the UTF8 management bits, which we really don't want here.

If you really want this, I'd prefer if you decode each UTF8 character,
and if it is non-printable you print "\uABCD" or "\UABCDWXYZ" (like
C++ does) as a 6-byte or 10-byte sequence. Other characters are just
printed normally.

Thanks
David

> +                        }
> +                } else {
> +                        s = mempcpy(s, UTF8_REPLACEMENT_CHARACTER, 
> strlen(UTF8_REPLACEMENT_CHARACTER));
> +                        str += 1;
> +                }
> +        }
> +
> +        *s = '\0';
> +
> +        return p;
> +}
> +
>  char *ascii_is_valid(const char *str) {
>          const char *p;
>
> diff --git a/src/shared/utf8.h b/src/shared/utf8.h
> index c087995..1fe1a35 100644
> --- a/src/shared/utf8.h
> +++ b/src/shared/utf8.h
> @@ -30,6 +30,7 @@
>  const char *utf8_is_valid(const char *s) _pure_;
>  char *ascii_is_valid(const char *s) _pure_;
>  char *utf8_escape_invalid(const char *s);
> +char *utf8_escape_non_printable(const char *str);
>
>  bool utf8_is_printable_newline(const char* str, size_t length, bool newline) 
> _pure_;
>  _pure_ static inline bool utf8_is_printable(const char* str, size_t length) {
> diff --git a/src/test/test-utf8.c b/src/test/test-utf8.c
> index b7d988f..6dde63c 100644
> --- a/src/test/test-utf8.c
> +++ b/src/test/test-utf8.c
> @@ -66,12 +66,37 @@ static void test_utf8_escaping(void) {
>          assert_se(utf8_is_valid(p3));
>  }
>
> +static void test_utf8_escaping_printable(void) {
> +        _cleanup_free_ char *p1, *p2, *p3, *p4, *p5;
> +
> +        p1 = utf8_escape_non_printable("goo goo goo");
> +        puts(p1);
> +        assert_se(utf8_is_valid(p1));
> +
> +        p2 = utf8_escape_non_printable("\341\204\341\204");
> +        puts(p2);
> +        assert_se(utf8_is_valid(p2));
> +
> +        p3 = utf8_escape_non_printable("\341\204");
> +        puts(p3);
> +        assert_se(utf8_is_valid(p3));
> +
> +        p4 = 
> utf8_escape_non_printable("ąę\n가너도루\n1234\n\341\204\341\204\n\001 \019\20\a");
> +        puts(p4);
> +        assert_se(utf8_is_valid(p4));
> +
> +        p5 = utf8_escape_non_printable("\001 \019\20\a");
> +        puts(p5);
> +        assert_se(utf8_is_valid(p5));
> +}
> +
>  int main(int argc, char *argv[]) {
>          test_utf8_is_valid();
>          test_utf8_is_printable();
>          test_ascii_is_valid();
>          test_utf8_encoded_valid_unichar();
>          test_utf8_escaping();
> +        test_utf8_escaping_printable();
>
>          return 0;
>  }
> --
> 1.9.3
>
> _______________________________________________
> systemd-devel mailing list
> [email protected]
> http://lists.freedesktop.org/mailman/listinfo/systemd-devel
_______________________________________________
systemd-devel mailing list
[email protected]
http://lists.freedesktop.org/mailman/listinfo/systemd-devel

Reply via email to