On 13/03/17 19:35 +0000, Jonathan Wakely wrote:
This is a series of patches to fix various bugs in the Unicode
character conversion facets.
Ther first patch fixes a silly < versus <= bug that meant that 0xffff
got written as a surrogate pair instead of as simply 0xff, and an
endianness bug for the internal representation of UTF-16 code units
stored in char32_t or wchar_t values. That's PR 79511.
The second patch fixes some incorrect bitwise operations (because I
confused & and |) and some incorrect limits (because I confused max
and min). That fixes determining the endianness of the external
representation bytes when they start with a Byte OrderMark, and
correctly reports errors on invalid UCS2. It also fixes
wstring_convert so that it reports the number of characters that were
converted prior to an error. That's PR 79980.
The third patch fixes the output of the encoding() and max_length()
member functions on the codecvt facets, because I wasn't correctly
accounting for a BOM or for the differences between UTF-16 and UCS2.
I plan to commit these for all branches, but I'll wait until after GCC
7.1 is released, and fix it for 7.2 instead. These bugs aren't
important enough to rush into trunk now.
One more patch for a problem found by the libc++ testsuite. Now we
pass all the libc++ tests, and we even pass a test that libc++ fails.
With this, I hope our <codecvt> is 100% conforming. Just in time to be
deprecated for C++17 :-)
commit 3118704bc37cd771b9fc5bf83230f38a16a7c5c3
Author: Jonathan Wakely <jwak...@redhat.com>
Date: Tue Mar 14 17:47:12 2017 +0000
PR libstdc++/80041 fix codecvt_utf16<wchar_t> to use UTF-16 not UTF-8
PR libstdc++/80041
* src/c++11/codecvt.cc (__codecvt_utf16_base<wchar_t>::do_out)
(__codecvt_utf16_base<wchar_t>::do_in): Convert char arguments to
char16_t to work with UTF-16 instead of UTF-8.
* testsuite/22_locale/codecvt/codecvt_utf16/80041.cc: New test.
diff --git a/libstdc++-v3/src/c++11/codecvt.cc b/libstdc++-v3/src/c++11/codecvt.cc
index 9c91725..ef38267 100644
--- a/libstdc++-v3/src/c++11/codecvt.cc
+++ b/libstdc++-v3/src/c++11/codecvt.cc
@@ -1217,7 +1217,10 @@ do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
extern_type* __to, extern_type* __to_end,
extern_type*& __to_next) const
{
- range<char> to{ __to, __to_end };
+ range<char16_t> to{
+ reinterpret_cast<char16_t*>(__to),
+ reinterpret_cast<char16_t*>(__to_end)
+ };
#if __SIZEOF_WCHAR_T__ == 2
range<const char16_t> from{
reinterpret_cast<const char16_t*>(__from),
@@ -1234,7 +1237,7 @@ do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
return codecvt_base::error;
#endif
__from_next = reinterpret_cast<const wchar_t*>(from.next);
- __to_next = to.next;
+ __to_next = reinterpret_cast<char*>(to.next);
return res;
}
@@ -1254,7 +1257,10 @@ do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
intern_type* __to, intern_type* __to_end,
intern_type*& __to_next) const
{
- range<const char> from{ __from, __from_end };
+ range<const char16_t> from{
+ reinterpret_cast<const char16_t*>(__from),
+ reinterpret_cast<const char16_t*>(__from_end)
+ };
#if __SIZEOF_WCHAR_T__ == 2
range<char16_t> to{
reinterpret_cast<char16_t*>(__to),
@@ -1270,7 +1276,7 @@ do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
#else
return codecvt_base::error;
#endif
- __from_next = from.next;
+ __from_next = reinterpret_cast<const char*>(from.next);
__to_next = reinterpret_cast<wchar_t*>(to.next);
return res;
}
diff --git a/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf16/80041.cc b/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf16/80041.cc
new file mode 100644
index 0000000..a78b194
--- /dev/null
+++ b/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf16/80041.cc
@@ -0,0 +1,87 @@
+// Copyright (C) 2017 Free Software Foundation, Inc.
+//
+// This file is part of the GNU ISO C++ Library. This library is free
+// software; you can redistribute it and/or modify it under the
+// terms of the GNU General Public License as published by the
+// Free Software Foundation; either version 3, or (at your option)
+// any later version.
+
+// This library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License along
+// with this library; see the file COPYING3. If not see
+// <http://www.gnu.org/licenses/>.
+
+// { dg-do run { target c++11 } }
+
+#include <codecvt>
+#include <testsuite_hooks.h>
+
+void
+test01()
+{
+#ifdef _GLIBCXX_USE_WCHAR_T
+ std::codecvt_utf16<wchar_t> conv;
+ const wchar_t wc = 0x6557;
+ char bytes[2] = {0};
+ const wchar_t* wcnext;
+ std::mbstate_t st{};
+ char* next = nullptr;
+ auto res = conv.out(st, &wc, &wc+ 1, wcnext, bytes, std::end(bytes), next);
+ VERIFY( res == std::codecvt_base::ok );
+ VERIFY( wcnext == &wc + 1 );
+ VERIFY( next == std::end(bytes) );
+ VERIFY( bytes[0] == 0x65 );
+ VERIFY( bytes[1] == 0x57 );
+ VERIFY( conv.length(st, bytes, next, 1) == (next - bytes) );
+
+ wchar_t w;
+ wchar_t* wnext;
+ const char* cnext;
+ st = {};
+ res = conv.in(st, bytes, next, cnext, &w, &w + 1, wnext);
+ VERIFY( res == std::codecvt_base::ok );
+ VERIFY( wnext == &w + 1 );
+ VERIFY( cnext == next );
+ VERIFY( w == wc );
+#endif
+}
+
+void
+test02()
+{
+#ifdef _GLIBCXX_USE_WCHAR_T
+ std::codecvt_utf16<wchar_t, 0x10FFFF, std::little_endian> conv;
+ wchar_t wc = 0x6557;
+ char bytes[2] = {0};
+ const wchar_t* wcnext;
+ std::mbstate_t st{};
+ char* next = nullptr;
+ auto res = conv.out(st, &wc, &wc+ 1, wcnext, bytes, std::end(bytes), next);
+ VERIFY( res == std::codecvt_base::ok );
+ VERIFY( wcnext == &wc + 1 );
+ VERIFY( next == std::end(bytes) );
+ VERIFY( bytes[0] == 0x57 );
+ VERIFY( bytes[1] == 0x65 );
+ VERIFY( conv.length(st, bytes, next, 1) == (next - bytes) );
+
+ wchar_t w;
+ wchar_t* wnext;
+ const char* cnext;
+ st = {};
+ res = conv.in(st, bytes, next, cnext, &w, &w + 1, wnext);
+ VERIFY( res == std::codecvt_base::ok );
+ VERIFY( wnext == &w + 1 );
+ VERIFY( cnext == next );
+ VERIFY( w == wc );
+#endif
+}
+
+int main()
+{
+ test01();
+ test02();
+}