https://gcc.gnu.org/bugzilla/show_bug.cgi?id=70893

--- Comment #6 from Кирилл <kirillnow at gmail dot com> ---
(In reply to Jonathan Wakely from comment #4)
> If you think there's a bug here please provide a testcase that compiles and
> produces an incorrect result.

Here:
#include <iostream>
#include <string>
#include <locale>
#include <codecvt>

using namespace std;
//-------------------------------------------------------------------------------------------------
/** Extensively tested and works.*/
template<bool big_e> string my_utf16_to_utf8(const char *s, size_t sz)
{
  uint32_t ucs=0; sz &= ~1; //odd sizes are rounded down
  string rtv; rtv.resize(sz+sz/2);
  char* o=&rtv[0];
  for(const uint8_t *i=(uint8_t*)s, *e=i+sz; i<e; i+=2, ++o)
   {
    ucs = uint32_t(i[!big_e])<<8 | i[big_e];
    if((ucs&0xFC00)==0xD800 && i+2<e && (i[2+!big_e]&0xFC)==0xDC)
     {
      i+=2; ucs &= 0x03FF;
      ucs = (ucs<<10 | uint32_t(i[!big_e]&3)<<8 | i[big_e]) + 0x010000;
     } //pass standalone surrogates as-is
    if(ucs&(~0x7FF))
     {
      if(ucs&(~0xFFFF))
       {
        *o=ucs>>18|0xF0; ++o; *o=(ucs>>12 & 0x3F)|0x80; ++o;
        *o=(ucs>>6 & 0x3F)|0x80; ++o; *o=(ucs&0x3F)|0x80;
       }
      else
       { *o=ucs>>12|0xE0; ++o; *o=(ucs>>6&0x3F)|0x80; ++o; *o=(ucs&0x3F)|0x80;
}
     }
    else
     {
      if(ucs&(~0x7F)) { *o=ucs>>6|0xC0; ++o; *o=(ucs&0x3F)|0x80; }
      else *o = ucs;
     }
//  if((o-&rtv[0]>=rtv.size()) throw range_error("utf16_to_utf8()"); //debug
   }
  rtv.resize(o-&rtv[0]); rtv.shrink_to_fit();
  return rtv;
}
//-------------------------------------------------------------------------------------------------
template<bool big_e> inline std::string std_utf16_to_utf8(const char *s, size_t
sz)
{
  using namespace std; sz &= ~1;
  wstring_convert<codecvt_utf8_utf16
    <char16_t, 0x10ffff, (big_e)?(codecvt_mode)0:little_endian>, char16_t>
conv;
  try
   { return conv.to_bytes((const char16_t*)s, (const char16_t*)(s+sz)); }
  catch(...) { return string{}; }
}
//-------------------------------------------------------------------------------------------------
int main(int argc, const char** argv)
{
  static constexpr const uint8_t txt_utf16be[] = {
    0x01, 0x31, 0x00, 0x6e, 0x00, 0x74, 0x02, 0x59, 0x00, 0x67, 0x00, 0xe6,
    0x00, 0x6c, 0x00, 0xe6, 0x00, 0x6b, 0x00, 0x74, 0x01, 0x31, 0x00, 0x63,
    0x00, 0x2c, 0x00, 0x20, 0x00, 0x62, 0x00, 0x61, 0x01, 0x31, 0x00, 0x20,
    0x00, 0xf0, 0x02, 0x59, 0x00, 0x20, 0x00, 0x62, 0x00, 0x69, 0x02, 0xd0,
    0x00, 0x73, 0x00, 0x74, 0x00, 0x69, 0x00, 0x20, 0x00, 0x62, 0x02, 0x54,
    0x01, 0x31, 0x00, 0x7a, 0x00, 0x0a, 0x00, 0x0a, 0x00, 0x6c, 0x01, 0x31,
    0x00, 0x72, 0x01, 0x31, 0x00, 0x6b, 0x00, 0x73, 0x00, 0x20, 0x00, 0x74,
    0x00, 0x72, 0x00, 0xe6, 0x00, 0x6e, 0x00, 0x73, 0x00, 0x6b, 0x00, 0x72,
    0x00, 0x61, 0x01, 0x31, 0x00, 0x62, 0x00, 0x64, 0x00, 0x20, 0x01, 0x31,
    0x00, 0x6e, 0x00, 0x74, 0x00, 0x75, 0x00, 0x20, 0x00, 0x69, 0x00, 0x70,
    0x00, 0x61, 0x00, 0x20, 0x00, 0x6a, 0x00, 0x75, 0x02, 0xd0, 0x00, 0x73,
    0x01, 0x31, 0x00, 0x6e, 0x00, 0x67, 0x00, 0x20, 0x00, 0xe6, 0x00, 0x6e,
    0x00, 0x20, 0x00, 0x69, 0x02, 0xd0, 0x00, 0x73, 0x00, 0x74, 0x00, 0x20,
    0x00, 0x6d, 0x01, 0x31, 0x00, 0x64, 0x00, 0x6c, 0x02, 0x59, 0x00, 0x6e,
    0x00, 0x64, 0x00, 0x73, 0x00, 0x20, 0x00, 0xe6, 0x00, 0x6b, 0x00, 0x73,
    0x02, 0x59, 0x00, 0x6e, 0x00, 0x74, 0x00, 0x20, 0x00, 0x62, 0x00, 0x61,
    0x01, 0x31, 0x00, 0x20, 0x00, 0x72, 0x02, 0x52, 0x00, 0x62, 0x02, 0x59,
    0x00, 0x74, 0x00, 0x20, 0x00, 0x62, 0x00, 0x72, 0x00, 0x65, 0x01, 0x31,
    0x00, 0x64, 0x00, 0x69, 0x00, 0x0a, 0x00, 0x0a, 0x00, 0x73, 0x02, 0x52,
    0x00, 0x72, 0x00, 0x69, 0x00, 0x20, 0x02, 0x59, 0x00, 0x62, 0x00, 0x61,
    0x02, 0x8a, 0x00, 0x74, 0x00, 0x20, 0x00, 0xf0, 0x02, 0x59, 0x00, 0x20,
    0x00, 0x6c, 0x00, 0xe6, 0x00, 0x6b, 0x00, 0x20, 0x02, 0x59, 0x00, 0x76,
    0x00, 0x20, 0x00, 0x73, 0x00, 0x74, 0x00, 0x72, 0x02, 0x5b, 0x00, 0x73,
    0x00, 0x20, 0x01, 0x31, 0x00, 0x6e, 0x00, 0x66, 0x02, 0x54, 0x02, 0xd0,
    0x00, 0x6d, 0x00, 0x65, 0x01, 0x31, 0x02, 0x83, 0x02, 0x59, 0x00, 0x6e,
    0x00, 0x2c, 0x00, 0x20, 0x00, 0x62, 0x02, 0x8c, 0x00, 0x74, 0x00, 0x20,
    0x00, 0x73, 0x01, 0x31, 0x00, 0x6e, 0x00, 0x73, 0x00, 0x20, 0x00, 0xf0,
    0x01, 0x31, 0x00, 0x73, 0x00, 0x20, 0x01, 0x31, 0x00, 0x7a, 0x00, 0x20,
    0x02, 0x59, 0x00, 0x20, 0x00, 0x72, 0x00, 0xe6, 0x00, 0x70, 0x00, 0x20,
    0x01, 0x31, 0x00, 0x74, 0x00, 0x20, 0x00, 0x77, 0x02, 0x8c, 0x00, 0x64,
    0x02, 0x59, 0x00, 0x6e, 0x00, 0x74, 0x00, 0x20, 0x00, 0x62, 0x00, 0x69,
    0x00, 0x20, 0x00, 0x0a, 0x00, 0x6d, 0x02, 0x8c, 0x02, 0xa7, 0x00, 0x20,
    0x00, 0x6a, 0x00, 0x75, 0x02, 0xd0, 0x00, 0x73, 0x00, 0x0a, 0x00, 0x0a,
};
  constexpr size_t txt_sz = (sizeof txt_utf16be)/(sizeof txt_utf16be[0]);
  cout << "My conversion: " << endl;
  cout << my_utf16_to_utf8<true>((char*)txt_utf16be, txt_sz) << endl;
  cout << "Codecvt conversion: " << endl;
  cout << std_utf16_to_utf8<true>((char*)txt_utf16be, txt_sz);
  return 0;
}

Reply via email to