sc/Library_scui.mk                  |    3 +
 sc/source/ui/dbgui/scuiasciiopt.cxx |   71 +++++++++++++++++++-----------------
 2 files changed, 41 insertions(+), 33 deletions(-)

New commits:
commit 85f12e47f4a086a3923dd3a6b097776d60c6dc82
Author:     Tomofumi Yagi <[email protected]>
AuthorDate: Sat Sep 12 11:47:10 2020 +0900
Commit:     Noel Grandin <[email protected]>
CommitDate: Sun Sep 13 13:21:43 2020 +0200

    Calc: ScImportAsciiDlg can now detect Unicode encoding without BOM
    
    Change-Id: I8a3aa7458ce97f659c0caf2386a96f605b740fbc
    Reviewed-on: https://gerrit.libreoffice.org/c/core/+/102543
    Tested-by: Jenkins
    Reviewed-by: Noel Grandin <[email protected]>

diff --git a/sc/Library_scui.mk b/sc/Library_scui.mk
index a8c2097485b0..86605ab63a0d 100644
--- a/sc/Library_scui.mk
+++ b/sc/Library_scui.mk
@@ -39,6 +39,9 @@ $(eval $(call gb_Library_use_externals,scui,\
        $(call gb_Helper_optional,OPENCL, \
                clew) \
        mdds_headers \
+       icui18n \
+       icuuc \
+       icu_headers \
 ))
 
 $(eval $(call gb_Library_use_libraries,scui,\
diff --git a/sc/source/ui/dbgui/scuiasciiopt.cxx 
b/sc/source/ui/dbgui/scuiasciiopt.cxx
index a0e645e551e0..5e5f08bf87a7 100644
--- a/sc/source/ui/dbgui/scuiasciiopt.cxx
+++ b/sc/source/ui/dbgui/scuiasciiopt.cxx
@@ -37,6 +37,9 @@
 #include <miscuno.hxx>
 #include <osl/diagnose.h>
 
+#include <unicode/uclean.h>
+#include <unicode/ucsdet.h>
+
 //! TODO make dynamic
 const SCSIZE ASCIIDLG_MAXROWS                = MAXROWCOUNT;
 
@@ -380,41 +383,43 @@ ScImportAsciiDlg::ScImportAsciiDlg(weld::Window* pParent, 
const OUString& aDatNa
     // Sniff for Unicode / not
     if( ePreselectUnicode == RTL_TEXTENCODING_DONTKNOW && mpDatStream )
     {
-        Seek( 0 );
-        mpDatStream->StartReadingUnicodeText( RTL_TEXTENCODING_DONTKNOW );
-        sal_uLong nUniPos = mpDatStream->Tell();
-        switch (nUniPos)
+        mpDatStream->Seek( 0 );
+        constexpr size_t buffsize = 4096;
+        sal_Int8 bytes[buffsize] = { 0 };
+        sal_Int32 nRead = mpDatStream->ReadBytes( bytes, buffsize );
+        mpDatStream->Seek( 0 );
+
+        if ( nRead > 0 )
         {
-            case 2:
-                ePreselectUnicode = RTL_TEXTENCODING_UNICODE;   // UTF-16
-                break;
-            case 3:
-                ePreselectUnicode = RTL_TEXTENCODING_UTF8;      // UTF-8
-                break;
-            case 0:
-                {
-                    sal_uInt16 n;
-                    mpDatStream->ReadUInt16( n );
-                    // Assume that normal ASCII/ANSI/ISO/etc. text doesn't 
start with
-                    // control characters except CR,LF,TAB
-                    if ( (n & 0xff00) < 0x2000 )
-                    {
-                        switch ( n & 0xff00 )
-                        {
-                            case 0x0900 :
-                            case 0x0a00 :
-                            case 0x0d00 :
-                                break;
-                            default:
-                                ePreselectUnicode = RTL_TEXTENCODING_UNICODE;  
 // UTF-16
-                        }
-                    }
-                    mpDatStream->Seek(0);
-                }
-                break;
-            default:
-                ;   // nothing
+            UErrorCode uerr = U_ZERO_ERROR;
+            UCharsetDetector* ucd = ucsdet_open( &uerr );
+            ucsdet_setText( ucd, reinterpret_cast<const char*>(bytes), nRead, 
&uerr );
+            const UCharsetMatch* match = ucsdet_detect( ucd, &uerr );
+            const char* pEncodingName = ucsdet_getName( match, &uerr );
+
+            if ( U_SUCCESS(uerr) && !strcmp("UTF-8", pEncodingName) )
+            {
+                ePreselectUnicode = RTL_TEXTENCODING_UTF8; // UTF-8
+                mpDatStream->StartReadingUnicodeText( RTL_TEXTENCODING_UTF8 );
+            }
+            else if ( U_SUCCESS(uerr) && !strcmp("UTF-16LE", pEncodingName) )
+            {
+                ePreselectUnicode = RTL_TEXTENCODING_UNICODE; // UTF-16LE
+                mpDatStream->SetEndian( SvStreamEndian::LITTLE );
+                mpDatStream->StartReadingUnicodeText( RTL_TEXTENCODING_UNICODE 
);
+            }
+            else if ( U_SUCCESS(uerr) && !strcmp("UTF-16BE", pEncodingName) )
+            {
+                ePreselectUnicode = RTL_TEXTENCODING_UNICODE; // UTF-16BE
+                mpDatStream->SetEndian(SvStreamEndian::BIG);
+                mpDatStream->StartReadingUnicodeText( RTL_TEXTENCODING_UNICODE 
);
+            }
+            else // other
+                mpDatStream->StartReadingUnicodeText( 
RTL_TEXTENCODING_DONTKNOW );
+
+            ucsdet_close( ucd );
         }
+
         mnStreamPos = mpDatStream->Tell();
     }
 
_______________________________________________
Libreoffice-commits mailing list
[email protected]
https://lists.freedesktop.org/mailman/listinfo/libreoffice-commits

Reply via email to