core.git: sal/textenc

Mike Kaganski (via logerrit) Fri, 22 Aug 2025 00:14:53 -0700

 sal/textenc/tencinfo.cxx |  268 +++++++++++++----------------------------------
 1 file changed, 76 insertions(+), 192 deletions(-)


New commits:
commit 2ecfde9b84a0f548871815327d7ce628de93edad
Author:     Mike Kaganski <[email protected]>
AuthorDate: Thu Aug 21 23:11:45 2025 +0200
Commit:     Mike Kaganski <[email protected]>
CommitDate: Fri Aug 22 09:14:26 2025 +0200

    Simplify rtl_getTextEncodingFrom{Unix,Mime}Charset
    
    Change-Id: I912deeaf3ddfe7198e65f7b17a5b067b1d49cd91
    Reviewed-on: https://gerrit.libreoffice.org/c/core/+/190041
    Tested-by: Jenkins
    Reviewed-by: Mike Kaganski <[email protected]>

diff --git a/sal/textenc/tencinfo.cxx b/sal/textenc/tencinfo.cxx
index 2447e71006fd..9e6dbc02ada4 100644
--- a/sal/textenc/tencinfo.cxx
+++ b/sal/textenc/tencinfo.cxx
@@ -19,13 +19,15 @@
 
 #include <sal/config.h>
 
-#include <cstring>
+#include <algorithm>
+#include <span>
+#include <string_view>
 
+#include <rtl/character.hxx>
 #include <rtl/tencinfo.h>
 
 #include "gettextencodingdata.hxx"
 #include "tenchelp.hxx"
-#include <memory>
 
 sal_Bool SAL_CALL rtl_isOctetTextEncoding(rtl_TextEncoding nEncoding)
 {
@@ -37,82 +39,18 @@ sal_Bool SAL_CALL rtl_isOctetTextEncoding(rtl_TextEncoding 
nEncoding)
 
 /* ======================================================================= */
 
-static void Impl_toAsciiLower( const char* pName, char* pBuf )
-{
-    while ( *pName )
-    {
-        /* A-Z */
-        if ( (*pName >= 0x41) && (*pName <= 0x5A) )
-            *pBuf = (*pName)+0x20;  /* toAsciiLower */
-        else
-            *pBuf = *pName;
-
-        pBuf++;
-        pName++;
-    }
-
-    *pBuf = '
-}
-
-/* ----------------------------------------------------------------------- */
-
-static void Impl_toAsciiLowerAndRemoveNonAlphanumeric( const char* pName, 
char* pBuf )
-{
-    while ( *pName )
-    {
-        /* A-Z */
-        if ( (*pName >= 0x41) && (*pName <= 0x5A) )
-        {
-            *pBuf = (*pName)+0x20;  /* toAsciiLower */
-            pBuf++;
-        }
-        /* a-z, 0-9 */
-        else if ( ((*pName >= 0x61) && (*pName <= 0x7A)) ||
-                  ((*pName >= 0x30) && (*pName <= 0x39)) )
-        {
-            *pBuf = *pName;
-            pBuf++;
-        }
-
-        pName++;
-    }
-
-    *pBuf = '
-}
-
-/* ----------------------------------------------------------------------- */
-
-/* pMatchStr must match with all characters in pCompStr */
-static bool Impl_matchString( const char* pCompStr, const char* pMatchStr )
-{
-    /* We test only for end in MatchStr, because the last 0 character from */
-    /* pCompStr is unequal a character in MatchStr, so the loop terminates */
-    while ( *pMatchStr )
-    {
-        if ( *pCompStr != *pMatchStr )
-            return false;
-
-        pCompStr++;
-        pMatchStr++;
-    }
-
-    return true;
-}
-
-/* ======================================================================= */
-
 namespace {
 
 struct ImplStrCharsetDef
 {
-    const char*             mpCharsetStr;
+    std::string_view mpCharsetStr;
     rtl_TextEncoding        meTextEncoding;
 };
 
 struct ImplStrFirstPartCharsetDef
 {
-    const char*             mpCharsetStr;
-    const ImplStrCharsetDef*    mpSecondPartTab;
+    std::string_view mpCharsetStr;
+    std::span<const ImplStrCharsetDef> mpSecondPartTab;
 };
 
 }
@@ -210,7 +148,7 @@ rtl_TextEncoding SAL_CALL 
rtl_getTextEncodingFromUnixCharset( const char* pUnixC
     /* for the first matching string in the tables. */
     /* Sort order: unique (first 14, then 1), important */
 
-    static ImplStrCharsetDef const aUnixCharsetISOTab[] =
+    static constexpr ImplStrCharsetDef aUnixCharsetISOTab[] =
     {
         { "15", RTL_TEXTENCODING_ISO_8859_15 },
         { "14", RTL_TEXTENCODING_ISO_8859_14 },
@@ -226,16 +164,14 @@ rtl_TextEncoding SAL_CALL 
rtl_getTextEncodingFromUnixCharset( const char* pUnixC
         { "7", RTL_TEXTENCODING_ISO_8859_7 },
         { "8", RTL_TEXTENCODING_ISO_8859_8 },
         { "9", RTL_TEXTENCODING_ISO_8859_9 },
-        { nullptr, RTL_TEXTENCODING_DONTKNOW }
     };
 
-    static ImplStrCharsetDef const aUnixCharsetADOBETab[] =
+    static constexpr ImplStrCharsetDef aUnixCharsetADOBETab[] =
     {
         { "fontspecific", RTL_TEXTENCODING_SYMBOL },
-        { nullptr, RTL_TEXTENCODING_DONTKNOW }
     };
 
-    static ImplStrCharsetDef const aUnixCharsetMSTab[] =
+    static constexpr ImplStrCharsetDef aUnixCharsetMSTab[] =
     {
         { "1252", RTL_TEXTENCODING_MS_1252 },
         { "1250", RTL_TEXTENCODING_MS_1250 },
@@ -265,10 +201,9 @@ rtl_TextEncoding SAL_CALL 
rtl_getTextEncodingFromUnixCharset( const char* pUnixC
         { "cp949", RTL_TEXTENCODING_MS_949 },
         { "cp950", RTL_TEXTENCODING_MS_950 },
         { "cp1361", RTL_TEXTENCODING_MS_1361 },
-        { nullptr, RTL_TEXTENCODING_DONTKNOW }
     };
 
-    static ImplStrCharsetDef const aUnixCharsetIBMTab[] =
+    static constexpr ImplStrCharsetDef aUnixCharsetIBMTab[] =
     {
         { "437", RTL_TEXTENCODING_IBM_437 },
         { "850", RTL_TEXTENCODING_IBM_850 },
@@ -288,95 +223,90 @@ rtl_TextEncoding SAL_CALL 
rtl_getTextEncodingFromUnixCharset( const char* pUnixC
         { "874", RTL_TEXTENCODING_MS_874 },
         { "1004", RTL_TEXTENCODING_MS_1252 },
         { "65400", RTL_TEXTENCODING_SYMBOL },
-        { nullptr, RTL_TEXTENCODING_DONTKNOW }
     };
 
-    static ImplStrCharsetDef const aUnixCharsetKOI8Tab[] =
+    static constexpr ImplStrCharsetDef aUnixCharsetKOI8Tab[] =
     {
         { "r", RTL_TEXTENCODING_KOI8_R },
         { "u", RTL_TEXTENCODING_KOI8_U },
-        { nullptr, RTL_TEXTENCODING_DONTKNOW }
     };
 
-    static ImplStrCharsetDef const aUnixCharsetJISX0208Tab[] =
+    static constexpr ImplStrCharsetDef aUnixCharsetJISX0208Tab[] =
     {
-        { nullptr, RTL_TEXTENCODING_JIS_X_0208 }
+        { {}, RTL_TEXTENCODING_JIS_X_0208 }
     };
 
-    static ImplStrCharsetDef const aUnixCharsetJISX0201Tab[] =
+    static constexpr ImplStrCharsetDef aUnixCharsetJISX0201Tab[] =
     {
-        { nullptr, RTL_TEXTENCODING_JIS_X_0201 }
+        { {}, RTL_TEXTENCODING_JIS_X_0201 }
     };
 
-    static ImplStrCharsetDef const aUnixCharsetJISX0212Tab[] =
+    static constexpr ImplStrCharsetDef aUnixCharsetJISX0212Tab[] =
     {
-        { nullptr, RTL_TEXTENCODING_JIS_X_0212 }
+        { {}, RTL_TEXTENCODING_JIS_X_0212 }
     };
 
-    static ImplStrCharsetDef const aUnixCharsetGBTab[] =
+    static constexpr ImplStrCharsetDef aUnixCharsetGBTab[] =
     {
-        { nullptr, RTL_TEXTENCODING_GB_2312 }
+        { {}, RTL_TEXTENCODING_GB_2312 }
     };
 
-    static ImplStrCharsetDef const aUnixCharsetGBKTab[] =
+    static constexpr ImplStrCharsetDef aUnixCharsetGBKTab[] =
     {
-        { nullptr, RTL_TEXTENCODING_GBK }
+        { {}, RTL_TEXTENCODING_GBK }
     };
 
-    static ImplStrCharsetDef const aUnixCharsetBIG5Tab[] =
+    static constexpr ImplStrCharsetDef aUnixCharsetBIG5Tab[] =
     {
-        { nullptr, RTL_TEXTENCODING_BIG5 }
+        { {}, RTL_TEXTENCODING_BIG5 }
     };
 
-    static ImplStrCharsetDef const aUnixCharsetKSC56011987Tab[] =
+    static constexpr ImplStrCharsetDef aUnixCharsetKSC56011987Tab[] =
     {
-        { nullptr, RTL_TEXTENCODING_EUC_KR }
+        { {}, RTL_TEXTENCODING_EUC_KR }
     };
 
-    static ImplStrCharsetDef const aUnixCharsetKSC56011992Tab[] =
+    static constexpr ImplStrCharsetDef aUnixCharsetKSC56011992Tab[] =
     {
-        { nullptr, RTL_TEXTENCODING_MS_1361 }
+        { {}, RTL_TEXTENCODING_MS_1361 }
     };
 
-    static ImplStrCharsetDef const aUnixCharsetISO10646Tab[] =
+    static constexpr ImplStrCharsetDef aUnixCharsetISO10646Tab[] =
     {
-        { nullptr, RTL_TEXTENCODING_UNICODE }
+        { {}, RTL_TEXTENCODING_UNICODE }
     };
 
-    static ImplStrCharsetDef const aUnixCharsetUNICODETab[] =
+    static constexpr ImplStrCharsetDef aUnixCharsetUNICODETab[] =
     {
 /* Currently every Unicode Encoding is for us Unicode */
 /*        { "fontspecific", RTL_TEXTENCODING_UNICODE }, */
-        { nullptr, RTL_TEXTENCODING_UNICODE }
+        { {}, RTL_TEXTENCODING_UNICODE }
     };
 
-    static ImplStrCharsetDef const aUnixCharsetSymbolTab[] =
+    static constexpr ImplStrCharsetDef aUnixCharsetSymbolTab[] =
     {
-        { nullptr, RTL_TEXTENCODING_SYMBOL }
+        { {}, RTL_TEXTENCODING_SYMBOL }
     };
 
     /* See <http://cvs.freedesktop.org/xorg/xc/fonts/encodings/iso8859-11.enc?
        rev=1.1.1.1>: */
-    static ImplStrCharsetDef const aUnixCharsetTIS620Tab[] =
+    static constexpr ImplStrCharsetDef aUnixCharsetTIS620Tab[] =
     {
         { "0", RTL_TEXTENCODING_TIS_620 },
         { "2529", RTL_TEXTENCODING_TIS_620 },
         { "2533", RTL_TEXTENCODING_TIS_620 },
-        { nullptr, RTL_TEXTENCODING_DONTKNOW }
     };
-    static ImplStrCharsetDef const aUnixCharsetTIS6202529Tab[] =
+    static constexpr ImplStrCharsetDef aUnixCharsetTIS6202529Tab[] =
     {
         { "1", RTL_TEXTENCODING_TIS_620 },
-        { nullptr, RTL_TEXTENCODING_DONTKNOW }
     };
-    static ImplStrCharsetDef const aUnixCharsetTIS6202533Tab[] =
+    static constexpr ImplStrCharsetDef aUnixCharsetTIS6202533Tab[] =
     {
         { "0", RTL_TEXTENCODING_TIS_620 },
         { "1", RTL_TEXTENCODING_TIS_620 },
-        { nullptr, RTL_TEXTENCODING_DONTKNOW }
     };
 
-    static ImplStrFirstPartCharsetDef const aUnixCharsetFirstPartTab[] =
+    static constexpr ImplStrFirstPartCharsetDef aUnixCharsetFirstPartTab[] =
     {
         { "iso8859", aUnixCharsetISOTab },
         { "adobe", aUnixCharsetADOBETab },
@@ -407,66 +337,40 @@ rtl_TextEncoding SAL_CALL 
rtl_getTextEncodingFromUnixCharset( const char* pUnixC
 /*        { "sunudcko.1997",  },        */
 /*        { "sunudczh.1997",  },        */
 /*        { "sunudczhtw.1997",  },      */
-        { nullptr, nullptr }
     };
 
-    rtl_TextEncoding    eEncoding = RTL_TEXTENCODING_DONTKNOW;
-    char*           pTempBuf;
-    sal_uInt32          nBufLen = strlen( pUnixCharset )+1;
-    const char*     pFirstPart;
-    const char*     pSecondPart;
+    const std::string_view charset(pUnixCharset);
+    const size_t dashPos = charset.find('-');
+    if (dashPos == std::string_view::npos)
+        return RTL_TEXTENCODING_DONTKNOW;
 
+    /* found part separator */
     /* Alloc Buffer and map to lower case */
-    std::unique_ptr<char[]> pBuf(new char[nBufLen]);
-    Impl_toAsciiLower( pUnixCharset, pBuf.get() );
-
-    /* Search FirstPart */
-    pFirstPart = pBuf.get();
-    pSecondPart = nullptr;
-    pTempBuf = pBuf.get();
-    while ( *pTempBuf )
-    {
-        if ( *pTempBuf == '-' )
-        {
-            *pTempBuf = '
-            pSecondPart = pTempBuf+1;
-            break;
-        }
+    char buf1[20]; // more than enough to compare the longest prefix here, 
which is 14 chars
+    std::string_view part1 = charset.substr(0, std::min(dashPos, size_t(20)));
+    std::transform(part1.begin(), part1.end(), buf1, 
rtl::toAsciiLowerCase<unsigned char>);
+    part1 = { buf1, part1.size() };
 
-        pTempBuf++;
-    }
+    char buf2[20]; // more than enough to compare the longest postfix here, 
which is 13 chars
+    std::string_view part2 = charset.substr(dashPos + 1, 20);
+    std::transform(part2.begin(), part2.end(), buf2, 
rtl::toAsciiLowerCase<unsigned char>);
+    part2 = { buf2, part2.size() };
 
-    /* found part separator */
-    if ( pSecondPart )
+    /* Search for the part tab */
+    for (const auto& rFirstPartData : aUnixCharsetFirstPartTab)
     {
-        /* Search for the part tab */
-        const ImplStrFirstPartCharsetDef* pFirstPartData = 
aUnixCharsetFirstPartTab;
-        while ( pFirstPartData->mpCharsetStr )
+        if (part1.starts_with(rFirstPartData.mpCharsetStr))
         {
-            if ( Impl_matchString( pFirstPart, pFirstPartData->mpCharsetStr ) )
-            {
-                /* Search for the charset in the second part tab */
-                const ImplStrCharsetDef* pData = 
pFirstPartData->mpSecondPartTab;
-                while ( pData->mpCharsetStr )
-                {
-                    if ( Impl_matchString( pSecondPart, pData->mpCharsetStr ) )
-                    {
-                        break;
-                    }
-
-                    pData++;
-                }
-
-                /* use default encoding for first part */
-                eEncoding = pData->meTextEncoding;
-                break;
-            }
-
-            pFirstPartData++;
+            /* Search for the charset in the second part tab */
+            for (const auto& rData : rFirstPartData.mpSecondPartTab)
+                if (part2.starts_with(rData.mpCharsetStr)) // empty 
mpCharsetStr matches always
+                    return rData.meTextEncoding;
+
+            return RTL_TEXTENCODING_DONTKNOW;
         }
     }
 
-    return eEncoding;
+    return RTL_TEXTENCODING_DONTKNOW;
 }
 
 /* ----------------------------------------------------------------------- */
@@ -477,7 +381,7 @@ rtl_TextEncoding SAL_CALL 
rtl_getTextEncodingFromMimeCharset( const char* pMimeC
     /* characters. The function search for the first equal string in */
     /* the table. In this table are only the most used mime types. */
     /* Sort order: important */
-    static ImplStrCharsetDef const aVIPMimeCharsetTab[] =
+    static constexpr ImplStrCharsetDef aVIPMimeCharsetTab[] =
     {
         { "usascii", RTL_TEXTENCODING_ASCII_US },
         { "utf8", RTL_TEXTENCODING_UTF8 },
@@ -519,14 +423,13 @@ rtl_TextEncoding SAL_CALL 
rtl_getTextEncodingFromMimeCharset( const char* pMimeC
         { "windows1256", RTL_TEXTENCODING_MS_1256 },
         { "windows1257", RTL_TEXTENCODING_MS_1257 },
         { "windows1258", RTL_TEXTENCODING_MS_1258 },
-        { nullptr, RTL_TEXTENCODING_DONTKNOW }
     };
 
     /* All Identifiers are in lower case and contain only alphanumeric */
     /* characters. The function search for the first matching string in */
     /* the table. */
     /* Sort order: unique (first iso885914, then iso88591), important */
-    static ImplStrCharsetDef const aMimeCharsetTab[] =
+    static constexpr ImplStrCharsetDef aMimeCharsetTab[] =
     {
         { "unicode11utf7", RTL_TEXTENCODING_UTF7 },
         { "caunicode11utf7", RTL_TEXTENCODING_UTF7 },
@@ -737,46 +640,27 @@ rtl_TextEncoding SAL_CALL 
rtl_getTextEncodingFromMimeCharset( const char* pMimeC
         { "xisciide", RTL_TEXTENCODING_ISCII_DEVANAGARI },
             /* This is not an official MIME character set name, but is in use 
by
                various windows APIs. */
-        { nullptr, RTL_TEXTENCODING_DONTKNOW }
     };
 
-    rtl_TextEncoding            eEncoding = RTL_TEXTENCODING_DONTKNOW;
-    const ImplStrCharsetDef*    pData = aVIPMimeCharsetTab;
-    sal_uInt32                  nBufLen = strlen( pMimeCharset )+1;
-
     /* Alloc Buffer and map to lower case and remove non alphanumeric chars */
-    std::unique_ptr<char[]> pBuf(new char[nBufLen]);
-    Impl_toAsciiLowerAndRemoveNonAlphanumeric( pMimeCharset, pBuf.get() );
+    char buf[30]; // more than enough to compare the longest encoding name 
here, which is 23 chars
+    auto end = std::begin(buf);
+    for (const char* p = pMimeCharset; *p && end != std::end(buf); ++p)
+        if (rtl::isAsciiAlphanumeric<unsigned char>(*p))
+            *end++ = rtl::toAsciiLowerCase<unsigned char>(*p);
+    std::string_view charset(buf, end);
 
     /* Search for equal in the VIP table */
-    while ( pData->mpCharsetStr )
-    {
-        if ( strcmp( pBuf.get(), pData->mpCharsetStr ) == 0 )
-        {
-            eEncoding = pData->meTextEncoding;
-            break;
-        }
-
-        pData++;
-    }
+    for (const auto& rData : aVIPMimeCharsetTab)
+        if (charset == rData.mpCharsetStr)
+            return rData.meTextEncoding;
 
     /* Search for matching in the mime table */
-    if ( eEncoding == RTL_TEXTENCODING_DONTKNOW )
-    {
-        pData = aMimeCharsetTab;
-        while ( pData->mpCharsetStr )
-        {
-            if ( Impl_matchString( pBuf.get(), pData->mpCharsetStr ) )
-            {
-                eEncoding = pData->meTextEncoding;
-                break;
-            }
-
-            pData++;
-        }
-    }
+    for (const auto& rData : aMimeCharsetTab)
+        if (charset.starts_with(rData.mpCharsetStr))
+            return rData.meTextEncoding;
 
-    return eEncoding;
+    return RTL_TEXTENCODING_DONTKNOW;
 }
 
 /* ======================================================================= */

core.git: sal/textenc

Reply via email to