This is an automated email from the ASF dual-hosted git repository. damjan pushed a commit to branch icu-c-api in repository https://gitbox.apache.org/repos/asf/openoffice.git
commit ad25e7e6cc7202dfaa6e89605fba804f12bcacff Author: Damjan Jovanovic <[email protected]> AuthorDate: Thu May 1 11:02:23 2025 +0200 Use only the C API for ICU in i18npool, so it can use newer ICU versions. Patch by: me --- main/i18npool/Library_i18npool.mk | 4 +- main/i18npool/inc/breakiterator_unicode.hxx | 5 +- main/i18npool/inc/calendar_gregorian.hxx | 2 +- main/i18npool/inc/collator_unicode.hxx | 3 +- main/i18npool/inc/warnings_guard_unicode_brkiter.h | 4 + .../i18npool/inc/warnings_guard_unicode_calendar.h | 5 +- main/i18npool/inc/warnings_guard_unicode_tblcoll.h | 4 + .../source/breakiterator/breakiterator_unicode.cxx | 142 +++++++++++++-------- main/i18npool/source/breakiterator/data/line.txt | 3 +- main/i18npool/source/breakiterator/data/sent.txt | 87 +++---------- .../source/calendar/calendar_gregorian.cxx | 104 ++++++++------- main/i18npool/source/collator/collator_unicode.cxx | 46 ++++--- main/i18npool/source/collator/gencoll_rule.cxx | 39 ++++-- main/i18npool/source/search/textsearch.cxx | 96 ++++++++------ main/i18npool/source/search/textsearch.hxx | 8 +- 15 files changed, 308 insertions(+), 244 deletions(-) diff --git a/main/i18npool/Library_i18npool.mk b/main/i18npool/Library_i18npool.mk index cf030058cc..9c74e5e727 100644 --- a/main/i18npool/Library_i18npool.mk +++ b/main/i18npool/Library_i18npool.mk @@ -144,7 +144,9 @@ $(WORKDIR)/CustomTarget/i18npool/source/collator/lrl_include.hxx : $(wildcard $( # fdo#31271 ")" reclassified in more recent ICU/Unicode Standards --include $(OUTDIR)/inc/icuversion.mk +ifneq ($(SYSTEM_ICU),YES) +include $(OUTDIR)/inc/icuversion.mk +endif ICU_RECLASSIFIED_BRACKET := $(shell [ ${ICU_MAJOR} -ge 5 -o \( ${ICU_MAJOR} -eq 4 -a ${ICU_MINOR} -ge 4 \) ] && echo YES) diff --git a/main/i18npool/inc/breakiterator_unicode.hxx b/main/i18npool/inc/breakiterator_unicode.hxx index a6ff9b987b..c3c1fa46b2 100644 --- a/main/i18npool/inc/breakiterator_unicode.hxx +++ b/main/i18npool/inc/breakiterator_unicode.hxx @@ -26,6 +26,7 @@ #include <breakiteratorImpl.hxx> #include "warnings_guard_unicode_brkiter.h" +#include "unicode/ustring.h" namespace com { namespace sun { namespace star { namespace i18n { @@ -80,8 +81,8 @@ protected: Boundary result; // for word break iterator struct BI_Data { - UnicodeString aICUText; - icu::BreakIterator *aBreakIterator; + ::rtl::OUString aICUText; + UBreakIterator *aBreakIterator; BI_Data() : aICUText(), aBreakIterator(NULL) {} } character, word, sentence, line, *icuBI; diff --git a/main/i18npool/inc/calendar_gregorian.hxx b/main/i18npool/inc/calendar_gregorian.hxx index 90571e7cf3..f0935116a3 100644 --- a/main/i18npool/inc/calendar_gregorian.hxx +++ b/main/i18npool/inc/calendar_gregorian.hxx @@ -86,7 +86,7 @@ public: protected: Era *eraArray; - icu::Calendar *body; + UCalendar *body; NativeNumberSupplier aNatNum; const sal_Char* cCalendar; com::sun::star::lang::Locale aLocale; diff --git a/main/i18npool/inc/collator_unicode.hxx b/main/i18npool/inc/collator_unicode.hxx index d45b449d78..c480560980 100644 --- a/main/i18npool/inc/collator_unicode.hxx +++ b/main/i18npool/inc/collator_unicode.hxx @@ -72,7 +72,8 @@ public: protected: const sal_Char *implementationName; private: - RuleBasedCollator *uca_base, *collator; + UCollator *uca_base; + UCollator *collator; oslModule hModule; }; diff --git a/main/i18npool/inc/warnings_guard_unicode_brkiter.h b/main/i18npool/inc/warnings_guard_unicode_brkiter.h index a681c8b90c..77d1e0adaf 100644 --- a/main/i18npool/inc/warnings_guard_unicode_brkiter.h +++ b/main/i18npool/inc/warnings_guard_unicode_brkiter.h @@ -24,6 +24,9 @@ #ifndef INCLUDED_WARNINGS_GUARD_UNICODE_BRKITER_H #define INCLUDED_WARNINGS_GUARD_UNICODE_BRKITER_H +#define U_SHOW_CPLUSPLUS_API 0 +#define U_SHOW_CPLUSPLUS_HEADER_API 0 + // Because the GCC system_header mechanism doesn't work in .c/.cxx compilation // units and more important affects the rest of the current include file, the // warnings guard is separated into this header file on its own. @@ -37,6 +40,7 @@ #pragma GCC system_header #endif #include <unicode/brkiter.h> +#include <unicode/ubrk.h> #ifdef _MSC_VER #pragma warning(pop) #endif diff --git a/main/i18npool/inc/warnings_guard_unicode_calendar.h b/main/i18npool/inc/warnings_guard_unicode_calendar.h index 41a5ee26a8..d2db70542c 100644 --- a/main/i18npool/inc/warnings_guard_unicode_calendar.h +++ b/main/i18npool/inc/warnings_guard_unicode_calendar.h @@ -24,6 +24,9 @@ #ifndef INCLUDED_WARNINGS_GUARD_UNICODE_CALENDAR_H #define INCLUDED_WARNINGS_GUARD_UNICODE_CALENDAR_H +#define U_SHOW_CPLUSPLUS_API 0 +#define U_SHOW_CPLUSPLUS_HEADER_API 0 + // Because the GCC system_header mechanism doesn't work in .c/.cxx compilation // units and more important affects the rest of the current include file, the // warnings guard is separated into this header file on its own. @@ -36,7 +39,7 @@ #elif defined __GNUC__ #pragma GCC system_header #endif -#include <unicode/calendar.h> +#include <unicode/ucal.h> #ifdef _MSC_VER #pragma warning(pop) #endif diff --git a/main/i18npool/inc/warnings_guard_unicode_tblcoll.h b/main/i18npool/inc/warnings_guard_unicode_tblcoll.h index de609c8443..bc131fbac5 100644 --- a/main/i18npool/inc/warnings_guard_unicode_tblcoll.h +++ b/main/i18npool/inc/warnings_guard_unicode_tblcoll.h @@ -24,6 +24,9 @@ #ifndef INCLUDED_WARNINGS_GUARD_UNICODE_TBLCOLL_H #define INCLUDED_WARNINGS_GUARD_UNICODE_TBLCOLL_H +#define U_SHOW_CPLUSPLUS_API 0 +#define U_SHOW_CPLUSPLUS_HEADER_API 0 + // Because the GCC system_header mechanism doesn't work in .c/.cxx compilation // units and more important affects the rest of the current include file, the // warnings guard is separated into this header file on its own. @@ -37,6 +40,7 @@ #pragma GCC system_header #endif #include <unicode/tblcoll.h> +#include <unicode/ucol.h> #ifdef _MSC_VER #pragma warning(pop) #endif diff --git a/main/i18npool/source/breakiterator/breakiterator_unicode.cxx b/main/i18npool/source/breakiterator/breakiterator_unicode.cxx index c69e871d6a..fdd28c056b 100644 --- a/main/i18npool/source/breakiterator/breakiterator_unicode.cxx +++ b/main/i18npool/source/breakiterator/breakiterator_unicode.cxx @@ -31,6 +31,7 @@ #include <unicode/udata.h> #include <rtl/strbuf.hxx> #include <rtl/ustring.hxx> +#include <rtl/ustrbuf.hxx> U_CDECL_BEGIN extern const char OpenOffice_dat[]; @@ -67,15 +68,19 @@ BreakIterator_Unicode::BreakIterator_Unicode() : BreakIterator_Unicode::~BreakIterator_Unicode() { if (icuBI && icuBI->aBreakIterator) { - delete icuBI->aBreakIterator; + ubrk_close(icuBI->aBreakIterator); icuBI->aBreakIterator=NULL; } - if (character.aBreakIterator) delete character.aBreakIterator; - if (word.aBreakIterator) delete word.aBreakIterator; - if (sentence.aBreakIterator) delete sentence.aBreakIterator; - if (line.aBreakIterator) delete line.aBreakIterator; + if (character.aBreakIterator) ubrk_close(character.aBreakIterator); + if (word.aBreakIterator) ubrk_close(word.aBreakIterator); + if (sentence.aBreakIterator) ubrk_close(sentence.aBreakIterator); + if (line.aBreakIterator) ubrk_close(line.aBreakIterator); } +// Hard to support in C: +// 1. setBreakType() cannot be reached from C. +// 2. UDataMemory's udata_getLength() is a private API, yet we need the length for ubrk_openBinaryRules(). +#if 0 /* Wrapper class to provide public access to the RuleBasedBreakIterator's setbreakType method. @@ -91,6 +96,28 @@ class OOoRuleBasedBreakIterator : public RuleBasedBreakIterator { }; +static UBreakIterator* open_udata_BreakIterator(UDataMemory *udm, UErrorCode *status) +{ + *status = U_ZERO_ERROR; + UDataInfo info; + info.size = sizeof(info); + udata_getInfo(udm, &info); + if ( !(info.isBigEndian == U_IS_BIG_ENDIAN && + info.charsetFamily == U_CHARSET_FAMILY && + info.dataFormat[0] == 0x42 && // dataFormat="Brk " + info.dataFormat[1] == 0x72 && + info.dataFormat[2] == 0x6b && + info.dataFormat[3] == 0x20) + ) { + *status = U_INVALID_FORMAT_ERROR; + return NULL; + } + + uint8_t *memory = (uint8_t*) udata_getMemory(udm); + return ubrk_openBinaryRules(memory, udata_getLength(udm), NULL, 0, status); +} +#endif + // loading ICU breakiterator on demand. void SAL_CALL BreakIterator_Unicode::loadICUBreakIterator(const com::sun::star::lang::Locale& rLocale, sal_Int16 rBreakType, sal_Int16 rWordType, const sal_Char *rule, const OUString& rText) throw(uno::RuntimeException) @@ -114,9 +141,11 @@ void SAL_CALL BreakIterator_Unicode::loadICUBreakIterator(const com::sun::star:: rLocale.Language != aLocale.Language || rLocale.Country != aLocale.Country || rLocale.Variant != aLocale.Variant) { if (icuBI->aBreakIterator) { - delete icuBI->aBreakIterator; + ubrk_close(icuBI->aBreakIterator); icuBI->aBreakIterator=NULL; } +// Hard to support in C: +#if 0 if (rule) { uno::Sequence< OUString > breakRules = LocaleData().getBreakIteratorRules(rLocale); @@ -124,11 +153,11 @@ void SAL_CALL BreakIterator_Unicode::loadICUBreakIterator(const com::sun::star:: udata_setAppData("OpenOffice", OpenOffice_dat, &status); if ( !U_SUCCESS(status) ) throw ERROR; - OOoRuleBasedBreakIterator *rbi = NULL; + UBreakIterator *rbi = NULL; if (breakRules.getLength() > breakType && breakRules[breakType].getLength() > 0) { - rbi = new OOoRuleBasedBreakIterator(udata_open("OpenOffice", "brk", - OUStringToOString(breakRules[breakType], RTL_TEXTENCODING_ASCII_US).getStr(), &status), status); + rbi = open_udata_BreakIterator(udata_open("OpenOffice", "brk", + OUStringToOString(breakRules[breakType], RTL_TEXTENCODING_ASCII_US).getStr(), &status), &status); } else { status = U_ZERO_ERROR; OStringBuffer aUDName(64); @@ -137,12 +166,12 @@ void SAL_CALL BreakIterator_Unicode::loadICUBreakIterator(const com::sun::star:: aUDName.append( OUStringToOString(rLocale.Language, RTL_TEXTENCODING_ASCII_US)); UDataMemory* pUData = udata_open("OpenOffice", "brk", aUDName.getStr(), &status); if( U_SUCCESS(status) ) - rbi = new OOoRuleBasedBreakIterator( pUData, status); + rbi = open_udata_BreakIterator( pUData, &status); if (!U_SUCCESS(status) ) { status = U_ZERO_ERROR; pUData = udata_open("OpenOffice", "brk", rule, &status); if( U_SUCCESS(status) ) - rbi = new OOoRuleBasedBreakIterator( pUData, status); + rbi = open_udata_BreakIterator( pUData, &status); if (!U_SUCCESS(status) ) icuBI->aBreakIterator=NULL; } } @@ -156,26 +185,37 @@ void SAL_CALL BreakIterator_Unicode::loadICUBreakIterator(const com::sun::star:: icuBI->aBreakIterator = rbi; } } +#endif if (!icuBI->aBreakIterator) { - icu::Locale icuLocale( - OUStringToOString(rLocale.Language, RTL_TEXTENCODING_ASCII_US).getStr(), - OUStringToOString(rLocale.Country, RTL_TEXTENCODING_ASCII_US).getStr(), - OUStringToOString(rLocale.Variant, RTL_TEXTENCODING_ASCII_US).getStr()); + ::rtl::OUStringBuffer locale; + if (!rLocale.Language.isEmpty()) { + locale.append(rLocale.Language); + locale.appendAscii("_"); + if (!rLocale.Country.isEmpty()) { + locale.append(rLocale.Country); + } + if (!rLocale.Variant.isEmpty()) { + locale.appendAscii("_"); + locale.append(rLocale.Variant); + } + } + char icuLocale[1024]; + uloc_getName(OUStringToOString(locale.makeStringAndClear(), RTL_TEXTENCODING_ASCII_US).getStr(), icuLocale, sizeof(icuLocale), &status); status = U_ZERO_ERROR; switch (rBreakType) { case LOAD_CHARACTER_BREAKITERATOR: - icuBI->aBreakIterator = icu::BreakIterator::createCharacterInstance(icuLocale, status); + icuBI->aBreakIterator = ubrk_open(UBRK_CHARACTER, icuLocale, NULL, 0, &status); break; case LOAD_WORD_BREAKITERATOR: - icuBI->aBreakIterator = icu::BreakIterator::createWordInstance(icuLocale, status); + icuBI->aBreakIterator = ubrk_open(UBRK_WORD, icuLocale, NULL, 0, &status); break; case LOAD_SENTENCE_BREAKITERATOR: - icuBI->aBreakIterator = icu::BreakIterator::createSentenceInstance(icuLocale, status); + icuBI->aBreakIterator = ubrk_open(UBRK_SENTENCE, icuLocale, NULL, 0, &status); break; case LOAD_LINE_BREAKITERATOR: - icuBI->aBreakIterator = icu::BreakIterator::createLineInstance(icuLocale, status); + icuBI->aBreakIterator = ubrk_open(UBRK_LINE, icuLocale, NULL, 0, &status); break; } if ( !U_SUCCESS(status) ) { @@ -193,9 +233,9 @@ void SAL_CALL BreakIterator_Unicode::loadICUBreakIterator(const com::sun::star:: } } - if (newBreak || icuBI->aICUText.compare(UnicodeString(reinterpret_cast<const UChar *>(rText.getStr()), rText.getLength()))) { // UChar != sal_Unicode in MinGW - icuBI->aICUText=UnicodeString(reinterpret_cast<const UChar *>(rText.getStr()), rText.getLength()); - icuBI->aBreakIterator->setText(icuBI->aICUText); + if (newBreak || icuBI->aICUText.compareTo(rText)) { // UChar != sal_Unicode in MinGW + icuBI->aICUText = rText; + ubrk_setText(icuBI->aBreakIterator, (const UChar*)icuBI->aICUText.getStr(), icuBI->aICUText.getLength(), &status); } } @@ -208,8 +248,8 @@ sal_Int32 SAL_CALL BreakIterator_Unicode::nextCharacters( const OUString& Text, if (nCharacterIteratorMode == CharacterIteratorMode::SKIPCELL ) { // for CELL mode loadICUBreakIterator(rLocale, LOAD_CHARACTER_BREAKITERATOR, 0, "char", Text); for (nDone = 0; nDone < nCount; nDone++) { - nStartPos = character.aBreakIterator->following(nStartPos); - if (nStartPos == BreakIterator::DONE) + nStartPos = ubrk_following(character.aBreakIterator, nStartPos); + if (nStartPos == UBRK_DONE) return Text.getLength(); } } else { // for CHARACTER mode @@ -227,8 +267,8 @@ sal_Int32 SAL_CALL BreakIterator_Unicode::previousCharacters( const OUString& Te if (nCharacterIteratorMode == CharacterIteratorMode::SKIPCELL ) { // for CELL mode loadICUBreakIterator(rLocale, LOAD_CHARACTER_BREAKITERATOR, 0, "char", Text); for (nDone = 0; nDone < nCount; nDone++) { - nStartPos = character.aBreakIterator->preceding(nStartPos); - if (nStartPos == BreakIterator::DONE) + nStartPos = ubrk_preceding(character.aBreakIterator, nStartPos); + if (nStartPos == UBRK_DONE) return 0; } } else { // for BS to delete one char and CHARACTER mode. @@ -244,17 +284,17 @@ Boundary SAL_CALL BreakIterator_Unicode::nextWord( const OUString& Text, sal_Int { loadICUBreakIterator(rLocale, LOAD_WORD_BREAKITERATOR, rWordType, NULL, Text); - result.startPos = word.aBreakIterator->following(nStartPos); - if( result.startPos >= Text.getLength() || result.startPos == BreakIterator::DONE ) + result.startPos = ubrk_following(word.aBreakIterator, nStartPos); + if( result.startPos >= Text.getLength() || result.startPos == UBRK_DONE ) result.endPos = result.startPos; else { if ( (rWordType == WordType::ANYWORD_IGNOREWHITESPACES || rWordType == WordType::DICTIONARY_WORD ) && u_isWhitespace(Text.iterateCodePoints(&result.startPos, 0)) ) - result.startPos = word.aBreakIterator->following(result.startPos); + result.startPos = ubrk_following(word.aBreakIterator, result.startPos); - result.endPos = word.aBreakIterator->following(result.startPos); - if(result.endPos == BreakIterator::DONE) + result.endPos = ubrk_following(word.aBreakIterator, result.startPos); + if(result.endPos == UBRK_DONE) result.endPos = result.startPos; } return result; @@ -266,17 +306,17 @@ Boundary SAL_CALL BreakIterator_Unicode::previousWord(const OUString& Text, sal_ { loadICUBreakIterator(rLocale, LOAD_WORD_BREAKITERATOR, rWordType, NULL, Text); - result.startPos = word.aBreakIterator->preceding(nStartPos); - if( result.startPos < 0 || result.startPos == BreakIterator::DONE) + result.startPos = ubrk_preceding(word.aBreakIterator, nStartPos); + if( result.startPos < 0 || result.startPos == UBRK_DONE) result.endPos = result.startPos; else { if ( (rWordType == WordType::ANYWORD_IGNOREWHITESPACES || rWordType == WordType::DICTIONARY_WORD) && u_isWhitespace(Text.iterateCodePoints(&result.startPos, 0)) ) - result.startPos = word.aBreakIterator->preceding(result.startPos); + result.startPos = ubrk_preceding(word.aBreakIterator, result.startPos); - result.endPos = word.aBreakIterator->following(result.startPos); - if(result.endPos == BreakIterator::DONE) + result.endPos = ubrk_following(word.aBreakIterator, result.startPos); + if(result.endPos == UBRK_DONE) result.endPos = result.startPos; } return result; @@ -289,27 +329,27 @@ Boundary SAL_CALL BreakIterator_Unicode::getWordBoundary( const OUString& Text, loadICUBreakIterator(rLocale, LOAD_WORD_BREAKITERATOR, rWordType, NULL, Text); sal_Int32 len = Text.getLength(); - if(word.aBreakIterator->isBoundary(nPos)) { + if(ubrk_isBoundary(word.aBreakIterator, nPos)) { result.startPos = result.endPos = nPos; if((bDirection || nPos == 0) && nPos < len) //forward - result.endPos = word.aBreakIterator->following(nPos); + result.endPos = ubrk_following(word.aBreakIterator, nPos); else - result.startPos = word.aBreakIterator->preceding(nPos); + result.startPos = ubrk_preceding(word.aBreakIterator, nPos); } else { if(nPos <= 0) { result.startPos = 0; - result.endPos = len ? word.aBreakIterator->following((sal_Int32)0) : 0; + result.endPos = len ? ubrk_following(word.aBreakIterator, (sal_Int32)0) : 0; } else if(nPos >= len) { - result.startPos = word.aBreakIterator->preceding(len); + result.startPos = ubrk_preceding(word.aBreakIterator, len); result.endPos = len; } else { - result.startPos = word.aBreakIterator->preceding(nPos); - result.endPos = word.aBreakIterator->following(nPos); + result.startPos = ubrk_preceding(word.aBreakIterator, nPos); + result.endPos = ubrk_following(word.aBreakIterator, nPos); } } - if (result.startPos == BreakIterator::DONE) + if (result.startPos == UBRK_DONE) result.startPos = result.endPos; - else if (result.endPos == BreakIterator::DONE) + else if (result.endPos == UBRK_DONE) result.endPos = result.startPos; return result; @@ -324,8 +364,8 @@ sal_Int32 SAL_CALL BreakIterator_Unicode::beginOfSentence( const OUString& Text, sal_Int32 len = Text.getLength(); if (len > 0 && nStartPos == len) Text.iterateCodePoints(&nStartPos, -1); // issue #i27703# treat end position as part of last sentence - if (!sentence.aBreakIterator->isBoundary(nStartPos)) - nStartPos = sentence.aBreakIterator->preceding(nStartPos); + if (!ubrk_isBoundary(sentence.aBreakIterator, nStartPos)) + nStartPos = ubrk_preceding(sentence.aBreakIterator, nStartPos); // skip preceding space. sal_uInt32 ch = Text.iterateCodePoints(&nStartPos, 1); @@ -343,7 +383,7 @@ sal_Int32 SAL_CALL BreakIterator_Unicode::endOfSentence( const OUString& Text, s sal_Int32 len = Text.getLength(); if (len > 0 && nStartPos == len) Text.iterateCodePoints(&nStartPos, -1); // issue #i27703# treat end position as part of last sentence - nStartPos = sentence.aBreakIterator->following(nStartPos); + nStartPos = ubrk_following(sentence.aBreakIterator, nStartPos); sal_Int32 nPos=nStartPos; while (nPos > 0 && u_isWhitespace(Text.iterateCodePoints(&nPos, -1))) nStartPos=nPos; @@ -369,7 +409,7 @@ LineBreakResults SAL_CALL BreakIterator_Unicode::getLineBreak( sal_Bool GlueSpace=sal_True; while (GlueSpace) { - if (line.aBreakIterator->preceding(nStartPos + 1) == nStartPos) { //Line boundary break + if (ubrk_preceding(line.aBreakIterator, nStartPos + 1) == nStartPos) { //Line boundary break lbr.breakIndex = nStartPos; lbr.breakType = BreakType::WORDBOUNDARY; } else if (hOptions.rHyphenator.is()) { //Hyphenation break @@ -387,11 +427,11 @@ LineBreakResults SAL_CALL BreakIterator_Unicode::getLineBreak( lbr.breakIndex = wBoundary.startPos; //aHyphenatedWord->getHyphenationPos(); lbr.breakType = BreakType::HYPHENATION; } else { - lbr.breakIndex = line.aBreakIterator->preceding(nStartPos); + lbr.breakIndex = ubrk_preceding(line.aBreakIterator, nStartPos); lbr.breakType = BreakType::WORDBOUNDARY; } } else { //word boundary break - lbr.breakIndex = line.aBreakIterator->preceding(nStartPos); + lbr.breakIndex = ubrk_preceding(line.aBreakIterator, nStartPos); lbr.breakType = BreakType::WORDBOUNDARY; } diff --git a/main/i18npool/source/breakiterator/data/line.txt b/main/i18npool/source/breakiterator/data/line.txt index ca5d7b803f..397da21883 100644 --- a/main/i18npool/source/breakiterator/data/line.txt +++ b/main/i18npool/source/breakiterator/data/line.txt @@ -14,7 +14,8 @@ # !!chain; -!!LBCMNoChain; +# Breaks on recent ICU, see LibreOffice's bug report at https://bugs.documentfoundation.org/show_bug.cgi?id=158108 +#!!LBCMNoChain; !!lookAheadHardBreak; diff --git a/main/i18npool/source/breakiterator/data/sent.txt b/main/i18npool/source/breakiterator/data/sent.txt index 7fada89e62..eb1224ea5e 100644 --- a/main/i18npool/source/breakiterator/data/sent.txt +++ b/main/i18npool/source/breakiterator/data/sent.txt @@ -1,43 +1,40 @@ +# Copyright (C) 2016 and later: Unicode, Inc. and others. +# License & terms of use: http://www.unicode.org/copyright.html # -# Copyright (C) 2002-2006, International Business Machines Corporation and others. +# Copyright (C) 2002-2015, International Business Machines Corporation and others. # All Rights Reserved. # # file: sent.txt # # ICU Sentence Break Rules # See Unicode Standard Annex #29. -# These rules are based on SA 29 version 5.0.0 -# Includes post 5.0 changes to treat Japanese half width voicing marks -# as Grapheme Extend. +# These rules are based on UAX #29 Revision 34 for Unicode Version 12.0 # - -$VoiceMarks = [\uff9e\uff9f]; -$Thai = [:Script = Thai:]; +!!quoted_literals_only; # # Character categories as defined in TR 29 # +$CR = [\p{Sentence_Break = CR}]; +$LF = [\p{Sentence_Break = LF}]; +$Extend = [\p{Sentence_Break = Extend}]; $Sep = [\p{Sentence_Break = Sep}]; $Format = [\p{Sentence_Break = Format}]; $Sp = [\p{Sentence_Break = Sp}]; $Lower = [\p{Sentence_Break = Lower}]; $Upper = [\p{Sentence_Break = Upper}]; -$OLetter = [\p{Sentence_Break = OLetter}-$VoiceMarks]; +$OLetter = [\p{Sentence_Break = OLetter}]; $Numeric = [\p{Sentence_Break = Numeric}]; $ATerm = [\p{Sentence_Break = ATerm}]; +$SContinue = [\p{Sentence_Break = SContinue}]; $STerm = [\p{Sentence_Break = STerm}]; $Close = [\p{Sentence_Break = Close}]; # # Define extended forms of the character classes, -# incorporate grapheme cluster + format chars. -# Rules 4 and 5. - - -$CR = \u000d; -$LF = \u000a; -$Extend = [[:Grapheme_Extend = TRUE:]$VoiceMarks]; +# incorporate trailing Extend or Format chars. +# Rules 4 and 5. $SpEx = $Sp ($Extend | $Format)*; $LowerEx = $Lower ($Extend | $Format)*; @@ -45,6 +42,7 @@ $UpperEx = $Upper ($Extend | $Format)*; $OLetterEx = $OLetter ($Extend | $Format)*; $NumericEx = $Numeric ($Extend | $Format)*; $ATermEx = $ATerm ($Extend | $Format)*; +$SContinueEx= $SContinue ($Extend | $Format)*; $STermEx = $STerm ($Extend | $Format)*; $CloseEx = $Close ($Extend | $Format)*; @@ -52,77 +50,34 @@ $CloseEx = $Close ($Extend | $Format)*; ## ------------------------------------------------- !!chain; -!!forward; # Rule 3 - break after separators. Keep CR/LF together. # $CR $LF; -$LettersEx = [$OLetter $Upper $Lower $Numeric $Close $STerm] ($Extend | $Format)*; -$LettersEx* $Thai $LettersEx* ($ATermEx | $SpEx)*; # Rule 4 - Break after $Sep. # Rule 5 - Ignore $Format and $Extend # -[^$Sep]? ($Extend | $Format)*; +[^$Sep $CR $LF]? ($Extend | $Format)*; # Rule 6 $ATermEx $NumericEx; # Rule 7 -$UpperEx $ATermEx $UpperEx; +($UpperEx | $LowerEx) $ATermEx $UpperEx; #Rule 8 -# Note: follows errata for Unicode 5.0 boundary rules. -$NotLettersEx = [^$OLetter $Upper $Lower $Sep $ATerm $STerm] ($Extend | $Format)*; +$NotLettersEx = [^$OLetter $Upper $Lower $Sep $CR $LF $ATerm $STerm] ($Extend | $Format)*; $ATermEx $CloseEx* $SpEx* $NotLettersEx* $Lower; # Rule 8a -($STermEx | $ATermEx) $CloseEx* $SpEx* ($STermEx | $ATermEx); +($STermEx | $ATermEx) $CloseEx* $SpEx* ($SContinueEx | $STermEx | $ATermEx); #Rule 9, 10, 11 -($STermEx | $ATermEx) $CloseEx* $SpEx* $Sep?; - -#Rule 12 -[[^$STerm $ATerm $Close $Sp $Sep $Format $Extend $Thai]{bof}] ($Extend | $Format | $Close | $Sp)* [^$Thai]; -[[^$STerm $ATerm $Close $Sp $Sep $Format $Extend]{bof}] ($Extend | $Format | $Close | $Sp)* ([$Sep{eof}] | $CR $LF){100}; - -## ------------------------------------------------- - -!!reverse; - -$SpEx_R = ($Extend | $Format)* $Sp; -$ATermEx_R = ($Extend | $Format)* $ATerm; -$STermEx_R = ($Extend | $Format)* $STerm; -$CloseEx_R = ($Extend | $Format)* $Close; - -# -# Reverse rules. -# For now, use the old style inexact reverse rules, which are easier -# to write, but less efficient. -# TODO: exact reverse rules. It appears that exact reverse rules -# may require improving support for look-ahead breaks in the -# builder. Needs more investigation. -# - -[{bof}] (.? | $LF $CR) [^$Sep]* [$Sep {eof}] ($SpEx_R* $CloseEx_R* ($STermEx_R | $ATermEx_R))*; -#.*; - -# Explanation for this rule: -# -# It needs to back over -# The $Sep at which we probably begin -# All of the non $Sep chars leading to the preceding $Sep -# The preceding $Sep, which will be the second one that the rule matches. -# Any immediately preceding STerm or ATerm sequences. We need to see these -# to get the correct rule status when moving forwards again. -# -# [{bof}] inhibit rule chaining. Without this, rule would loop on itself and match -# the entire string. -# -# (.? | $LF $CR) Match one $Sep instance. Use .? rather than $Sep because position might be -# at the beginning of the string at this point, and we don't want to fail. -# Can only use {eof} once, and it is used later. -# +($STermEx | $ATermEx) $CloseEx* $SpEx* ($Sep | $CR | $LF)?; +#Rule 998 +[[^$STerm $ATerm $Close $Sp $Sep $LF $CR $Format $Extend]{bof}] ($Extend | $Format | $Close | $Sp)* .; +[[^$STerm $ATerm $Close $Sp $Sep $LF $CR $Format $Extend]{bof}] ($Extend | $Format | $Close | $Sp)* ([$Sep $LF $CR {eof}] | $CR $LF){100}; diff --git a/main/i18npool/source/calendar/calendar_gregorian.cxx b/main/i18npool/source/calendar/calendar_gregorian.cxx index 9fd7349731..6eb1d50a9d 100644 --- a/main/i18npool/source/calendar/calendar_gregorian.cxx +++ b/main/i18npool/source/calendar/calendar_gregorian.cxx @@ -161,11 +161,10 @@ Calendar_gregorian::init(Era *_eraArray) * not all, language and country and variant), otherwise the current * default locale would be used again and the calendar keyword ignored. * */ - icu::Locale aIcuLocale( "", NULL, NULL, "calendar=gregorian"); - - UErrorCode status; - body = icu::Calendar::createInstance( aIcuLocale, status = U_ZERO_ERROR); - if (!body || !U_SUCCESS(status)) throw ERROR; + UErrorCode status = U_ZERO_ERROR; + body = ucal_open(NULL, -1, "@calendar=gregorian", UCAL_GREGORIAN, &status); + if (!body || !U_SUCCESS(status)) + throw ERROR; #if 0 { @@ -292,8 +291,8 @@ Calendar_gregorian::getUniqueID() throw(RuntimeException) void SAL_CALL Calendar_gregorian::setDateTime( double timeInDays ) throw(RuntimeException) { - UErrorCode status; - body->setTime(timeInDays * U_MILLIS_PER_DAY, status = U_ZERO_ERROR); + UErrorCode status = U_ZERO_ERROR; + ucal_setMillis(body, timeInDays * U_MILLIS_PER_DAY, &status); if ( !U_SUCCESS(status) ) throw ERROR; getValue(); } @@ -305,8 +304,8 @@ Calendar_gregorian::getDateTime() throw(RuntimeException) setValue(); getValue(); } - UErrorCode status; - double r = body->getTime(status = U_ZERO_ERROR); + UErrorCode status = U_ZERO_ERROR; + double r = ucal_getMillis(body, &status); if ( !U_SUCCESS(status) ) throw ERROR; return r / U_MILLIS_PER_DAY; } @@ -432,7 +431,7 @@ void Calendar_gregorian::submitFields() throw(com::sun::star::uno::RuntimeExcept switch (fieldIndex) { default: - body->set(fieldNameConverter(fieldIndex), fieldSetValue[fieldIndex]); + ucal_set(body, fieldNameConverter(fieldIndex), fieldSetValue[fieldIndex]); break; case CalendarFieldIndex::ZONE_OFFSET: case CalendarFieldIndex::DST_OFFSET: @@ -444,9 +443,9 @@ void Calendar_gregorian::submitFields() throw(com::sun::star::uno::RuntimeExcept } sal_Int32 nZoneOffset, nDSTOffset; if (getZoneOffset( nZoneOffset)) - body->set( fieldNameConverter( CalendarFieldIndex::ZONE_OFFSET), nZoneOffset); + ucal_set(body, fieldNameConverter( CalendarFieldIndex::ZONE_OFFSET), nZoneOffset); if (getDSTOffset( nDSTOffset)) - body->set( fieldNameConverter( CalendarFieldIndex::DST_OFFSET), nDSTOffset); + ucal_set(body, fieldNameConverter( CalendarFieldIndex::DST_OFFSET), nDSTOffset); } void Calendar_gregorian::submitValues( sal_Int32 nYear, @@ -456,23 +455,23 @@ void Calendar_gregorian::submitValues( sal_Int32 nYear, { submitFields(); if (nYear >= 0) - body->set( UCAL_YEAR, nYear); + ucal_set(body, UCAL_YEAR, nYear); if (nMonth >= 0) - body->set( UCAL_MONTH, nMonth); + ucal_set(body, UCAL_MONTH, nMonth); if (nDay >= 0) - body->set( UCAL_DATE, nDay); + ucal_set(body, UCAL_DATE, nDay); if (nHour >= 0) - body->set( UCAL_HOUR_OF_DAY, nHour); + ucal_set(body, UCAL_HOUR_OF_DAY, nHour); if (nMinute >= 0) - body->set( UCAL_MINUTE, nMinute); + ucal_set(body, UCAL_MINUTE, nMinute); if (nSecond >= 0) - body->set( UCAL_SECOND, nSecond); + ucal_set(body, UCAL_SECOND, nSecond); if (nMilliSecond >= 0) - body->set( UCAL_MILLISECOND, nMilliSecond); + ucal_set(body, UCAL_MILLISECOND, nMilliSecond); if (nZone != 0) - body->set( UCAL_ZONE_OFFSET, nZone); + ucal_set(body, UCAL_ZONE_OFFSET, nZone); if (nDST != 0) - body->set( UCAL_DST_OFFSET, nDST); + ucal_set(body, UCAL_DST_OFFSET, nDST); } static void lcl_setCombinedOffsetFieldValues( sal_Int32 nValue, @@ -533,55 +532,64 @@ void Calendar_gregorian::setValue() throw(RuntimeException) UErrorCode status; if ( !(fieldSet & (1 << CalendarFieldIndex::YEAR)) ) { - nYear = body->get( UCAL_YEAR, status = U_ZERO_ERROR); + status = U_ZERO_ERROR; + nYear = ucal_get(body, UCAL_YEAR, &status); if ( !U_SUCCESS(status) ) nYear = -1; } if ( !(fieldSet & (1 << CalendarFieldIndex::MONTH)) ) { - nMonth = body->get( UCAL_MONTH, status = U_ZERO_ERROR); + status = U_ZERO_ERROR; + nMonth = ucal_get(body, UCAL_MONTH, &status); if ( !U_SUCCESS(status) ) nMonth = -1; } if ( !(fieldSet & (1 << CalendarFieldIndex::DAY_OF_MONTH)) ) { - nDay = body->get( UCAL_DATE, status = U_ZERO_ERROR); + status = U_ZERO_ERROR; + nDay = ucal_get(body, UCAL_DATE, &status); if ( !U_SUCCESS(status) ) nDay = -1; } if ( !(fieldSet & (1 << CalendarFieldIndex::HOUR)) ) { - nHour = body->get( UCAL_HOUR_OF_DAY, status = U_ZERO_ERROR); + status = U_ZERO_ERROR; + nHour = ucal_get(body, UCAL_HOUR_OF_DAY, &status); if ( !U_SUCCESS(status) ) nHour = -1; } if ( !(fieldSet & (1 << CalendarFieldIndex::MINUTE)) ) { - nMinute = body->get( UCAL_MINUTE, status = U_ZERO_ERROR); + status = U_ZERO_ERROR; + nMinute = ucal_get(body, UCAL_MINUTE, &status); if ( !U_SUCCESS(status) ) nMinute = -1; } if ( !(fieldSet & (1 << CalendarFieldIndex::SECOND)) ) { - nSecond = body->get( UCAL_SECOND, status = U_ZERO_ERROR); + status = U_ZERO_ERROR; + nSecond = ucal_get(body, UCAL_SECOND, &status); if ( !U_SUCCESS(status) ) nSecond = -1; } if ( !(fieldSet & (1 << CalendarFieldIndex::MILLISECOND)) ) { - nMilliSecond = body->get( UCAL_MILLISECOND, status = U_ZERO_ERROR); + status = U_ZERO_ERROR; + nMilliSecond = ucal_get(body, UCAL_MILLISECOND, &status); if ( !U_SUCCESS(status) ) nMilliSecond = -1; } if ( !(fieldSet & (1 << CalendarFieldIndex::ZONE_OFFSET)) ) { - nZone0 = body->get( UCAL_ZONE_OFFSET, status = U_ZERO_ERROR); + status = U_ZERO_ERROR; + nZone0 = ucal_get(body, UCAL_ZONE_OFFSET, &status); if ( !U_SUCCESS(status) ) nZone0 = 0; } if ( !(fieldSet & (1 << CalendarFieldIndex::DST_OFFSET)) ) { - nDST0 = body->get( UCAL_DST_OFFSET, status = U_ZERO_ERROR); + status = U_ZERO_ERROR; + nDST0 = ucal_get(body, UCAL_DST_OFFSET, &status); if ( !U_SUCCESS(status) ) nDST0 = 0; } @@ -591,10 +599,12 @@ void Calendar_gregorian::setValue() throw(RuntimeException) DUMP_ICU_CAL_MSG(("%s\n","setValue() in bNeedZone||bNeedDST after submitValues()")); DUMP_I18N_CAL_MSG(("%s\n","setValue() in bNeedZone||bNeedDST after submitValues()")); - nZone1 = body->get( UCAL_ZONE_OFFSET, status = U_ZERO_ERROR); + status = U_ZERO_ERROR; + nZone1 = ucal_get(body, UCAL_ZONE_OFFSET, &status); if ( !U_SUCCESS(status) ) nZone1 = 0; - nDST1 = body->get( UCAL_DST_OFFSET, status = U_ZERO_ERROR); + status = U_ZERO_ERROR; + nDST1 = ucal_get(body, UCAL_DST_OFFSET, &status); if ( !U_SUCCESS(status) ) nDST1 = 0; } @@ -607,11 +617,11 @@ void Calendar_gregorian::setValue() throw(RuntimeException) if ( bNeedZone || bNeedDST ) { - UErrorCode status; - sal_Int32 nZone2 = body->get( UCAL_ZONE_OFFSET, status = U_ZERO_ERROR); + UErrorCode status = U_ZERO_ERROR; + sal_Int32 nZone2 = ucal_get(body, UCAL_ZONE_OFFSET, &status); if ( !U_SUCCESS(status) ) nZone2 = nZone1; - sal_Int32 nDST2 = body->get( UCAL_DST_OFFSET, status = U_ZERO_ERROR); + sal_Int32 nDST2 = ucal_get(body, UCAL_DST_OFFSET, &status); if ( !U_SUCCESS(status) ) nDST2 = nDST1; if ( nZone0 != nZone1 || nZone2 != nZone1 || nDST0 != nDST1 || nDST2 != nDST1 ) @@ -647,7 +657,8 @@ void Calendar_gregorian::setValue() throw(RuntimeException) // -3:30:52 (!) instead of -3:30 // if first submission included time zone -3:30 that would be wrong. bool bResubmit = false; - sal_Int32 nZone3 = body->get( UCAL_ZONE_OFFSET, status = U_ZERO_ERROR); + status = U_ZERO_ERROR; + sal_Int32 nZone3 = ucal_get(body, UCAL_ZONE_OFFSET, &status); if ( !U_SUCCESS(status) ) nZone3 = nZone2; if (nZone3 != nZone2) @@ -668,7 +679,8 @@ void Calendar_gregorian::setValue() throw(RuntimeException) // factored in all days by ICU and there seems to be some // unknown behavior. // TZ=Asia/Tehran 1999-03-22 exposes this, for example. - sal_Int32 nDST3 = body->get( UCAL_DST_OFFSET, status = U_ZERO_ERROR); + status = U_ZERO_ERROR; + sal_Int32 nDST3 = ucal_get(body, UCAL_DST_OFFSET, &status); if ( !U_SUCCESS(status) ) nDST3 = nDST2; if (nDST2 != nDST3 && !nDST3) @@ -693,8 +705,8 @@ void Calendar_gregorian::setValue() throw(RuntimeException) #if erDUMP_ICU_CALENDAR || erDUMP_I18N_CALENDAR { // force icu::Calendar to recalculate - UErrorCode status; - sal_Int32 nTmp = body->get( UCAL_DATE, status = U_ZERO_ERROR); + UErrorCode status = U_ZERO_ERROR; + sal_Int32 nTmp = ucal_get(body, UCAL_DATE, &status); DUMP_ICU_CAL_MSG(("%s: %d\n","setValue() result day",nTmp)); DUMP_I18N_CAL_MSG(("%s: %d\n","setValue() result day",nTmp)); } @@ -711,8 +723,8 @@ void Calendar_gregorian::getValue() throw(RuntimeException) fieldIndex == CalendarFieldIndex::DST_OFFSET_SECOND_MILLIS) continue; // not ICU fields - UErrorCode status; sal_Int32 value = body->get( fieldNameConverter( - fieldIndex), status = U_ZERO_ERROR); + UErrorCode status = U_ZERO_ERROR; + sal_Int32 value = ucal_get(body, fieldNameConverter(fieldIndex), &status); if ( !U_SUCCESS(status) ) throw ERROR; // Convert millisecond to minute for ZONE and DST and set remainder in @@ -762,8 +774,8 @@ void SAL_CALL Calendar_gregorian::addValue( sal_Int16 fieldIndex, sal_Int32 value ) throw(RuntimeException) { // since ZONE and DST could not be add, we don't need to convert value here - UErrorCode status; - body->add(fieldNameConverter(fieldIndex), value, status = U_ZERO_ERROR); + UErrorCode status = U_ZERO_ERROR; + ucal_add(body, fieldNameConverter(fieldIndex), value, &status); if ( !U_SUCCESS(status) ) throw ERROR; getValue(); } @@ -865,7 +877,7 @@ Calendar_gregorian::getFirstDayOfWeek() throw(RuntimeException) // Check for underflow just in case we're called "out of sync". return ::std::max( sal::static_int_cast<sal_Int16>(0), sal::static_int_cast<sal_Int16>( static_cast<sal_Int16>( - body->getFirstDayOfWeek()) - 1)); + ucal_getAttribute(body, UCAL_FIRST_DAY_OF_WEEK)) - 1)); } void SAL_CALL @@ -873,14 +885,14 @@ Calendar_gregorian::setFirstDayOfWeek( sal_Int16 day ) throw(RuntimeException) { // Weekdays::SUNDAY == 0, UCAL_SUNDAY == 1 => offset +1 - body->setFirstDayOfWeek( static_cast<UCalendarDaysOfWeek>( day + 1)); + ucal_setAttribute(body, UCAL_FIRST_DAY_OF_WEEK, static_cast<UCalendarDaysOfWeek>( day + 1)); } void SAL_CALL Calendar_gregorian::setMinimumNumberOfDaysForFirstWeek( sal_Int16 days ) throw(RuntimeException) { aCalendar.MinimumNumberOfDaysForFirstWeek = days; - body->setMinimalDaysInFirstWeek( static_cast<uint8_t>( days)); + ucal_setAttribute(body, UCAL_MINIMAL_DAYS_IN_FIRST_WEEK, static_cast<uint8_t>( days)); } sal_Int16 SAL_CALL diff --git a/main/i18npool/source/collator/collator_unicode.cxx b/main/i18npool/source/collator/collator_unicode.cxx index f8b3c214bc..fe5ba42d32 100644 --- a/main/i18npool/source/collator/collator_unicode.cxx +++ b/main/i18npool/source/collator/collator_unicode.cxx @@ -49,8 +49,8 @@ Collator_Unicode::Collator_Unicode() Collator_Unicode::~Collator_Unicode() { - if (collator) delete collator; - if (uca_base) delete uca_base; + if (collator) ucol_close(collator); + if (uca_base) ucol_close(uca_base); if (hModule) osl_unloadModule(hModule); } @@ -58,13 +58,13 @@ sal_Int32 SAL_CALL Collator_Unicode::compareSubstring( const OUString& str1, sal_Int32 off1, sal_Int32 len1, const OUString& str2, sal_Int32 off2, sal_Int32 len2) throw(RuntimeException) { - return collator->compare(reinterpret_cast<const UChar *>(str1.getStr()) + off1, len1, reinterpret_cast<const UChar *>(str2.getStr()) + off2, len2); // UChar != sal_Unicode in MinGW + return ucol_strcoll(collator, reinterpret_cast<const UChar *>(str1.getStr()) + off1, len1, reinterpret_cast<const UChar *>(str2.getStr()) + off2, len2); // UChar != sal_Unicode in MinGW } sal_Int32 SAL_CALL Collator_Unicode::compareString( const OUString& str1, const OUString& str2) throw(RuntimeException) { - return collator->compare(reinterpret_cast<const UChar *>(str1.getStr()), reinterpret_cast<const UChar *>(str2.getStr())); // UChar != sal_Unicode in MinGW + return ucol_strcoll(collator, reinterpret_cast<const UChar *>(str1.getStr()), -1, reinterpret_cast<const UChar *>(str2.getStr()), -1); // UChar != sal_Unicode in MinGW } extern "C" { static void SAL_CALL thisModule() {} } @@ -75,9 +75,10 @@ Collator_Unicode::loadCollatorAlgorithm(const OUString& rAlgorithm, const lang:: { if (!collator) { UErrorCode status = U_ZERO_ERROR; + UParseError parseError; OUString rule = LocaleData().getCollatorRuleByAlgorithm(rLocale, rAlgorithm); if (rule.getLength() > 0) { - collator = new RuleBasedCollator(reinterpret_cast<const UChar *>(rule.getStr()), status); // UChar != sal_Unicode in MinGW + collator = ucol_openRules(reinterpret_cast<const UChar *>(rule.getStr()), -1, UCOL_DEFAULT, UCOL_DEFAULT_STRENGTH, &parseError, &status); // UChar != sal_Unicode in MinGW if (! U_SUCCESS(status)) throw RuntimeException(); } if (!collator && OUString::createFromAscii(LOCAL_RULE_LANGS).indexOf(rLocale.Language) >= 0) { @@ -113,9 +114,9 @@ Collator_Unicode::loadCollatorAlgorithm(const OUString& rAlgorithm, const lang:: } if (func) { const sal_uInt8* ruleImage=func(); - uca_base = new RuleBasedCollator(static_cast<UChar*>(NULL), status); + uca_base = ucol_open("root", &status); if (! U_SUCCESS(status)) throw RuntimeException(); - collator = new RuleBasedCollator(reinterpret_cast<const uint8_t*>(ruleImage), -1, uca_base, status); + collator = ucol_openBinary(reinterpret_cast<const uint8_t*>(ruleImage), -1, uca_base, &status); if (! U_SUCCESS(status)) throw RuntimeException(); } } @@ -127,22 +128,37 @@ Collator_Unicode::loadCollatorAlgorithm(const OUString& rAlgorithm, const lang:: case here. The icu::Locale constructor changes the algorithm name to uppercase itself, so we don't have to bother with that. */ - icu::Locale icuLocale( - OUStringToOString(rLocale.Language, RTL_TEXTENCODING_ASCII_US).getStr(), - OUStringToOString(rLocale.Country, RTL_TEXTENCODING_ASCII_US).getStr(), - OUStringToOString(rAlgorithm, RTL_TEXTENCODING_ASCII_US).getStr()); + /** "The Locale constructor (in C++ and Java) taking multiple strings behaves exactly as if those strings + were concatenated, with the ‘_’ separator inserted between two adjacent non-empty strings, and + the result passed to uloc_getName." -- https://unicode-org.github.io/icu/userguide/locale/ + */ + OUStringBuffer locale; + if (!rLocale.Language.isEmpty()) { + locale.append(rLocale.Language); + locale.appendAscii("_"); + if (!rLocale.Country.isEmpty()) { + locale.append(rLocale.Country); + } + if (!rAlgorithm.isEmpty()) { + locale.appendAscii("_"); + locale.append(rAlgorithm); + } + } + char icuLocale[1024]; + uloc_getName(OUStringToOString(locale.makeStringAndClear(), RTL_TEXTENCODING_ASCII_US).getStr(), icuLocale, sizeof(icuLocale), &status); + if (! U_SUCCESS(status)) throw RuntimeException(); // load ICU collator - collator = (RuleBasedCollator*) icu::Collator::createInstance(icuLocale, status); + collator = ucol_open(icuLocale, &status); if (! U_SUCCESS(status)) throw RuntimeException(); } } if (options & CollatorOptions::CollatorOptions_IGNORE_CASE_ACCENT) - collator->setStrength(Collator::PRIMARY); + ucol_setStrength(collator, UCOL_PRIMARY); else if (options & CollatorOptions::CollatorOptions_IGNORE_CASE) - collator->setStrength(Collator::SECONDARY); + ucol_setStrength(collator, UCOL_SECONDARY); else - collator->setStrength(Collator::TERTIARY); + ucol_setStrength(collator, UCOL_TERTIARY); return(0); } diff --git a/main/i18npool/source/collator/gencoll_rule.cxx b/main/i18npool/source/collator/gencoll_rule.cxx index 2295d79b35..66a1b7962a 100644 --- a/main/i18npool/source/collator/gencoll_rule.cxx +++ b/main/i18npool/source/collator/gencoll_rule.cxx @@ -30,8 +30,10 @@ #include <sal/main.h> #include <sal/types.h> #include <rtl/ustrbuf.hxx> - +#define U_SHOW_CPLUSPLUS_API 0 +#define U_SHOW_CPLUSPLUS_HEADER_API 0 #include "warnings_guard_unicode_tblcoll.h" +#include "unicode/ucol.h" U_CAPI void U_EXPORT2 uprv_free(void *mem); @@ -107,30 +109,41 @@ SAL_IMPLEMENT_MAIN_WITH_ARGS(argc, argv) fclose(fp); UErrorCode status = U_ZERO_ERROR; - //UParseError parseError; - //UCollator *coll = ucol_openRules(Obuf.getStr(), Obuf.getLength(), UCOL_OFF, - // UCOL_DEFAULT_STRENGTH, &parseError, &status); + UParseError parseError; + UCollator *coll = ucol_openRules(reinterpret_cast<const UChar *>(Obuf.getStr()), -1, UCOL_OFF, + UCOL_DEFAULT_STRENGTH, &parseError, &status); + + //RuleBasedCollator *coll = new RuleBasedCollator(reinterpret_cast<const UChar *>(Obuf.getStr()), status); // UChar != sal_Unicode in MinGW - RuleBasedCollator *coll = new RuleBasedCollator(reinterpret_cast<const UChar *>(Obuf.getStr()), status); // UChar != sal_Unicode in MinGW if (U_SUCCESS(status)) { int32_t len = 0; - uint8_t *data = coll->cloneRuleData(len, status); - - if (U_SUCCESS(status) && data != NULL) - data_write(argv[2], argv[3], data, len); - else { + status = U_ZERO_ERROR; + len = ucol_cloneBinary(coll, NULL, 0, &status); + if (len > 0 && status == U_BUFFER_OVERFLOW_ERROR) { + uint8_t* data = (uint8_t*)malloc(len); + if (data != NULL) { + status = U_ZERO_ERROR; + len = ucol_cloneBinary(coll, data, len, &status); + if (U_SUCCESS(status)) + data_write(argv[2], argv[3], data, len); + else { + printf("Could not get rule data from collator\n"); + } + free(data); + } else { + printf("Out of memory getting rule data from collator\n"); + } + } else { printf("Could not get rule data from collator\n"); } - - if (data) uprv_free(data); } else { printf("\nRule parsering error\n"); } if (coll) - delete coll; + ucol_close(coll); //delete coll; return U_SUCCESS(status) ? 0 : 1; } // End of main diff --git a/main/i18npool/source/search/textsearch.cxx b/main/i18npool/source/search/textsearch.cxx index ec30c46d03..6a7f732c0f 100644 --- a/main/i18npool/source/search/textsearch.cxx +++ b/main/i18npool/source/search/textsearch.cxx @@ -73,7 +73,7 @@ TextSearch::TextSearch(const Reference < XMultiServiceFactory > & rxMSF) : xMSF( rxMSF ) , pJumpTable( 0 ) , pJumpTable2( 0 ) - , pRegexMatcher( NULL ) + , pRegex( NULL ) , pWLD( 0 ) { SearchOptions aOpt; @@ -85,7 +85,7 @@ TextSearch::TextSearch(const Reference < XMultiServiceFactory > & rxMSF) TextSearch::~TextSearch() { - delete pRegexMatcher; + uregex_close(pRegex); delete pWLD; delete pJumpTable; delete pJumpTable2; @@ -95,7 +95,10 @@ void TextSearch::setOptions( const SearchOptions& rOptions ) throw( RuntimeExcep { aSrchPara = rOptions; - delete pRegexMatcher, pRegexMatcher = NULL; + if (pRegex) { + uregex_close(pRegex); + pRegex = NULL; + } delete pWLD, pWLD = 0; delete pJumpTable, pJumpTable = 0; delete pJumpTable2, pJumpTable2 = 0; @@ -739,27 +742,36 @@ void TextSearch::RESrchPrepare( const ::com::sun::star::util::SearchOptions& rOp nIcuSearchFlags |= UREGEX_CASE_INSENSITIVE; UErrorCode nIcuErr = U_ZERO_ERROR; // assumption: transliteration didn't mangle regexp control chars - IcuUniString aIcuSearchPatStr( (const UChar*)rPatternStr.getStr(), rPatternStr.getLength()); #ifndef DISABLE_WORDBOUND_EMULATION // for conveniance specific syntax elements of the old regex engine are emulated // - by replacing \< with "word-break followed by a look-ahead word-char" - static const IcuUniString aChevronPatternB( "\\\\<", -1, IcuUniString::kInvariant); - static const IcuUniString aChevronReplaceB( "\\\\b(?=\\\\w)", -1, IcuUniString::kInvariant); - static RegexMatcher aChevronMatcherB( aChevronPatternB, 0, nIcuErr); - aChevronMatcherB.reset( aIcuSearchPatStr); - aIcuSearchPatStr = aChevronMatcherB.replaceAll( aChevronReplaceB, nIcuErr); - aChevronMatcherB.reset(); + static const ::rtl::OUString aChevronPatternB = ::rtl::OUString::createFromAscii( "\\\\<" ); + static const ::rtl::OUString aChevronReplaceB = ::rtl::OUString::createFromAscii( "\\\\b(?=\\\\w)" ); + URegularExpression *aChevronMatcherB = uregex_open( (const UChar*)aChevronPatternB.getStr(), -1, 0, NULL, &nIcuErr); + uregex_setText(aChevronMatcherB, (const UChar*)rPatternStr.getStr(), -1, &nIcuErr); + ::std::vector<sal_uInt16> replacedTextB(rPatternStr.getLength() * 2); + int32_t realSize = uregex_replaceAll(aChevronMatcherB, (const UChar*)aChevronReplaceB.getStr(), -1, (UChar*)replacedTextB.data(), replacedTextB.capacity(), &nIcuErr); + if (realSize > replacedTextB.capacity()) { + replacedTextB.reserve(realSize); + realSize = uregex_replaceAll(aChevronMatcherB, (const UChar*)aChevronReplaceB.getStr(), -1, (UChar*)replacedTextB.data(), replacedTextB.capacity(), &nIcuErr); + } + uregex_close(aChevronMatcherB); // - by replacing \> with "look-behind word-char followed by a word-break" - static const IcuUniString aChevronPatternE( "\\\\>", -1, IcuUniString::kInvariant); - static const IcuUniString aChevronReplaceE( "(?<=\\\\w)\\\\b", -1, IcuUniString::kInvariant); - static RegexMatcher aChevronMatcherE( aChevronPatternE, 0, nIcuErr); - aChevronMatcherE.reset( aIcuSearchPatStr); - aIcuSearchPatStr = aChevronMatcherE.replaceAll( aChevronReplaceE, nIcuErr); - aChevronMatcherE.reset(); + static const ::rtl::OUString aChevronPatternE = ::rtl::OUString::createFromAscii( "\\\\>" ); + static const ::rtl::OUString aChevronReplaceE = ::rtl::OUString::createFromAscii( "(?<=\\\\w)\\\\b" ); + URegularExpression *aChevronMatcherE = uregex_open( (const UChar*)aChevronPatternE.getStr(), -1, 0, NULL, &nIcuErr); + uregex_setText(aChevronMatcherE, (const UChar*)replacedTextB.data(), -1, &nIcuErr); + ::std::vector<sal_uInt16> replacedTextE(replacedTextB.capacity() * 2); + realSize = uregex_replaceAll(aChevronMatcherE, (const UChar*)aChevronReplaceE.getStr(), -1, (UChar*)replacedTextE.data(), replacedTextE.capacity(), &nIcuErr); + if (realSize > replacedTextE.capacity()) { + replacedTextE.reserve(realSize); + realSize = uregex_replaceAll(aChevronMatcherE, (const UChar*)aChevronReplaceE.getStr(), -1, (UChar*)replacedTextE.data(), replacedTextE.capacity(), &nIcuErr); + } + uregex_close(aChevronMatcherE); #endif - pRegexMatcher = new RegexMatcher( aIcuSearchPatStr, nIcuSearchFlags, nIcuErr); + pRegex = uregex_open( (const UChar*)replacedTextE.data(), -1, nIcuSearchFlags, NULL, &nIcuErr); if( nIcuErr) - { delete pRegexMatcher; pRegexMatcher = NULL;} + { uregex_close(pRegex); pRegex = NULL;} } //--------------------------------------------------------------------------- @@ -770,7 +782,7 @@ SearchResult TextSearch::RESrchFrwrd( const OUString& searchStr, { SearchResult aRet; aRet.subRegExpressions = 0; - if( !pRegexMatcher) + if( !pRegex) return aRet; if( endPos > searchStr.getLength()) @@ -778,17 +790,17 @@ SearchResult TextSearch::RESrchFrwrd( const OUString& searchStr, // use the ICU RegexMatcher to find the matches UErrorCode nIcuErr = U_ZERO_ERROR; - const IcuUniString aSearchTargetStr( (const UChar*)searchStr.getStr(), endPos); - pRegexMatcher->reset( aSearchTargetStr); + const ::rtl::OUString aSearchTargetStr = searchStr.copy(0, endPos); + uregex_setText(pRegex, (const UChar*)aSearchTargetStr.getStr(), -1, &nIcuErr); // search until there is a valid match for(;;) { - if( !pRegexMatcher->find( startPos, nIcuErr)) + if( !uregex_find(pRegex, startPos, &nIcuErr)) return aRet; // #i118887# ignore zero-length matches e.g. "a*" in "bc" - int nStartOfs = pRegexMatcher->start( nIcuErr); - int nEndOfs = pRegexMatcher->end( nIcuErr); + int nStartOfs = uregex_start(pRegex, 0, &nIcuErr); + int nEndOfs = uregex_end(pRegex, 0, &nIcuErr); if( nStartOfs < nEndOfs) break; // try at next position if there was a zero-length match @@ -797,15 +809,15 @@ SearchResult TextSearch::RESrchFrwrd( const OUString& searchStr, } // extract the result of the search - const int nGroupCount = pRegexMatcher->groupCount(); + const int nGroupCount = uregex_groupCount(pRegex, &nIcuErr); aRet.subRegExpressions = nGroupCount + 1; aRet.startOffset.realloc( aRet.subRegExpressions); aRet.endOffset.realloc( aRet.subRegExpressions); - aRet.startOffset[0] = pRegexMatcher->start( nIcuErr); - aRet.endOffset[0] = pRegexMatcher->end( nIcuErr); + aRet.startOffset[0] = uregex_start(pRegex, 0, &nIcuErr); + aRet.endOffset[0] = uregex_end(pRegex, 0, &nIcuErr); for( int i = 1; i <= nGroupCount; ++i) { - aRet.startOffset[i] = pRegexMatcher->start( i, nIcuErr); - aRet.endOffset[i] = pRegexMatcher->end( i, nIcuErr); + aRet.startOffset[i] = uregex_start(pRegex, i, &nIcuErr); + aRet.endOffset[i] = uregex_end(pRegex, i, &nIcuErr); } return aRet; @@ -818,7 +830,7 @@ SearchResult TextSearch::RESrchBkwrd( const OUString& searchStr, // NOTE: for backwards search callers provide startPos/endPos inverted! SearchResult aRet; aRet.subRegExpressions = 0; - if( !pRegexMatcher) + if( !pRegex) return aRet; if( startPos > searchStr.getLength()) @@ -828,37 +840,37 @@ SearchResult TextSearch::RESrchBkwrd( const OUString& searchStr, // TODO: use ICU's backward searching once it becomes available // as its replacement using forward search is not as good as the real thing UErrorCode nIcuErr = U_ZERO_ERROR; - const IcuUniString aSearchTargetStr( (const UChar*)searchStr.getStr(), startPos); - pRegexMatcher->reset( aSearchTargetStr); - if( !pRegexMatcher->find( endPos, nIcuErr)) + const ::rtl::OUString aSearchTargetStr = searchStr.copy(0, startPos); + uregex_setText(pRegex, (const UChar*)aSearchTargetStr.getStr(), -1, &nIcuErr); + if( !uregex_find(pRegex, endPos, &nIcuErr)) return aRet; // find the last match int nLastPos = 0; int nFoundEnd = 0; do { - nLastPos = pRegexMatcher->start( nIcuErr); - nFoundEnd = pRegexMatcher->end( nIcuErr); + nLastPos = uregex_start(pRegex, 0, &nIcuErr); + nFoundEnd = uregex_end(pRegex, 0, &nIcuErr); if( nFoundEnd >= startPos) break; if( nFoundEnd == nLastPos) ++nFoundEnd; - } while( pRegexMatcher->find( nFoundEnd, nIcuErr)); + } while( uregex_find(pRegex, nFoundEnd, &nIcuErr)); // find last match again to get its details - pRegexMatcher->find( nLastPos, nIcuErr); + uregex_find(pRegex, nLastPos, &nIcuErr); // fill in the details of the last match - const int nGroupCount = pRegexMatcher->groupCount(); + const int nGroupCount = uregex_groupCount(pRegex, &nIcuErr); aRet.subRegExpressions = nGroupCount + 1; aRet.startOffset.realloc( aRet.subRegExpressions); aRet.endOffset.realloc( aRet.subRegExpressions); // NOTE: existing users of backward search seem to expect startOfs/endOfs being inverted! - aRet.startOffset[0] = pRegexMatcher->end( nIcuErr); - aRet.endOffset[0] = pRegexMatcher->start( nIcuErr); + aRet.startOffset[0] = uregex_end(pRegex, 0, &nIcuErr); + aRet.endOffset[0] = uregex_start(pRegex, 0, &nIcuErr); for( int i = 1; i <= nGroupCount; ++i) { - aRet.startOffset[i] = pRegexMatcher->end( i, nIcuErr); - aRet.endOffset[i] = pRegexMatcher->start( i, nIcuErr); + aRet.startOffset[i] = uregex_end(pRegex, i, &nIcuErr); + aRet.endOffset[i] = uregex_start(pRegex, i, &nIcuErr); } return aRet; diff --git a/main/i18npool/source/search/textsearch.hxx b/main/i18npool/source/search/textsearch.hxx index 793066c9b6..f5a89e1816 100644 --- a/main/i18npool/source/search/textsearch.hxx +++ b/main/i18npool/source/search/textsearch.hxx @@ -34,9 +34,9 @@ #include <map> -#include <unicode/regex.h> -using namespace U_ICU_NAMESPACE; -typedef U_ICU_NAMESPACE::UnicodeString IcuUniString; +#define U_SHOW_CPLUSPLUS_API 0 +#define U_SHOW_CPLUSPLUS_HEADER_API 0 +#include <unicode/uregex.h> class WLevDistance; typedef ::std::map< sal_Unicode, sal_Int32 > TextSearchJumpTable; @@ -92,7 +92,7 @@ class TextSearch: public cppu::WeakImplHelper2 throw(::com::sun::star::uno::RuntimeException); // Members and methods for the regular expression search - RegexMatcher* pRegexMatcher; + URegularExpression* pRegex; ::com::sun::star::util::SearchResult SAL_CALL RESrchFrwrd( const ::rtl::OUString& searchStr, sal_Int32 startPos, sal_Int32 endPos )
