--- Begin Message ---
Source: ucto
Source-Version: 0.9.6-1
Severity: important
Tags: patch
Usertags: icu63
Dear Maintainer,
ICU 63.1 recently released, packaged and uploaded to experimental.
Its transition is going to start soon. However your package fails to
build with this version. I attach a patch which fixes the problem.
Please check if it works with the version in Sid and upload the
package when it's feasible for you.
Thanks,
Laszlo/GCS
Description: fix FTBFS with ICU 63.1
Add icu namespace.
Author: Laszlo Boszormenyi (GCS) <gcs@debian.org>
Last-Update: 2018-11-09
---
--- ucto-0.9.6.orig/include/ucto/setting.h
+++ ucto-0.9.6/include/ucto/setting.h
@@ -38,15 +38,15 @@ namespace Tokenizer {
public:
Rule(): regexp(0){
};
- Rule( const UnicodeString& id, const UnicodeString& pattern);
+ Rule( const icu::UnicodeString& id, const icu::UnicodeString& pattern);
~Rule();
- UnicodeString id;
- UnicodeString pattern;
+ icu::UnicodeString id;
+ icu::UnicodeString pattern;
UnicodeRegexMatcher *regexp;
- bool matchAll( const UnicodeString&,
- UnicodeString&,
- UnicodeString&,
- std::vector<UnicodeString>& );
+ bool matchAll( const icu::UnicodeString&,
+ icu::UnicodeString&,
+ icu::UnicodeString&,
+ std::vector<icu::UnicodeString>& );
private:
Rule( const Rule& ); // inhibit copies
Rule& operator=( const Rule& ); // inhibit copies
@@ -56,17 +56,17 @@ namespace Tokenizer {
class Quoting {
friend std::ostream& operator<<( std::ostream&, const Quoting& );
struct QuotePair {
- UnicodeString openQuote;
- UnicodeString closeQuote;
+ icu::UnicodeString openQuote;
+ icu::UnicodeString closeQuote;
};
public:
- void add( const UnicodeString&, const UnicodeString& );
- UnicodeString lookupOpen( const UnicodeString &) const;
- UnicodeString lookupClose( const UnicodeString & ) const;
+ void add( const icu::UnicodeString&, const icu::UnicodeString& );
+ icu::UnicodeString lookupOpen( const icu::UnicodeString &) const;
+ icu::UnicodeString lookupClose( const icu::UnicodeString & ) const;
bool empty() const { return _quotes.empty(); };
bool emptyStack() const { return quotestack.empty(); };
void clearStack() { quoteindexstack.clear(); quotestack.clear(); };
- int lookup( const UnicodeString&, int& );
+ int lookup( const icu::UnicodeString&, int& );
void eraseAtPos( int pos ) {
quotestack.erase( quotestack.begin()+pos );
quoteindexstack.erase( quoteindexstack.begin()+pos );
@@ -90,14 +90,14 @@ namespace Tokenizer {
bool readfilters( const std::string& );
bool readquotes( const std::string& );
bool readeosmarkers( const std::string& );
- bool readabbreviations( const std::string&, UnicodeString& );
- void add_rule( const UnicodeString&, const std::vector<UnicodeString>& );
- void sortRules( std::map<UnicodeString, Rule *>&,
- const std::vector<UnicodeString>& );
- UnicodeString eosmarkers;
+ bool readabbreviations( const std::string&, icu::UnicodeString& );
+ void add_rule( const icu::UnicodeString&, const std::vector<icu::UnicodeString>& );
+ void sortRules( std::map<icu::UnicodeString, Rule *>&,
+ const std::vector<icu::UnicodeString>& );
+ icu::UnicodeString eosmarkers;
std::vector<Rule *> rules;
- std::map<UnicodeString, Rule *> rulesmap;
- std::map<UnicodeString, int> rules_index;
+ std::map<icu::UnicodeString, Rule *> rulesmap;
+ std::map<icu::UnicodeString, int> rules_index;
Quoting quotes;
UnicodeFilter filter;
std::string set_file; // the name of the settingsfile
--- ucto-0.9.6.orig/include/ucto/tokenize.h
+++ ucto-0.9.6/include/ucto/tokenize.h
@@ -78,11 +78,11 @@ namespace Tokenizer {
class Token {
friend std::ostream& operator<< (std::ostream&, const Token& );
public:
- UnicodeString type;
- UnicodeString us;
+ icu::UnicodeString type;
+ icu::UnicodeString us;
TokenRole role;
- Token( const UnicodeString&,
- const UnicodeString&,
+ Token( const icu::UnicodeString&,
+ const icu::UnicodeString&,
TokenRole role = NOROLE,
const std::string& = "" );
std::string lc; // ISO 639-3 language code
@@ -130,12 +130,12 @@ namespace Tokenizer {
// Tokenize a line (a line is NOT a sentence, but an arbitrary string
// of characters, inclusive EOS markers, Newlines etc.)
- int tokenizeLine( const UnicodeString&,
+ int tokenizeLine( const icu::UnicodeString&,
const std::string& = "default" ); // Unicode chars
int tokenizeLine( const std::string&,
const std::string& = "default" ); // UTF8 chars
- void passthruLine( const UnicodeString&, bool& );
+ void passthruLine( const icu::UnicodeString&, bool& );
void passthruLine( const std::string&, bool& );
//Processes tokens and initialises the sentence buffer. Returns the amount of sentences found
@@ -209,8 +209,8 @@ namespace Tokenizer {
void setLanguage( const std::string& l ){ default_language = l; };
// set eos marker
- UnicodeString setEosMarker( const std::string& s = "<utt>") { UnicodeString t = eosmark; eosmark = folia::UTF8ToUnicode(s); return t; };
- UnicodeString getEosMarker( ) const { return eosmark; }
+ icu::UnicodeString setEosMarker( const std::string& s = "<utt>") { icu::UnicodeString t = eosmark; eosmark = folia::UTF8ToUnicode(s); return t; };
+ icu::UnicodeString getEosMarker( ) const { return eosmark; }
bool setNormSet( const std::string& );
@@ -255,14 +255,14 @@ namespace Tokenizer {
private:
TokenizerClass( const TokenizerClass& ); // inhibit copies
TokenizerClass& operator=( const TokenizerClass& ); // inhibit copies
- void add_rule( const UnicodeString&,
- const std::vector<UnicodeString>& );
- void tokenizeWord( const UnicodeString&,
+ void add_rule( const icu::UnicodeString&,
+ const std::vector<icu::UnicodeString>& );
+ void tokenizeWord( const icu::UnicodeString&,
bool,
const std::string&,
- const UnicodeString& ="" );
+ const icu::UnicodeString& ="" );
- bool detectEos( size_t, const UnicodeString&, const Quoting& ) const;
+ bool detectEos( size_t, const icu::UnicodeString&, const Quoting& ) const;
void detectSentenceBounds( const int offset,
const std::string& = "default" );
void detectQuotedSentenceBounds( const int offset,
@@ -272,7 +272,7 @@ namespace Tokenizer {
//Signal the tokeniser that a paragraph is detected
void signalParagraph( bool b=true ) { paragraphsignal = b; };
- bool resolveQuote( int, const UnicodeString&, Quoting& );
+ bool resolveQuote( int, const icu::UnicodeString&, Quoting& );
bool u_isquote( UChar32,
const Quoting& ) const;
std::string checkBOM( std::istream& );
@@ -289,9 +289,9 @@ namespace Tokenizer {
UnicodeNormalizer normalizer;
std::string inputEncoding;
- UnicodeString eosmark;
+ icu::UnicodeString eosmark;
std::vector<Token> tokens;
- std::set<UnicodeString> norm_set;
+ std::set<icu::UnicodeString> norm_set;
TiCC::LogStream *theErrLog;
std::string default_language;
--- ucto-0.9.6.orig/include/ucto/unicode.h
+++ ucto-0.9.6/include/ucto/unicode.h
@@ -42,7 +42,7 @@ namespace Tokenizer {
class UnicodeNormalizer {
public:
UnicodeNormalizer(): mode(UNORM_NFC){};
- UnicodeString normalize( const UnicodeString& );
+ icu::UnicodeString normalize( const icu::UnicodeString& );
std::string getMode( ) const;
std::string setMode( const std::string& );
private:
@@ -52,34 +52,34 @@ namespace Tokenizer {
class UnicodeFilter {
friend std::ostream& operator<<( std::ostream&, const UnicodeFilter& );
public:
- UnicodeString filter( const UnicodeString& );
+ icu::UnicodeString filter( const icu::UnicodeString& );
bool fill( const std::string& );
- bool add( const UnicodeString& );
+ bool add( const icu::UnicodeString& );
bool add( const std::string& );
bool empty() const { return the_map.empty(); };
private:
- void add( UChar uc, const UnicodeString& us ) { the_map[uc] = us; };
- std::map<UChar, UnicodeString> the_map;
+ void add( UChar uc, const icu::UnicodeString& us ) { the_map[uc] = us; };
+ std::map<UChar, icu::UnicodeString> the_map;
};
class UnicodeRegexMatcher {
public:
- UnicodeRegexMatcher( const UnicodeString&, const UnicodeString& name="" );
+ UnicodeRegexMatcher( const icu::UnicodeString&, const icu::UnicodeString& name="" );
~UnicodeRegexMatcher();
- bool match_all( const UnicodeString&, UnicodeString&, UnicodeString& );
- const UnicodeString get_match( unsigned int ) const;
+ bool match_all( const icu::UnicodeString&, icu::UnicodeString&, icu::UnicodeString& );
+ const icu::UnicodeString get_match( unsigned int ) const;
int NumOfMatches() const;
- int split( const UnicodeString&, std::vector<UnicodeString>& );
- UnicodeString Pattern() const;
+ int split( const icu::UnicodeString&, std::vector<icu::UnicodeString>& );
+ icu::UnicodeString Pattern() const;
private:
UnicodeRegexMatcher( const UnicodeRegexMatcher& ); // inhibit copies
UnicodeRegexMatcher& operator=( const UnicodeRegexMatcher& ); // inhibit copies
std::string failString;
- RegexPattern *pattern;
- RegexMatcher *matcher;
+ icu::RegexPattern *pattern;
+ icu::RegexMatcher *matcher;
UnicodeRegexMatcher();
- std::vector<UnicodeString> results;
- const UnicodeString _name;
+ std::vector<icu::UnicodeString> results;
+ const icu::UnicodeString _name;
};
} // namespace
--- ucto-0.9.6.orig/src/setting.cxx
+++ ucto-0.9.6/src/setting.cxx
@@ -54,7 +54,7 @@ namespace Tokenizer {
ORDINALS, EOSMARKERS, QUOTES, CURRENCY,
FILTER, RULEORDER, METARULES };
- ConfigMode getMode( const UnicodeString& line ) {
+ ConfigMode getMode( const icu::UnicodeString& line ) {
ConfigMode mode = NONE;
if (line == "[RULES]") {
mode = RULES;
@@ -111,7 +111,7 @@ namespace Tokenizer {
public:
uConfigError( const string& s, const string& f ):
invalid_argument( "ucto: " + s + " (" + f + ")" ){};
- uConfigError( const UnicodeString& us, const string& f ):
+ uConfigError( const icu::UnicodeString& us, const string& f ):
uConfigError( folia::UnicodeToUTF8(us), f ){};
};
@@ -143,14 +143,14 @@ namespace Tokenizer {
}
}
- void Quoting::add( const UnicodeString& o, const UnicodeString& c ){
+ void Quoting::add( const icu::UnicodeString& o, const icu::UnicodeString& c ){
QuotePair quote;
quote.openQuote = o;
quote.closeQuote = c;
_quotes.push_back( quote );
}
- int Quoting::lookup( const UnicodeString& open, int& stackindex ){
+ int Quoting::lookup( const icu::UnicodeString& open, int& stackindex ){
if (quotestack.empty() || (quotestack.size() != quoteindexstack.size())) return -1;
auto it = quotestack.crbegin();
size_t i = quotestack.size();
@@ -165,7 +165,7 @@ namespace Tokenizer {
return -1;
}
- UnicodeString Quoting::lookupOpen( const UnicodeString &q ) const {
+ icu::UnicodeString Quoting::lookupOpen( const icu::UnicodeString &q ) const {
for ( const auto& quote : _quotes ){
if ( quote.openQuote.indexOf(q) >=0 )
return quote.closeQuote;
@@ -173,8 +173,8 @@ namespace Tokenizer {
return "";
}
- UnicodeString Quoting::lookupClose( const UnicodeString &q ) const {
- UnicodeString res;
+ icu::UnicodeString Quoting::lookupClose( const icu::UnicodeString &q ) const {
+ icu::UnicodeString res;
for ( const auto& quote : _quotes ){
if ( quote.closeQuote.indexOf(q) >= 0 )
return quote.openQuote;
@@ -186,7 +186,7 @@ namespace Tokenizer {
delete regexp;
}
- Rule::Rule( const UnicodeString& _id, const UnicodeString& _pattern):
+ Rule::Rule( const icu::UnicodeString& _id, const icu::UnicodeString& _pattern):
id(_id), pattern(_pattern) {
regexp = new UnicodeRegexMatcher( pattern, id );
}
@@ -200,10 +200,10 @@ namespace Tokenizer {
return os;
}
- bool Rule::matchAll( const UnicodeString& line,
- UnicodeString& pre,
- UnicodeString& post,
- vector<UnicodeString>& matches ){
+ bool Rule::matchAll( const icu::UnicodeString& line,
+ icu::UnicodeString& pre,
+ icu::UnicodeString& post,
+ vector<icu::UnicodeString>& matches ){
matches.clear();
pre = "";
post = "";
@@ -244,7 +244,7 @@ namespace Tokenizer {
else {
string rawline;
while ( getline(f,rawline) ){
- UnicodeString line = folia::UTF8ToUnicode(rawline);
+ icu::UnicodeString line = folia::UTF8ToUnicode(rawline);
line.trim();
if ((line.length() > 0) && (line[0] != '#')) {
if ( tokDebug >= 5 ){
@@ -255,8 +255,8 @@ namespace Tokenizer {
throw uConfigError( "invalid RULES entry: " + line,
fname );
}
- UnicodeString id = UnicodeString( line, 0,splitpoint);
- UnicodeString pattern = UnicodeString( line, splitpoint+1);
+ icu::UnicodeString id = icu::UnicodeString( line, 0,splitpoint);
+ icu::UnicodeString pattern = icu::UnicodeString( line, splitpoint+1);
rulesmap[id] = new Rule( id, pattern);
}
}
@@ -282,7 +282,7 @@ namespace Tokenizer {
else {
string rawline;
while ( getline(f,rawline) ){
- UnicodeString line = folia::UTF8ToUnicode(rawline);
+ icu::UnicodeString line = folia::UTF8ToUnicode(rawline);
line.trim();
if ((line.length() > 0) && (line[0] != '#')) {
if ( tokDebug >= 5 ){
@@ -296,8 +296,8 @@ namespace Tokenizer {
+ " (missing whitespace)",
fname );
}
- UnicodeString open = UnicodeString( line, 0,splitpoint);
- UnicodeString close = UnicodeString( line, splitpoint+1);
+ icu::UnicodeString open = icu::UnicodeString( line, 0,splitpoint);
+ icu::UnicodeString close = icu::UnicodeString( line, splitpoint+1);
open = open.trim().unescape();
close = close.trim().unescape();
if ( open.isEmpty() || close.isEmpty() ){
@@ -323,7 +323,7 @@ namespace Tokenizer {
else {
string rawline;
while ( getline(f,rawline) ){
- UnicodeString line = folia::UTF8ToUnicode(rawline);
+ icu::UnicodeString line = folia::UTF8ToUnicode(rawline);
line.trim();
if ((line.length() > 0) && (line[0] != '#')) {
if ( tokDebug >= 5 ){
@@ -331,7 +331,7 @@ namespace Tokenizer {
}
if ( ( line.startsWith("\\u") && line.length() == 6 ) ||
( line.startsWith("\\U") && line.length() == 10 ) ){
- UnicodeString uit = line.unescape();
+ icu::UnicodeString uit = line.unescape();
if ( uit.isEmpty() ){
throw uConfigError( "Invalid EOSMARKERS entry: " + line, fname );
}
@@ -344,7 +344,7 @@ namespace Tokenizer {
}
bool Setting::readabbreviations( const string& fname,
- UnicodeString& abbreviations ){
+ icu::UnicodeString& abbreviations ){
if ( tokDebug > 0 ){
*theErrLog << "%include " << fname << endl;
}
@@ -355,7 +355,7 @@ namespace Tokenizer {
else {
string rawline;
while ( getline(f,rawline) ){
- UnicodeString line = folia::UTF8ToUnicode(rawline);
+ icu::UnicodeString line = folia::UTF8ToUnicode(rawline);
line.trim();
if ((line.length() > 0) && (line[0] != '#')) {
if ( tokDebug >= 5 ){
@@ -370,17 +370,17 @@ namespace Tokenizer {
return true;
}
- void Setting::add_rule( const UnicodeString& name,
- const vector<UnicodeString>& parts ){
- UnicodeString pat;
+ void Setting::add_rule( const icu::UnicodeString& name,
+ const vector<icu::UnicodeString>& parts ){
+ icu::UnicodeString pat;
for ( auto const& part : parts ){
pat += part;
}
rulesmap[name] = new Rule( name, pat );
}
- void Setting::sortRules( map<UnicodeString, Rule *>& rulesmap,
- const vector<UnicodeString>& sort ){
+ void Setting::sortRules( map<icu::UnicodeString, Rule *>& rulesmap,
+ const vector<icu::UnicodeString>& sort ){
// LOG << "rules voor sort : " << endl;
// for ( size_t i=0; i < rules.size(); ++i ){
// LOG << "rule " << i << " " << *rules[i] << endl;
@@ -432,14 +432,14 @@ namespace Tokenizer {
return result;
}
- void addOrder( vector<UnicodeString>& order,
- map<UnicodeString,int>& reverse_order,
+ void addOrder( vector<icu::UnicodeString>& order,
+ map<icu::UnicodeString,int>& reverse_order,
int& index,
- UnicodeString &line,
+ icu::UnicodeString &line,
const string& fn ){
try {
UnicodeRegexMatcher m( "\\s+" );
- vector<UnicodeString> usv;
+ vector<icu::UnicodeString> usv;
m.split( line, usv );
for ( const auto& us : usv ){
if ( reverse_order.find( us ) != reverse_order.end() ){
@@ -500,7 +500,7 @@ namespace Tokenizer {
int dbg, LogStream* ls ) {
tokDebug = dbg;
theErrLog = ls;
- map<ConfigMode, UnicodeString> pattern = { { ABBREVIATIONS, "" },
+ map<ConfigMode, icu::UnicodeString> pattern = { { ABBREVIATIONS, "" },
{ TOKENS, "" },
{ PREFIXES, "" },
{ SUFFIXES, "" },
@@ -508,7 +508,7 @@ namespace Tokenizer {
{ ATTACHEDSUFFIXES, "" },
{ UNITS, "" },
{ ORDINALS, "" } };
- vector<UnicodeString> rules_order;
+ vector<icu::UnicodeString> rules_order;
vector<string> meta_rules;
string conffile = get_filename( settings_name );
@@ -572,7 +572,7 @@ namespace Tokenizer {
continue;
}
- UnicodeString line = folia::UTF8ToUnicode(rawline);
+ icu::UnicodeString line = folia::UTF8ToUnicode(rawline);
line.trim();
if ((line.length() > 0) && (line[0] != '#')) {
if (line[0] == '[') {
@@ -580,7 +580,7 @@ namespace Tokenizer {
}
else {
if ( line[0] == '\\' && line.length() > 1 && line[1] == '[' ){
- line = UnicodeString( line, 1 );
+ line = icu::UnicodeString( line, 1 );
}
switch( mode ){
case RULES: {
@@ -589,8 +589,8 @@ namespace Tokenizer {
throw uConfigError( "invalid RULES entry: " + line,
set_file );
}
- UnicodeString id = UnicodeString( line, 0,splitpoint);
- UnicodeString pattern = UnicodeString( line, splitpoint+1);
+ icu::UnicodeString id = icu::UnicodeString( line, 0,splitpoint);
+ icu::UnicodeString pattern = icu::UnicodeString( line, splitpoint+1);
rulesmap[id] = new Rule( id, pattern);
}
break;
@@ -617,7 +617,7 @@ namespace Tokenizer {
case EOSMARKERS:
if ( ( line.startsWith("\\u") && line.length() == 6 ) ||
( line.startsWith("\\U") && line.length() == 10 ) ){
- UnicodeString uit = line.unescape();
+ icu::UnicodeString uit = line.unescape();
if ( uit.isEmpty() ){
throw uConfigError( "Invalid EOSMARKERS entry: " + line,
set_file );
@@ -634,8 +634,8 @@ namespace Tokenizer {
+ " (missing whitespace)",
set_file );
}
- UnicodeString open = UnicodeString( line, 0,splitpoint);
- UnicodeString close = UnicodeString( line, splitpoint+1);
+ icu::UnicodeString open = icu::UnicodeString( line, 0,splitpoint);
+ icu::UnicodeString close = icu::UnicodeString( line, splitpoint+1);
open = open.trim().unescape();
close = close.trim().unescape();
if ( open.isEmpty() || close.isEmpty() ){
@@ -702,7 +702,7 @@ namespace Tokenizer {
}
continue;
}
- UnicodeString name = folia::UTF8ToUnicode( nam );
+ icu::UnicodeString name = folia::UTF8ToUnicode( nam );
string rule = mr.substr( pos+1 );
if ( tokDebug > 5 ){
LOG << "SPLIT using: '" << split << "'" << endl;
@@ -712,11 +712,11 @@ namespace Tokenizer {
for ( auto& str : parts ){
str = TiCC::trim( str );
}
- vector<UnicodeString> new_parts;
- vector<UnicodeString> undef_parts;
+ vector<icu::UnicodeString> new_parts;
+ vector<icu::UnicodeString> undef_parts;
bool skip_rule = false;
for ( const auto& part : parts ){
- UnicodeString meta = folia::UTF8ToUnicode( part );
+ icu::UnicodeString meta = folia::UTF8ToUnicode( part );
ConfigMode mode = getMode( "[" + meta + "]" );
switch ( mode ){
case ORDINALS:
--- ucto-0.9.6.orig/src/tokenize.cxx
+++ ucto-0.9.6/src/tokenize.cxx
@@ -88,11 +88,11 @@ namespace Tokenizer {
};
- UnicodeString convert( const string& line,
+ icu::UnicodeString convert( const string& line,
const string& inputEncoding ){
- UnicodeString result;
+ icu::UnicodeString result;
try {
- result = UnicodeString( line.c_str(),
+ result = icu::UnicodeString( line.c_str(),
line.length(),
inputEncoding.c_str() );
}
@@ -108,17 +108,17 @@ namespace Tokenizer {
return result;
}
- const UnicodeString type_space = "SPACE";
- const UnicodeString type_currency = "CURRENCY";
- const UnicodeString type_emoticon = "EMOTICON";
- const UnicodeString type_word = "WORD";
- const UnicodeString type_symbol = "SYMBOL";
- const UnicodeString type_punctuation = "PUNCTUATION";
- const UnicodeString type_number = "NUMBER";
- const UnicodeString type_unknown = "UNKNOWN";
+ const icu::UnicodeString type_space = "SPACE";
+ const icu::UnicodeString type_currency = "CURRENCY";
+ const icu::UnicodeString type_emoticon = "EMOTICON";
+ const icu::UnicodeString type_word = "WORD";
+ const icu::UnicodeString type_symbol = "SYMBOL";
+ const icu::UnicodeString type_punctuation = "PUNCTUATION";
+ const icu::UnicodeString type_number = "NUMBER";
+ const icu::UnicodeString type_unknown = "UNKNOWN";
- Token::Token( const UnicodeString& _type,
- const UnicodeString& _s,
+ Token::Token( const icu::UnicodeString& _type,
+ const icu::UnicodeString& _s,
TokenRole _role, const string& _lc ):
type(_type), us(_s), role(_role), lc(_lc) {}
@@ -226,7 +226,7 @@ namespace Tokenizer {
<< "'" << endl;
}
stripCR( line );
- UnicodeString input_line;
+ icu::UnicodeString input_line;
if ( line.size() > 0 && line[0] == 0 ){
// when processing UTF16LE, '0' bytes show up at pos 0
// we discard them, not for UTF16BE!
@@ -273,7 +273,7 @@ namespace Tokenizer {
LOG << "use textCat to guess language from: "
<< input_line << endl;
}
- UnicodeString temp = input_line;
+ icu::UnicodeString temp = input_line;
temp.toLower();
string lan = tc->get_language( folia::UnicodeToUTF8(temp) );
if ( settings.find( lan ) != settings.end() ){
@@ -531,7 +531,7 @@ namespace Tokenizer {
if ( root->hastext( outputclass ) ){
return;
}
- UnicodeString utxt = root->text( outputclass, false, false );
+ icu::UnicodeString utxt = root->text( outputclass, false, false );
// cerr << "untok: '" << utxt << "'" << endl;
// UnicodeString txt = root->text( outputclass, true );
// cerr << " tok: '" << txt << "'" << endl;
@@ -664,7 +664,7 @@ namespace Tokenizer {
if ( tokDebug > 0 ){
cerr << "tokenize sentence element: " << element->id() << endl;
}
- UnicodeString line = element->stricttext( inputclass );
+ icu::UnicodeString line = element->stricttext( inputclass );
if ( line.isEmpty() ){
// so no usefull text in this element. skip it
return;
@@ -837,7 +837,7 @@ namespace Tokenizer {
args["space"]= "no";
}
folia::FoliaElement *w = new folia::Word( args, root->doc() );
- UnicodeString out = token.us;
+ icu::UnicodeString out = token.us;
if (lowercase) {
out.toLower();
}
@@ -898,7 +898,7 @@ namespace Tokenizer {
OUT << endl << endl;
}
}
- UnicodeString s = token.us;
+ icu::UnicodeString s = token.us;
if (lowercase) {
s = s.toLower();
}
@@ -1118,12 +1118,12 @@ namespace Tokenizer {
quote = true;
}
else {
- UnicodeString opening = quotes.lookupOpen( c );
+ icu::UnicodeString opening = quotes.lookupOpen( c );
if (!opening.isEmpty()) {
quote = true;
}
else {
- UnicodeString closing = quotes.lookupClose( c );
+ icu::UnicodeString closing = quotes.lookupClose( c );
if (!closing.isEmpty()) {
quote = true;
}
@@ -1151,7 +1151,7 @@ namespace Tokenizer {
}
bool TokenizerClass::resolveQuote( int endindex,
- const UnicodeString& open,
+ const icu::UnicodeString& open,
Quoting& quotes ) {
//resolve a quote
int stackindex = -1;
@@ -1250,7 +1250,7 @@ namespace Tokenizer {
}
bool TokenizerClass::detectEos( size_t i,
- const UnicodeString& eosmarkers,
+ const icu::UnicodeString& eosmarkers,
const Quoting& quotes ) const {
bool is_eos = false;
UChar32 c = tokens[i].us.char32At(0);
@@ -1288,7 +1288,7 @@ namespace Tokenizer {
Quoting& quotes ) {
UChar32 c = tokens[i].us.char32At(0);
//Detect Quotation marks
- if ((c == '"') || ( UnicodeString(c) == """) ) {
+ if ((c == '"') || ( icu::UnicodeString(c) == """) ) {
if (tokDebug > 1 ){
LOG << "[detectQuoteBounds] Standard double-quote (ambiguous) found @i="<< i << endl;
}
@@ -1311,7 +1311,7 @@ namespace Tokenizer {
}
}
else {
- UnicodeString close = quotes.lookupOpen( c );
+ icu::UnicodeString close = quotes.lookupOpen( c );
if ( !close.isEmpty() ){ // we have a opening quote
if ( tokDebug > 1 ) {
LOG << "[detectQuoteBounds] Opening quote found @i="<< i << ", pushing to stack for resolution later..." << endl;
@@ -1319,7 +1319,7 @@ namespace Tokenizer {
quotes.push( i, c ); // remember it
}
else {
- UnicodeString open = quotes.lookupClose( c );
+ icu::UnicodeString open = quotes.lookupClose( c );
if ( !open.isEmpty() ) { // we have a closeing quote
if (tokDebug > 1 ) {
LOG << "[detectQuoteBounds] Closing quote found @i="<< i << ", attempting to resolve..." << endl;
@@ -1484,17 +1484,17 @@ namespace Tokenizer {
void TokenizerClass::passthruLine( const string& s, bool& bos ) {
// string wrapper
- UnicodeString us = convert( s, inputEncoding );;
+ icu::UnicodeString us = convert( s, inputEncoding );;
passthruLine( us, bos );
}
- void TokenizerClass::passthruLine( const UnicodeString& input, bool& bos ) {
+ void TokenizerClass::passthruLine( const icu::UnicodeString& input, bool& bos ) {
if (tokDebug) {
LOG << "[passthruLine] input: line=[" << input << "]" << endl;
}
bool alpha = false, num = false, punct = false;
- UnicodeString word;
- StringCharacterIterator sit(input);
+ icu::UnicodeString word;
+ icu::StringCharacterIterator sit(input);
while ( sit.hasNext() ){
UChar32 c = sit.current32();
if ( u_isspace(c)) {
@@ -1514,7 +1514,7 @@ namespace Tokenizer {
bos = true;
}
else {
- UnicodeString type;
+ icu::UnicodeString type;
if (alpha && !num && !punct) {
type = type_word;
}
@@ -1577,7 +1577,7 @@ namespace Tokenizer {
tokens.back().role |= ENDOFSENTENCE;
}
else {
- UnicodeString type;
+ icu::UnicodeString type;
if (alpha && !num && !punct) {
type = type_word;
}
@@ -1653,7 +1653,7 @@ namespace Tokenizer {
// string wrapper
int TokenizerClass::tokenizeLine( const string& s,
const string& lang ){
- UnicodeString uinputstring = convert( s, inputEncoding );
+ icu::UnicodeString uinputstring = convert( s, inputEncoding );
return tokenizeLine( uinputstring, lang );
}
@@ -1673,7 +1673,7 @@ namespace Tokenizer {
|| u_charType( c ) == U_OTHER_SYMBOL;
}
- const UnicodeString& detect_type( UChar32 c ){
+ const icu::UnicodeString& detect_type( UChar32 c ){
if ( u_isspace(c)) {
return type_space;
}
@@ -1768,7 +1768,7 @@ namespace Tokenizer {
}
}
- int TokenizerClass::tokenizeLine( const UnicodeString& originput,
+ int TokenizerClass::tokenizeLine( const icu::UnicodeString& originput,
const string& _lang ){
string lang = _lang;
if ( lang.empty() ){
@@ -1786,7 +1786,7 @@ namespace Tokenizer {
LOG << "[tokenizeLine] input: line=["
<< originput << "] (" << lang << ")" << endl;
}
- UnicodeString input = normalizer.normalize( originput );
+ icu::UnicodeString input = normalizer.normalize( originput );
if ( doFilter ){
input = settings[lang]->filter.filter( input );
}
@@ -1808,13 +1808,13 @@ namespace Tokenizer {
bool tokenizeword = false;
bool reset = false;
//iterate over all characters
- UnicodeString word;
- StringCharacterIterator sit(input);
+ icu::UnicodeString word;
+ icu::StringCharacterIterator sit(input);
long int i = 0;
while ( sit.hasNext() ){
UChar32 c = sit.current32();
if ( tokDebug > 8 ){
- UnicodeString s = c;
+ icu::UnicodeString s = c;
int8_t charT = u_charType( c );
LOG << "examine character: " << s << " type= "
<< toString( charT ) << endl;
@@ -1855,7 +1855,7 @@ namespace Tokenizer {
}
int eospos = tokens.size()-1;
if (expliciteosfound > 0) {
- UnicodeString realword;
+ icu::UnicodeString realword;
word.extract(0,expliciteosfound,realword);
if (tokDebug >= 2) {
LOG << "[tokenizeLine] Prefix before EOS: "
@@ -1865,7 +1865,7 @@ namespace Tokenizer {
eospos++;
}
if ( expliciteosfound + eosmark.length() < word.length() ){
- UnicodeString realword;
+ icu::UnicodeString realword;
word.extract( expliciteosfound+eosmark.length(),
word.length() - expliciteosfound - eosmark.length(),
realword );
@@ -1941,10 +1941,10 @@ namespace Tokenizer {
return numNewTokens;
}
- void TokenizerClass::tokenizeWord( const UnicodeString& input,
+ void TokenizerClass::tokenizeWord( const icu::UnicodeString& input,
bool space,
const string& lang,
- const UnicodeString& assigned_type ) {
+ const icu::UnicodeString& assigned_type ) {
bool recurse = !assigned_type.isEmpty();
int32_t inpLen = input.countChar32();
@@ -1977,7 +1977,7 @@ namespace Tokenizer {
if ( inpLen == 1) {
//single character, no need to process all rules, do some simpler (faster) detection
UChar32 c = input.char32At(0);
- UnicodeString type = detect_type( c );
+ icu::UnicodeString type = detect_type( c );
if ( type == type_space ){
return;
}
@@ -1993,7 +1993,7 @@ namespace Tokenizer {
}
}
else {
- UnicodeString word = input;
+ icu::UnicodeString word = input;
if ( norm_set.find( type ) != norm_set.end() ){
word = "{{" + type + "}}";
}
@@ -2010,10 +2010,10 @@ namespace Tokenizer {
if ( tokDebug >= 4){
LOG << "\tTESTING " << rule->id << endl;
}
- UnicodeString type = rule->id;
+ icu::UnicodeString type = rule->id;
//Find first matching rule
- UnicodeString pre, post;
- vector<UnicodeString> matches;
+ icu::UnicodeString pre, post;
+ vector<icu::UnicodeString> matches;
if ( rule->matchAll( input, pre, post, matches ) ){
a_rule_matched = true;
if ( tokDebug >= 4 ){
@@ -2083,7 +2083,7 @@ namespace Tokenizer {
if ( post.length() > 0 ) {
internal_space = false;
}
- UnicodeString word = matches[m];
+ icu::UnicodeString word = matches[m];
if ( norm_set.find( type ) != norm_set.end() ){
word = "{{" + type + "}}";
tokens.push_back( Token( type, word, internal_space ? NOROLE : NOSPACE, lang ) );
--- ucto-0.9.6.orig/src/unicode.cxx
+++ ucto-0.9.6/src/unicode.cxx
@@ -84,10 +84,10 @@ namespace Tokenizer {
return res;
}
- UnicodeString UnicodeNormalizer::normalize( const UnicodeString& us ){
- UnicodeString r;
+ icu::UnicodeString UnicodeNormalizer::normalize( const icu::UnicodeString& us ){
+ icu::UnicodeString r;
UErrorCode status=U_ZERO_ERROR;
- Normalizer::normalize( us, mode, 0, r, status );
+ icu::Normalizer::normalize( us, mode, 0, r, status );
if (U_FAILURE(status)){
throw std::invalid_argument("Normalizer");
}
@@ -101,18 +101,18 @@ namespace Tokenizer {
else {
auto it=q.the_map.cbegin();
while ( it != q.the_map.cend() ){
- os << folia::UnicodeToUTF8(UnicodeString(it->first)) << "\t" << it->second << endl;
+ os << folia::UnicodeToUTF8(icu::UnicodeString(it->first)) << "\t" << it->second << endl;
++it;
}
}
return os;
}
- UnicodeString UnicodeFilter::filter( const UnicodeString& s ){
+ icu::UnicodeString UnicodeFilter::filter( const icu::UnicodeString& s ){
if ( empty() )
return s;
else {
- UnicodeString result;
+ icu::UnicodeString result;
for ( int i=0; i < s.length(); ++i ){
auto it=the_map.find(s[i]);
if ( it != the_map.cend() )
@@ -125,16 +125,16 @@ namespace Tokenizer {
}
bool UnicodeFilter::add( const string& s ){
- UnicodeString line = folia::UTF8ToUnicode(s);
+ icu::UnicodeString line = folia::UTF8ToUnicode(s);
return add( line );
}
- bool UnicodeFilter::add( const UnicodeString& s ){
- UnicodeString line = s;
+ bool UnicodeFilter::add( const icu::UnicodeString& s ){
+ icu::UnicodeString line = s;
line.trim();
if ((line.length() > 0) && (line[0] != '#')) {
- UnicodeString open = "";
- UnicodeString close = "";
+ icu::UnicodeString open = "";
+ icu::UnicodeString close = "";
int splitpoint = line.indexOf(" ");
if ( splitpoint == -1 )
splitpoint = line.indexOf("\t");
@@ -142,8 +142,8 @@ namespace Tokenizer {
open = line;
}
else {
- open = UnicodeString( line, 0,splitpoint);
- close = UnicodeString( line, splitpoint+1);
+ open = icu::UnicodeString( line, 0,splitpoint);
+ close = icu::UnicodeString( line, splitpoint+1);
}
open = open.trim().unescape();
close = close.trim().unescape();
@@ -175,29 +175,29 @@ namespace Tokenizer {
class uConfigError: public std::invalid_argument {
public:
uConfigError( const string& s ): invalid_argument( "ucto: config file:" + s ){};
- uConfigError( const UnicodeString& us ): invalid_argument( "ucto: config file:" + folia::UnicodeToUTF8(us) ){};
+ uConfigError( const icu::UnicodeString& us ): invalid_argument( "ucto: config file:" + folia::UnicodeToUTF8(us) ){};
};
- UnicodeString UnicodeRegexMatcher::Pattern() const{
+ icu::UnicodeString UnicodeRegexMatcher::Pattern() const{
return pattern->pattern();
}
- UnicodeRegexMatcher::UnicodeRegexMatcher( const UnicodeString& pat,
- const UnicodeString& name ):
+ UnicodeRegexMatcher::UnicodeRegexMatcher( const icu::UnicodeString& pat,
+ const icu::UnicodeString& name ):
_name(name)
{
failString.clear();
matcher = NULL;
UErrorCode u_stat = U_ZERO_ERROR;
UParseError errorInfo;
- pattern = RegexPattern::compile( pat, 0, errorInfo, u_stat );
+ pattern = icu::RegexPattern::compile( pat, 0, errorInfo, u_stat );
if ( U_FAILURE(u_stat) ){
string spat = folia::UnicodeToUTF8(pat);
failString = folia::UnicodeToUTF8(_name);
if ( errorInfo.offset >0 ){
failString += " Invalid regular expression at position " + TiCC::toString( errorInfo.offset ) + "\n";
- UnicodeString pat1 = UnicodeString( pat, 0, errorInfo.offset -1 );
+ icu::UnicodeString pat1 = icu::UnicodeString( pat, 0, errorInfo.offset -1 );
failString += folia::UnicodeToUTF8(pat1) + " <== HERE\n";
}
else {
@@ -222,9 +222,9 @@ namespace Tokenizer {
//#define MATCH_DEBUG 1
- bool UnicodeRegexMatcher::match_all( const UnicodeString& line,
- UnicodeString& pre,
- UnicodeString& post ){
+ bool UnicodeRegexMatcher::match_all( const icu::UnicodeString& line,
+ icu::UnicodeString& pre,
+ icu::UnicodeString& post ){
UErrorCode u_stat = U_ZERO_ERROR;
pre = "";
post = "";
@@ -243,21 +243,21 @@ namespace Tokenizer {
#endif
if ( matcher->groupCount() == 0 ){
// case 1: a rule without capture groups matches
- UnicodeString us = matcher->group(0,u_stat) ;
+ icu::UnicodeString us = matcher->group(0,u_stat) ;
#ifdef MATCH_DEBUG
cerr << "case 1, result = " << us << endl;
#endif
results.push_back( us );
int start = matcher->start( 0, u_stat );
if ( start > 0 ){
- pre = UnicodeString( line, 0, start );
+ pre = icu::UnicodeString( line, 0, start );
#ifdef MATCH_DEBUG
cerr << "found pre " << folia::UnicodeToUTF8(pre) << endl;
#endif
}
int end = matcher->end( 0, u_stat );
if ( end < line.length() ){
- post = UnicodeString( line, end );
+ post = icu::UnicodeString( line, end );
#ifdef MATCH_DEBUG
cerr << "found post " << folia::UnicodeToUTF8(post) << endl;
#endif
@@ -268,20 +268,20 @@ namespace Tokenizer {
// case 2: a rule with one capture group matches
int start = matcher->start( 1, u_stat );
if ( start >= 0 ){
- UnicodeString us = matcher->group(1,u_stat) ;
+ icu::UnicodeString us = matcher->group(1,u_stat) ;
#ifdef MATCH_DEBUG
cerr << "case 2a , result = " << us << endl;
#endif
results.push_back( us );
if ( start > 0 ){
- pre = UnicodeString( line, 0, start );
+ pre = icu::UnicodeString( line, 0, start );
#ifdef MATCH_DEBUG
cerr << "found pre " << pre << endl;
#endif
}
int end = matcher->end( 1, u_stat );
if ( end < line.length() ){
- post = UnicodeString( line, end );
+ post = icu::UnicodeString( line, end );
#ifdef MATCH_DEBUG
cerr << "found post " << post << endl;
#endif
@@ -289,21 +289,21 @@ namespace Tokenizer {
}
else {
// group 1 is empty, return group 0
- UnicodeString us = matcher->group(0,u_stat) ;
+ icu::UnicodeString us = matcher->group(0,u_stat) ;
#ifdef MATCH_DEBUG
cerr << "case 2b , result = " << us << endl;
#endif
results.push_back( us );
start = matcher->start( 0, u_stat );
if ( start > 0 ){
- pre = UnicodeString( line, 0, start );
+ pre = icu::UnicodeString( line, 0, start );
#ifdef MATCH_DEBUG
cerr << "found pre " << pre << endl;
#endif
}
int end = matcher->end( 0, u_stat );
if ( end < line.length() ){
- post = UnicodeString( line, end );
+ post = icu::UnicodeString( line, end );
#ifdef MATCH_DEBUG
cerr << "found post " << post << endl;
#endif
@@ -332,7 +332,7 @@ namespace Tokenizer {
else
break;
if ( start > end ){
- pre = UnicodeString( line, end, start );
+ pre = icu::UnicodeString( line, end, start );
#ifdef MATCH_DEBUG
cerr << "found pre " << folia::UnicodeToUTF8(pre) << endl;
#endif
@@ -342,7 +342,7 @@ namespace Tokenizer {
cerr << "end = " << end << endl;
#endif
if (!U_FAILURE(u_stat)){
- results.push_back( UnicodeString( line, start, end - start ) );
+ results.push_back( icu::UnicodeString( line, start, end - start ) );
#ifdef MATCH_DEBUG
cerr << "added result " << folia::UnicodeToUTF8( results.back() ) << endl;
#endif
@@ -351,7 +351,7 @@ namespace Tokenizer {
break;
}
if ( end < line.length() ){
- post = UnicodeString( line, end );
+ post = icu::UnicodeString( line, end );
#ifdef MATCH_DEBUG
cerr << "found post " << folia::UnicodeToUTF8(post) << endl;
#endif
@@ -364,7 +364,7 @@ namespace Tokenizer {
return false;
}
- const UnicodeString UnicodeRegexMatcher::get_match( unsigned int n ) const{
+ const icu::UnicodeString UnicodeRegexMatcher::get_match( unsigned int n ) const{
if ( n < results.size() )
return results[n];
else
@@ -378,11 +378,11 @@ namespace Tokenizer {
return 0;
}
- int UnicodeRegexMatcher::split( const UnicodeString& us,
- vector<UnicodeString>& result ){
+ int UnicodeRegexMatcher::split( const icu::UnicodeString& us,
+ vector<icu::UnicodeString>& result ){
result.clear();
const int maxWords = 256;
- UnicodeString words[maxWords];
+ icu::UnicodeString words[maxWords];
UErrorCode status = U_ZERO_ERROR;
int numWords = matcher->split( us, words, maxWords, status );
for ( int i = 0; i < numWords; ++i )
--- End Message ---