Re: [PR] [feat](test) add tokenize ut test [doris]

via GitHub Thu, 12 Dec 2024 17:28:29 -0800


github-actions[bot] commented on code in PR #45374:
URL: https://github.com/apache/doris/pull/45374#discussion_r1883113878



##########
be/test/olap/itoken_extractor_test.cpp:
##########
@@ -92,4 +92,497 @@ TEST_F(TestITokenExtractor, ngram_like_extractor) {
     runNextInStringLike(ngram_extractor, {from_u8string(u8"\\_手机%")},
                         {from_u8string(u8"_手"), from_u8string(u8"手机")});
 }
+
+TEST_F(TestITokenExtractor, ngram_extractor_empty_input) {
+    // Test empty string input, expect no output
+    std::string statement = "";
+    std::vector<std::string> expect = {};
+    NgramTokenExtractor ngram_extractor(2);
+    runNextInString(ngram_extractor, statement, expect);
+}
+
+TEST_F(TestITokenExtractor, ngram_extractor_single_char) {
+    // Only one character, less than n=2, should produce no tokens
+    std::string statement = "a";
+    std::vector<std::string> expect = {};
+    NgramTokenExtractor ngram_extractor(2);
+    runNextInString(ngram_extractor, statement, expect);
+}
+
+TEST_F(TestITokenExtractor, ngram_extractor_ascii_characters) {
+    // Test token extraction for pure ASCII characters
+    std::string statement = "abcd";
+    // 2-gram tokens: "ab", "bc", "cd"
+    std::vector<std::string> expect = {"ab", "bc", "cd"};
+    NgramTokenExtractor ngram_extractor(2);
+    runNextInString(ngram_extractor, statement, expect);
+}
+
+TEST_F(TestITokenExtractor, ngram_extractor_emoji) {
+    // Test scenarios that include Emoji and other multi-byte UTF-8 characters
+    // Assume n=2. Here "👍" is an emoji (4 bytes), "测" is a Chinese character 
(3 bytes).
+    // String: "👍测A" (3 elements: 1 Emoji, 1 Chinese char, 1 ASCII)
+    // For two code points per token:
+    // First token: "👍测"
+    // Second token: "测A"
+    std::string statement = from_u8string(u8"👍测A");
+    std::vector<std::string> expect = {from_u8string(u8"👍测"), 
from_u8string(u8"测A")};
+    NgramTokenExtractor ngram_extractor(2);
+    runNextInString(ngram_extractor, statement, expect);
+}
+
+TEST_F(TestITokenExtractor, ngram_extractor_n_greater_than_length) {
+    // When n=3 and the string length is only 2, no 3-character Ngram can be 
formed
+    std::string statement = "ab";
+    std::vector<std::string> expect = {};
+    NgramTokenExtractor ngram_extractor(3);
+    runNextInString(ngram_extractor, statement, expect);
+}
+
+TEST_F(TestITokenExtractor, ngram_extractor_chinese_only) {
+    // Test pure Chinese characters with multi-byte UTF-8 tokens
+    // String: "中国人" (3 Chinese chars, each 3 bytes)
+    // n=2, expected tokens: ["中国", "国人"]
+    std::string statement = from_u8string(u8"中国人");
+    std::vector<std::string> expect = {from_u8string(u8"中国"), 
from_u8string(u8"国人")};
+    NgramTokenExtractor ngram_extractor(2);
+    runNextInString(ngram_extractor, statement, expect);
+}
+
+TEST_F(TestITokenExtractor, ngram_extractor_mixed_width_characters) {
+    // Mixed character widths: English (1 byte), Chinese (3 bytes), Emoji (4 
bytes)
+    // String: "A中👍B"
+    // Code points: 'A'(1), '中'(1), '👍'(1), 'B'(1) total 4 code points
+    // n=2 tokens: "A中", "中👍", "👍B"
+    std::string statement = from_u8string(u8"A中👍B");
+    std::vector<std::string> expect = {from_u8string(u8"A中"), 
from_u8string(u8"中👍"),
+                                       from_u8string(u8"👍B")};
+    NgramTokenExtractor ngram_extractor(2);
+    runNextInString(ngram_extractor, statement, expect);
+}
+
+TEST_F(TestITokenExtractor, ngram_like_extractor_empty_input) {
+    // Test empty input for like extraction
+    std::string statement = "";
+    std::vector<std::string> expect = {};
+    NgramTokenExtractor ngram_extractor(2);
+    runNextInStringLike(ngram_extractor, statement, expect);
+}
+
+TEST_F(TestITokenExtractor, ngram_like_extractor_no_pattern) {
+    // No % or _, equivalent to extracting n-length sequences.
+    // String: "abc", n=2, theoretically extract "ab", "bc"
+    // next_in_string_like requires n code points to return a token.
+    // Without % or _, it should still extract normally.
+    std::string statement = "abc";
+    // n=2: extract "ab", then "bc"
+    std::vector<std::string> expect = {"ab", "bc"};
+    NgramTokenExtractor ngram_extractor(2);
+    runNextInStringLike(ngram_extractor, statement, expect);
+}
+
+TEST_F(TestITokenExtractor, ngram_like_extractor_pattern1) {
+    // No % or _, equivalent to extracting n-length sequences.
+    // String: "abc", n=2, theoretically extract "ab", "bc"
+    // next_in_string_like requires n code points to return a token.
+    // Without % or _, it should still extract normally.
+    std::string statement = "%abc%def%gh%";
+    // n=2: extract "ab", then "bc"
+    std::vector<std::string> expect = {"ab", "bc", "de", "ef", "gh"};
+    NgramTokenExtractor ngram_extractor(2);
+    runNextInStringLike(ngram_extractor, statement, expect);
+}
+
+TEST_F(TestITokenExtractor, ngram_like_extractor_patterns_only) {
+    // String has only '%' and '_', no normal chars to form a 2-gram
+    // "%__%", n=2: % and _ are not considered normal token characters
+    // Each encounter of % resets the token, so no tokens are generated
+    std::string statement = "%__%";
+    std::vector<std::string> expect = {};
+    NgramTokenExtractor ngram_extractor(2);
+    runNextInStringLike(ngram_extractor, statement, expect);
+}
+
+TEST_F(TestITokenExtractor, ngram_like_extractor_escaped_characters) {
+    // Test scenarios with escape characters: "\\%abc% \\_xyz_"
+    // Escaped '%' should be treated as a normal character, similarly for '_'
+    // Suppose n=2, for "\\%abc%":
+    // Initially encounter '\\%' => escaped '%', include it in token: "%a"
+    // Then 'a'(1 byte) 'b'(1 byte) form "ab", 'c'(1 byte) continues...
+    // A bit complex example, mainly to demonstrate properly handling escaped 
chars.
+    std::string statement = from_u8string(u8"\\%手机% \\_人_");
+    // Analysis:
+    // "\\%" -> escaped '%', token gets "%"
+    // then "手"(1 code point), "机"(1 code point). Once 2 code points are 
formed, we have "%手"
+    // Move pos. Next token starts from "机":
+    // '机'(1 code point)
+    // Next is '%', encountering '%', reset token, skip over ' '...
+    // Next segment: "\\_人_"
+    // "\\_" => escaped '_', token gets "_"
+    // '人'(1 code point) + '_' pattern encountered resets token after 
outputting "_人"
+    // Final result: {"%手", "_人"}
+    // Note: Based on logic, pattern chars % and _ reset the token. After a 
token is output,
+    // encountering % or _ resets the token to empty, not affecting previously 
output tokens.
+    std::vector<std::string> expect = {"%手", "手机", " _", "_人"};
+    NgramTokenExtractor ngram_extractor(2);
+    runNextInStringLike(ngram_extractor, statement, expect);
+}
+
+TEST_F(TestITokenExtractor, ngram_like_extractor_complex_pattern) {
+    // Complex scenario: "abc%中_\\%国%d"
+    // n=2 analysis:
+    // Start from the beginning: 'a'(1 code point), 'b'(1 code point) => "ab" 
output
+    // Encounter 'c' then '%', at '%' reset token and move forward
+    // Next: "中"(1 code point), '_' is pattern reset
+    // Then "\\%" => '%'(1 code point), '国'(1 code point) => "%国" output
+    // Encounter '%', reset token
+    // Finally 'd' alone is not enough to form 2 code points, no output
+    std::string statement = from_u8string(u8"abc%中_\\%国%d");
+    std::vector<std::string> expect = {"ab", "bc", "%国"};
+    NgramTokenExtractor ngram_extractor(2);
+    runNextInStringLike(ngram_extractor, statement, expect);
+}
+
+TEST_F(TestITokenExtractor, ngram_extractor_different_n) {
+    // Test different n values
+    // String: "abcd"
+    // n=3: extract "abc", "bcd"
+    std::string statement = "abcd";
+    std::vector<std::string> expect = {"abc", "bcd"};
+    NgramTokenExtractor ngram_extractor(3);
+    runNextInString(ngram_extractor, statement, expect);
+}
+
+std::string get_repetition_info(const std::string& text, size_t n) {
+    NgramTokenExtractor ngram_extractor(n);
+    std::vector<std::string> tokens;
+
+    {
+        size_t pos = 0;
+        size_t token_start = 0;
+        size_t token_length = 0;
+        while (ngram_extractor.next_in_string(text.c_str(), text.size(), &pos, 
&token_start,
+                                              &token_length)) {
+            tokens.push_back(text.substr(token_start, token_length));
+        }
+    }
+
+    std::unordered_map<std::string, int> token_count;
+    for (auto& t : tokens) {
+        token_count[t]++;
+    }
+
+    int total_tokens = static_cast<int>(tokens.size());
+    int repeated_tokens = 0;
+    for (auto& kv : token_count) {
+        if (kv.second > 1) {
+            repeated_tokens += kv.second;
+        }
+    }
+
+    double repetition_rate = 0.0;
+    if (total_tokens > 0) {
+        repetition_rate = static_cast<double>(repeated_tokens) / total_tokens;
+    }
+
+    std::ostringstream oss;
+    oss << "Total tokens: " << total_tokens << "\n"
+        << "Repeated tokens: " << repeated_tokens << "\n"
+        << "Repetition rate: " << repetition_rate << "\n";
+
+    return oss.str();
+}
+
+TEST_F(TestITokenExtractor, ngram_extractor_repetition_rate_matchine_text) {
+    std::string statement =
+            "Exception=System.CannotUnloadAppDomain;\n"
+            "HResult=0x00007486;\n"
+            "Message=exception happened;\n"

Review Comment:
   warning: function 'TEST_F' exceeds recommended size/complexity thresholds 
[readability-function-size]
   ```cpp
   .str();
              ^
   ```
   <details>
   <summary>Additional context</summary>
   
   **be/test/olap/itoken_extractor_test.cpp:299:** 109 lines including 
whitespace and comments (threshold 80)
   ```cpp
   .str();
              ^
   ```
   
   </details>
   



##########
be/test/olap/itoken_extractor_test.cpp:
##########
@@ -92,4 +92,497 @@
     runNextInStringLike(ngram_extractor, {from_u8string(u8"\\_手机%")},
                         {from_u8string(u8"_手"), from_u8string(u8"手机")});
 }
+
+TEST_F(TestITokenExtractor, ngram_extractor_empty_input) {
+    // Test empty string input, expect no output
+    std::string statement = "";
+    std::vector<std::string> expect = {};
+    NgramTokenExtractor ngram_extractor(2);
+    runNextInString(ngram_extractor, statement, expect);
+}
+
+TEST_F(TestITokenExtractor, ngram_extractor_single_char) {
+    // Only one character, less than n=2, should produce no tokens
+    std::string statement = "a";
+    std::vector<std::string> expect = {};
+    NgramTokenExtractor ngram_extractor(2);
+    runNextInString(ngram_extractor, statement, expect);
+}
+
+TEST_F(TestITokenExtractor, ngram_extractor_ascii_characters) {
+    // Test token extraction for pure ASCII characters
+    std::string statement = "abcd";
+    // 2-gram tokens: "ab", "bc", "cd"
+    std::vector<std::string> expect = {"ab", "bc", "cd"};
+    NgramTokenExtractor ngram_extractor(2);
+    runNextInString(ngram_extractor, statement, expect);
+}
+
+TEST_F(TestITokenExtractor, ngram_extractor_emoji) {
+    // Test scenarios that include Emoji and other multi-byte UTF-8 characters
+    // Assume n=2. Here "👍" is an emoji (4 bytes), "测" is a Chinese character 
(3 bytes).
+    // String: "👍测A" (3 elements: 1 Emoji, 1 Chinese char, 1 ASCII)
+    // For two code points per token:
+    // First token: "👍测"
+    // Second token: "测A"
+    std::string statement = from_u8string(u8"👍测A");
+    std::vector<std::string> expect = {from_u8string(u8"👍测"), 
from_u8string(u8"测A")};
+    NgramTokenExtractor ngram_extractor(2);
+    runNextInString(ngram_extractor, statement, expect);
+}
+
+TEST_F(TestITokenExtractor, ngram_extractor_n_greater_than_length) {
+    // When n=3 and the string length is only 2, no 3-character Ngram can be 
formed
+    std::string statement = "ab";
+    std::vector<std::string> expect = {};
+    NgramTokenExtractor ngram_extractor(3);
+    runNextInString(ngram_extractor, statement, expect);
+}
+
+TEST_F(TestITokenExtractor, ngram_extractor_chinese_only) {
+    // Test pure Chinese characters with multi-byte UTF-8 tokens
+    // String: "中国人" (3 Chinese chars, each 3 bytes)
+    // n=2, expected tokens: ["中国", "国人"]
+    std::string statement = from_u8string(u8"中国人");
+    std::vector<std::string> expect = {from_u8string(u8"中国"), 
from_u8string(u8"国人")};
+    NgramTokenExtractor ngram_extractor(2);
+    runNextInString(ngram_extractor, statement, expect);
+}
+
+TEST_F(TestITokenExtractor, ngram_extractor_mixed_width_characters) {
+    // Mixed character widths: English (1 byte), Chinese (3 bytes), Emoji (4 
bytes)
+    // String: "A中👍B"
+    // Code points: 'A'(1), '中'(1), '👍'(1), 'B'(1) total 4 code points
+    // n=2 tokens: "A中", "中👍", "👍B"
+    std::string statement = from_u8string(u8"A中👍B");
+    std::vector<std::string> expect = {from_u8string(u8"A中"), 
from_u8string(u8"中👍"),
+                                       from_u8string(u8"👍B")};
+    NgramTokenExtractor ngram_extractor(2);
+    runNextInString(ngram_extractor, statement, expect);
+}
+
+TEST_F(TestITokenExtractor, ngram_like_extractor_empty_input) {
+    // Test empty input for like extraction
+    std::string statement = "";
+    std::vector<std::string> expect = {};
+    NgramTokenExtractor ngram_extractor(2);
+    runNextInStringLike(ngram_extractor, statement, expect);
+}
+
+TEST_F(TestITokenExtractor, ngram_like_extractor_no_pattern) {
+    // No % or _, equivalent to extracting n-length sequences.
+    // String: "abc", n=2, theoretically extract "ab", "bc"
+    // next_in_string_like requires n code points to return a token.
+    // Without % or _, it should still extract normally.
+    std::string statement = "abc";
+    // n=2: extract "ab", then "bc"
+    std::vector<std::string> expect = {"ab", "bc"};
+    NgramTokenExtractor ngram_extractor(2);
+    runNextInStringLike(ngram_extractor, statement, expect);
+}
+
+TEST_F(TestITokenExtractor, ngram_like_extractor_pattern1) {
+    // No % or _, equivalent to extracting n-length sequences.
+    // String: "abc", n=2, theoretically extract "ab", "bc"
+    // next_in_string_like requires n code points to return a token.
+    // Without % or _, it should still extract normally.
+    std::string statement = "%abc%def%gh%";
+    // n=2: extract "ab", then "bc"
+    std::vector<std::string> expect = {"ab", "bc", "de", "ef", "gh"};
+    NgramTokenExtractor ngram_extractor(2);
+    runNextInStringLike(ngram_extractor, statement, expect);
+}
+
+TEST_F(TestITokenExtractor, ngram_like_extractor_patterns_only) {
+    // String has only '%' and '_', no normal chars to form a 2-gram
+    // "%__%", n=2: % and _ are not considered normal token characters
+    // Each encounter of % resets the token, so no tokens are generated
+    std::string statement = "%__%";
+    std::vector<std::string> expect = {};
+    NgramTokenExtractor ngram_extractor(2);
+    runNextInStringLike(ngram_extractor, statement, expect);
+}
+
+TEST_F(TestITokenExtractor, ngram_like_extractor_escaped_characters) {
+    // Test scenarios with escape characters: "\\%abc% \\_xyz_"
+    // Escaped '%' should be treated as a normal character, similarly for '_'
+    // Suppose n=2, for "\\%abc%":
+    // Initially encounter '\\%' => escaped '%', include it in token: "%a"
+    // Then 'a'(1 byte) 'b'(1 byte) form "ab", 'c'(1 byte) continues...
+    // A bit complex example, mainly to demonstrate properly handling escaped 
chars.
+    std::string statement = from_u8string(u8"\\%手机% \\_人_");
+    // Analysis:
+    // "\\%" -> escaped '%', token gets "%"
+    // then "手"(1 code point), "机"(1 code point). Once 2 code points are 
formed, we have "%手"
+    // Move pos. Next token starts from "机":
+    // '机'(1 code point)
+    // Next is '%', encountering '%', reset token, skip over ' '...
+    // Next segment: "\\_人_"
+    // "\\_" => escaped '_', token gets "_"
+    // '人'(1 code point) + '_' pattern encountered resets token after 
outputting "_人"
+    // Final result: {"%手", "_人"}
+    // Note: Based on logic, pattern chars % and _ reset the token. After a 
token is output,
+    // encountering % or _ resets the token to empty, not affecting previously 
output tokens.
+    std::vector<std::string> expect = {"%手", "手机", " _", "_人"};
+    NgramTokenExtractor ngram_extractor(2);
+    runNextInStringLike(ngram_extractor, statement, expect);
+}
+
+TEST_F(TestITokenExtractor, ngram_like_extractor_complex_pattern) {
+    // Complex scenario: "abc%中_\\%国%d"
+    // n=2 analysis:
+    // Start from the beginning: 'a'(1 code point), 'b'(1 code point) => "ab" 
output
+    // Encounter 'c' then '%', at '%' reset token and move forward
+    // Next: "中"(1 code point), '_' is pattern reset
+    // Then "\\%" => '%'(1 code point), '国'(1 code point) => "%国" output
+    // Encounter '%', reset token
+    // Finally 'd' alone is not enough to form 2 code points, no output
+    std::string statement = from_u8string(u8"abc%中_\\%国%d");
+    std::vector<std::string> expect = {"ab", "bc", "%国"};
+    NgramTokenExtractor ngram_extractor(2);
+    runNextInStringLike(ngram_extractor, statement, expect);
+}
+
+TEST_F(TestITokenExtractor, ngram_extractor_different_n) {
+    // Test different n values
+    // String: "abcd"
+    // n=3: extract "abc", "bcd"
+    std::string statement = "abcd";
+    std::vector<std::string> expect = {"abc", "bcd"};
+    NgramTokenExtractor ngram_extractor(3);
+    runNextInString(ngram_extractor, statement, expect);
+}
+
+std::string get_repetition_info(const std::string& text, size_t n) {
+    NgramTokenExtractor ngram_extractor(n);
+    std::vector<std::string> tokens;
+
+    {
+        size_t pos = 0;
+        size_t token_start = 0;
+        size_t token_length = 0;
+        while (ngram_extractor.next_in_string(text.c_str(), text.size(), &pos, 
&token_start,
+                                              &token_length)) {
+            tokens.push_back(text.substr(token_start, token_length));
+        }
+    }
+
+    std::unordered_map<std::string, int> token_count;
+    for (auto& t : tokens) {
+        token_count[t]++;
+    }
+
+    int total_tokens = static_cast<int>(tokens.size());
+    int repeated_tokens = 0;
+    for (auto& kv : token_count) {
+        if (kv.second > 1) {
+            repeated_tokens += kv.second;
+        }
+    }
+
+    double repetition_rate = 0.0;
+    if (total_tokens > 0) {
+        repetition_rate = static_cast<double>(repeated_tokens) / total_tokens;
+    }
+
+    std::ostringstream oss;
+    oss << "Total tokens: " << total_tokens << "\n"
+        << "Repeated tokens: " << repeated_tokens << "\n"
+        << "Repetition rate: " << repetition_rate << "\n";
+
+    return oss.str();
+}
+
+TEST_F(TestITokenExtractor, ngram_extractor_repetition_rate_matchine_text) {
+    std::string statement =
+            "Exception=System.CannotUnloadAppDomain;\n"
+            "HResult=0x00007486;\n"
+            "Message=exception happened;\n"
+            "Source=BenchmarkLogGenerator;\n"
+            "StackTrace:\n"
+            " at BenchmarkLogGenerator.Generator.Run(Int32 sizeFactor) in "
+            "C:\\Src\\Tools\\BenchmarkLogGenerator\\Generator.cs:line 84\n"
+            " at 
BenchmarkLogGenerator.Generator.<>c__DisplayClass26_0.<RunInBackground>b__0() 
in "
+            "C:\\Src\\Tools\\BenchmarkLogGenerator\\Generator.cs:line 74\n"
+            " at System.Threading.ThreadHelper.ThreadStart_Context(Object 
state)\n"
+            " at 
System.Threading.ExecutionContext.RunInternal(ExecutionContext 
executionContext)\n"
+            " at BenchmarkLogGenerator.Flows.BootFlow.GetLevel(Int64 v) in "
+            "C:\\Src\\Tools\\BenchmarkLogGenerator\\Flows\\BootFlow.cs:line 
85\n"
+            " at 
BenchmarkLogGenerator.Flows.BootFlow.<IngestionSession>d__1.MoveNext() in "
+            "C:\\Src\\Tools\\BenchmarkLogGenerator\\Flows\\BootFlow.cs:line 
47\n"
+            " at BenchmarkLogGenerator.Scheduler.Flow.NextStep() in "
+            "C:\\Src\\Tools\\BenchmarkLogGenerator\\Scheduler.cs:line 74\n"
+            " at 
BenchmarkLogGenerator.Scheduler.Step.EnqueueNextStep(Scheduler scheduler) in "
+            "C:\\Src\\Tools\\BenchmarkLogGenerator\\Scheduler.cs:line 112\n"
+            " at 
BenchmarkLogGenerator.Scheduler.FlowDelayStep.Execute(Scheduler scheduler) in "
+            "C:\\Src\\Tools\\BenchmarkLogGenerator\\Scheduler.cs:line 137\n"
+            " at BenchmarkLogGenerator.Scheduler.Run() in "
+            "C:\\Src\\Tools\\BenchmarkLogGenerator\\Scheduler.cs:line 28\n"
+            " at BenchmarkLogGenerator.Generator.Run(Int32 sizeFactor) in "
+            "C:\\Src\\Tools\\BenchmarkLogGenerator\\Generator.cs:line 84\n"
+            " at 
BenchmarkLogGenerator.Generator.<>c__DisplayClass26_0.<RunInBackground>b__0() 
in "
+            "C:\\Src\\Tools\\BenchmarkLogGenerator\\Generator.cs:line 74\n"
+            " at System.Threading.ThreadHelper.ThreadStart_Context(Object 
state)\n"
+            " at 
System.Threading.ExecutionContext.RunInternal(ExecutionContext 
executionContext)\n"
+            " at BenchmarkLogGenerator.Flows.BootFlow.GetLevel(Int64 v) in "
+            "C:\\Src\\Tools\\BenchmarkLogGenerator\\Flows\\BootFlow.cs:line 
85\n"
+            " at 
BenchmarkLogGenerator.Flows.BootFlow.<IngestionSession>d__1.MoveNext() in "
+            "C:\\Src\\Tools\\BenchmarkLogGenerator\\Flows\\BootFlow.cs:line 
47\n"
+            " at BenchmarkLogGenerator.Scheduler.Flow.NextStep() in "
+            "C:\\Src\\Tools\\BenchmarkLogGenerator\\Scheduler.cs:line 74\n"
+            " at 
BenchmarkLogGenerator.Scheduler.Step.EnqueueNextStep(Scheduler scheduler) in "
+            "C:\\Src\\Tools\\BenchmarkLogGenerator\\Scheduler.cs:line 112\n"
+            " at 
BenchmarkLogGenerator.Scheduler.FlowDelayStep.Execute(Scheduler scheduler) in "
+            "C:\\Src\\Tools\\BenchmarkLogGenerator\\Scheduler.cs:line 137\n"
+            " at BenchmarkLogGenerator.Scheduler.Run() in "
+            "C:\\Src\\Tools\\BenchmarkLogGenerator\\Scheduler.cs:line 28\n"
+            " at BenchmarkLogGenerator.Generator.Run(Int32 sizeFactor) in "
+            "C:\\Src\\Tools\\BenchmarkLogGenerator\\Generator.cs:line 84\n"
+            " at 
BenchmarkLogGenerator.Generator.<>c__DisplayClass26_0.<RunInBackground>b__0() 
in "
+            "C:\\Src\\Tools\\BenchmarkLogGenerator\\Generator.cs:line 74\n"
+            " at System.Threading.ThreadHelper.ThreadStart_Context(Object 
state)\n"
+            " at 
System.Threading.ExecutionContext.RunInternal(ExecutionContext 
executionContext)\n"
+            " at BenchmarkLogGenerator.Flows.BootFlow.GetLevel(Int64 v) in "
+            "C:\\Src\\Tools\\BenchmarkLogGenerator\\Flows\\BootFlow.cs:line 
85\n"
+            " at 
BenchmarkLogGenerator.Flows.BootFlow.<IngestionSession>d__1.MoveNext() in "
+            "C:\\Src\\Tools\\BenchmarkLogGenerator\\Flows\\BootFlow.cs:line 
47\n"
+            " at BenchmarkLogGenerator.Scheduler.Flow.NextStep() in "
+            "C:\\Src\\Tools\\BenchmarkLogGenerator\\Scheduler.cs:line 74\n"
+            " at 
BenchmarkLogGenerator.Scheduler.Step.EnqueueNextStep(Scheduler scheduler) in "
+            "C:\\Src\\Tools\\BenchmarkLogGenerator\\Scheduler.cs:line 112\n"
+            " at 
BenchmarkLogGenerator.Scheduler.FlowDelayStep.Execute(Scheduler scheduler) in "
+            "C:\\Src\\Tools\\BenchmarkLogGenerator\\Scheduler.cs:line 137\n"
+            " at BenchmarkLogGenerator.Scheduler.Run() in "
+            "C:\\Src\\Tools\\BenchmarkLogGenerator\\Scheduler.cs:line 28\n"
+            " at BenchmarkLogGenerator.Generator.Run(Int32 sizeFactor) in "
+            "C:\\Src\\Tools\\BenchmarkLogGenerator\\Generator.cs:line 84\n"
+            " at 
BenchmarkLogGenerator.Generator.<>c__DisplayClass26_0.<RunInBackground>b__0() 
in "
+            "C:\\Src\\Tools\\BenchmarkLogGenerator\\Generator.cs:line 74\n"
+            " at System.Threading.ThreadHelper.ThreadStart_Context(Object 
state)\n"
+            " at 
System.Threading.ExecutionContext.RunInternal(ExecutionContext 
executionContext)\n"
+            " at BenchmarkLogGenerator.Flows.BootFlow.GetLevel(Int64 v) in "
+            "C:\\Src\\Tools\\BenchmarkLogGenerator\\Flows\\BootFlow.cs:line 
85\n"
+            " at 
BenchmarkLogGenerator.Flows.BootFlow.<IngestionSession>d__1.MoveNext() in "
+            "C:\\Src\\Tools\\BenchmarkLogGenerator\\Flows\\BootFlow.cs:line 
47\n"
+            " at BenchmarkLogGenerator.Scheduler.Flow.NextStep() in "
+            "C:\\Src\\Tools\\BenchmarkLogGenerator\\Scheduler.cs:line 74\n"
+            " at 
BenchmarkLogGenerator.Scheduler.Step.EnqueueNextStep(Scheduler scheduler) in "
+            "C:\\Src\\Tools\\BenchmarkLogGenerator\\Scheduler.cs:line 112\n"
+            " at 
BenchmarkLogGenerator.Scheduler.FlowDelayStep.Execute(Scheduler scheduler) in "
+            "C:\\Src\\Tools\\BenchmarkLogGenerator\\Scheduler.cs:line 137\n"
+            " at BenchmarkLogGenerator.Scheduler.Run() in "
+            "C:\\Src\\Tools\\BenchmarkLogGenerator\\Scheduler.cs:line 28\n"
+            " at BenchmarkLogGenerator.Generator.Run(Int32 sizeFactor) in "
+            "C:\\Src\\Tools\\BenchmarkLogGenerator\\Generator.cs:line 84\n"
+            " at 
BenchmarkLogGenerator.Generator.<>c__DisplayClass26_0.<RunInBackground>b__0() 
in "
+            "C:\\Src\\Tools\\BenchmarkLogGenerator\\Generator.cs:line 74\n"
+            " at System.Threading.ThreadHelper.ThreadStart_Context(Object 
state)\n"
+            " at 
System.Threading.ExecutionContext.RunInternal(ExecutionContext 
executionContext)\n"
+            " at BenchmarkLogGenerator.Flows.BootFlow.GetLevel(Int64 v) in "
+            "C:\\Src\\Tools\\BenchmarkLogGenerator\\Flows\\BootFlow.cs:line 
85\n"
+            " at 
BenchmarkLogGenerator.Flows.BootFlow.<IngestionSession>d__1.MoveNext() in "
+            "C:\\Src\\Tools\\BenchmarkLogGenerator\\Flows\\BootFlow.cs:line 
47\n"
+            " at BenchmarkLogGenerator.Scheduler.Flow.NextStep() in "
+            "C:\\Src\\Tools\\BenchmarkLogGenerator\\Scheduler.cs:line 74\n"
+            " at 
BenchmarkLogGenerator.Scheduler.Step.EnqueueNextStep(Scheduler scheduler) in "
+            "C:\\Src\\Tools\\BenchmarkLogGenerator\\Scheduler.cs:line 112\n"
+            " at 
BenchmarkLogGenerator.Scheduler.FlowDelayStep.Execute(Scheduler scheduler) in "
+            "C:\\Src\\Tools\\BenchmarkLogGenerator\\Scheduler.cs:line 137\n"
+            " at BenchmarkLogGenerator.Scheduler.Run() in "
+            "C:\\Src\\Tools\\BenchmarkLogGenerator\\Scheduler.cs:line 28\n"
+            " at BenchmarkLogGenerator.Generator.Run(Int32 sizeFactor) in "
+            "C:\\Src\\Tools\\BenchmarkLogGenerator\\Generator.cs:line 84\n"
+            " at 
BenchmarkLogGenerator.Generator.<>c__DisplayClass26_0.<RunInBackground>b__0() 
in "
+            "C:\\Src\\Tools\\BenchmarkLogGenerator\\Generator.cs:line 74\n"
+            " at System.Threading.ThreadHelper.ThreadStart_Context(Object 
state)\n"
+            " at 
System.Threading.ExecutionContext.RunInternal(ExecutionContext 
executionContext)\n"
+            " at BenchmarkLogGenerator.Flows.BootFlow.GetLevel(Int64 v) in "
+            "C:\\Src\\Tools\\BenchmarkLogGenerator\\Flows\\BootFlow.cs:line 
85";
+    size_t n = 5;
+    std::string info = get_repetition_info(statement, n);
+
+    std::cout << info << std::endl;
+}
+
+TEST_F(TestITokenExtractor, ngram_extractor_repetition_rate_short_text) {
+    std::string statement =
+            "I bought these leggings for my daughter @ Christmas along with 
several other "
+            "leggings.  She liked these leggings the best since they were 
lined and are very warm. "
+            " She is 5'3&#34; and 115 lbs. and they fit her very 
well/comfortable.  The only thing "
+            "I disliked about them is that the pattern is not uniform on both 
legs as it gets to "
+            "your upper thigh area.";
+    size_t n = 5;
+    std::string info = get_repetition_info(statement, n);
+
+    std::cout << info << std::endl;
+}
+
+TEST_F(TestITokenExtractor, ngram_extractor_repetition_rate_medium_text) {
+    std::string statement =
+            "Loving the fabulous and exquisite women's wear for plus size 
women, because of how "
+            "this sweater makes you feel good about yourself, and speaks to 
her heart with a "

Review Comment:
   warning: function 'TEST_F' exceeds recommended size/complexity thresholds 
[readability-function-size]
   ```cpp
   dl;
          ^
   ```
   <details>
   <summary>Additional context</summary>
   
   **be/test/olap/itoken_extractor_test.cpp:422:** 167 lines including 
whitespace and comments (threshold 80)
   ```cpp
   dl;
          ^
   ```
   
   </details>
   



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Re: [PR] [feat](test) add tokenize ut test [doris]

Reply via email to