This is an automated email from the ASF dual-hosted git repository.

xbli pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/pinot.git


The following commit(s) were added to refs/heads/master by this push:
     new 1e0f87053c add logs to debug why crc values are different upon same 
input data and found Text Index data is not deterministic (#14188)
1e0f87053c is described below

commit 1e0f87053c7bee2316223ec4152c80341c291fb1
Author: Xiaobing <61892277+klsi...@users.noreply.github.com>
AuthorDate: Tue Oct 8 14:35:41 2024 -0700

    add logs to debug why crc values are different upon same input data and 
found Text Index data is not deterministic (#14188)
---
 .../org/apache/pinot/core/util/CrcUtilsTest.java   | 101 +++++++++++++++++++--
 .../apache/pinot/segment/local/utils/CrcUtils.java |   3 +
 2 files changed, 98 insertions(+), 6 deletions(-)

diff --git 
a/pinot-core/src/test/java/org/apache/pinot/core/util/CrcUtilsTest.java 
b/pinot-core/src/test/java/org/apache/pinot/core/util/CrcUtilsTest.java
index 120f84e07e..494d4f518f 100644
--- a/pinot-core/src/test/java/org/apache/pinot/core/util/CrcUtilsTest.java
+++ b/pinot-core/src/test/java/org/apache/pinot/core/util/CrcUtilsTest.java
@@ -19,16 +19,24 @@
 package org.apache.pinot.core.util;
 
 import java.io.File;
+import java.io.IOException;
 import java.net.URL;
 import java.util.concurrent.TimeUnit;
 import org.apache.commons.io.FileUtils;
 import org.apache.pinot.segment.local.segment.creator.SegmentTestUtils;
 import 
org.apache.pinot.segment.local.segment.creator.impl.SegmentIndexCreationDriverImpl;
 import 
org.apache.pinot.segment.local.segment.index.converter.SegmentV1V2ToV3FormatConverter;
+import 
org.apache.pinot.segment.local.segment.index.text.TextIndexConfigBuilder;
 import org.apache.pinot.segment.local.utils.CrcUtils;
 import org.apache.pinot.segment.spi.creator.SegmentGeneratorConfig;
 import org.apache.pinot.segment.spi.creator.SegmentIndexCreationDriver;
+import org.apache.pinot.segment.spi.index.FieldIndexConfigs;
+import org.apache.pinot.segment.spi.index.FstIndexConfig;
+import org.apache.pinot.segment.spi.index.StandardIndexes;
+import org.apache.pinot.segment.spi.index.TextIndexConfig;
+import org.apache.pinot.spi.config.table.FSTType;
 import org.apache.pinot.util.TestUtils;
+import org.testng.annotations.BeforeMethod;
 import org.testng.annotations.Test;
 
 import static org.testng.Assert.assertEquals;
@@ -38,30 +46,111 @@ import static org.testng.Assert.assertNotNull;
 public class CrcUtilsTest {
   private static final File INDEX_DIR = new File(FileUtils.getTempDirectory(), 
"CrcUtilsTest");
   private static final String AVRO_DATA = "data/test_data-mv.avro";
-  private static final long EXPECTED_V1_CRC = 2708456273L;
-  private static final long EXPECTED_V3_CRC = 2796149869L;
+
+  @BeforeMethod
+  public void setup()
+      throws IOException {
+    FileUtils.deleteDirectory(INDEX_DIR);
+  }
+
+  @BeforeMethod
+  public void tearDown()
+      throws IOException {
+    FileUtils.deleteDirectory(INDEX_DIR);
+  }
 
   @Test
   public void testCrc()
       throws Exception {
-    FileUtils.deleteDirectory(INDEX_DIR);
+    URL resource = getClass().getClassLoader().getResource(AVRO_DATA);
+    assertNotNull(resource);
+    String filePath = TestUtils.getFileFromResourceUrl(resource);
+    SegmentGeneratorConfig config =
+        SegmentTestUtils.getSegmentGenSpecWithSchemAndProjectedColumns(new 
File(filePath), INDEX_DIR, "daysSinceEpoch",
+            TimeUnit.DAYS, "testTable");
+    SegmentIndexCreationDriver driver = new SegmentIndexCreationDriverImpl();
+    driver.init(config);
+    driver.build();
+
+    File indexDir = driver.getOutputDirectory();
+    assertEquals(CrcUtils.forAllFilesInFolder(indexDir).computeCrc(), 
2708456273L);
 
+    new SegmentV1V2ToV3FormatConverter().convert(indexDir);
+    assertEquals(CrcUtils.forAllFilesInFolder(indexDir).computeCrc(), 
2796149869L);
+  }
+
+  @Test
+  public void testCrcWithNativeFstIndex()
+      throws Exception {
     URL resource = getClass().getClassLoader().getResource(AVRO_DATA);
     assertNotNull(resource);
     String filePath = TestUtils.getFileFromResourceUrl(resource);
     SegmentGeneratorConfig config =
         SegmentTestUtils.getSegmentGenSpecWithSchemAndProjectedColumns(new 
File(filePath), INDEX_DIR, "daysSinceEpoch",
             TimeUnit.DAYS, "testTable");
+    FstIndexConfig fstIndexConfig = new FstIndexConfig(FSTType.NATIVE);
+    config.setIndexOn(StandardIndexes.fst(), fstIndexConfig, "column5");
     SegmentIndexCreationDriver driver = new SegmentIndexCreationDriverImpl();
     driver.init(config);
     driver.build();
 
     File indexDir = driver.getOutputDirectory();
-    assertEquals(CrcUtils.forAllFilesInFolder(indexDir).computeCrc(), 
EXPECTED_V1_CRC);
+    assertEquals(CrcUtils.forAllFilesInFolder(indexDir).computeCrc(), 
3358657641L);
 
     new SegmentV1V2ToV3FormatConverter().convert(indexDir);
-    assertEquals(CrcUtils.forAllFilesInFolder(indexDir).computeCrc(), 
EXPECTED_V3_CRC);
+    assertEquals(CrcUtils.forAllFilesInFolder(indexDir).computeCrc(), 
961102604L);
+  }
 
-    FileUtils.deleteDirectory(INDEX_DIR);
+  @Test
+  public void testCrcWithLuceneFstIndex()
+      throws Exception {
+    URL resource = getClass().getClassLoader().getResource(AVRO_DATA);
+    assertNotNull(resource);
+    String filePath = TestUtils.getFileFromResourceUrl(resource);
+    SegmentGeneratorConfig config =
+        SegmentTestUtils.getSegmentGenSpecWithSchemAndProjectedColumns(new 
File(filePath), INDEX_DIR, "daysSinceEpoch",
+            TimeUnit.DAYS, "testTable");
+    FstIndexConfig fstIndexConfig = new FstIndexConfig(FSTType.LUCENE);
+    config.setIndexOn(StandardIndexes.fst(), fstIndexConfig, "column5");
+    SegmentIndexCreationDriver driver = new SegmentIndexCreationDriverImpl();
+    driver.init(config);
+    driver.build();
+
+    File indexDir = driver.getOutputDirectory();
+    assertEquals(CrcUtils.forAllFilesInFolder(indexDir).computeCrc(), 
3294819300L);
+
+    new SegmentV1V2ToV3FormatConverter().convert(indexDir);
+    assertEquals(CrcUtils.forAllFilesInFolder(indexDir).computeCrc(), 
2552900261L);
+  }
+
+  // @Test
+  public void testCrcWithLuceneTextIndex()
+      throws Exception {
+    URL resource = getClass().getClassLoader().getResource(AVRO_DATA);
+    assertNotNull(resource);
+    String filePath = TestUtils.getFileFromResourceUrl(resource);
+    SegmentGeneratorConfig config =
+        SegmentTestUtils.getSegmentGenSpecWithSchemAndProjectedColumns(new 
File(filePath), INDEX_DIR, "daysSinceEpoch",
+            TimeUnit.DAYS, "testTable");
+    addTextIndex(config, "column5");
+    SegmentIndexCreationDriver driver = new SegmentIndexCreationDriverImpl();
+    driver.init(config);
+    driver.build();
+
+    // Lucene text index data is not deterministic, thus leading to different 
segment crc across each test runs.
+    // When using text index in RealTime table, different crc values can cause 
servers to have to download segments
+    // from deep store to make segment replicas in sync.
+    File indexDir = driver.getOutputDirectory();
+    System.out.println(CrcUtils.forAllFilesInFolder(indexDir).computeCrc());
+
+    new SegmentV1V2ToV3FormatConverter().convert(indexDir);
+    System.out.println(CrcUtils.forAllFilesInFolder(indexDir).computeCrc());
+  }
+
+  private void addTextIndex(SegmentGeneratorConfig config, String colName) {
+    FieldIndexConfigs fieldIndexConfigs = 
config.getIndexConfigsByColName().get(colName);
+    TextIndexConfig textConfig = 
fieldIndexConfigs.getConfig(StandardIndexes.text());
+    TextIndexConfig newTextConfig = new 
TextIndexConfigBuilder(textConfig).build();
+    config.setIndexOn(StandardIndexes.text(), newTextConfig, colName);
   }
 }
diff --git 
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/utils/CrcUtils.java
 
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/utils/CrcUtils.java
index 659d54f9ea..84dfb8283c 100644
--- 
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/utils/CrcUtils.java
+++ 
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/utils/CrcUtils.java
@@ -84,6 +84,9 @@ public class CrcUtils {
         while ((len = input.read(buffer)) > 0) {
           checksum.update(buffer, 0, len);
         }
+        if (LOGGER.isDebugEnabled()) {
+          LOGGER.debug("Updated crc = {}, based on file {} of length {}", 
checksum.getValue(), file, file.length());
+        }
       }
     }
     long crc = checksum.getValue();


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@pinot.apache.org
For additional commands, e-mail: commits-h...@pinot.apache.org

Reply via email to