This is an automated email from the ASF dual-hosted git repository. xbli pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/pinot.git
The following commit(s) were added to refs/heads/master by this push: new 1e0f87053c add logs to debug why crc values are different upon same input data and found Text Index data is not deterministic (#14188) 1e0f87053c is described below commit 1e0f87053c7bee2316223ec4152c80341c291fb1 Author: Xiaobing <61892277+klsi...@users.noreply.github.com> AuthorDate: Tue Oct 8 14:35:41 2024 -0700 add logs to debug why crc values are different upon same input data and found Text Index data is not deterministic (#14188) --- .../org/apache/pinot/core/util/CrcUtilsTest.java | 101 +++++++++++++++++++-- .../apache/pinot/segment/local/utils/CrcUtils.java | 3 + 2 files changed, 98 insertions(+), 6 deletions(-) diff --git a/pinot-core/src/test/java/org/apache/pinot/core/util/CrcUtilsTest.java b/pinot-core/src/test/java/org/apache/pinot/core/util/CrcUtilsTest.java index 120f84e07e..494d4f518f 100644 --- a/pinot-core/src/test/java/org/apache/pinot/core/util/CrcUtilsTest.java +++ b/pinot-core/src/test/java/org/apache/pinot/core/util/CrcUtilsTest.java @@ -19,16 +19,24 @@ package org.apache.pinot.core.util; import java.io.File; +import java.io.IOException; import java.net.URL; import java.util.concurrent.TimeUnit; import org.apache.commons.io.FileUtils; import org.apache.pinot.segment.local.segment.creator.SegmentTestUtils; import org.apache.pinot.segment.local.segment.creator.impl.SegmentIndexCreationDriverImpl; import org.apache.pinot.segment.local.segment.index.converter.SegmentV1V2ToV3FormatConverter; +import org.apache.pinot.segment.local.segment.index.text.TextIndexConfigBuilder; import org.apache.pinot.segment.local.utils.CrcUtils; import org.apache.pinot.segment.spi.creator.SegmentGeneratorConfig; import org.apache.pinot.segment.spi.creator.SegmentIndexCreationDriver; +import org.apache.pinot.segment.spi.index.FieldIndexConfigs; +import org.apache.pinot.segment.spi.index.FstIndexConfig; +import org.apache.pinot.segment.spi.index.StandardIndexes; +import org.apache.pinot.segment.spi.index.TextIndexConfig; +import org.apache.pinot.spi.config.table.FSTType; import org.apache.pinot.util.TestUtils; +import org.testng.annotations.BeforeMethod; import org.testng.annotations.Test; import static org.testng.Assert.assertEquals; @@ -38,30 +46,111 @@ import static org.testng.Assert.assertNotNull; public class CrcUtilsTest { private static final File INDEX_DIR = new File(FileUtils.getTempDirectory(), "CrcUtilsTest"); private static final String AVRO_DATA = "data/test_data-mv.avro"; - private static final long EXPECTED_V1_CRC = 2708456273L; - private static final long EXPECTED_V3_CRC = 2796149869L; + + @BeforeMethod + public void setup() + throws IOException { + FileUtils.deleteDirectory(INDEX_DIR); + } + + @BeforeMethod + public void tearDown() + throws IOException { + FileUtils.deleteDirectory(INDEX_DIR); + } @Test public void testCrc() throws Exception { - FileUtils.deleteDirectory(INDEX_DIR); + URL resource = getClass().getClassLoader().getResource(AVRO_DATA); + assertNotNull(resource); + String filePath = TestUtils.getFileFromResourceUrl(resource); + SegmentGeneratorConfig config = + SegmentTestUtils.getSegmentGenSpecWithSchemAndProjectedColumns(new File(filePath), INDEX_DIR, "daysSinceEpoch", + TimeUnit.DAYS, "testTable"); + SegmentIndexCreationDriver driver = new SegmentIndexCreationDriverImpl(); + driver.init(config); + driver.build(); + + File indexDir = driver.getOutputDirectory(); + assertEquals(CrcUtils.forAllFilesInFolder(indexDir).computeCrc(), 2708456273L); + new SegmentV1V2ToV3FormatConverter().convert(indexDir); + assertEquals(CrcUtils.forAllFilesInFolder(indexDir).computeCrc(), 2796149869L); + } + + @Test + public void testCrcWithNativeFstIndex() + throws Exception { URL resource = getClass().getClassLoader().getResource(AVRO_DATA); assertNotNull(resource); String filePath = TestUtils.getFileFromResourceUrl(resource); SegmentGeneratorConfig config = SegmentTestUtils.getSegmentGenSpecWithSchemAndProjectedColumns(new File(filePath), INDEX_DIR, "daysSinceEpoch", TimeUnit.DAYS, "testTable"); + FstIndexConfig fstIndexConfig = new FstIndexConfig(FSTType.NATIVE); + config.setIndexOn(StandardIndexes.fst(), fstIndexConfig, "column5"); SegmentIndexCreationDriver driver = new SegmentIndexCreationDriverImpl(); driver.init(config); driver.build(); File indexDir = driver.getOutputDirectory(); - assertEquals(CrcUtils.forAllFilesInFolder(indexDir).computeCrc(), EXPECTED_V1_CRC); + assertEquals(CrcUtils.forAllFilesInFolder(indexDir).computeCrc(), 3358657641L); new SegmentV1V2ToV3FormatConverter().convert(indexDir); - assertEquals(CrcUtils.forAllFilesInFolder(indexDir).computeCrc(), EXPECTED_V3_CRC); + assertEquals(CrcUtils.forAllFilesInFolder(indexDir).computeCrc(), 961102604L); + } - FileUtils.deleteDirectory(INDEX_DIR); + @Test + public void testCrcWithLuceneFstIndex() + throws Exception { + URL resource = getClass().getClassLoader().getResource(AVRO_DATA); + assertNotNull(resource); + String filePath = TestUtils.getFileFromResourceUrl(resource); + SegmentGeneratorConfig config = + SegmentTestUtils.getSegmentGenSpecWithSchemAndProjectedColumns(new File(filePath), INDEX_DIR, "daysSinceEpoch", + TimeUnit.DAYS, "testTable"); + FstIndexConfig fstIndexConfig = new FstIndexConfig(FSTType.LUCENE); + config.setIndexOn(StandardIndexes.fst(), fstIndexConfig, "column5"); + SegmentIndexCreationDriver driver = new SegmentIndexCreationDriverImpl(); + driver.init(config); + driver.build(); + + File indexDir = driver.getOutputDirectory(); + assertEquals(CrcUtils.forAllFilesInFolder(indexDir).computeCrc(), 3294819300L); + + new SegmentV1V2ToV3FormatConverter().convert(indexDir); + assertEquals(CrcUtils.forAllFilesInFolder(indexDir).computeCrc(), 2552900261L); + } + + // @Test + public void testCrcWithLuceneTextIndex() + throws Exception { + URL resource = getClass().getClassLoader().getResource(AVRO_DATA); + assertNotNull(resource); + String filePath = TestUtils.getFileFromResourceUrl(resource); + SegmentGeneratorConfig config = + SegmentTestUtils.getSegmentGenSpecWithSchemAndProjectedColumns(new File(filePath), INDEX_DIR, "daysSinceEpoch", + TimeUnit.DAYS, "testTable"); + addTextIndex(config, "column5"); + SegmentIndexCreationDriver driver = new SegmentIndexCreationDriverImpl(); + driver.init(config); + driver.build(); + + // Lucene text index data is not deterministic, thus leading to different segment crc across each test runs. + // When using text index in RealTime table, different crc values can cause servers to have to download segments + // from deep store to make segment replicas in sync. + File indexDir = driver.getOutputDirectory(); + System.out.println(CrcUtils.forAllFilesInFolder(indexDir).computeCrc()); + + new SegmentV1V2ToV3FormatConverter().convert(indexDir); + System.out.println(CrcUtils.forAllFilesInFolder(indexDir).computeCrc()); + } + + private void addTextIndex(SegmentGeneratorConfig config, String colName) { + FieldIndexConfigs fieldIndexConfigs = config.getIndexConfigsByColName().get(colName); + TextIndexConfig textConfig = fieldIndexConfigs.getConfig(StandardIndexes.text()); + TextIndexConfig newTextConfig = new TextIndexConfigBuilder(textConfig).build(); + config.setIndexOn(StandardIndexes.text(), newTextConfig, colName); } } diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/utils/CrcUtils.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/utils/CrcUtils.java index 659d54f9ea..84dfb8283c 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/utils/CrcUtils.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/utils/CrcUtils.java @@ -84,6 +84,9 @@ public class CrcUtils { while ((len = input.read(buffer)) > 0) { checksum.update(buffer, 0, len); } + if (LOGGER.isDebugEnabled()) { + LOGGER.debug("Updated crc = {}, based on file {} of length {}", checksum.getValue(), file, file.length()); + } } } long crc = checksum.getValue(); --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@pinot.apache.org For additional commands, e-mail: commits-h...@pinot.apache.org