Re: [PR] Add test to check data and index types compatibility [pinot]

via GitHub Thu, 21 Nov 2024 23:50:36 -0800


gortiz commented on code in PR #14493:
URL: https://github.com/apache/pinot/pull/14493#discussion_r1853422448



##########
pinot-integration-tests/src/test/java/org/apache/pinot/integration/tests/TableIndexingTest.java:
##########
@@ -0,0 +1,776 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.pinot.integration.tests;
+
+import com.fasterxml.jackson.databind.node.ArrayNode;
+import com.fasterxml.jackson.databind.node.JsonNodeFactory;
+import com.fasterxml.jackson.databind.node.ObjectNode;
+import java.io.File;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.text.SimpleDateFormat;
+import java.time.Duration;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Date;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Random;
+import java.util.stream.Collectors;
+import org.apache.avro.file.DataFileWriter;
+import org.apache.avro.generic.GenericData;
+import org.apache.avro.generic.GenericDatumWriter;
+import org.apache.commons.io.FileUtils;
+import org.apache.pinot.common.metrics.ServerMetrics;
+import org.apache.pinot.spi.config.table.FieldConfig;
+import org.apache.pinot.spi.config.table.IndexingConfig;
+import org.apache.pinot.spi.config.table.StarTreeIndexConfig;
+import org.apache.pinot.spi.config.table.TableConfig;
+import org.apache.pinot.spi.config.table.TableType;
+import org.apache.pinot.spi.config.table.TimestampConfig;
+import org.apache.pinot.spi.config.table.TimestampIndexGranularity;
+import org.apache.pinot.spi.data.ComplexFieldSpec;
+import org.apache.pinot.spi.data.DimensionFieldSpec;
+import org.apache.pinot.spi.data.FieldSpec;
+import org.apache.pinot.spi.data.FieldSpec.DataType;
+import org.apache.pinot.spi.data.MetricFieldSpec;
+import org.apache.pinot.spi.data.Schema;
+import org.apache.pinot.spi.metrics.PinotMetricName;
+import org.apache.pinot.spi.metrics.PinotMetricUtils;
+import org.apache.pinot.spi.utils.builder.TableConfigBuilder;
+import org.apache.pinot.util.TestUtils;
+import org.testng.Assert;
+import org.testng.annotations.AfterClass;
+import org.testng.annotations.BeforeClass;
+import org.testng.annotations.DataProvider;
+import org.testng.annotations.Ignore;
+import org.testng.annotations.Test;
+
+
+// Try to create various index types for all data type/cardinality/encoding 
combinations and report outcome.
+// NOTES: There is no multi-value type for BigDecimal, JSON or MAP.
+// see PinotDataType.getPinotDataTypeForIngestion()
+@Test(enabled = false)
+public class TableIndexingTest extends BaseClusterIntegrationTestSet {
+
+  private final ArrayList<String> _tableNames = new ArrayList<>();
+  private final int _allDocs = 3000;
+  private final SimpleDateFormat _format = new 
SimpleDateFormat("HH:mm:ss.SSS");
+  private final List<TestCase> _allResults = new ArrayList<>();
+
+  static class TestCase {
+    String _tableName;
+    String _indexType;
+    Throwable _error;
+
+    public TestCase(String tableName, String indexType) {
+      _tableName = tableName;
+      _indexType = indexType;
+    }
+
+    @Override
+    public String toString() {
+      return _tableName + "," + _indexType;
+    }
+  }
+
+  @BeforeClass
+  public void setUp()
+      throws Exception {
+    TestUtils.ensureDirectoriesExistAndEmpty(_tempDir, _segmentDir, _tarDir);
+
+    // Start the Pinot cluster
+    startZk();
+    startController();
+    startBrokers(1);
+    startServers(1);
+
+    // Create and upload the schema and table config
+    List<Schema> schemas = createSchemas();
+    addSchemas(schemas);
+    List<TableConfig> tableConfigs = createOfflineTableConfigs(schemas);
+    addTableConfigs(tableConfigs);
+
+    List<List<File>> avroFiles = createAvroFile(schemas);
+
+    for (int i = 0; i < schemas.size(); i++) {
+      // we've to use separate directories because segment tar files must 
exist for the duration of test
+      File schemaSegmentDir = new File(_segmentDir, "schema_" + i);
+      File schemaTarDir = new File(_tarDir, "schema_" + i);
+      TestUtils.ensureDirectoriesExistAndEmpty(schemaSegmentDir, schemaTarDir);
+      ClusterIntegrationTestUtils.buildSegmentsFromAvro(avroFiles.get(i), 
tableConfigs.get(i), schemas.get(i), 0,
+          schemaSegmentDir, schemaTarDir);
+      uploadSegments(schemas.get(i).getSchemaName(), schemaTarDir);
+    }
+
+    waitForAllDocsLoaded(schemas);
+  }
+
+  private void addTableConfigs(List<TableConfig> tableConfigs)
+      throws IOException {
+    for (TableConfig config : tableConfigs) {
+      super.addTableConfig(config);
+    }
+  }
+
+  private List<TableConfig> createOfflineTableConfigs(List<Schema> schemas) {
+    return
+        schemas.stream().map(s -> new TableConfigBuilder(TableType.OFFLINE)
+                .setTableName(s.getSchemaName())
+                .build())
+            .collect(Collectors.toList());
+  }
+
+  private void waitForAllDocsLoaded(final List<Schema> schemas) {
+    HashSet<String> incompleteTables = new HashSet<>();
+    for (Schema schema : schemas) {
+      incompleteTables.add(schema.getSchemaName());
+    }
+    List<String> toRemove = new ArrayList<>();
+
+    TestUtils.waitForCondition(() -> {
+          toRemove.clear();
+          for (String table : incompleteTables) {
+            if (getCurrentCountStarResult(table) == _allDocs) {
+              toRemove.add(table);
+            }
+          }
+          incompleteTables.removeAll(toRemove);
+          return incompleteTables.isEmpty();
+        }, 100L, 60_000L,
+        "Failed to load " + _allDocs + " documents", true, 
Duration.ofMillis(60_000L / 10));
+  }
+
+  @AfterClass
+  public void tearDown()
+      throws Exception {
+    stopServer();
+    stopBroker();
+    stopController();
+    stopZk();
+    FileUtils.deleteDirectory(_tempDir);
+  }
+
+  @Ignore
+  @Test(dataProvider = "fieldsAndIndexTypes")
+  public void testAddIndex(TestCase testCase)
+      throws Throwable {
+    try {
+      String schemaName = testCase._tableName;
+      String indexType = testCase._indexType;
+
+      System.out.println(
+          _format.format(new Date()) + " Starting check for column: " + 
schemaName + " index type: " + indexType);
+      Schema schema = getSchema(schemaName);
+      FieldSpec field = schema.getFieldSpecFor("col");
+
+      // These exceptions are thrown during segment reload (and not table 
config update) and appear in logs only
+      // We're throwing them here to make test faster and improve output.
+      if ("geo".equals(indexType) && field.getDataType() != DataType.BYTES) {
+        throw new RuntimeException("Geo/H3 index can only be applied to column 
of BYTES data type!");
+      }
+
+      if ("json".equals(indexType) && ((field.getDataType() != DataType.STRING 
&& field.getDataType() != DataType.JSON)
+          || !field.isSingleValueField())) {
+        throw new RuntimeException(
+            "JSON index can only be applied to single value column of STRING 
or JSON data type!");
+      }
+
+      if ("vector".equals(indexType) && (field.getDataType() != DataType.FLOAT 
|| field.isSingleValueField())) {
+        throw new RuntimeException("VECTOR index can only be applied to Float 
Array columns");
+      }
+
+      if (("text".equals(indexType) || "native_text".equals(indexType)) && 
field.getDataType() != DataType.STRING) {
+        throw new RuntimeException("Text index is currently only supported on 
STRING columns");
+      }
+
+      TableConfig tableConfig = getOfflineTableConfig(schemaName);
+      IndexingConfig idxCfg = tableConfig.getIndexingConfig();
+
+      FieldConfig.EncodingType encoding =
+          field.getName().startsWith("raw") ? FieldConfig.EncodingType.RAW : 
FieldConfig.EncodingType.DICTIONARY;
+
+      List<FieldConfig.IndexType> indexTypes = new ArrayList<>();
+      Map<String, String> properties = new HashMap<>();
+      ObjectNode indexes = new ObjectNode(JsonNodeFactory.instance);
+      TimestampConfig tstmpConfig = null;
+      FieldConfig config = getByName(tableConfig, field.getName());
+      // ignore existing config and overwrite it, otherwise it will fail on 
earlier errors
+      boolean isNew = config == null;
+
+      switch (indexType) {
+        case "bloom":
+            /* bloom filter. Maybe we should call it bloom filter index to be 
consistent ?
+            {
+              "tableName": "somePinotTable",
+              "fieldConfigList": [
+                {
+                  "name": "playerID",
+                  "indexes": {
+                    "bloom": {}
+                  }
+                },
+                ...
+              ],
+              ...
+            } */
+          // no params
+          indexes.put("bloom", new ObjectNode(JsonNodeFactory.instance));
+
+          break;
+        case "fst":
+            /* fst index / text index
+              "fieldConfigList":[
+              {
+                "name":"text_col_1",
+                "encodingType":"DICTIONARY",
+                "indexType":"FST"
+                }
+                ]
+             */
+          indexTypes.add(FieldConfig.IndexType.FST);
+          break;
+        case "geo":
+            /* geospatial - requires dictionary be disabled
+              {
+              "fieldConfigList": [
+                {
+                  "name": "location_st_point",
+                  "encodingType":"RAW", // this actually disables the 
dictionary
+                  "indexes": {
+                    "h3": {
+                      "resolutions": [13, 5, 6]
+                    }
+                  }
+                }
+              ],
+              ...
+            }
+             */
+          ObjectNode resolutions = new ObjectNode(JsonNodeFactory.instance);
+          ArrayNode res = new ArrayNode(JsonNodeFactory.instance);
+          res.add(13).add(5).add(6);
+          resolutions.put("resolutions", res);

Review Comment:
   ```suggestion
             ObjectNode resolutions = 
JsonUtils.stringToJsonNode("{\"resolutions\": [13, 5, 6]}");
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@pinot.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@pinot.apache.org
For additional commands, e-mail: commits-h...@pinot.apache.org

Re: [PR] Add test to check data and index types compatibility [pinot]

Reply via email to