This is an automated email from the ASF dual-hosted git repository.

jackie pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/pinot.git


The following commit(s) were added to refs/heads/master by this push:
     new 372bcd2840 Add a quickstart for null handling related queries (#14182)
372bcd2840 is described below

commit 372bcd2840f8403d1b76648cce1fbc450cafdea2
Author: Yash Mayya <yash.ma...@gmail.com>
AuthorDate: Thu Oct 10 03:33:36 2024 +0530

    Add a quickstart for null handling related queries (#14182)
---
 .../apache/pinot/tools/NullHandlingQuickstart.java | 107 +++++++++++++++++
 .../clientSalaryNulls_offline_table_config.json    |  18 +++
 .../clientSalaryNulls_schema.json                  |  38 ++++++
 .../batch/clientSalaryNulls/ingestionJobSpec.yaml  | 129 +++++++++++++++++++++
 .../rawdata/clientSalaryNulls_data.avro            | Bin 0 -> 3192 bytes
 5 files changed, 292 insertions(+)

diff --git 
a/pinot-tools/src/main/java/org/apache/pinot/tools/NullHandlingQuickstart.java 
b/pinot-tools/src/main/java/org/apache/pinot/tools/NullHandlingQuickstart.java
new file mode 100644
index 0000000000..7f0a8a727b
--- /dev/null
+++ 
b/pinot-tools/src/main/java/org/apache/pinot/tools/NullHandlingQuickstart.java
@@ -0,0 +1,107 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.pinot.tools;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.List;
+import java.util.Map;
+import org.apache.pinot.spi.utils.CommonConstants;
+import org.apache.pinot.tools.admin.PinotAdministrator;
+import org.apache.pinot.tools.admin.command.QuickstartRunner;
+
+
+/**
+ * Quickstart with a table that has some null values in order to be able to 
play around with Pinot's null handling
+ * related features.
+ */
+public class NullHandlingQuickstart extends Quickstart {
+
+  private static final String[] NULL_HANDLING_TABLE_DIRS = new 
String[]{"examples/batch/clientSalaryNulls"};
+
+  @Override
+  public List<String> types() {
+    return Collections.singletonList("NULL_HANDLING");
+  }
+
+  @Override
+  public String[] getDefaultBatchTableDirectories() {
+    return NULL_HANDLING_TABLE_DIRS;
+  }
+
+  @Override
+  public void runSampleQueries(QuickstartRunner runner)
+      throws Exception {
+    printStatus(Quickstart.Color.YELLOW, "***** Null handling quickstart setup 
complete *****");
+
+    Map<String, String> queryOptions = Collections.singletonMap("queryOptions",
+        CommonConstants.Broker.Request.QueryOptionKey.ENABLE_NULL_HANDLING + 
"=true");
+
+    printStatus(Quickstart.Color.YELLOW, "Total number of documents in the 
table");
+    String query = "SELECT COUNT(*) FROM clientSalaryNulls";
+    printStatus(Quickstart.Color.CYAN, "Query : " + query);
+    printStatus(Quickstart.Color.YELLOW, 
prettyPrintResponse(runner.runQuery(query, queryOptions)));
+    printStatus(Quickstart.Color.GREEN, 
"***************************************************");
+
+    printStatus(Quickstart.Color.YELLOW, "Total number of documents in the 
table with null salary values");
+    query = "SELECT COUNT(*) FROM clientSalaryNulls WHERE salary IS NULL";
+    printStatus(Quickstart.Color.CYAN, "Query : " + query);
+    printStatus(Quickstart.Color.YELLOW, 
prettyPrintResponse(runner.runQuery(query, queryOptions)));
+    printStatus(Quickstart.Color.GREEN, 
"***************************************************");
+
+    printStatus(Quickstart.Color.YELLOW, "Total number of documents in the 
table with non-null description");
+    query = "SELECT COUNT(*) FROM clientSalaryNulls WHERE description IS NOT 
NULL";
+    printStatus(Quickstart.Color.CYAN, "Query : " + query);
+    printStatus(Quickstart.Color.YELLOW, 
prettyPrintResponse(runner.runQuery(query, queryOptions)));
+    printStatus(Quickstart.Color.GREEN, 
"***************************************************");
+
+    printStatus(Quickstart.Color.YELLOW, "Minimum salary with null handling 
enabled");
+    query = "SELECT MIN(salary) FROM clientSalaryNulls";
+    printStatus(Quickstart.Color.CYAN, "Query : " + query);
+    printStatus(Quickstart.Color.YELLOW, 
prettyPrintResponse(runner.runQuery(query, queryOptions)));
+    printStatus(Quickstart.Color.GREEN, 
"***************************************************");
+
+    printStatus(Quickstart.Color.YELLOW, "Minimum salary without null handling 
enabled");
+    query = "SELECT MIN(salary) FROM clientSalaryNulls";
+    printStatus(Quickstart.Color.CYAN, "Query : " + query);
+    printStatus(Quickstart.Color.YELLOW, 
prettyPrintResponse(runner.runQuery(query)));
+    printStatus(Quickstart.Color.GREEN, 
"***************************************************");
+
+    printStatus(Quickstart.Color.YELLOW, "Count where salary is less than 
80000");
+    query = "SELECT COUNT(*) FROM clientSalaryNulls WHERE salary < 80000";
+    printStatus(Quickstart.Color.CYAN, "Query : " + query);
+    printStatus(Quickstart.Color.YELLOW, 
prettyPrintResponse(runner.runQuery(query, queryOptions)));
+    printStatus(Quickstart.Color.GREEN, 
"***************************************************");
+
+    printStatus(Quickstart.Color.YELLOW, "Count where salary is less than 
80000 (without null handling enabled)");
+    query = "SELECT COUNT(*) FROM clientSalaryNulls WHERE salary < 80000";
+    printStatus(Quickstart.Color.CYAN, "Query : " + query);
+    printStatus(Quickstart.Color.YELLOW, 
prettyPrintResponse(runner.runQuery(query)));
+    printStatus(Quickstart.Color.GREEN, 
"***************************************************");
+  }
+
+  public static void main(String[] args)
+      throws Exception {
+    List<String> arguments = new ArrayList<>();
+    arguments.addAll(Arrays.asList("QuickStart", "-type", "NULL_HANDLING"));
+    arguments.addAll(Arrays.asList(args));
+    PinotAdministrator.main(arguments.toArray(new String[0]));
+  }
+}
diff --git 
a/pinot-tools/src/main/resources/examples/batch/clientSalaryNulls/clientSalaryNulls_offline_table_config.json
 
b/pinot-tools/src/main/resources/examples/batch/clientSalaryNulls/clientSalaryNulls_offline_table_config.json
new file mode 100644
index 0000000000..08a322eb73
--- /dev/null
+++ 
b/pinot-tools/src/main/resources/examples/batch/clientSalaryNulls/clientSalaryNulls_offline_table_config.json
@@ -0,0 +1,18 @@
+{
+  "tableName": "clientSalaryNulls",
+  "segmentsConfig" : {
+    "replication" : "1",
+    "schemaName" : "clientSalaryNulls"
+  },
+  "tableIndexConfig" : {
+    "invertedIndexColumns" : [],
+    "loadMode"  : "MMAP"
+  },
+  "tenants" : {
+    "broker":"DefaultTenant",
+    "server":"DefaultTenant"
+  },
+  "tableType":"OFFLINE",
+  "metadata": {}
+}
+
diff --git 
a/pinot-tools/src/main/resources/examples/batch/clientSalaryNulls/clientSalaryNulls_schema.json
 
b/pinot-tools/src/main/resources/examples/batch/clientSalaryNulls/clientSalaryNulls_schema.json
new file mode 100644
index 0000000000..c69ae185c9
--- /dev/null
+++ 
b/pinot-tools/src/main/resources/examples/batch/clientSalaryNulls/clientSalaryNulls_schema.json
@@ -0,0 +1,38 @@
+{
+  "dimensionFieldSpecs": [
+    {
+      "dataType": "INT",
+      "singleValueField": true,
+      "name": "clientId",
+      "notNull": "true"
+    },
+    {
+      "dataType": "STRING",
+      "singleValueField": true,
+      "name": "city",
+      "notNull": "true"
+    },
+    {
+      "dataType": "STRING",
+      "singleValueField": true,
+      "name": "description",
+      "notNull": "false"
+    },
+    {
+      "dataType": "INT",
+      "singleValueField": true,
+      "name": "salary",
+      "notNull": "false"
+    }
+  ],
+  "dateTimeFieldSpecs": [
+    {
+      "name": "DaysSinceEpoch",
+      "dataType": "INT",
+      "format": "1:DAYS:EPOCH",
+      "granularity": "1:DAYS"
+    }
+  ],
+  "schemaName": "clientSalaryNulls",
+  "enableColumnBasedNullHandling": true
+}
diff --git 
a/pinot-tools/src/main/resources/examples/batch/clientSalaryNulls/ingestionJobSpec.yaml
 
b/pinot-tools/src/main/resources/examples/batch/clientSalaryNulls/ingestionJobSpec.yaml
new file mode 100644
index 0000000000..4bdd5519fd
--- /dev/null
+++ 
b/pinot-tools/src/main/resources/examples/batch/clientSalaryNulls/ingestionJobSpec.yaml
@@ -0,0 +1,129 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+# executionFrameworkSpec: Defines ingestion jobs to be running.
+executionFrameworkSpec:
+
+  # name: execution framework name
+  name: 'standalone'
+
+  # Class to use for segment generation and different push types.
+  segmentGenerationJobRunnerClassName: 
'org.apache.pinot.plugin.ingestion.batch.standalone.SegmentGenerationJobRunner'
+  segmentTarPushJobRunnerClassName: 
'org.apache.pinot.plugin.ingestion.batch.standalone.SegmentTarPushJobRunner'
+  segmentUriPushJobRunnerClassName: 
'org.apache.pinot.plugin.ingestion.batch.standalone.SegmentUriPushJobRunner'
+  segmentMetadataPushJobRunnerClassName: 
'org.apache.pinot.plugin.ingestion.batch.standalone.SegmentMetadataPushJobRunner'
+
+# jobType: Pinot ingestion job type.
+# Supported job types are defined in PinotIngestionJobType class.
+#   'SegmentCreation'
+#   'SegmentTarPush'
+#   'SegmentUriPush'
+#   'SegmentMetadataPush'
+#   'SegmentCreationAndTarPush'
+#   'SegmentCreationAndUriPush'
+#   'SegmentCreationAndMetadataPush'
+jobType: SegmentCreationAndTarPush
+
+# inputDirURI: Root directory of input data, expected to have scheme 
configured in PinotFS.
+inputDirURI: 'examples/batch/clientSalaryNulls/rawdata'
+
+# includeFileNamePattern: include file name pattern, supported glob pattern.
+# Sample usage:
+#   'glob:*.avro' will include all avro files just under the inputDirURI, not 
sub directories;
+#   'glob:**/*.avro' will include all the avro files under inputDirURI 
recursively.
+includeFileNamePattern: 'glob:**/*.avro'
+
+# excludeFileNamePattern: exclude file name pattern, supported glob pattern.
+# Sample usage:
+#   'glob:*.avro' will exclude all avro files just under the inputDirURI, not 
sub directories;
+#   'glob:**/*.avro' will exclude all the avro files under inputDirURI 
recursively.
+# _excludeFileNamePattern: ''
+
+# outputDirURI: Root directory of output segments, expected to have scheme 
configured in PinotFS.
+outputDirURI: 'examples/batch/clientSalaryNulls/segments'
+
+# overwriteOutput: Overwrite output segments if existed.
+overwriteOutput: true
+
+# pinotFSSpecs: defines all related Pinot file systems.
+pinotFSSpecs:
+
+  - # scheme: used to identify a PinotFS.
+    # E.g. local, hdfs, dbfs, etc
+    scheme: file
+
+    # className: Class name used to create the PinotFS instance.
+    # E.g.
+    #   org.apache.pinot.spi.filesystem.LocalPinotFS is used for local 
filesystem
+    #   org.apache.pinot.plugin.filesystem.AzurePinotFS is used for Azure Data 
Lake
+    #   org.apache.pinot.plugin.filesystem.HadoopPinotFS is used for HDFS
+    className: org.apache.pinot.spi.filesystem.LocalPinotFS
+
+# recordReaderSpec: defines all record reader
+recordReaderSpec:
+
+  # dataFormat: Record data format, e.g. 'avro', 'parquet', 'orc', 'csv', 
'json', 'thrift' etc.
+  dataFormat: 'avro'
+
+  # className: Corresponding RecordReader class name.
+  # E.g.
+  #   org.apache.pinot.plugin.inputformat.avro.AvroRecordReader
+  #   org.apache.pinot.plugin.inputformat.csv.CSVRecordReader
+  #   org.apache.pinot.plugin.inputformat.parquet.ParquetRecordReader
+  #   org.apache.pinot.plugin.inputformat.parquet.ParquetNativeRecordReader
+  #   org.apache.pinot.plugin.inputformat.json.JSONRecordReader
+  #   org.apache.pinot.plugin.inputformat.orc.ORCRecordReader
+  #   org.apache.pinot.plugin.inputformat.thrift.ThriftRecordReader
+  className: 'org.apache.pinot.plugin.inputformat.avro.AvroRecordReader'
+
+# tableSpec: defines table name and where to fetch corresponding table config 
and table schema.
+tableSpec:
+
+  # tableName: Table name
+  tableName: 'clientSalaryNulls'
+
+  # schemaURI: defines where to read the table schema, supports PinotFS or 
HTTP.
+  # E.g.
+  #   hdfs://path/to/table_schema.json
+  #   http://localhost:9000/tables/myTable/schema
+  schemaURI: 'http://localhost:9000/tables/clientSalaryNulls/schema'
+
+  # tableConfigURI: defines where to reade the table config.
+  # Supports using PinotFS or HTTP.
+  # E.g.
+  #   hdfs://path/to/table_config.json
+  #   http://localhost:9000/tables/myTable
+  # Note that the API to read Pinot table config directly from pinot 
controller contains a JSON wrapper.
+  # The real table config is the object under the field 'OFFLINE'.
+  tableConfigURI: 'http://localhost:9000/tables/clientSalaryNulls'
+
+# pinotClusterSpecs: defines the Pinot Cluster Access Point.
+pinotClusterSpecs:
+  - # controllerURI: used to fetch table/schema information and data push.
+    # E.g. http://localhost:9000
+    controllerURI: 'http://localhost:9000'
+
+# pushJobSpec: defines segment push job related configuration.
+pushJobSpec:
+
+  # pushAttempts: number of attempts for push job, default is 1, which means 
no retry.
+  pushAttempts: 2
+
+  # pushRetryIntervalMillis: retry wait Ms, default to 1 second.
+  pushRetryIntervalMillis: 1000
diff --git 
a/pinot-tools/src/main/resources/examples/batch/clientSalaryNulls/rawdata/clientSalaryNulls_data.avro
 
b/pinot-tools/src/main/resources/examples/batch/clientSalaryNulls/rawdata/clientSalaryNulls_data.avro
new file mode 100644
index 0000000000..c7843d4738
Binary files /dev/null and 
b/pinot-tools/src/main/resources/examples/batch/clientSalaryNulls/rawdata/clientSalaryNulls_data.avro
 differ


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@pinot.apache.org
For additional commands, e-mail: commits-h...@pinot.apache.org

Reply via email to