This is an automated email from the ASF dual-hosted git repository. jackie pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/pinot.git
The following commit(s) were added to refs/heads/master by this push: new 372bcd2840 Add a quickstart for null handling related queries (#14182) 372bcd2840 is described below commit 372bcd2840f8403d1b76648cce1fbc450cafdea2 Author: Yash Mayya <yash.ma...@gmail.com> AuthorDate: Thu Oct 10 03:33:36 2024 +0530 Add a quickstart for null handling related queries (#14182) --- .../apache/pinot/tools/NullHandlingQuickstart.java | 107 +++++++++++++++++ .../clientSalaryNulls_offline_table_config.json | 18 +++ .../clientSalaryNulls_schema.json | 38 ++++++ .../batch/clientSalaryNulls/ingestionJobSpec.yaml | 129 +++++++++++++++++++++ .../rawdata/clientSalaryNulls_data.avro | Bin 0 -> 3192 bytes 5 files changed, 292 insertions(+) diff --git a/pinot-tools/src/main/java/org/apache/pinot/tools/NullHandlingQuickstart.java b/pinot-tools/src/main/java/org/apache/pinot/tools/NullHandlingQuickstart.java new file mode 100644 index 0000000000..7f0a8a727b --- /dev/null +++ b/pinot-tools/src/main/java/org/apache/pinot/tools/NullHandlingQuickstart.java @@ -0,0 +1,107 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.tools; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; +import java.util.Map; +import org.apache.pinot.spi.utils.CommonConstants; +import org.apache.pinot.tools.admin.PinotAdministrator; +import org.apache.pinot.tools.admin.command.QuickstartRunner; + + +/** + * Quickstart with a table that has some null values in order to be able to play around with Pinot's null handling + * related features. + */ +public class NullHandlingQuickstart extends Quickstart { + + private static final String[] NULL_HANDLING_TABLE_DIRS = new String[]{"examples/batch/clientSalaryNulls"}; + + @Override + public List<String> types() { + return Collections.singletonList("NULL_HANDLING"); + } + + @Override + public String[] getDefaultBatchTableDirectories() { + return NULL_HANDLING_TABLE_DIRS; + } + + @Override + public void runSampleQueries(QuickstartRunner runner) + throws Exception { + printStatus(Quickstart.Color.YELLOW, "***** Null handling quickstart setup complete *****"); + + Map<String, String> queryOptions = Collections.singletonMap("queryOptions", + CommonConstants.Broker.Request.QueryOptionKey.ENABLE_NULL_HANDLING + "=true"); + + printStatus(Quickstart.Color.YELLOW, "Total number of documents in the table"); + String query = "SELECT COUNT(*) FROM clientSalaryNulls"; + printStatus(Quickstart.Color.CYAN, "Query : " + query); + printStatus(Quickstart.Color.YELLOW, prettyPrintResponse(runner.runQuery(query, queryOptions))); + printStatus(Quickstart.Color.GREEN, "***************************************************"); + + printStatus(Quickstart.Color.YELLOW, "Total number of documents in the table with null salary values"); + query = "SELECT COUNT(*) FROM clientSalaryNulls WHERE salary IS NULL"; + printStatus(Quickstart.Color.CYAN, "Query : " + query); + printStatus(Quickstart.Color.YELLOW, prettyPrintResponse(runner.runQuery(query, queryOptions))); + printStatus(Quickstart.Color.GREEN, "***************************************************"); + + printStatus(Quickstart.Color.YELLOW, "Total number of documents in the table with non-null description"); + query = "SELECT COUNT(*) FROM clientSalaryNulls WHERE description IS NOT NULL"; + printStatus(Quickstart.Color.CYAN, "Query : " + query); + printStatus(Quickstart.Color.YELLOW, prettyPrintResponse(runner.runQuery(query, queryOptions))); + printStatus(Quickstart.Color.GREEN, "***************************************************"); + + printStatus(Quickstart.Color.YELLOW, "Minimum salary with null handling enabled"); + query = "SELECT MIN(salary) FROM clientSalaryNulls"; + printStatus(Quickstart.Color.CYAN, "Query : " + query); + printStatus(Quickstart.Color.YELLOW, prettyPrintResponse(runner.runQuery(query, queryOptions))); + printStatus(Quickstart.Color.GREEN, "***************************************************"); + + printStatus(Quickstart.Color.YELLOW, "Minimum salary without null handling enabled"); + query = "SELECT MIN(salary) FROM clientSalaryNulls"; + printStatus(Quickstart.Color.CYAN, "Query : " + query); + printStatus(Quickstart.Color.YELLOW, prettyPrintResponse(runner.runQuery(query))); + printStatus(Quickstart.Color.GREEN, "***************************************************"); + + printStatus(Quickstart.Color.YELLOW, "Count where salary is less than 80000"); + query = "SELECT COUNT(*) FROM clientSalaryNulls WHERE salary < 80000"; + printStatus(Quickstart.Color.CYAN, "Query : " + query); + printStatus(Quickstart.Color.YELLOW, prettyPrintResponse(runner.runQuery(query, queryOptions))); + printStatus(Quickstart.Color.GREEN, "***************************************************"); + + printStatus(Quickstart.Color.YELLOW, "Count where salary is less than 80000 (without null handling enabled)"); + query = "SELECT COUNT(*) FROM clientSalaryNulls WHERE salary < 80000"; + printStatus(Quickstart.Color.CYAN, "Query : " + query); + printStatus(Quickstart.Color.YELLOW, prettyPrintResponse(runner.runQuery(query))); + printStatus(Quickstart.Color.GREEN, "***************************************************"); + } + + public static void main(String[] args) + throws Exception { + List<String> arguments = new ArrayList<>(); + arguments.addAll(Arrays.asList("QuickStart", "-type", "NULL_HANDLING")); + arguments.addAll(Arrays.asList(args)); + PinotAdministrator.main(arguments.toArray(new String[0])); + } +} diff --git a/pinot-tools/src/main/resources/examples/batch/clientSalaryNulls/clientSalaryNulls_offline_table_config.json b/pinot-tools/src/main/resources/examples/batch/clientSalaryNulls/clientSalaryNulls_offline_table_config.json new file mode 100644 index 0000000000..08a322eb73 --- /dev/null +++ b/pinot-tools/src/main/resources/examples/batch/clientSalaryNulls/clientSalaryNulls_offline_table_config.json @@ -0,0 +1,18 @@ +{ + "tableName": "clientSalaryNulls", + "segmentsConfig" : { + "replication" : "1", + "schemaName" : "clientSalaryNulls" + }, + "tableIndexConfig" : { + "invertedIndexColumns" : [], + "loadMode" : "MMAP" + }, + "tenants" : { + "broker":"DefaultTenant", + "server":"DefaultTenant" + }, + "tableType":"OFFLINE", + "metadata": {} +} + diff --git a/pinot-tools/src/main/resources/examples/batch/clientSalaryNulls/clientSalaryNulls_schema.json b/pinot-tools/src/main/resources/examples/batch/clientSalaryNulls/clientSalaryNulls_schema.json new file mode 100644 index 0000000000..c69ae185c9 --- /dev/null +++ b/pinot-tools/src/main/resources/examples/batch/clientSalaryNulls/clientSalaryNulls_schema.json @@ -0,0 +1,38 @@ +{ + "dimensionFieldSpecs": [ + { + "dataType": "INT", + "singleValueField": true, + "name": "clientId", + "notNull": "true" + }, + { + "dataType": "STRING", + "singleValueField": true, + "name": "city", + "notNull": "true" + }, + { + "dataType": "STRING", + "singleValueField": true, + "name": "description", + "notNull": "false" + }, + { + "dataType": "INT", + "singleValueField": true, + "name": "salary", + "notNull": "false" + } + ], + "dateTimeFieldSpecs": [ + { + "name": "DaysSinceEpoch", + "dataType": "INT", + "format": "1:DAYS:EPOCH", + "granularity": "1:DAYS" + } + ], + "schemaName": "clientSalaryNulls", + "enableColumnBasedNullHandling": true +} diff --git a/pinot-tools/src/main/resources/examples/batch/clientSalaryNulls/ingestionJobSpec.yaml b/pinot-tools/src/main/resources/examples/batch/clientSalaryNulls/ingestionJobSpec.yaml new file mode 100644 index 0000000000..4bdd5519fd --- /dev/null +++ b/pinot-tools/src/main/resources/examples/batch/clientSalaryNulls/ingestionJobSpec.yaml @@ -0,0 +1,129 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +# executionFrameworkSpec: Defines ingestion jobs to be running. +executionFrameworkSpec: + + # name: execution framework name + name: 'standalone' + + # Class to use for segment generation and different push types. + segmentGenerationJobRunnerClassName: 'org.apache.pinot.plugin.ingestion.batch.standalone.SegmentGenerationJobRunner' + segmentTarPushJobRunnerClassName: 'org.apache.pinot.plugin.ingestion.batch.standalone.SegmentTarPushJobRunner' + segmentUriPushJobRunnerClassName: 'org.apache.pinot.plugin.ingestion.batch.standalone.SegmentUriPushJobRunner' + segmentMetadataPushJobRunnerClassName: 'org.apache.pinot.plugin.ingestion.batch.standalone.SegmentMetadataPushJobRunner' + +# jobType: Pinot ingestion job type. +# Supported job types are defined in PinotIngestionJobType class. +# 'SegmentCreation' +# 'SegmentTarPush' +# 'SegmentUriPush' +# 'SegmentMetadataPush' +# 'SegmentCreationAndTarPush' +# 'SegmentCreationAndUriPush' +# 'SegmentCreationAndMetadataPush' +jobType: SegmentCreationAndTarPush + +# inputDirURI: Root directory of input data, expected to have scheme configured in PinotFS. +inputDirURI: 'examples/batch/clientSalaryNulls/rawdata' + +# includeFileNamePattern: include file name pattern, supported glob pattern. +# Sample usage: +# 'glob:*.avro' will include all avro files just under the inputDirURI, not sub directories; +# 'glob:**/*.avro' will include all the avro files under inputDirURI recursively. +includeFileNamePattern: 'glob:**/*.avro' + +# excludeFileNamePattern: exclude file name pattern, supported glob pattern. +# Sample usage: +# 'glob:*.avro' will exclude all avro files just under the inputDirURI, not sub directories; +# 'glob:**/*.avro' will exclude all the avro files under inputDirURI recursively. +# _excludeFileNamePattern: '' + +# outputDirURI: Root directory of output segments, expected to have scheme configured in PinotFS. +outputDirURI: 'examples/batch/clientSalaryNulls/segments' + +# overwriteOutput: Overwrite output segments if existed. +overwriteOutput: true + +# pinotFSSpecs: defines all related Pinot file systems. +pinotFSSpecs: + + - # scheme: used to identify a PinotFS. + # E.g. local, hdfs, dbfs, etc + scheme: file + + # className: Class name used to create the PinotFS instance. + # E.g. + # org.apache.pinot.spi.filesystem.LocalPinotFS is used for local filesystem + # org.apache.pinot.plugin.filesystem.AzurePinotFS is used for Azure Data Lake + # org.apache.pinot.plugin.filesystem.HadoopPinotFS is used for HDFS + className: org.apache.pinot.spi.filesystem.LocalPinotFS + +# recordReaderSpec: defines all record reader +recordReaderSpec: + + # dataFormat: Record data format, e.g. 'avro', 'parquet', 'orc', 'csv', 'json', 'thrift' etc. + dataFormat: 'avro' + + # className: Corresponding RecordReader class name. + # E.g. + # org.apache.pinot.plugin.inputformat.avro.AvroRecordReader + # org.apache.pinot.plugin.inputformat.csv.CSVRecordReader + # org.apache.pinot.plugin.inputformat.parquet.ParquetRecordReader + # org.apache.pinot.plugin.inputformat.parquet.ParquetNativeRecordReader + # org.apache.pinot.plugin.inputformat.json.JSONRecordReader + # org.apache.pinot.plugin.inputformat.orc.ORCRecordReader + # org.apache.pinot.plugin.inputformat.thrift.ThriftRecordReader + className: 'org.apache.pinot.plugin.inputformat.avro.AvroRecordReader' + +# tableSpec: defines table name and where to fetch corresponding table config and table schema. +tableSpec: + + # tableName: Table name + tableName: 'clientSalaryNulls' + + # schemaURI: defines where to read the table schema, supports PinotFS or HTTP. + # E.g. + # hdfs://path/to/table_schema.json + # http://localhost:9000/tables/myTable/schema + schemaURI: 'http://localhost:9000/tables/clientSalaryNulls/schema' + + # tableConfigURI: defines where to reade the table config. + # Supports using PinotFS or HTTP. + # E.g. + # hdfs://path/to/table_config.json + # http://localhost:9000/tables/myTable + # Note that the API to read Pinot table config directly from pinot controller contains a JSON wrapper. + # The real table config is the object under the field 'OFFLINE'. + tableConfigURI: 'http://localhost:9000/tables/clientSalaryNulls' + +# pinotClusterSpecs: defines the Pinot Cluster Access Point. +pinotClusterSpecs: + - # controllerURI: used to fetch table/schema information and data push. + # E.g. http://localhost:9000 + controllerURI: 'http://localhost:9000' + +# pushJobSpec: defines segment push job related configuration. +pushJobSpec: + + # pushAttempts: number of attempts for push job, default is 1, which means no retry. + pushAttempts: 2 + + # pushRetryIntervalMillis: retry wait Ms, default to 1 second. + pushRetryIntervalMillis: 1000 diff --git a/pinot-tools/src/main/resources/examples/batch/clientSalaryNulls/rawdata/clientSalaryNulls_data.avro b/pinot-tools/src/main/resources/examples/batch/clientSalaryNulls/rawdata/clientSalaryNulls_data.avro new file mode 100644 index 0000000000..c7843d4738 Binary files /dev/null and b/pinot-tools/src/main/resources/examples/batch/clientSalaryNulls/rawdata/clientSalaryNulls_data.avro differ --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@pinot.apache.org For additional commands, e-mail: commits-h...@pinot.apache.org