This is an automated email from the ASF dual-hosted git repository.

xiangfu0 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/pinot.git


The following commit(s) were added to refs/heads/master by this push:
     new 11545db16de Fix GROUP BY on column with RAW forward index + dictionary 
(#18500)
11545db16de is described below

commit 11545db16de09215d43cd76268200cab12b4f8f0
Author: Xiang Fu <[email protected]>
AuthorDate: Thu May 14 13:59:07 2026 -0700

    Fix GROUP BY on column with RAW forward index + dictionary (#18500)
    
    A column configured with EncodingType.RAW + dictionaryIndex has a
    dictionary file on disk but a non-dict-encoded forward index. Inside
    NoDictionaryMultiColumnGroupKeyGenerator, the per-column branch gated on
    ColumnContext.getDictionary() != null and then called
    BlockValSet.getDictionaryIdsSV() — which routes to
    ForwardIndexReader.readDictIds and throws UnsupportedOperationException
    on a RAW forward index.
    
    DefaultGroupByExecutor already routes such columns to the no-dict
    GroupByExecutor by checking forwardIndex.isDictionaryEncoded() in
    addition to the dict presence. NoDictionaryMultiColumnGroupKeyGenerator
    must apply the same check per column when choosing between the dict-id
    path and the on-the-fly-dictionary path, so a single RAW+dict column
    in a multi-column GROUP BY does not crash the query.
    
    The dictionary itself remains exposed on ColumnContext and
    ProjectionBlockValSet for callers that legitimately need value↔id
    lookups on a RAW+dict column (e.g. inverted-index based filters that
    already hold dict IDs); the high-level operator alone decides whether
    the dict-id read path is usable.
    
    Add a multi-column GROUP BY regression test in
    RawForwardIndexWithDictionaryTest covering the exact
    NoDictionaryMultiColumnGroupKeyGenerator path from the original stack
    trace.
---
 .../NoDictionaryMultiColumnGroupKeyGenerator.java  | 23 ++++++++++++++++++++
 .../custom/RawForwardIndexWithDictionaryTest.java  | 25 ++++++++++++++++++++++
 2 files changed, 48 insertions(+)

diff --git 
a/pinot-core/src/main/java/org/apache/pinot/core/query/aggregation/groupby/NoDictionaryMultiColumnGroupKeyGenerator.java
 
b/pinot-core/src/main/java/org/apache/pinot/core/query/aggregation/groupby/NoDictionaryMultiColumnGroupKeyGenerator.java
index 26857e2dd25..8f588b49d9e 100644
--- 
a/pinot-core/src/main/java/org/apache/pinot/core/query/aggregation/groupby/NoDictionaryMultiColumnGroupKeyGenerator.java
+++ 
b/pinot-core/src/main/java/org/apache/pinot/core/query/aggregation/groupby/NoDictionaryMultiColumnGroupKeyGenerator.java
@@ -32,7 +32,9 @@ import org.apache.pinot.core.operator.ColumnContext;
 import org.apache.pinot.core.operator.blocks.ValueBlock;
 import org.apache.pinot.core.query.aggregation.groupby.utils.ValueToIdMap;
 import 
org.apache.pinot.core.query.aggregation.groupby.utils.ValueToIdMapFactory;
+import org.apache.pinot.segment.spi.datasource.DataSource;
 import org.apache.pinot.segment.spi.index.reader.Dictionary;
+import org.apache.pinot.segment.spi.index.reader.ForwardIndexReader;
 import org.apache.pinot.spi.data.FieldSpec.DataType;
 import org.apache.pinot.spi.utils.ByteArray;
 import org.apache.pinot.spi.utils.FixedIntArray;
@@ -78,7 +80,14 @@ public class NoDictionaryMultiColumnGroupKeyGenerator 
implements GroupKeyGenerat
       ExpressionContext groupByExpression = groupByExpressions[i];
       ColumnContext columnContext = 
projectOperator.getResultColumnContext(groupByExpression);
       _storedTypes[i] = columnContext.getDataType().getStoredType();
+      // Only take the dict-id path when the column has a dictionary AND its 
forward index is dict-encoded.
+      // A column can have a dictionary alongside a RAW forward index (e.g. 
dict + inverted/range), in which case
+      // BlockValSet#getDictionaryIdsSV would route to 
ForwardIndexReader#readDictIds and throw on the raw forward
+      // index. Fall back to an on-the-fly dictionary on raw values instead.
       Dictionary dictionary = _nullHandlingEnabled ? null : 
columnContext.getDictionary();
+      if (dictionary != null && !hasDictEncodedForwardIndex(columnContext)) {
+        dictionary = null;
+      }
       if (dictionary != null) {
         _dictionaries[i] = dictionary;
       } else {
@@ -428,6 +437,20 @@ public class NoDictionaryMultiColumnGroupKeyGenerator 
implements GroupKeyGenerat
     return new GroupKeyIterator();
   }
 
+  /**
+   * Returns {@code true} if the column has a dict-encoded forward index, i.e. 
{@link BlockValSet#getDictionaryIdsSV}
+   * is callable. A column referenced by a transform (rather than directly) 
has no underlying {@link DataSource}; in
+   * that case the transform builds its own dictionary on the fly, so the 
dict-id path is always usable.
+   */
+  private static boolean hasDictEncodedForwardIndex(ColumnContext 
columnContext) {
+    DataSource dataSource = columnContext.getDataSource();
+    if (dataSource == null) {
+      return true;
+    }
+    ForwardIndexReader<?> forwardIndex = dataSource.getForwardIndex();
+    return forwardIndex == null || forwardIndex.isDictionaryEncoded();
+  }
+
   /**
    * Helper method to get or create group-id for a group key.
    *
diff --git 
a/pinot-integration-tests/src/test/java/org/apache/pinot/integration/tests/custom/RawForwardIndexWithDictionaryTest.java
 
b/pinot-integration-tests/src/test/java/org/apache/pinot/integration/tests/custom/RawForwardIndexWithDictionaryTest.java
index eebde67c7f8..2fa08e27adf 100644
--- 
a/pinot-integration-tests/src/test/java/org/apache/pinot/integration/tests/custom/RawForwardIndexWithDictionaryTest.java
+++ 
b/pinot-integration-tests/src/test/java/org/apache/pinot/integration/tests/custom/RawForwardIndexWithDictionaryTest.java
@@ -517,6 +517,31 @@ public class RawForwardIndexWithDictionaryTest extends 
CustomDataQueryClusterInt
     assertEquals(rawRows, dictRows, "DISTINCT rows must match between 
dictionary-only and raw+dictionary columns");
   }
 
+  /**
+   * Multi-column GROUP BY that mixes a dict-encoded column with a 
RAW+dictionary column. This forces the executor
+   * onto the {@code NoDictionaryMultiColumnGroupKeyGenerator} path. The 
per-column branch there must check the
+   * forward-index encoding in addition to {@code ColumnContext#getDictionary} 
— otherwise it keeps the dictionary
+   * for any column that has a dict file and calls {@code 
BlockValSet#getDictionaryIdsSV()} on it, which routes to
+   * {@code ForwardIndexReader#readDictIds} and throws {@code 
UnsupportedOperationException} on a RAW forward index.
+   */
+  @Test(dataProvider = "useBothQueryEngines")
+  public void 
testMultiColumnGroupByWithRawDictColumnReturnsSameResults(boolean 
useMultiStageQueryEngine)
+      throws Exception {
+    setUseMultiStageQueryEngine(useMultiStageQueryEngine);
+    JsonNode dictRows = postQuery(
+        String.format("SELECT %s, %s, COUNT(*) FROM %s GROUP BY %s, %s ORDER 
BY %s, %s",
+            DICT_DIMENSION, DICT_INT_DIMENSION, getTableName(),
+            DICT_DIMENSION, DICT_INT_DIMENSION, DICT_DIMENSION, 
DICT_INT_DIMENSION))
+        .get("resultTable").get("rows");
+    JsonNode rawRows = postQuery(
+        String.format("SELECT %s, %s, COUNT(*) FROM %s GROUP BY %s, %s ORDER 
BY %s, %s",
+            RAW_DICT_DIMENSION, RAW_DICT_INT_DIMENSION, getTableName(),
+            RAW_DICT_DIMENSION, RAW_DICT_INT_DIMENSION, RAW_DICT_DIMENSION, 
RAW_DICT_INT_DIMENSION))
+        .get("resultTable").get("rows");
+    assertEquals(rawRows, dictRows,
+        "Multi-column GROUP BY rows must match between dictionary-only and 
raw+dictionary columns");
+  }
+
   @Test(dataProvider = "useBothQueryEngines")
   public void 
testAggregationWithGroupByOnRawDictColumnReturnsSameResults(boolean 
useMultiStageQueryEngine)
       throws Exception {


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to