stefanvodita commented on code in PR #12184: URL: https://github.com/apache/lucene/pull/12184#discussion_r1131382134
########## lucene/facet/src/java/org/apache/lucene/facet/taxonomy/ExpressionFacets.java: ########## @@ -0,0 +1,253 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.facet.taxonomy; + +import java.io.IOException; +import java.text.ParseException; +import java.util.Arrays; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Locale; +import java.util.Map; +import java.util.Objects; +import java.util.Set; +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import org.apache.lucene.expressions.Bindings; +import org.apache.lucene.expressions.SimpleBindings; +import org.apache.lucene.expressions.js.JavascriptCompiler; +import org.apache.lucene.facet.FacetsCollector; +import org.apache.lucene.facet.FacetsConfig; +import org.apache.lucene.index.DocValues; +import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.index.SortedNumericDocValues; +import org.apache.lucene.search.ConjunctionUtils; +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.search.DoubleValues; +import org.apache.lucene.search.DoubleValuesSource; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.util.FixedBitSet; + +/** + * Facet by a provided expression string. Variables in the expression string are expected to be of + * the form: {@code val_<binding_name>_[sum|max]}, where {@code <binding_name>} must be a valid + * reference in the provided {@link Bindings}. + * + * <p>nocommit: This is a simple demo and is incomplete and not tested. + */ +public class ExpressionFacets extends FloatTaxonomyFacets { + private static final Pattern RE = Pattern.compile("(?=(val_.+_(max|sum)))"); + private static final Map<String, AssociationAggregationFunction> F_MAP = new HashMap<>(); + + static { + F_MAP.put("sum", AssociationAggregationFunction.SUM); + F_MAP.put("max", AssociationAggregationFunction.MAX); + } + + public ExpressionFacets( + String indexFieldName, + TaxonomyReader taxoReader, + FacetsConfig config, + FacetsCollector facetsCollector, + String expression, + Bindings bindings) + throws IOException, ParseException { + super(indexFieldName, taxoReader, AssociationAggregationFunction.SUM, config); + Set<FieldAggregation> aggregations = parseAggregations(expression); + SimpleBindings aggregationBindings = new SimpleBindings(); + aggregate(expression, bindings, aggregationBindings, aggregations, facetsCollector); + } + + /** + * Identify the component aggregations needed by the expression. String parsing is not my strong + * suit, so I'm sure this is clunky at best and buggy at worst. + */ + private static Set<FieldAggregation> parseAggregations(String expression) { + Set<FieldAggregation> result = new HashSet<>(); + Matcher m = RE.matcher(expression); + while (m.find()) { + int start = m.start(); + int sumPos = expression.indexOf("sum", start); + if (sumPos == -1) { + sumPos = Integer.MAX_VALUE; + } + int maxPos = expression.indexOf("max", start); + if (maxPos == -1) { + maxPos = Integer.MAX_VALUE; + } + int end = Math.min(sumPos, maxPos); + if (end == Integer.MAX_VALUE) { + throw new IllegalArgumentException("Invalid syntax"); + } + end += 3; + String component = expression.substring(start, end); + String[] tokens = component.split("_"); + if (tokens.length < 3 || "val".equals(tokens[0]) == false) { + throw new IllegalArgumentException("Invalid syntax"); + } + AssociationAggregationFunction func = + F_MAP.get(tokens[tokens.length - 1].toLowerCase(Locale.ROOT)); + String ref; + if (tokens.length > 3) { + StringBuilder sb = new StringBuilder(); + for (int i = 1; i < tokens.length - 1; i++) { + sb.append(tokens[i]); + if (i < tokens.length - 2) { + sb.append("_"); + } + } + ref = sb.toString(); + } else { + ref = tokens[1]; + } + result.add(new FieldAggregation(component, ref, func)); + } + + return result; + } + + private void aggregate( + String expression, + Bindings bindings, + SimpleBindings aggregationBindings, + Set<FieldAggregation> aggregations, + FacetsCollector facetsCollector) + throws IOException, ParseException { + // Compute component aggregations: + for (FieldAggregation fa : aggregations) { Review Comment: I’ve [worked a bit with the example code](https://github.com/stefanvodita/lucene/commit/0a9ae3f8de36801c06b07879f2d924bab28ac9b6) to see how these ideas could play out. I’ll go through them in the same order. 1. As anticipated, it’s straight-forward to use the Bindings object as a global cache of aggregations. A strong objection to this is that the Bindings object would now contain values keyed by docid and keyed by ordinal. I don’t think this is a problem. When a variable is referenced in an expression it’s not ambiguous whether it is keyed by docid or ordinal. Another reason to keep field bindings separate from aggregation bindings is that aggregations make sense within the context of a particular query. 2. The user can define DVSs that have a built-in condition. With this approach, it’s the user’s responsibility to provide conditional DVSs that make sense. Another approach is to use conditions in the expressions - the ternary operator is supported. 3. I’m dropping this idea. Having considered it some more, I don’t see any benefit. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org