Re: [PR] Improve PointRangeQuery's "inverse" optimization. [lucene]

2025-03-13 Thread via GitHub


jpountz commented on PR #14353:
URL: https://github.com/apache/lucene/pull/14353#issuecomment-2721270141

   Nightly benchmarks don't show a change since their points are not clustered 
by doc ID so they don't call `visit(DocIdSetIterator)`.


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


-
To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org
For additional commands, e-mail: issues-h...@lucene.apache.org



Re: [PR] A specialized Trie for Block Tree Index [lucene]

2025-03-13 Thread via GitHub


gf2121 commented on code in PR #14333:
URL: https://github.com/apache/lucene/pull/14333#discussion_r1993755452


##
lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Trie.java:
##
@@ -0,0 +1,486 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.codecs.lucene90.blocktree;
+
+import java.io.IOException;
+import java.util.ArrayDeque;
+import java.util.Deque;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.ListIterator;
+import java.util.function.BiConsumer;
+import org.apache.lucene.store.DataOutput;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.store.RandomAccessInput;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.BytesRefBuilder;
+
+/** TODO make it a more memory efficient structure */
+class Trie {
+
+  static final int SIGN_NO_CHILDREN = 0x00;
+  static final int SIGN_SINGLE_CHILDREN_WITH_OUTPUT = 0x01;
+  static final int SIGN_SINGLE_CHILDREN_WITHOUT_OUTPUT = 0x02;
+  static final int SIGN_MULTI_CHILDREN = 0x03;
+
+  record Output(long fp, boolean hasTerms, BytesRef floorData) {}
+
+  private enum Status {
+UNSAVED,
+SAVED,
+DESTROYED
+  }
+
+  private static class Node {
+private final int label;
+private final LinkedList children;
+private Output output;
+private long fp = -1;
+
+Node(int label, Output output, LinkedList children) {
+  this.label = label;
+  this.output = output;
+  this.children = children;
+}
+  }
+
+  private Status status = Status.UNSAVED;
+  final Node root = new Node(0, null, new LinkedList<>());
+
+  Trie(BytesRef k, Output v) {
+if (k.length == 0) {
+  root.output = v;
+  return;
+}
+Node parent = root;
+for (int i = 0; i < k.length; i++) {
+  int b = k.bytes[i + k.offset] & 0xFF;
+  Output output = i == k.length - 1 ? v : null;
+  Node node = new Node(b, output, new LinkedList<>());
+  parent.children.add(node);
+  parent = node;
+}
+  }
+
+  void putAll(Trie trie) {
+if (status != Status.UNSAVED || trie.status != Status.UNSAVED) {
+  throw new IllegalStateException("tries should be unsaved");
+}
+trie.status = Status.DESTROYED;
+putAll(this.root, trie.root);
+  }
+
+  private static void putAll(Node n, Node add) {
+assert n.label == add.label;
+if (add.output != null) {
+  n.output = add.output;

Review Comment:
   Actually I intended to make it work like `java.util.Map#putAll`, simply 
overwrite the origin output if any. This makes tests easier.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


-
To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org
For additional commands, e-mail: issues-h...@lucene.apache.org



Re: [PR] A specialized Trie for Block Tree Index [lucene]

2025-03-13 Thread via GitHub


gf2121 commented on code in PR #14333:
URL: https://github.com/apache/lucene/pull/14333#discussion_r1993758687


##
lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Trie.java:
##
@@ -0,0 +1,486 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.codecs.lucene90.blocktree;
+
+import java.io.IOException;
+import java.util.ArrayDeque;
+import java.util.Deque;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.ListIterator;
+import java.util.function.BiConsumer;
+import org.apache.lucene.store.DataOutput;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.store.RandomAccessInput;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.BytesRefBuilder;
+
+/** TODO make it a more memory efficient structure */
+class Trie {
+
+  static final int SIGN_NO_CHILDREN = 0x00;
+  static final int SIGN_SINGLE_CHILDREN_WITH_OUTPUT = 0x01;
+  static final int SIGN_SINGLE_CHILDREN_WITHOUT_OUTPUT = 0x02;
+  static final int SIGN_MULTI_CHILDREN = 0x03;
+
+  record Output(long fp, boolean hasTerms, BytesRef floorData) {}
+
+  private enum Status {
+UNSAVED,
+SAVED,
+DESTROYED
+  }
+
+  private static class Node {
+private final int label;
+private final LinkedList children;
+private Output output;
+private long fp = -1;
+
+Node(int label, Output output, LinkedList children) {
+  this.label = label;
+  this.output = output;
+  this.children = children;
+}
+  }
+
+  private Status status = Status.UNSAVED;
+  final Node root = new Node(0, null, new LinkedList<>());
+
+  Trie(BytesRef k, Output v) {
+if (k.length == 0) {
+  root.output = v;
+  return;
+}
+Node parent = root;
+for (int i = 0; i < k.length; i++) {
+  int b = k.bytes[i + k.offset] & 0xFF;
+  Output output = i == k.length - 1 ? v : null;
+  Node node = new Node(b, output, new LinkedList<>());
+  parent.children.add(node);
+  parent = node;
+}
+  }
+
+  void putAll(Trie trie) {
+if (status != Status.UNSAVED || trie.status != Status.UNSAVED) {
+  throw new IllegalStateException("tries should be unsaved");
+}
+trie.status = Status.DESTROYED;
+putAll(this.root, trie.root);
+  }
+
+  private static void putAll(Node n, Node add) {
+assert n.label == add.label;
+if (add.output != null) {
+  n.output = add.output;
+}
+ListIterator iter = n.children.listIterator();
+// TODO we can do more efficient if there is no intersection, block tree 
always do that

Review Comment:
   I believe it is a O(M + N) as each element only visited once.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


-
To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org
For additional commands, e-mail: issues-h...@lucene.apache.org



Re: [PR] A specialized Trie for Block Tree Index [lucene]

2025-03-13 Thread via GitHub


gf2121 commented on code in PR #14333:
URL: https://github.com/apache/lucene/pull/14333#discussion_r1993751047


##
lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Trie.java:
##
@@ -0,0 +1,486 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.codecs.lucene90.blocktree;
+
+import java.io.IOException;
+import java.util.ArrayDeque;
+import java.util.Deque;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.ListIterator;
+import java.util.function.BiConsumer;
+import org.apache.lucene.store.DataOutput;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.store.RandomAccessInput;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.BytesRefBuilder;
+
+/** TODO make it a more memory efficient structure */
+class Trie {
+
+  static final int SIGN_NO_CHILDREN = 0x00;
+  static final int SIGN_SINGLE_CHILDREN_WITH_OUTPUT = 0x01;
+  static final int SIGN_SINGLE_CHILDREN_WITHOUT_OUTPUT = 0x02;
+  static final int SIGN_MULTI_CHILDREN = 0x03;
+
+  record Output(long fp, boolean hasTerms, BytesRef floorData) {}
+
+  private enum Status {
+UNSAVED,
+SAVED,
+DESTROYED
+  }
+
+  private static class Node {
+private final int label;
+private final LinkedList children;

Review Comment:
   > LinkedList has highish RAM overhead ... is this only used during building, 
not at search time?
   
   Yes, it is only used during building. I added a todo to make the trie more 
memory-efficient.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


-
To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org
For additional commands, e-mail: issues-h...@lucene.apache.org



Re: [PR] A specialized Trie for Block Tree Index [lucene]

2025-03-13 Thread via GitHub


gf2121 commented on code in PR #14333:
URL: https://github.com/apache/lucene/pull/14333#discussion_r1993776438


##
lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Trie.java:
##
@@ -0,0 +1,486 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.codecs.lucene90.blocktree;
+
+import java.io.IOException;
+import java.util.ArrayDeque;
+import java.util.Deque;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.ListIterator;
+import java.util.function.BiConsumer;
+import org.apache.lucene.store.DataOutput;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.store.RandomAccessInput;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.BytesRefBuilder;
+
+/** TODO make it a more memory efficient structure */
+class Trie {
+
+  static final int SIGN_NO_CHILDREN = 0x00;
+  static final int SIGN_SINGLE_CHILDREN_WITH_OUTPUT = 0x01;
+  static final int SIGN_SINGLE_CHILDREN_WITHOUT_OUTPUT = 0x02;
+  static final int SIGN_MULTI_CHILDREN = 0x03;
+
+  record Output(long fp, boolean hasTerms, BytesRef floorData) {}
+
+  private enum Status {
+UNSAVED,
+SAVED,
+DESTROYED
+  }
+
+  private static class Node {
+private final int label;
+private final LinkedList children;
+private Output output;
+private long fp = -1;
+
+Node(int label, Output output, LinkedList children) {
+  this.label = label;
+  this.output = output;
+  this.children = children;
+}
+  }
+
+  private Status status = Status.UNSAVED;
+  final Node root = new Node(0, null, new LinkedList<>());
+
+  Trie(BytesRef k, Output v) {
+if (k.length == 0) {
+  root.output = v;
+  return;
+}
+Node parent = root;
+for (int i = 0; i < k.length; i++) {
+  int b = k.bytes[i + k.offset] & 0xFF;
+  Output output = i == k.length - 1 ? v : null;
+  Node node = new Node(b, output, new LinkedList<>());
+  parent.children.add(node);
+  parent = node;
+}
+  }
+
+  void putAll(Trie trie) {
+if (status != Status.UNSAVED || trie.status != Status.UNSAVED) {
+  throw new IllegalStateException("tries should be unsaved");
+}
+trie.status = Status.DESTROYED;
+putAll(this.root, trie.root);
+  }
+
+  private static void putAll(Node n, Node add) {
+assert n.label == add.label;
+if (add.output != null) {
+  n.output = add.output;
+}
+ListIterator iter = n.children.listIterator();
+// TODO we can do more efficient if there is no intersection, block tree 
always do that
+outer:
+for (Node addChild : add.children) {
+  while (iter.hasNext()) {
+Node nChild = iter.next();
+if (nChild.label == addChild.label) {
+  putAll(nChild, addChild);
+  continue outer;
+}
+if (nChild.label > addChild.label) {
+  iter.previous(); // move back
+  iter.add(addChild);
+  continue outer;
+}
+  }
+  iter.add(addChild);
+}
+  }
+
+  Output getEmptyOutput() {
+return root.output;
+  }
+
+  void forEach(BiConsumer consumer) {
+if (root.output != null) {
+  consumer.accept(new BytesRef(), root.output);
+}
+intersect(root.children, new BytesRefBuilder(), consumer);
+  }
+
+  private void intersect(
+  List nodes, BytesRefBuilder key, BiConsumer 
consumer) {
+for (Node node : nodes) {
+  key.append((byte) node.label);
+  if (node.output != null) consumer.accept(key.toBytesRef(), node.output);
+  intersect(node.children, key, consumer);
+  key.setLength(key.length() - 1);
+}
+  }
+
+  void save(DataOutput meta, IndexOutput index) throws IOException {
+if (status != Status.UNSAVED) {
+  throw new IllegalStateException("only unsaved trie can be saved");
+}
+status = Status.SAVED;

Review Comment:
   > is the act of saving (this method) destructive?
   
   The trie can be saved again if we reset all nodes' fp to -1. We do not 
actually need that now so i kept it simple.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe

Re: [PR] knn search - add tests to perform exact search when filtering does not return enough results [lucene]

2025-03-13 Thread via GitHub


carlosdelest commented on code in PR #14274:
URL: https://github.com/apache/lucene/pull/14274#discussion_r1993167987


##
lucene/core/src/test/org/apache/lucene/search/BaseKnnVectorQueryTestCase.java:
##
@@ -646,6 +654,24 @@ public void testRandomWithFilter() throws IOException {
   searcher.search(
   getThrowingKnnVectorQuery("field", 
randomVector(dimension), 1, filter4),
   numDocs));
+
+  // Test a filter with cost slightly more than k, and check we use 
exact search as k
+  // results are not retrieved from approximate search
+  Query filter5 = IntPoint.newRangeQuery("tag", lower, lower + 11);
+  results =
+  searcher.search(
+  getKnnVectorQuery("field", randomVector(dimension), 10, 
filter5), numDocs);
+  assertEquals(10, results.totalHits.value());
+  assertEquals(results.totalHits.value(), results.scoreDocs.length);
+  expectThrows(
+  UnsupportedOperationException.class,
+  () ->
+  searcher.search(
+  getCappedResultsThrowingKnnVectorQuery(
+  "field", randomVector(dimension), 10, filter5, 5),
+  numDocs));
+  assertEquals(10, results.totalHits.value());
+  assertEquals(results.totalHits.value(), results.scoreDocs.length);

Review Comment:
   I did! I checked all the subclasses with 5K iterations multiple times.
   
   The test is forcing a number of results to be returned via the capped 
results query, so it will always return a number of results that will result in 
doing an exact query. Why do you think it will be flaky?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


-
To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org
For additional commands, e-mail: issues-h...@lucene.apache.org



Re: [PR] A specialized Trie for Block Tree Index [lucene]

2025-03-13 Thread via GitHub


gf2121 commented on code in PR #14333:
URL: https://github.com/apache/lucene/pull/14333#discussion_r1993786103


##
lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Trie.java:
##
@@ -0,0 +1,486 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.codecs.lucene90.blocktree;
+
+import java.io.IOException;
+import java.util.ArrayDeque;
+import java.util.Deque;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.ListIterator;
+import java.util.function.BiConsumer;
+import org.apache.lucene.store.DataOutput;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.store.RandomAccessInput;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.BytesRefBuilder;
+
+/** TODO make it a more memory efficient structure */
+class Trie {
+
+  static final int SIGN_NO_CHILDREN = 0x00;
+  static final int SIGN_SINGLE_CHILDREN_WITH_OUTPUT = 0x01;
+  static final int SIGN_SINGLE_CHILDREN_WITHOUT_OUTPUT = 0x02;
+  static final int SIGN_MULTI_CHILDREN = 0x03;
+
+  record Output(long fp, boolean hasTerms, BytesRef floorData) {}
+
+  private enum Status {
+UNSAVED,
+SAVED,
+DESTROYED
+  }
+
+  private static class Node {
+private final int label;
+private final LinkedList children;
+private Output output;
+private long fp = -1;
+
+Node(int label, Output output, LinkedList children) {
+  this.label = label;
+  this.output = output;
+  this.children = children;
+}
+  }
+
+  private Status status = Status.UNSAVED;
+  final Node root = new Node(0, null, new LinkedList<>());
+
+  Trie(BytesRef k, Output v) {
+if (k.length == 0) {
+  root.output = v;
+  return;
+}
+Node parent = root;
+for (int i = 0; i < k.length; i++) {
+  int b = k.bytes[i + k.offset] & 0xFF;
+  Output output = i == k.length - 1 ? v : null;
+  Node node = new Node(b, output, new LinkedList<>());
+  parent.children.add(node);
+  parent = node;
+}
+  }
+
+  void putAll(Trie trie) {
+if (status != Status.UNSAVED || trie.status != Status.UNSAVED) {
+  throw new IllegalStateException("tries should be unsaved");
+}
+trie.status = Status.DESTROYED;
+putAll(this.root, trie.root);
+  }
+
+  private static void putAll(Node n, Node add) {
+assert n.label == add.label;
+if (add.output != null) {
+  n.output = add.output;
+}
+ListIterator iter = n.children.listIterator();
+// TODO we can do more efficient if there is no intersection, block tree 
always do that
+outer:
+for (Node addChild : add.children) {
+  while (iter.hasNext()) {
+Node nChild = iter.next();
+if (nChild.label == addChild.label) {
+  putAll(nChild, addChild);
+  continue outer;
+}
+if (nChild.label > addChild.label) {
+  iter.previous(); // move back
+  iter.add(addChild);
+  continue outer;
+}
+  }
+  iter.add(addChild);
+}
+  }
+
+  Output getEmptyOutput() {
+return root.output;
+  }
+
+  void forEach(BiConsumer consumer) {
+if (root.output != null) {
+  consumer.accept(new BytesRef(), root.output);
+}
+intersect(root.children, new BytesRefBuilder(), consumer);
+  }
+
+  private void intersect(
+  List nodes, BytesRefBuilder key, BiConsumer 
consumer) {
+for (Node node : nodes) {
+  key.append((byte) node.label);
+  if (node.output != null) consumer.accept(key.toBytesRef(), node.output);
+  intersect(node.children, key, consumer);
+  key.setLength(key.length() - 1);
+}
+  }
+
+  void save(DataOutput meta, IndexOutput index) throws IOException {
+if (status != Status.UNSAVED) {
+  throw new IllegalStateException("only unsaved trie can be saved");
+}
+status = Status.SAVED;
+meta.writeVLong(index.getFilePointer());
+saveNodes(index);
+meta.writeVLong(root.fp);
+index.writeLong(0L); // additional 8 bytes for over-reading
+meta.writeVLong(index.getFilePointer());

Review Comment:
   This writes the end file pointer of the trie index so a slice / 
randomAccessSlice can be pulled on reading side.



-- 
This is an automated mess

Re: [PR] [DRAFT] Case-insensitive matching over union of strings [lucene]

2025-03-13 Thread via GitHub


dweiss commented on PR #14350:
URL: https://github.com/apache/lucene/pull/14350#issuecomment-2720479164

   > @dweiss understands this one the best, he implemented it.
   
   ... 15 years ago in LUCENE-3832. Thanks for putting so much trust in my 
memory. I'll take a look.


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


-
To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org
For additional commands, e-mail: issues-h...@lucene.apache.org



Re: [PR] Disable the query cache by default. [lucene]

2025-03-13 Thread via GitHub


jpountz commented on code in PR #14187:
URL: https://github.com/apache/lucene/pull/14187#discussion_r1993476471


##
lucene/core/src/java/org/apache/lucene/search/IndexSearcher.java:
##
@@ -77,7 +77,8 @@
 public class IndexSearcher {
 
   static int maxClauseCount = 1024;
-  private static QueryCache DEFAULT_QUERY_CACHE;
+  // Caching is disabled by default.
+  private static QueryCache DEFAULT_QUERY_CACHE = null;
   private static QueryCachingPolicy DEFAULT_CACHING_POLICY = new 
UsageTrackingQueryCachingPolicy();

Review Comment:
   Good question, I wondered about this too, and for `DEFAULT_CACHING_POLICY` 
as well. I agree that setting them in a centralized place where searchers get 
instantiated is a better practice. These defaults are still quite convenient 
for tests though, which don't have such a central place where searchers get 
created (not all tests rely on `LuceneTestCase#newSearcher`).
   
   In general, I'm in favor of this, would looking into it in a follow-up work 
for you?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


-
To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org
For additional commands, e-mail: issues-h...@lucene.apache.org



Re: [PR] A specialized Trie for Block Tree Index [lucene]

2025-03-13 Thread via GitHub


gf2121 commented on code in PR #14333:
URL: https://github.com/apache/lucene/pull/14333#discussion_r1993802895


##
lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Trie.java:
##
@@ -0,0 +1,486 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.codecs.lucene90.blocktree;
+
+import java.io.IOException;
+import java.util.ArrayDeque;
+import java.util.Deque;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.ListIterator;
+import java.util.function.BiConsumer;
+import org.apache.lucene.store.DataOutput;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.store.RandomAccessInput;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.BytesRefBuilder;
+
+/** TODO make it a more memory efficient structure */
+class Trie {
+
+  static final int SIGN_NO_CHILDREN = 0x00;
+  static final int SIGN_SINGLE_CHILDREN_WITH_OUTPUT = 0x01;
+  static final int SIGN_SINGLE_CHILDREN_WITHOUT_OUTPUT = 0x02;
+  static final int SIGN_MULTI_CHILDREN = 0x03;
+
+  record Output(long fp, boolean hasTerms, BytesRef floorData) {}
+
+  private enum Status {
+UNSAVED,
+SAVED,
+DESTROYED
+  }
+
+  private static class Node {
+private final int label;
+private final LinkedList children;
+private Output output;
+private long fp = -1;
+
+Node(int label, Output output, LinkedList children) {
+  this.label = label;
+  this.output = output;
+  this.children = children;
+}
+  }
+
+  private Status status = Status.UNSAVED;
+  final Node root = new Node(0, null, new LinkedList<>());
+
+  Trie(BytesRef k, Output v) {
+if (k.length == 0) {
+  root.output = v;
+  return;
+}
+Node parent = root;
+for (int i = 0; i < k.length; i++) {
+  int b = k.bytes[i + k.offset] & 0xFF;
+  Output output = i == k.length - 1 ? v : null;
+  Node node = new Node(b, output, new LinkedList<>());
+  parent.children.add(node);
+  parent = node;
+}
+  }
+
+  void putAll(Trie trie) {
+if (status != Status.UNSAVED || trie.status != Status.UNSAVED) {
+  throw new IllegalStateException("tries should be unsaved");
+}
+trie.status = Status.DESTROYED;
+putAll(this.root, trie.root);
+  }
+
+  private static void putAll(Node n, Node add) {
+assert n.label == add.label;
+if (add.output != null) {
+  n.output = add.output;
+}
+ListIterator iter = n.children.listIterator();
+// TODO we can do more efficient if there is no intersection, block tree 
always do that
+outer:
+for (Node addChild : add.children) {
+  while (iter.hasNext()) {
+Node nChild = iter.next();
+if (nChild.label == addChild.label) {
+  putAll(nChild, addChild);

Review Comment:
   > but it does mean the length of your terms index prefixes must be under the 
stack depth recursion limit.
   
   +1, we should avoid this recursive implementation. I think we only have 
limit of 65535 in BytesRefHash, i do not want to add more strict limitation 
here.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


-
To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org
For additional commands, e-mail: issues-h...@lucene.apache.org



Re: [PR] A specialized Trie for Block Tree Index [lucene]

2025-03-13 Thread via GitHub


gf2121 commented on code in PR #14333:
URL: https://github.com/apache/lucene/pull/14333#discussion_r1993812719


##
lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Trie.java:
##
@@ -0,0 +1,486 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.codecs.lucene90.blocktree;
+
+import java.io.IOException;
+import java.util.ArrayDeque;
+import java.util.Deque;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.ListIterator;
+import java.util.function.BiConsumer;
+import org.apache.lucene.store.DataOutput;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.store.RandomAccessInput;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.BytesRefBuilder;
+
+/** TODO make it a more memory efficient structure */
+class Trie {
+
+  static final int SIGN_NO_CHILDREN = 0x00;
+  static final int SIGN_SINGLE_CHILDREN_WITH_OUTPUT = 0x01;
+  static final int SIGN_SINGLE_CHILDREN_WITHOUT_OUTPUT = 0x02;
+  static final int SIGN_MULTI_CHILDREN = 0x03;
+
+  record Output(long fp, boolean hasTerms, BytesRef floorData) {}
+
+  private enum Status {
+UNSAVED,
+SAVED,
+DESTROYED
+  }
+
+  private static class Node {
+private final int label;
+private final LinkedList children;
+private Output output;
+private long fp = -1;
+
+Node(int label, Output output, LinkedList children) {
+  this.label = label;
+  this.output = output;
+  this.children = children;
+}
+  }
+
+  private Status status = Status.UNSAVED;
+  final Node root = new Node(0, null, new LinkedList<>());
+
+  Trie(BytesRef k, Output v) {
+if (k.length == 0) {
+  root.output = v;
+  return;
+}
+Node parent = root;
+for (int i = 0; i < k.length; i++) {
+  int b = k.bytes[i + k.offset] & 0xFF;
+  Output output = i == k.length - 1 ? v : null;
+  Node node = new Node(b, output, new LinkedList<>());
+  parent.children.add(node);
+  parent = node;
+}
+  }
+
+  void putAll(Trie trie) {
+if (status != Status.UNSAVED || trie.status != Status.UNSAVED) {
+  throw new IllegalStateException("tries should be unsaved");
+}
+trie.status = Status.DESTROYED;
+putAll(this.root, trie.root);
+  }
+
+  private static void putAll(Node n, Node add) {
+assert n.label == add.label;
+if (add.output != null) {
+  n.output = add.output;
+}
+ListIterator iter = n.children.listIterator();
+// TODO we can do more efficient if there is no intersection, block tree 
always do that
+outer:
+for (Node addChild : add.children) {
+  while (iter.hasNext()) {
+Node nChild = iter.next();
+if (nChild.label == addChild.label) {
+  putAll(nChild, addChild);
+  continue outer;
+}
+if (nChild.label > addChild.label) {
+  iter.previous(); // move back
+  iter.add(addChild);
+  continue outer;
+}
+  }
+  iter.add(addChild);
+}
+  }
+
+  Output getEmptyOutput() {
+return root.output;
+  }
+
+  void forEach(BiConsumer consumer) {
+if (root.output != null) {
+  consumer.accept(new BytesRef(), root.output);
+}
+intersect(root.children, new BytesRefBuilder(), consumer);
+  }
+
+  private void intersect(
+  List nodes, BytesRefBuilder key, BiConsumer 
consumer) {
+for (Node node : nodes) {
+  key.append((byte) node.label);
+  if (node.output != null) consumer.accept(key.toBytesRef(), node.output);
+  intersect(node.children, key, consumer);
+  key.setLength(key.length() - 1);
+}
+  }
+
+  void save(DataOutput meta, IndexOutput index) throws IOException {
+if (status != Status.UNSAVED) {
+  throw new IllegalStateException("only unsaved trie can be saved");
+}
+status = Status.SAVED;
+meta.writeVLong(index.getFilePointer());
+saveNodes(index);
+meta.writeVLong(root.fp);
+index.writeLong(0L); // additional 8 bytes for over-reading
+meta.writeVLong(index.getFilePointer());
+  }
+
+  void saveNodes(IndexOutput index) throws IOException {
+final long startFP = index.getFilePointer();
+Deque stack = new ArrayDeque<>();
+stack.p

Re: [PR] deduplicate standard BKDConfig records [lucene]

2025-03-13 Thread via GitHub


iverase merged PR #14338:
URL: https://github.com/apache/lucene/pull/14338


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


-
To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org
For additional commands, e-mail: issues-h...@lucene.apache.org



[PR] Improve DenseConjunctionBulkScorer's sparse fallback. [lucene]

2025-03-13 Thread via GitHub


jpountz opened a new pull request, #14354:
URL: https://github.com/apache/lucene/pull/14354

   `DenseConjunctionBulkScorer` has a fallback to sparse evaluation mode when 
the intersection of the clauses evaluated so far becomes sparse. Currently, 
this sparse evaluation mode clears bits in the window if they don't match new 
clauses.
   
   This change proposes to treat the bitset like any clause and evaluate the 
conjunction in the traditional way, collecting matching docs one by one.


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


-
To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org
For additional commands, e-mail: issues-h...@lucene.apache.org



[PR] Fetch PR branch to fix changelog workflow [lucene]

2025-03-13 Thread via GitHub


stefanvodita opened a new pull request, #14355:
URL: https://github.com/apache/lucene/pull/14355

   I've been debugging this and I think without the `ref`, our diff was 
effectively between a commit and itself. Now we're actually diffing the PR 
against the base merge commit.
   
   Addresses #13898
   


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


-
To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org
For additional commands, e-mail: issues-h...@lucene.apache.org



Re: [PR] Add venv to rat gitignore [lucene]

2025-03-13 Thread via GitHub


rmuir merged PR #14346:
URL: https://github.com/apache/lucene/pull/14346


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


-
To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org
For additional commands, e-mail: issues-h...@lucene.apache.org



Re: [PR] Case-insensitive TermInSetQuery Implementation (Proof of Concept) [lucene]

2025-03-13 Thread via GitHub


jpountz commented on PR #14349:
URL: https://github.com/apache/lucene/pull/14349#issuecomment-2721204272

   Implementing it as a query that rewrites to the proper query based on the 
list of terms makes sense to me. You may want to move this query to the 
lucene-queries module since it doesn't really qualify as a "core" query.


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


-
To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org
For additional commands, e-mail: issues-h...@lucene.apache.org



Re: [I] Create a bot to check if there is a CHANGES entry for new PRs [lucene]

2025-03-13 Thread via GitHub


stefanvodita commented on issue #13898:
URL: https://github.com/apache/lucene/issues/13898#issuecomment-2721556890

   @pseudo-nymous, I've attempted a fix in #14355.


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


-
To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org
For additional commands, e-mail: issues-h...@lucene.apache.org



Re: [PR] Fetch PR branch to fix changelog workflow [lucene]

2025-03-13 Thread via GitHub


stefanvodita merged PR #14355:
URL: https://github.com/apache/lucene/pull/14355


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


-
To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org
For additional commands, e-mail: issues-h...@lucene.apache.org



Re: [PR] A specialized Trie for Block Tree Index [lucene]

2025-03-13 Thread via GitHub


gf2121 commented on code in PR #14333:
URL: https://github.com/apache/lucene/pull/14333#discussion_r1993741308


##
lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Trie.java:
##
@@ -0,0 +1,486 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.codecs.lucene90.blocktree;
+
+import java.io.IOException;
+import java.util.ArrayDeque;
+import java.util.Deque;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.ListIterator;
+import java.util.function.BiConsumer;
+import org.apache.lucene.store.DataOutput;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.store.RandomAccessInput;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.BytesRefBuilder;
+
+/** TODO make it a more memory efficient structure */
+class Trie {
+
+  static final int SIGN_NO_CHILDREN = 0x00;
+  static final int SIGN_SINGLE_CHILDREN_WITH_OUTPUT = 0x01;
+  static final int SIGN_SINGLE_CHILDREN_WITHOUT_OUTPUT = 0x02;
+  static final int SIGN_MULTI_CHILDREN = 0x03;
+
+  record Output(long fp, boolean hasTerms, BytesRef floorData) {}
+
+  private enum Status {
+UNSAVED,
+SAVED,
+DESTROYED
+  }
+
+  private static class Node {
+private final int label;
+private final LinkedList children;
+private Output output;
+private long fp = -1;
+
+Node(int label, Output output, LinkedList children) {
+  this.label = label;
+  this.output = output;
+  this.children = children;
+}
+  }
+
+  private Status status = Status.UNSAVED;
+  final Node root = new Node(0, null, new LinkedList<>());
+
+  Trie(BytesRef k, Output v) {
+if (k.length == 0) {
+  root.output = v;
+  return;
+}
+Node parent = root;
+for (int i = 0; i < k.length; i++) {
+  int b = k.bytes[i + k.offset] & 0xFF;
+  Output output = i == k.length - 1 ? v : null;
+  Node node = new Node(b, output, new LinkedList<>());
+  parent.children.add(node);
+  parent = node;
+}
+  }
+
+  void putAll(Trie trie) {
+if (status != Status.UNSAVED || trie.status != Status.UNSAVED) {
+  throw new IllegalStateException("tries should be unsaved");
+}
+trie.status = Status.DESTROYED;
+putAll(this.root, trie.root);
+  }
+
+  private static void putAll(Node n, Node add) {
+assert n.label == add.label;
+if (add.output != null) {
+  n.output = add.output;
+}
+ListIterator iter = n.children.listIterator();
+// TODO we can do more efficient if there is no intersection, block tree 
always do that
+outer:
+for (Node addChild : add.children) {
+  while (iter.hasNext()) {
+Node nChild = iter.next();
+if (nChild.label == addChild.label) {
+  putAll(nChild, addChild);
+  continue outer;
+}
+if (nChild.label > addChild.label) {
+  iter.previous(); // move back
+  iter.add(addChild);
+  continue outer;
+}
+  }
+  iter.add(addChild);
+}
+  }
+
+  Output getEmptyOutput() {
+return root.output;
+  }
+
+  void forEach(BiConsumer consumer) {
+if (root.output != null) {
+  consumer.accept(new BytesRef(), root.output);
+}
+intersect(root.children, new BytesRefBuilder(), consumer);
+  }
+
+  private void intersect(
+  List nodes, BytesRefBuilder key, BiConsumer 
consumer) {
+for (Node node : nodes) {
+  key.append((byte) node.label);
+  if (node.output != null) consumer.accept(key.toBytesRef(), node.output);
+  intersect(node.children, key, consumer);
+  key.setLength(key.length() - 1);
+}
+  }
+
+  void save(DataOutput meta, IndexOutput index) throws IOException {
+if (status != Status.UNSAVED) {
+  throw new IllegalStateException("only unsaved trie can be saved");
+}
+status = Status.SAVED;
+meta.writeVLong(index.getFilePointer());
+saveNodes(index);
+meta.writeVLong(root.fp);
+index.writeLong(0L); // additional 8 bytes for over-reading
+meta.writeVLong(index.getFilePointer());
+  }
+
+  void saveNodes(IndexOutput index) throws IOException {
+final long startFP = index.getFilePointer();
+Deque stack = new ArrayDeque<>();
+stack.p

Re: [PR] Improve PointRangeQuery's "inverse" optimization. [lucene]

2025-03-13 Thread via GitHub


jpountz merged PR #14353:
URL: https://github.com/apache/lucene/pull/14353


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


-
To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org
For additional commands, e-mail: issues-h...@lucene.apache.org



Re: [I] [DISCUSS] Could we have a different ANN algorithm for Learned Sparse Vectors? [lucene]

2025-03-13 Thread via GitHub


chishui commented on issue #13675:
URL: https://github.com/apache/lucene/issues/13675#issuecomment-2717251375

   > I have recently been interested in this direction and plan on spending non 
trivial amount of time on this over the next few weeks. Assuming we haven't 
started dev on this, I am assigning it to myself.
   > 
   > What I have been hacking on is impact sorted prioritisation of docIDs 
during the ranking phase (especially for custom rankers), so that would 
probably be the first thing that comes out of this ticket.
   
   @atris may I know if there's any progress on this?


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


-
To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org
For additional commands, e-mail: issues-h...@lucene.apache.org



Re: [PR] [DRAFT] Case-insensitive matching over union of strings [lucene]

2025-03-13 Thread via GitHub


dweiss commented on PR #14350:
URL: https://github.com/apache/lucene/pull/14350#issuecomment-2720681661

   Crap, you're right. Didn't think of it.


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


-
To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org
For additional commands, e-mail: issues-h...@lucene.apache.org



Re: [PR] A specialized Trie for Block Tree Index [lucene]

2025-03-13 Thread via GitHub


gf2121 commented on code in PR #14333:
URL: https://github.com/apache/lucene/pull/14333#discussion_r1993768037


##
lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Trie.java:
##
@@ -0,0 +1,486 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.codecs.lucene90.blocktree;
+
+import java.io.IOException;
+import java.util.ArrayDeque;
+import java.util.Deque;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.ListIterator;
+import java.util.function.BiConsumer;
+import org.apache.lucene.store.DataOutput;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.store.RandomAccessInput;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.BytesRefBuilder;
+
+/** TODO make it a more memory efficient structure */
+class Trie {
+
+  static final int SIGN_NO_CHILDREN = 0x00;
+  static final int SIGN_SINGLE_CHILDREN_WITH_OUTPUT = 0x01;
+  static final int SIGN_SINGLE_CHILDREN_WITHOUT_OUTPUT = 0x02;
+  static final int SIGN_MULTI_CHILDREN = 0x03;
+
+  record Output(long fp, boolean hasTerms, BytesRef floorData) {}
+
+  private enum Status {
+UNSAVED,
+SAVED,
+DESTROYED
+  }
+
+  private static class Node {
+private final int label;
+private final LinkedList children;
+private Output output;
+private long fp = -1;
+
+Node(int label, Output output, LinkedList children) {
+  this.label = label;
+  this.output = output;
+  this.children = children;
+}
+  }
+
+  private Status status = Status.UNSAVED;
+  final Node root = new Node(0, null, new LinkedList<>());
+
+  Trie(BytesRef k, Output v) {

Review Comment:
   I like it!



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


-
To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org
For additional commands, e-mail: issues-h...@lucene.apache.org



Re: [PR] [DRAFT] Case-insensitive matching over union of strings [lucene]

2025-03-13 Thread via GitHub


dweiss commented on PR #14350:
URL: https://github.com/apache/lucene/pull/14350#issuecomment-2720601955

   I think this will work just fine in most cases and is a rather inexpensive 
way to implement this case-insensitive matching, but this comes at the cost of 
the output automaton that may not be minimal. Consider this example:
   ```
   List terms = new ArrayList<>(List.of(
   newBytesRef("abc"),
   newBytesRef("aBC")));
   Collections.sort(terms);
   Automaton a = build(terms, false, false);
   ```
   which produces:
   
![image](https://github.com/user-attachments/assets/5d89a382-0fe9-4b42-bdcc-a67cb2b90ef5)
   
   However, when you naively expand just the transitions for each letter 
variant, you get this:
   
![image](https://github.com/user-attachments/assets/2c55bbc5-1c16-43c9-b148-2effbd6b1efb)
   which clearly isn't minimal (and doesn't pass checkMinimized).
   
   I think the absolutely worst case is for the automaton to double the number 
of transitions - the number of states remains the same. So it's not like it's 
going to expand uncontrollably... But it's no longer minimal. Perhaps this is 
acceptable, given the constrained worst case?


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


-
To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org
For additional commands, e-mail: issues-h...@lucene.apache.org



Re: [PR] Improve PointRangeQuery's "inverse" optimization. [lucene]

2025-03-13 Thread via GitHub


jpountz commented on PR #14353:
URL: https://github.com/apache/lucene/pull/14353#issuecomment-2721927266

   Thanks @iverase !


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


-
To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org
For additional commands, e-mail: issues-h...@lucene.apache.org



Re: [PR] Improve PointRangeQuery's "inverse" optimization. [lucene]

2025-03-13 Thread via GitHub


jpountz commented on PR #14353:
URL: https://github.com/apache/lucene/pull/14353#issuecomment-2721296892

   Maybe I spoke too soon, there seems to be a tiny speedup:
   
   ```
   TaskQPS baseline  StdDevQPS 
my_modified_version  StdDevPct diff p-value
OrStopWords   33.22  (8.1%)   32.38  
(8.4%)   -2.5% ( -17% -   15%) 0.436
   AndStopWords   31.39  (7.1%)   30.61  
(8.0%)   -2.5% ( -16% -   13%) 0.403
   Or3Terms  167.25  (4.6%)  164.49  
(5.1%)   -1.6% ( -10% -8%) 0.386
 Or2Terms2StopWords  160.66  (4.6%)  158.63  
(5.1%)   -1.3% ( -10% -8%) 0.508
  And3Terms  172.11  (4.5%)  169.98  
(5.5%)   -1.2% ( -10% -9%) 0.529
 OrHighRare  276.96  (3.4%)  273.78  
(4.8%)   -1.1% (  -9% -7%) 0.482
 TermDTSort  504.48  (4.5%)  498.78  
(2.2%)   -1.1% (  -7% -5%) 0.414
And2Terms2StopWords  162.45  (4.4%)  160.63  
(5.1%)   -1.1% ( -10% -8%) 0.552
 FilteredAndHighMed  131.89  (3.2%)  130.68  
(3.0%)   -0.9% (  -6% -5%) 0.449
   AndMedOrHighHigh   66.77  (1.1%)   66.25  
(2.0%)   -0.8% (  -3% -2%) 0.225
FilteredOrStopWords   47.15  (1.9%)   46.85  
(1.9%)   -0.6% (  -4% -3%) 0.395
  CombinedOrHighMed   72.43  (1.9%)   72.11  
(1.9%)   -0.5% (  -4% -3%) 0.544
 OrMany   18.98  (4.5%)   18.90  
(4.7%)   -0.4% (  -9% -9%) 0.809
 FilteredOrHighHigh   68.36  (1.5%)   68.12  
(1.7%)   -0.4% (  -3% -2%) 0.580
   DismaxOrHighHigh  119.08  (1.6%)  118.76  
(2.3%)   -0.3% (  -4% -3%) 0.728
 AndHighMed  131.53  (1.0%)  131.23  
(1.9%)   -0.2% (  -3% -2%) 0.695
  TermTitleSort  142.85  (1.8%)  142.61  
(1.9%)   -0.2% (  -3% -3%) 0.816
  TermDayOfYearSort  700.35  (2.7%)  699.76  
(2.5%)   -0.1% (  -5% -5%) 0.933
 CombinedOrHighHigh   18.83  (1.8%)   18.82  
(2.0%)   -0.0% (  -3% -3%) 0.952
  FilteredAnd3Terms  190.17  (2.3%)  190.10  
(1.7%)   -0.0% (  -4% -4%) 0.962
   CombinedTerm   30.97  (1.8%)   30.97  
(2.1%)   -0.0% (  -3% -3%) 0.980
   FilteredAndStopWords   55.97  (1.7%)   55.98  
(1.1%)0.0% (  -2% -2%) 0.971
   Term  467.45  (2.5%)  467.58  
(2.8%)0.0% (  -5% -5%) 0.979
 DismaxTerm  548.26  (3.5%)  548.60  
(2.3%)0.1% (  -5% -6%) 0.957
CountFilteredOrHighHigh  106.27  (1.0%)  106.35  
(0.9%)0.1% (  -1% -1%) 0.820
 OrHighHigh   53.31  (3.3%)   53.36  
(3.9%)0.1% (  -6% -7%) 0.950
AndHighHigh   44.76  (1.2%)   44.81  
(2.2%)0.1% (  -3% -3%) 0.867
FilteredAndHighHigh   70.50  (1.5%)   70.60  
(0.9%)0.2% (  -2% -2%) 0.755
  OrHighMed  200.31  (2.6%)  200.63  
(3.2%)0.2% (  -5% -6%) 0.889
   CountAndHighHigh  302.08  (1.9%)  302.58  
(1.7%)0.2% (  -3% -3%) 0.813
   PKLookup  282.14  (1.6%)  282.66  
(2.2%)0.2% (  -3% -4%) 0.805
   FilteredOr3Terms  164.11  (2.0%)  164.45  
(1.6%)0.2% (  -3% -3%) 0.774
CountOrMany   29.53  (2.8%)   29.59  
(2.2%)0.2% (  -4% -5%) 0.836
 Phrase   15.27  (4.4%)   15.30  
(4.9%)0.2% (  -8% -9%) 0.909
DismaxOrHighMed  172.63  (1.8%)  173.02  
(1.8%)0.2% (  -3% -3%) 0.750
 FilteredOrMany   17.46  (2.0%)   17.50  
(1.6%)0.2% (  -3% -3%) 0.738
   FilteredTerm  161.09  (1.5%)  161.50  
(1.0%)0.3% (  -2% -2%) 0.621
  FilteredOrHighMed  155.61  (1.4%)  156.02  
(1.3%)0.3% (  -2% -2%) 0.614
 CountFilteredOrHighMed  117.03  (0.8%)  117.35  
(0.8%)0.3% (  -1% -1%) 0.379
  TermMonthSort 3472.10  (3.6%) 3482.72  
(3.7%)0.3% (  -6% -7%) 0.830
 FilteredPhrase   32.86  (2.9%)   32.96  
(1.5%)0.3

Re: [PR] Speed up scoring conjunctions a bit. [lucene]

2025-03-13 Thread via GitHub


jpountz merged PR #14345:
URL: https://github.com/apache/lucene/pull/14345


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


-
To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org
For additional commands, e-mail: issues-h...@lucene.apache.org



[PR] Improve PointRangeQuery's "inverse" optimization. [lucene]

2025-03-13 Thread via GitHub


jpountz opened a new pull request, #14353:
URL: https://github.com/apache/lucene/pull/14353

   When a `PointRangeQuery` matches for than 50% of points and the field is 
dense and single-valued, it internally computes the set of docs that don't 
match the query, which should be faster since fewer docs match the query than 
not.
   
   This change improves this optimization to take advantage of the 
`FixedBitSet#or(DocIdSetIterator)` optimization (which internally calls 
`DocIdSetIterator#intoBitSet`).
   


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


-
To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org
For additional commands, e-mail: issues-h...@lucene.apache.org



Re: [PR] PointInSetQuery use reverse collection to improve performance [lucene]

2025-03-13 Thread via GitHub


msfroh commented on PR #14352:
URL: https://github.com/apache/lucene/pull/14352#issuecomment-2722703844

   > Another new thought is that I believe SinglePointVisitor is unnecessary, 
as processing each point separately will traverse the bkd tree multiple times. 
MergePointVisitor will only traverse the bkd tree once. Therefore, in most 
cases, SinglePointVisitor will result in slower performance. As long as 
MergePointVisitor supports both single and multiple dimensions, reverse 
collection optimization can be used.
   
   Right now, `MergePointVisitor` only supports 1-dimension, since it relies on 
the fact that it can traverse the tree from left to right at the same time as 
it's traversing the points from left to right (like a merge sort).
   
   I think it might be possible to do an N-dimensional approach like a quick 
sort, though.
   
   Suppose you have a list of N-dimensional points. At the root of the tree, (I 
think) you always have two children. If you take the upper bound of the right 
child / lower bound of the left child as your pivot, you can do an O(n) 
reordering of the list into points that potentially match on the left and 
right. (If there's a common value shared between right and left, you would have 
to push those values both ways.) As you go down the tree, you're focused on a 
sublist of the original list and can reorder that sublist based on the next 
level's pivot.
   
   The values shared between left and right might mess things up, though, since 
you can't safely reorder them on the left without potentially messing up the 
right. Hmm... might still be doable, though.


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


-
To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org
For additional commands, e-mail: issues-h...@lucene.apache.org



Re: [PR] [DRAFT] Case-insensitive matching over union of strings [lucene]

2025-03-13 Thread via GitHub


msfroh commented on PR #14350:
URL: https://github.com/apache/lucene/pull/14350#issuecomment-2722105723

   My thinking is that a query that uses this should lowercase, dedupe, and 
sort the input before feeding it into `StringsToAutomaton`. That would handle 
@dweiss's example (i.e. that input is "invalid", or at least exact duplicates 
wouldn't add any new states, I think, since the full string exists as a prior 
prefix). 
   
   Would a check with `Character.isLowerCase()` on each input codepoint for the 
case-insensitive case be sufficient to reject that kind of input across all 
valid Unicode strings?


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


-
To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org
For additional commands, e-mail: issues-h...@lucene.apache.org



Re: [PR] Address completion fields testing gap and truly allow loading FST off heap [lucene]

2025-03-13 Thread via GitHub


github-actions[bot] commented on PR #14270:
URL: https://github.com/apache/lucene/pull/14270#issuecomment-2722992549

   This PR has not had activity in the past 2 weeks, labeling it as stale. If 
the PR is waiting for review, notify the d...@lucene.apache.org list. Thank you 
for your contribution!


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


-
To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org
For additional commands, e-mail: issues-h...@lucene.apache.org



Re: [PR] Add posTagFormat parameter for OpenNLPPOSFilter [lucene]

2025-03-13 Thread via GitHub


github-actions[bot] commented on PR #14194:
URL: https://github.com/apache/lucene/pull/14194#issuecomment-2722992638

   This PR has not had activity in the past 2 weeks, labeling it as stale. If 
the PR is waiting for review, notify the d...@lucene.apache.org list. Thank you 
for your contribution!


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


-
To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org
For additional commands, e-mail: issues-h...@lucene.apache.org



Re: [PR] Make Lucene better at skipping long runs of matches. [lucene]

2025-03-13 Thread via GitHub


gf2121 commented on code in PR #14312:
URL: https://github.com/apache/lucene/pull/14312#discussion_r1987237995


##
lucene/core/src/java/org/apache/lucene/search/DenseConjunctionBulkScorer.java:
##
@@ -117,6 +107,65 @@ private static int advance(FixedBitSet set, int i) {
 }
   }
 
+  private int scoreWindow(
+  LeafCollector collector, Bits acceptDocs, List 
iterators, int min, int max)
+  throws IOException {
+
+// Advance all iterators to the first doc that is greater than or equal to 
min. This is
+// important as this is the only place where we can take advantage of a 
large gap between
+// consecutive matches in any clause.
+for (DocIdSetIterator iterator : iterators) {
+  if (iterator.docID() >= min) {
+min = iterator.docID();
+  } else {
+min = iterator.advance(min);
+  }
+}
+if (min >= max) {
+  return min;
+}
+
+int bitsetWindowMax = (int) Math.min(max, (long) min + WINDOW_SIZE);
+
+if (acceptDocs == null) {
+  int minDocIDRunEnd = max;
+  for (DocIdSetIterator iterator : iterators) {
+if (iterator.docID() > min) {
+  minDocIDRunEnd = min;
+  break;
+} else {
+  minDocIDRunEnd = Math.min(minDocIDRunEnd, iterator.docIDRunEnd());
+}
+  }
+
+  if (minDocIDRunEnd >= bitsetWindowMax) {

Review Comment:
   > I would like to avoid collecting tiny windows of doc IDs at once, so that 
collectors can feel free to apply logic that has some overhead in 
LeafCollector#collect(DocIdStream) (e.g. 
https://github.com/apache/lucene/pull/14273/files#diff-05525bb5769d4251279bcd9c76d259f8eb451a16075af357e69bd98890c3db5bR257).
   
   Good point, thanks!



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


-
To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org
For additional commands, e-mail: issues-h...@lucene.apache.org



Re: [PR] [DRAFT] Case-insensitive matching over union of strings [lucene]

2025-03-13 Thread via GitHub


rmuir commented on PR #14350:
URL: https://github.com/apache/lucene/pull/14350#issuecomment-2722901888

   > Would a check with `Character.isLowerCase()` on each input codepoint for 
the case-insensitive case be sufficient to reject that kind of input across all 
valid Unicode strings?
   
   I dont think so for greek. I would step back from that and try to get 
matching working with simple Character.toLowerCase/toUpperCase first? If the 
user provides data with a certain order/casing as you suggest, will they always 
get a DFA?
   
   I'm less concerned about it being minimal, let's start with deterministic. I 
don't think we should do this if it will explode (e.g. NFA). And union of 
strings really wants to do that, if handled the naive way, that's why there is 
a special algorithm for it.


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


-
To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org
For additional commands, e-mail: issues-h...@lucene.apache.org



Re: [PR] [DRAFT] Case-insensitive matching over union of strings [lucene]

2025-03-13 Thread via GitHub


msfroh commented on PR #14350:
URL: https://github.com/apache/lucene/pull/14350#issuecomment-2722961519

   To the best of my understanding from reading the through the code while 
sketching this PR, I believe it would produce a minimal DFA if every character 
in a set of alternatives in the input strings have the same canonical 
representation. (The existing implementation already throws if input is not 
sorted BytesRefs.)
   
   That is, if you input `cap, cat, cats, cob`, it will generate the minimal 
DFA. If you input `CAP, CAT, CATS, COB`, you'll end up with the same minimal 
DFA (albeit with the transitions added in the opposite order, which I think is 
fine). But if you input `CAP, CATS, cat, cob`, you'll end up with a NFA.


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


-
To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org
For additional commands, e-mail: issues-h...@lucene.apache.org



Re: [PR] PointInSetQuery clips segments by lower and upper [lucene]

2025-03-13 Thread via GitHub


hanbj commented on PR #14268:
URL: https://github.com/apache/lucene/pull/14268#issuecomment-2720535117

   @stefanvodita Thank you for the review. Unit testing has been added


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


-
To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org
For additional commands, e-mail: issues-h...@lucene.apache.org



Re: [PR] knn search - add tests to perform exact search when filtering does not return enough results [lucene]

2025-03-13 Thread via GitHub


benwtrent commented on code in PR #14274:
URL: https://github.com/apache/lucene/pull/14274#discussion_r1993097312


##
lucene/core/src/test/org/apache/lucene/search/BaseKnnVectorQueryTestCase.java:
##
@@ -646,6 +654,24 @@ public void testRandomWithFilter() throws IOException {
   searcher.search(
   getThrowingKnnVectorQuery("field", 
randomVector(dimension), 1, filter4),
   numDocs));
+
+  // Test a filter with cost slightly more than k, and check we use 
exact search as k
+  // results are not retrieved from approximate search
+  Query filter5 = IntPoint.newRangeQuery("tag", lower, lower + 11);
+  results =
+  searcher.search(
+  getKnnVectorQuery("field", randomVector(dimension), 10, 
filter5), numDocs);
+  assertEquals(10, results.totalHits.value());
+  assertEquals(results.totalHits.value(), results.scoreDocs.length);
+  expectThrows(
+  UnsupportedOperationException.class,
+  () ->
+  searcher.search(
+  getCappedResultsThrowingKnnVectorQuery(
+  "field", randomVector(dimension), 10, filter5, 5),
+  numDocs));
+  assertEquals(10, results.totalHits.value());
+  assertEquals(results.totalHits.value(), results.scoreDocs.length);

Review Comment:
   I suspect this is flaky, have you ran this many times locally to detect any 
weird edges?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


-
To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org
For additional commands, e-mail: issues-h...@lucene.apache.org



Re: [PR] A specialized Trie for Block Tree Index [lucene]

2025-03-13 Thread via GitHub


mikemccand commented on PR #14333:
URL: https://github.com/apache/lucene/pull/14333#issuecomment-2720657669

   This is an exciting change @gf2121!  Smaller, simpler, and faster!?  I'll 
try to review soon.


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


-
To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org
For additional commands, e-mail: issues-h...@lucene.apache.org



Re: [PR] PointInSetQuery clips segments by lower and upper [lucene]

2025-03-13 Thread via GitHub


iverase commented on PR #14268:
URL: https://github.com/apache/lucene/pull/14268#issuecomment-2720590130

   >When creating a PointInSetQuery object, the data in the packedPoints 
parameter is returned in order, so the maximum and minimum values ​​can be 
determined when iterating over packedPoints.
   
   I don't think this is in the BDK tree contract, it feels you are taking 
advantagle of an implementation detail?


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


-
To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org
For additional commands, e-mail: issues-h...@lucene.apache.org



Re: [PR] [DRAFT] Case-insensitive matching over union of strings [lucene]

2025-03-13 Thread via GitHub


rmuir commented on PR #14350:
URL: https://github.com/apache/lucene/pull/14350#issuecomment-2720652612

   Bigger downside: that example isn't deterministic either.


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


-
To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org
For additional commands, e-mail: issues-h...@lucene.apache.org



[PR] PointInSetQuery use reverse collection to improve performance [lucene]

2025-03-13 Thread via GitHub


hanbj opened a new pull request, #14352:
URL: https://github.com/apache/lucene/pull/14352

   ### Description
   
   Performance issues with terms number field encountered in production 
environments: high query time and very high CPU usage.
   
   Through analysis and localization, it was found that the main time 
consumption is in the build_store stage. The main task of build_store is to 
collect document IDs and fill bitset in memory. When there are a large number 
of document IDs, it takes up a lot of CPU.
   
   After reading the code and optimizing it, the performance of terms number 
field is improved by 5-10 times in low cardinality case.
   
   1. values.getDocCount() == reader.maxDoc() : ensures that each document 
contains this field.
   2. values.getDocCount() == values.size() : ensures that each document has 
only one value for this field.
   3. cost() > reader.maxDoc() / 2 : ensure that more than half of the 
documents matched by all the points to be queried.
   
   Because when each document has only one value for this field, the document 
id in the bkd tree will not be duplicated. There will be no collection of 
duplicate document IDs. Therefore, first determine the number of documents that 
match all the points to be queried. If it exceeds half, use reverse collection 
to reduce the number of collected document ids.
   


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


-
To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org
For additional commands, e-mail: issues-h...@lucene.apache.org



Re: [PR] [DRAFT] Case-insensitive matching over union of strings [lucene]

2025-03-13 Thread via GitHub


dweiss commented on PR #14350:
URL: https://github.com/apache/lucene/pull/14350#issuecomment-2720790704

   I also don't think you can make it deterministic in any trivial way.


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


-
To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org
For additional commands, e-mail: issues-h...@lucene.apache.org



Re: [PR] A specialized Trie for Block Tree Index [lucene]

2025-03-13 Thread via GitHub


gf2121 commented on code in PR #14333:
URL: https://github.com/apache/lucene/pull/14333#discussion_r1994852913


##
lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Trie.java:
##
@@ -0,0 +1,486 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.codecs.lucene90.blocktree;
+
+import java.io.IOException;
+import java.util.ArrayDeque;
+import java.util.Deque;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.ListIterator;
+import java.util.function.BiConsumer;
+import org.apache.lucene.store.DataOutput;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.store.RandomAccessInput;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.BytesRefBuilder;
+
+/** TODO make it a more memory efficient structure */
+class Trie {
+
+  static final int SIGN_NO_CHILDREN = 0x00;
+  static final int SIGN_SINGLE_CHILDREN_WITH_OUTPUT = 0x01;
+  static final int SIGN_SINGLE_CHILDREN_WITHOUT_OUTPUT = 0x02;
+  static final int SIGN_MULTI_CHILDREN = 0x03;
+
+  record Output(long fp, boolean hasTerms, BytesRef floorData) {}
+
+  private enum Status {
+UNSAVED,
+SAVED,
+DESTROYED
+  }
+
+  private static class Node {
+private final int label;
+private final LinkedList children;
+private Output output;
+private long fp = -1;
+
+Node(int label, Output output, LinkedList children) {
+  this.label = label;
+  this.output = output;
+  this.children = children;
+}
+  }
+
+  private Status status = Status.UNSAVED;
+  final Node root = new Node(0, null, new LinkedList<>());
+
+  Trie(BytesRef k, Output v) {
+if (k.length == 0) {
+  root.output = v;
+  return;
+}
+Node parent = root;
+for (int i = 0; i < k.length; i++) {
+  int b = k.bytes[i + k.offset] & 0xFF;
+  Output output = i == k.length - 1 ? v : null;
+  Node node = new Node(b, output, new LinkedList<>());
+  parent.children.add(node);
+  parent = node;
+}
+  }
+
+  void putAll(Trie trie) {
+if (status != Status.UNSAVED || trie.status != Status.UNSAVED) {
+  throw new IllegalStateException("tries should be unsaved");
+}
+trie.status = Status.DESTROYED;
+putAll(this.root, trie.root);
+  }
+
+  private static void putAll(Node n, Node add) {
+assert n.label == add.label;
+if (add.output != null) {
+  n.output = add.output;
+}
+ListIterator iter = n.children.listIterator();
+// TODO we can do more efficient if there is no intersection, block tree 
always do that
+outer:
+for (Node addChild : add.children) {
+  while (iter.hasNext()) {
+Node nChild = iter.next();
+if (nChild.label == addChild.label) {
+  putAll(nChild, addChild);
+  continue outer;
+}
+if (nChild.label > addChild.label) {
+  iter.previous(); // move back
+  iter.add(addChild);
+  continue outer;
+}
+  }
+  iter.add(addChild);
+}
+  }
+
+  Output getEmptyOutput() {
+return root.output;
+  }
+
+  void forEach(BiConsumer consumer) {
+if (root.output != null) {
+  consumer.accept(new BytesRef(), root.output);
+}
+intersect(root.children, new BytesRefBuilder(), consumer);
+  }
+
+  private void intersect(
+  List nodes, BytesRefBuilder key, BiConsumer 
consumer) {
+for (Node node : nodes) {
+  key.append((byte) node.label);
+  if (node.output != null) consumer.accept(key.toBytesRef(), node.output);
+  intersect(node.children, key, consumer);
+  key.setLength(key.length() - 1);
+}
+  }
+
+  void save(DataOutput meta, IndexOutput index) throws IOException {
+if (status != Status.UNSAVED) {
+  throw new IllegalStateException("only unsaved trie can be saved");
+}
+status = Status.SAVED;
+meta.writeVLong(index.getFilePointer());
+saveNodes(index);
+meta.writeVLong(root.fp);
+index.writeLong(0L); // additional 8 bytes for over-reading
+meta.writeVLong(index.getFilePointer());
+  }
+
+  void saveNodes(IndexOutput index) throws IOException {
+final long startFP = index.getFilePointer();
+Deque stack = new ArrayDeque<>();
+stack.p