Re: [PR] Improve PointRangeQuery's "inverse" optimization. [lucene]
jpountz commented on PR #14353: URL: https://github.com/apache/lucene/pull/14353#issuecomment-2721270141 Nightly benchmarks don't show a change since their points are not clustered by doc ID so they don't call `visit(DocIdSetIterator)`. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org
Re: [PR] A specialized Trie for Block Tree Index [lucene]
gf2121 commented on code in PR #14333: URL: https://github.com/apache/lucene/pull/14333#discussion_r1993755452 ## lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Trie.java: ## @@ -0,0 +1,486 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene90.blocktree; + +import java.io.IOException; +import java.util.ArrayDeque; +import java.util.Deque; +import java.util.LinkedList; +import java.util.List; +import java.util.ListIterator; +import java.util.function.BiConsumer; +import org.apache.lucene.store.DataOutput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.store.RandomAccessInput; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.BytesRefBuilder; + +/** TODO make it a more memory efficient structure */ +class Trie { + + static final int SIGN_NO_CHILDREN = 0x00; + static final int SIGN_SINGLE_CHILDREN_WITH_OUTPUT = 0x01; + static final int SIGN_SINGLE_CHILDREN_WITHOUT_OUTPUT = 0x02; + static final int SIGN_MULTI_CHILDREN = 0x03; + + record Output(long fp, boolean hasTerms, BytesRef floorData) {} + + private enum Status { +UNSAVED, +SAVED, +DESTROYED + } + + private static class Node { +private final int label; +private final LinkedList children; +private Output output; +private long fp = -1; + +Node(int label, Output output, LinkedList children) { + this.label = label; + this.output = output; + this.children = children; +} + } + + private Status status = Status.UNSAVED; + final Node root = new Node(0, null, new LinkedList<>()); + + Trie(BytesRef k, Output v) { +if (k.length == 0) { + root.output = v; + return; +} +Node parent = root; +for (int i = 0; i < k.length; i++) { + int b = k.bytes[i + k.offset] & 0xFF; + Output output = i == k.length - 1 ? v : null; + Node node = new Node(b, output, new LinkedList<>()); + parent.children.add(node); + parent = node; +} + } + + void putAll(Trie trie) { +if (status != Status.UNSAVED || trie.status != Status.UNSAVED) { + throw new IllegalStateException("tries should be unsaved"); +} +trie.status = Status.DESTROYED; +putAll(this.root, trie.root); + } + + private static void putAll(Node n, Node add) { +assert n.label == add.label; +if (add.output != null) { + n.output = add.output; Review Comment: Actually I intended to make it work like `java.util.Map#putAll`, simply overwrite the origin output if any. This makes tests easier. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org
Re: [PR] A specialized Trie for Block Tree Index [lucene]
gf2121 commented on code in PR #14333: URL: https://github.com/apache/lucene/pull/14333#discussion_r1993758687 ## lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Trie.java: ## @@ -0,0 +1,486 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene90.blocktree; + +import java.io.IOException; +import java.util.ArrayDeque; +import java.util.Deque; +import java.util.LinkedList; +import java.util.List; +import java.util.ListIterator; +import java.util.function.BiConsumer; +import org.apache.lucene.store.DataOutput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.store.RandomAccessInput; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.BytesRefBuilder; + +/** TODO make it a more memory efficient structure */ +class Trie { + + static final int SIGN_NO_CHILDREN = 0x00; + static final int SIGN_SINGLE_CHILDREN_WITH_OUTPUT = 0x01; + static final int SIGN_SINGLE_CHILDREN_WITHOUT_OUTPUT = 0x02; + static final int SIGN_MULTI_CHILDREN = 0x03; + + record Output(long fp, boolean hasTerms, BytesRef floorData) {} + + private enum Status { +UNSAVED, +SAVED, +DESTROYED + } + + private static class Node { +private final int label; +private final LinkedList children; +private Output output; +private long fp = -1; + +Node(int label, Output output, LinkedList children) { + this.label = label; + this.output = output; + this.children = children; +} + } + + private Status status = Status.UNSAVED; + final Node root = new Node(0, null, new LinkedList<>()); + + Trie(BytesRef k, Output v) { +if (k.length == 0) { + root.output = v; + return; +} +Node parent = root; +for (int i = 0; i < k.length; i++) { + int b = k.bytes[i + k.offset] & 0xFF; + Output output = i == k.length - 1 ? v : null; + Node node = new Node(b, output, new LinkedList<>()); + parent.children.add(node); + parent = node; +} + } + + void putAll(Trie trie) { +if (status != Status.UNSAVED || trie.status != Status.UNSAVED) { + throw new IllegalStateException("tries should be unsaved"); +} +trie.status = Status.DESTROYED; +putAll(this.root, trie.root); + } + + private static void putAll(Node n, Node add) { +assert n.label == add.label; +if (add.output != null) { + n.output = add.output; +} +ListIterator iter = n.children.listIterator(); +// TODO we can do more efficient if there is no intersection, block tree always do that Review Comment: I believe it is a O(M + N) as each element only visited once. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org
Re: [PR] A specialized Trie for Block Tree Index [lucene]
gf2121 commented on code in PR #14333: URL: https://github.com/apache/lucene/pull/14333#discussion_r1993751047 ## lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Trie.java: ## @@ -0,0 +1,486 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene90.blocktree; + +import java.io.IOException; +import java.util.ArrayDeque; +import java.util.Deque; +import java.util.LinkedList; +import java.util.List; +import java.util.ListIterator; +import java.util.function.BiConsumer; +import org.apache.lucene.store.DataOutput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.store.RandomAccessInput; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.BytesRefBuilder; + +/** TODO make it a more memory efficient structure */ +class Trie { + + static final int SIGN_NO_CHILDREN = 0x00; + static final int SIGN_SINGLE_CHILDREN_WITH_OUTPUT = 0x01; + static final int SIGN_SINGLE_CHILDREN_WITHOUT_OUTPUT = 0x02; + static final int SIGN_MULTI_CHILDREN = 0x03; + + record Output(long fp, boolean hasTerms, BytesRef floorData) {} + + private enum Status { +UNSAVED, +SAVED, +DESTROYED + } + + private static class Node { +private final int label; +private final LinkedList children; Review Comment: > LinkedList has highish RAM overhead ... is this only used during building, not at search time? Yes, it is only used during building. I added a todo to make the trie more memory-efficient. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org
Re: [PR] A specialized Trie for Block Tree Index [lucene]
gf2121 commented on code in PR #14333: URL: https://github.com/apache/lucene/pull/14333#discussion_r1993776438 ## lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Trie.java: ## @@ -0,0 +1,486 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene90.blocktree; + +import java.io.IOException; +import java.util.ArrayDeque; +import java.util.Deque; +import java.util.LinkedList; +import java.util.List; +import java.util.ListIterator; +import java.util.function.BiConsumer; +import org.apache.lucene.store.DataOutput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.store.RandomAccessInput; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.BytesRefBuilder; + +/** TODO make it a more memory efficient structure */ +class Trie { + + static final int SIGN_NO_CHILDREN = 0x00; + static final int SIGN_SINGLE_CHILDREN_WITH_OUTPUT = 0x01; + static final int SIGN_SINGLE_CHILDREN_WITHOUT_OUTPUT = 0x02; + static final int SIGN_MULTI_CHILDREN = 0x03; + + record Output(long fp, boolean hasTerms, BytesRef floorData) {} + + private enum Status { +UNSAVED, +SAVED, +DESTROYED + } + + private static class Node { +private final int label; +private final LinkedList children; +private Output output; +private long fp = -1; + +Node(int label, Output output, LinkedList children) { + this.label = label; + this.output = output; + this.children = children; +} + } + + private Status status = Status.UNSAVED; + final Node root = new Node(0, null, new LinkedList<>()); + + Trie(BytesRef k, Output v) { +if (k.length == 0) { + root.output = v; + return; +} +Node parent = root; +for (int i = 0; i < k.length; i++) { + int b = k.bytes[i + k.offset] & 0xFF; + Output output = i == k.length - 1 ? v : null; + Node node = new Node(b, output, new LinkedList<>()); + parent.children.add(node); + parent = node; +} + } + + void putAll(Trie trie) { +if (status != Status.UNSAVED || trie.status != Status.UNSAVED) { + throw new IllegalStateException("tries should be unsaved"); +} +trie.status = Status.DESTROYED; +putAll(this.root, trie.root); + } + + private static void putAll(Node n, Node add) { +assert n.label == add.label; +if (add.output != null) { + n.output = add.output; +} +ListIterator iter = n.children.listIterator(); +// TODO we can do more efficient if there is no intersection, block tree always do that +outer: +for (Node addChild : add.children) { + while (iter.hasNext()) { +Node nChild = iter.next(); +if (nChild.label == addChild.label) { + putAll(nChild, addChild); + continue outer; +} +if (nChild.label > addChild.label) { + iter.previous(); // move back + iter.add(addChild); + continue outer; +} + } + iter.add(addChild); +} + } + + Output getEmptyOutput() { +return root.output; + } + + void forEach(BiConsumer consumer) { +if (root.output != null) { + consumer.accept(new BytesRef(), root.output); +} +intersect(root.children, new BytesRefBuilder(), consumer); + } + + private void intersect( + List nodes, BytesRefBuilder key, BiConsumer consumer) { +for (Node node : nodes) { + key.append((byte) node.label); + if (node.output != null) consumer.accept(key.toBytesRef(), node.output); + intersect(node.children, key, consumer); + key.setLength(key.length() - 1); +} + } + + void save(DataOutput meta, IndexOutput index) throws IOException { +if (status != Status.UNSAVED) { + throw new IllegalStateException("only unsaved trie can be saved"); +} +status = Status.SAVED; Review Comment: > is the act of saving (this method) destructive? The trie can be saved again if we reset all nodes' fp to -1. We do not actually need that now so i kept it simple. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe
Re: [PR] knn search - add tests to perform exact search when filtering does not return enough results [lucene]
carlosdelest commented on code in PR #14274: URL: https://github.com/apache/lucene/pull/14274#discussion_r1993167987 ## lucene/core/src/test/org/apache/lucene/search/BaseKnnVectorQueryTestCase.java: ## @@ -646,6 +654,24 @@ public void testRandomWithFilter() throws IOException { searcher.search( getThrowingKnnVectorQuery("field", randomVector(dimension), 1, filter4), numDocs)); + + // Test a filter with cost slightly more than k, and check we use exact search as k + // results are not retrieved from approximate search + Query filter5 = IntPoint.newRangeQuery("tag", lower, lower + 11); + results = + searcher.search( + getKnnVectorQuery("field", randomVector(dimension), 10, filter5), numDocs); + assertEquals(10, results.totalHits.value()); + assertEquals(results.totalHits.value(), results.scoreDocs.length); + expectThrows( + UnsupportedOperationException.class, + () -> + searcher.search( + getCappedResultsThrowingKnnVectorQuery( + "field", randomVector(dimension), 10, filter5, 5), + numDocs)); + assertEquals(10, results.totalHits.value()); + assertEquals(results.totalHits.value(), results.scoreDocs.length); Review Comment: I did! I checked all the subclasses with 5K iterations multiple times. The test is forcing a number of results to be returned via the capped results query, so it will always return a number of results that will result in doing an exact query. Why do you think it will be flaky? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org
Re: [PR] A specialized Trie for Block Tree Index [lucene]
gf2121 commented on code in PR #14333: URL: https://github.com/apache/lucene/pull/14333#discussion_r1993786103 ## lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Trie.java: ## @@ -0,0 +1,486 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene90.blocktree; + +import java.io.IOException; +import java.util.ArrayDeque; +import java.util.Deque; +import java.util.LinkedList; +import java.util.List; +import java.util.ListIterator; +import java.util.function.BiConsumer; +import org.apache.lucene.store.DataOutput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.store.RandomAccessInput; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.BytesRefBuilder; + +/** TODO make it a more memory efficient structure */ +class Trie { + + static final int SIGN_NO_CHILDREN = 0x00; + static final int SIGN_SINGLE_CHILDREN_WITH_OUTPUT = 0x01; + static final int SIGN_SINGLE_CHILDREN_WITHOUT_OUTPUT = 0x02; + static final int SIGN_MULTI_CHILDREN = 0x03; + + record Output(long fp, boolean hasTerms, BytesRef floorData) {} + + private enum Status { +UNSAVED, +SAVED, +DESTROYED + } + + private static class Node { +private final int label; +private final LinkedList children; +private Output output; +private long fp = -1; + +Node(int label, Output output, LinkedList children) { + this.label = label; + this.output = output; + this.children = children; +} + } + + private Status status = Status.UNSAVED; + final Node root = new Node(0, null, new LinkedList<>()); + + Trie(BytesRef k, Output v) { +if (k.length == 0) { + root.output = v; + return; +} +Node parent = root; +for (int i = 0; i < k.length; i++) { + int b = k.bytes[i + k.offset] & 0xFF; + Output output = i == k.length - 1 ? v : null; + Node node = new Node(b, output, new LinkedList<>()); + parent.children.add(node); + parent = node; +} + } + + void putAll(Trie trie) { +if (status != Status.UNSAVED || trie.status != Status.UNSAVED) { + throw new IllegalStateException("tries should be unsaved"); +} +trie.status = Status.DESTROYED; +putAll(this.root, trie.root); + } + + private static void putAll(Node n, Node add) { +assert n.label == add.label; +if (add.output != null) { + n.output = add.output; +} +ListIterator iter = n.children.listIterator(); +// TODO we can do more efficient if there is no intersection, block tree always do that +outer: +for (Node addChild : add.children) { + while (iter.hasNext()) { +Node nChild = iter.next(); +if (nChild.label == addChild.label) { + putAll(nChild, addChild); + continue outer; +} +if (nChild.label > addChild.label) { + iter.previous(); // move back + iter.add(addChild); + continue outer; +} + } + iter.add(addChild); +} + } + + Output getEmptyOutput() { +return root.output; + } + + void forEach(BiConsumer consumer) { +if (root.output != null) { + consumer.accept(new BytesRef(), root.output); +} +intersect(root.children, new BytesRefBuilder(), consumer); + } + + private void intersect( + List nodes, BytesRefBuilder key, BiConsumer consumer) { +for (Node node : nodes) { + key.append((byte) node.label); + if (node.output != null) consumer.accept(key.toBytesRef(), node.output); + intersect(node.children, key, consumer); + key.setLength(key.length() - 1); +} + } + + void save(DataOutput meta, IndexOutput index) throws IOException { +if (status != Status.UNSAVED) { + throw new IllegalStateException("only unsaved trie can be saved"); +} +status = Status.SAVED; +meta.writeVLong(index.getFilePointer()); +saveNodes(index); +meta.writeVLong(root.fp); +index.writeLong(0L); // additional 8 bytes for over-reading +meta.writeVLong(index.getFilePointer()); Review Comment: This writes the end file pointer of the trie index so a slice / randomAccessSlice can be pulled on reading side. -- This is an automated mess
Re: [PR] [DRAFT] Case-insensitive matching over union of strings [lucene]
dweiss commented on PR #14350: URL: https://github.com/apache/lucene/pull/14350#issuecomment-2720479164 > @dweiss understands this one the best, he implemented it. ... 15 years ago in LUCENE-3832. Thanks for putting so much trust in my memory. I'll take a look. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org
Re: [PR] Disable the query cache by default. [lucene]
jpountz commented on code in PR #14187: URL: https://github.com/apache/lucene/pull/14187#discussion_r1993476471 ## lucene/core/src/java/org/apache/lucene/search/IndexSearcher.java: ## @@ -77,7 +77,8 @@ public class IndexSearcher { static int maxClauseCount = 1024; - private static QueryCache DEFAULT_QUERY_CACHE; + // Caching is disabled by default. + private static QueryCache DEFAULT_QUERY_CACHE = null; private static QueryCachingPolicy DEFAULT_CACHING_POLICY = new UsageTrackingQueryCachingPolicy(); Review Comment: Good question, I wondered about this too, and for `DEFAULT_CACHING_POLICY` as well. I agree that setting them in a centralized place where searchers get instantiated is a better practice. These defaults are still quite convenient for tests though, which don't have such a central place where searchers get created (not all tests rely on `LuceneTestCase#newSearcher`). In general, I'm in favor of this, would looking into it in a follow-up work for you? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org
Re: [PR] A specialized Trie for Block Tree Index [lucene]
gf2121 commented on code in PR #14333: URL: https://github.com/apache/lucene/pull/14333#discussion_r1993802895 ## lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Trie.java: ## @@ -0,0 +1,486 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene90.blocktree; + +import java.io.IOException; +import java.util.ArrayDeque; +import java.util.Deque; +import java.util.LinkedList; +import java.util.List; +import java.util.ListIterator; +import java.util.function.BiConsumer; +import org.apache.lucene.store.DataOutput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.store.RandomAccessInput; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.BytesRefBuilder; + +/** TODO make it a more memory efficient structure */ +class Trie { + + static final int SIGN_NO_CHILDREN = 0x00; + static final int SIGN_SINGLE_CHILDREN_WITH_OUTPUT = 0x01; + static final int SIGN_SINGLE_CHILDREN_WITHOUT_OUTPUT = 0x02; + static final int SIGN_MULTI_CHILDREN = 0x03; + + record Output(long fp, boolean hasTerms, BytesRef floorData) {} + + private enum Status { +UNSAVED, +SAVED, +DESTROYED + } + + private static class Node { +private final int label; +private final LinkedList children; +private Output output; +private long fp = -1; + +Node(int label, Output output, LinkedList children) { + this.label = label; + this.output = output; + this.children = children; +} + } + + private Status status = Status.UNSAVED; + final Node root = new Node(0, null, new LinkedList<>()); + + Trie(BytesRef k, Output v) { +if (k.length == 0) { + root.output = v; + return; +} +Node parent = root; +for (int i = 0; i < k.length; i++) { + int b = k.bytes[i + k.offset] & 0xFF; + Output output = i == k.length - 1 ? v : null; + Node node = new Node(b, output, new LinkedList<>()); + parent.children.add(node); + parent = node; +} + } + + void putAll(Trie trie) { +if (status != Status.UNSAVED || trie.status != Status.UNSAVED) { + throw new IllegalStateException("tries should be unsaved"); +} +trie.status = Status.DESTROYED; +putAll(this.root, trie.root); + } + + private static void putAll(Node n, Node add) { +assert n.label == add.label; +if (add.output != null) { + n.output = add.output; +} +ListIterator iter = n.children.listIterator(); +// TODO we can do more efficient if there is no intersection, block tree always do that +outer: +for (Node addChild : add.children) { + while (iter.hasNext()) { +Node nChild = iter.next(); +if (nChild.label == addChild.label) { + putAll(nChild, addChild); Review Comment: > but it does mean the length of your terms index prefixes must be under the stack depth recursion limit. +1, we should avoid this recursive implementation. I think we only have limit of 65535 in BytesRefHash, i do not want to add more strict limitation here. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org
Re: [PR] A specialized Trie for Block Tree Index [lucene]
gf2121 commented on code in PR #14333: URL: https://github.com/apache/lucene/pull/14333#discussion_r1993812719 ## lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Trie.java: ## @@ -0,0 +1,486 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene90.blocktree; + +import java.io.IOException; +import java.util.ArrayDeque; +import java.util.Deque; +import java.util.LinkedList; +import java.util.List; +import java.util.ListIterator; +import java.util.function.BiConsumer; +import org.apache.lucene.store.DataOutput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.store.RandomAccessInput; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.BytesRefBuilder; + +/** TODO make it a more memory efficient structure */ +class Trie { + + static final int SIGN_NO_CHILDREN = 0x00; + static final int SIGN_SINGLE_CHILDREN_WITH_OUTPUT = 0x01; + static final int SIGN_SINGLE_CHILDREN_WITHOUT_OUTPUT = 0x02; + static final int SIGN_MULTI_CHILDREN = 0x03; + + record Output(long fp, boolean hasTerms, BytesRef floorData) {} + + private enum Status { +UNSAVED, +SAVED, +DESTROYED + } + + private static class Node { +private final int label; +private final LinkedList children; +private Output output; +private long fp = -1; + +Node(int label, Output output, LinkedList children) { + this.label = label; + this.output = output; + this.children = children; +} + } + + private Status status = Status.UNSAVED; + final Node root = new Node(0, null, new LinkedList<>()); + + Trie(BytesRef k, Output v) { +if (k.length == 0) { + root.output = v; + return; +} +Node parent = root; +for (int i = 0; i < k.length; i++) { + int b = k.bytes[i + k.offset] & 0xFF; + Output output = i == k.length - 1 ? v : null; + Node node = new Node(b, output, new LinkedList<>()); + parent.children.add(node); + parent = node; +} + } + + void putAll(Trie trie) { +if (status != Status.UNSAVED || trie.status != Status.UNSAVED) { + throw new IllegalStateException("tries should be unsaved"); +} +trie.status = Status.DESTROYED; +putAll(this.root, trie.root); + } + + private static void putAll(Node n, Node add) { +assert n.label == add.label; +if (add.output != null) { + n.output = add.output; +} +ListIterator iter = n.children.listIterator(); +// TODO we can do more efficient if there is no intersection, block tree always do that +outer: +for (Node addChild : add.children) { + while (iter.hasNext()) { +Node nChild = iter.next(); +if (nChild.label == addChild.label) { + putAll(nChild, addChild); + continue outer; +} +if (nChild.label > addChild.label) { + iter.previous(); // move back + iter.add(addChild); + continue outer; +} + } + iter.add(addChild); +} + } + + Output getEmptyOutput() { +return root.output; + } + + void forEach(BiConsumer consumer) { +if (root.output != null) { + consumer.accept(new BytesRef(), root.output); +} +intersect(root.children, new BytesRefBuilder(), consumer); + } + + private void intersect( + List nodes, BytesRefBuilder key, BiConsumer consumer) { +for (Node node : nodes) { + key.append((byte) node.label); + if (node.output != null) consumer.accept(key.toBytesRef(), node.output); + intersect(node.children, key, consumer); + key.setLength(key.length() - 1); +} + } + + void save(DataOutput meta, IndexOutput index) throws IOException { +if (status != Status.UNSAVED) { + throw new IllegalStateException("only unsaved trie can be saved"); +} +status = Status.SAVED; +meta.writeVLong(index.getFilePointer()); +saveNodes(index); +meta.writeVLong(root.fp); +index.writeLong(0L); // additional 8 bytes for over-reading +meta.writeVLong(index.getFilePointer()); + } + + void saveNodes(IndexOutput index) throws IOException { +final long startFP = index.getFilePointer(); +Deque stack = new ArrayDeque<>(); +stack.p
Re: [PR] deduplicate standard BKDConfig records [lucene]
iverase merged PR #14338: URL: https://github.com/apache/lucene/pull/14338 -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org
[PR] Improve DenseConjunctionBulkScorer's sparse fallback. [lucene]
jpountz opened a new pull request, #14354: URL: https://github.com/apache/lucene/pull/14354 `DenseConjunctionBulkScorer` has a fallback to sparse evaluation mode when the intersection of the clauses evaluated so far becomes sparse. Currently, this sparse evaluation mode clears bits in the window if they don't match new clauses. This change proposes to treat the bitset like any clause and evaluate the conjunction in the traditional way, collecting matching docs one by one. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org
[PR] Fetch PR branch to fix changelog workflow [lucene]
stefanvodita opened a new pull request, #14355: URL: https://github.com/apache/lucene/pull/14355 I've been debugging this and I think without the `ref`, our diff was effectively between a commit and itself. Now we're actually diffing the PR against the base merge commit. Addresses #13898 -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org
Re: [PR] Add venv to rat gitignore [lucene]
rmuir merged PR #14346: URL: https://github.com/apache/lucene/pull/14346 -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org
Re: [PR] Case-insensitive TermInSetQuery Implementation (Proof of Concept) [lucene]
jpountz commented on PR #14349: URL: https://github.com/apache/lucene/pull/14349#issuecomment-2721204272 Implementing it as a query that rewrites to the proper query based on the list of terms makes sense to me. You may want to move this query to the lucene-queries module since it doesn't really qualify as a "core" query. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org
Re: [I] Create a bot to check if there is a CHANGES entry for new PRs [lucene]
stefanvodita commented on issue #13898: URL: https://github.com/apache/lucene/issues/13898#issuecomment-2721556890 @pseudo-nymous, I've attempted a fix in #14355. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org
Re: [PR] Fetch PR branch to fix changelog workflow [lucene]
stefanvodita merged PR #14355: URL: https://github.com/apache/lucene/pull/14355 -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org
Re: [PR] A specialized Trie for Block Tree Index [lucene]
gf2121 commented on code in PR #14333: URL: https://github.com/apache/lucene/pull/14333#discussion_r1993741308 ## lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Trie.java: ## @@ -0,0 +1,486 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene90.blocktree; + +import java.io.IOException; +import java.util.ArrayDeque; +import java.util.Deque; +import java.util.LinkedList; +import java.util.List; +import java.util.ListIterator; +import java.util.function.BiConsumer; +import org.apache.lucene.store.DataOutput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.store.RandomAccessInput; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.BytesRefBuilder; + +/** TODO make it a more memory efficient structure */ +class Trie { + + static final int SIGN_NO_CHILDREN = 0x00; + static final int SIGN_SINGLE_CHILDREN_WITH_OUTPUT = 0x01; + static final int SIGN_SINGLE_CHILDREN_WITHOUT_OUTPUT = 0x02; + static final int SIGN_MULTI_CHILDREN = 0x03; + + record Output(long fp, boolean hasTerms, BytesRef floorData) {} + + private enum Status { +UNSAVED, +SAVED, +DESTROYED + } + + private static class Node { +private final int label; +private final LinkedList children; +private Output output; +private long fp = -1; + +Node(int label, Output output, LinkedList children) { + this.label = label; + this.output = output; + this.children = children; +} + } + + private Status status = Status.UNSAVED; + final Node root = new Node(0, null, new LinkedList<>()); + + Trie(BytesRef k, Output v) { +if (k.length == 0) { + root.output = v; + return; +} +Node parent = root; +for (int i = 0; i < k.length; i++) { + int b = k.bytes[i + k.offset] & 0xFF; + Output output = i == k.length - 1 ? v : null; + Node node = new Node(b, output, new LinkedList<>()); + parent.children.add(node); + parent = node; +} + } + + void putAll(Trie trie) { +if (status != Status.UNSAVED || trie.status != Status.UNSAVED) { + throw new IllegalStateException("tries should be unsaved"); +} +trie.status = Status.DESTROYED; +putAll(this.root, trie.root); + } + + private static void putAll(Node n, Node add) { +assert n.label == add.label; +if (add.output != null) { + n.output = add.output; +} +ListIterator iter = n.children.listIterator(); +// TODO we can do more efficient if there is no intersection, block tree always do that +outer: +for (Node addChild : add.children) { + while (iter.hasNext()) { +Node nChild = iter.next(); +if (nChild.label == addChild.label) { + putAll(nChild, addChild); + continue outer; +} +if (nChild.label > addChild.label) { + iter.previous(); // move back + iter.add(addChild); + continue outer; +} + } + iter.add(addChild); +} + } + + Output getEmptyOutput() { +return root.output; + } + + void forEach(BiConsumer consumer) { +if (root.output != null) { + consumer.accept(new BytesRef(), root.output); +} +intersect(root.children, new BytesRefBuilder(), consumer); + } + + private void intersect( + List nodes, BytesRefBuilder key, BiConsumer consumer) { +for (Node node : nodes) { + key.append((byte) node.label); + if (node.output != null) consumer.accept(key.toBytesRef(), node.output); + intersect(node.children, key, consumer); + key.setLength(key.length() - 1); +} + } + + void save(DataOutput meta, IndexOutput index) throws IOException { +if (status != Status.UNSAVED) { + throw new IllegalStateException("only unsaved trie can be saved"); +} +status = Status.SAVED; +meta.writeVLong(index.getFilePointer()); +saveNodes(index); +meta.writeVLong(root.fp); +index.writeLong(0L); // additional 8 bytes for over-reading +meta.writeVLong(index.getFilePointer()); + } + + void saveNodes(IndexOutput index) throws IOException { +final long startFP = index.getFilePointer(); +Deque stack = new ArrayDeque<>(); +stack.p
Re: [PR] Improve PointRangeQuery's "inverse" optimization. [lucene]
jpountz merged PR #14353: URL: https://github.com/apache/lucene/pull/14353 -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org
Re: [I] [DISCUSS] Could we have a different ANN algorithm for Learned Sparse Vectors? [lucene]
chishui commented on issue #13675: URL: https://github.com/apache/lucene/issues/13675#issuecomment-2717251375 > I have recently been interested in this direction and plan on spending non trivial amount of time on this over the next few weeks. Assuming we haven't started dev on this, I am assigning it to myself. > > What I have been hacking on is impact sorted prioritisation of docIDs during the ranking phase (especially for custom rankers), so that would probably be the first thing that comes out of this ticket. @atris may I know if there's any progress on this? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org
Re: [PR] [DRAFT] Case-insensitive matching over union of strings [lucene]
dweiss commented on PR #14350: URL: https://github.com/apache/lucene/pull/14350#issuecomment-2720681661 Crap, you're right. Didn't think of it. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org
Re: [PR] A specialized Trie for Block Tree Index [lucene]
gf2121 commented on code in PR #14333: URL: https://github.com/apache/lucene/pull/14333#discussion_r1993768037 ## lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Trie.java: ## @@ -0,0 +1,486 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene90.blocktree; + +import java.io.IOException; +import java.util.ArrayDeque; +import java.util.Deque; +import java.util.LinkedList; +import java.util.List; +import java.util.ListIterator; +import java.util.function.BiConsumer; +import org.apache.lucene.store.DataOutput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.store.RandomAccessInput; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.BytesRefBuilder; + +/** TODO make it a more memory efficient structure */ +class Trie { + + static final int SIGN_NO_CHILDREN = 0x00; + static final int SIGN_SINGLE_CHILDREN_WITH_OUTPUT = 0x01; + static final int SIGN_SINGLE_CHILDREN_WITHOUT_OUTPUT = 0x02; + static final int SIGN_MULTI_CHILDREN = 0x03; + + record Output(long fp, boolean hasTerms, BytesRef floorData) {} + + private enum Status { +UNSAVED, +SAVED, +DESTROYED + } + + private static class Node { +private final int label; +private final LinkedList children; +private Output output; +private long fp = -1; + +Node(int label, Output output, LinkedList children) { + this.label = label; + this.output = output; + this.children = children; +} + } + + private Status status = Status.UNSAVED; + final Node root = new Node(0, null, new LinkedList<>()); + + Trie(BytesRef k, Output v) { Review Comment: I like it! -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org
Re: [PR] [DRAFT] Case-insensitive matching over union of strings [lucene]
dweiss commented on PR #14350: URL: https://github.com/apache/lucene/pull/14350#issuecomment-2720601955 I think this will work just fine in most cases and is a rather inexpensive way to implement this case-insensitive matching, but this comes at the cost of the output automaton that may not be minimal. Consider this example: ``` List terms = new ArrayList<>(List.of( newBytesRef("abc"), newBytesRef("aBC"))); Collections.sort(terms); Automaton a = build(terms, false, false); ``` which produces:  However, when you naively expand just the transitions for each letter variant, you get this:  which clearly isn't minimal (and doesn't pass checkMinimized). I think the absolutely worst case is for the automaton to double the number of transitions - the number of states remains the same. So it's not like it's going to expand uncontrollably... But it's no longer minimal. Perhaps this is acceptable, given the constrained worst case? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org
Re: [PR] Improve PointRangeQuery's "inverse" optimization. [lucene]
jpountz commented on PR #14353: URL: https://github.com/apache/lucene/pull/14353#issuecomment-2721927266 Thanks @iverase ! -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org
Re: [PR] Improve PointRangeQuery's "inverse" optimization. [lucene]
jpountz commented on PR #14353: URL: https://github.com/apache/lucene/pull/14353#issuecomment-2721296892 Maybe I spoke too soon, there seems to be a tiny speedup: ``` TaskQPS baseline StdDevQPS my_modified_version StdDevPct diff p-value OrStopWords 33.22 (8.1%) 32.38 (8.4%) -2.5% ( -17% - 15%) 0.436 AndStopWords 31.39 (7.1%) 30.61 (8.0%) -2.5% ( -16% - 13%) 0.403 Or3Terms 167.25 (4.6%) 164.49 (5.1%) -1.6% ( -10% -8%) 0.386 Or2Terms2StopWords 160.66 (4.6%) 158.63 (5.1%) -1.3% ( -10% -8%) 0.508 And3Terms 172.11 (4.5%) 169.98 (5.5%) -1.2% ( -10% -9%) 0.529 OrHighRare 276.96 (3.4%) 273.78 (4.8%) -1.1% ( -9% -7%) 0.482 TermDTSort 504.48 (4.5%) 498.78 (2.2%) -1.1% ( -7% -5%) 0.414 And2Terms2StopWords 162.45 (4.4%) 160.63 (5.1%) -1.1% ( -10% -8%) 0.552 FilteredAndHighMed 131.89 (3.2%) 130.68 (3.0%) -0.9% ( -6% -5%) 0.449 AndMedOrHighHigh 66.77 (1.1%) 66.25 (2.0%) -0.8% ( -3% -2%) 0.225 FilteredOrStopWords 47.15 (1.9%) 46.85 (1.9%) -0.6% ( -4% -3%) 0.395 CombinedOrHighMed 72.43 (1.9%) 72.11 (1.9%) -0.5% ( -4% -3%) 0.544 OrMany 18.98 (4.5%) 18.90 (4.7%) -0.4% ( -9% -9%) 0.809 FilteredOrHighHigh 68.36 (1.5%) 68.12 (1.7%) -0.4% ( -3% -2%) 0.580 DismaxOrHighHigh 119.08 (1.6%) 118.76 (2.3%) -0.3% ( -4% -3%) 0.728 AndHighMed 131.53 (1.0%) 131.23 (1.9%) -0.2% ( -3% -2%) 0.695 TermTitleSort 142.85 (1.8%) 142.61 (1.9%) -0.2% ( -3% -3%) 0.816 TermDayOfYearSort 700.35 (2.7%) 699.76 (2.5%) -0.1% ( -5% -5%) 0.933 CombinedOrHighHigh 18.83 (1.8%) 18.82 (2.0%) -0.0% ( -3% -3%) 0.952 FilteredAnd3Terms 190.17 (2.3%) 190.10 (1.7%) -0.0% ( -4% -4%) 0.962 CombinedTerm 30.97 (1.8%) 30.97 (2.1%) -0.0% ( -3% -3%) 0.980 FilteredAndStopWords 55.97 (1.7%) 55.98 (1.1%)0.0% ( -2% -2%) 0.971 Term 467.45 (2.5%) 467.58 (2.8%)0.0% ( -5% -5%) 0.979 DismaxTerm 548.26 (3.5%) 548.60 (2.3%)0.1% ( -5% -6%) 0.957 CountFilteredOrHighHigh 106.27 (1.0%) 106.35 (0.9%)0.1% ( -1% -1%) 0.820 OrHighHigh 53.31 (3.3%) 53.36 (3.9%)0.1% ( -6% -7%) 0.950 AndHighHigh 44.76 (1.2%) 44.81 (2.2%)0.1% ( -3% -3%) 0.867 FilteredAndHighHigh 70.50 (1.5%) 70.60 (0.9%)0.2% ( -2% -2%) 0.755 OrHighMed 200.31 (2.6%) 200.63 (3.2%)0.2% ( -5% -6%) 0.889 CountAndHighHigh 302.08 (1.9%) 302.58 (1.7%)0.2% ( -3% -3%) 0.813 PKLookup 282.14 (1.6%) 282.66 (2.2%)0.2% ( -3% -4%) 0.805 FilteredOr3Terms 164.11 (2.0%) 164.45 (1.6%)0.2% ( -3% -3%) 0.774 CountOrMany 29.53 (2.8%) 29.59 (2.2%)0.2% ( -4% -5%) 0.836 Phrase 15.27 (4.4%) 15.30 (4.9%)0.2% ( -8% -9%) 0.909 DismaxOrHighMed 172.63 (1.8%) 173.02 (1.8%)0.2% ( -3% -3%) 0.750 FilteredOrMany 17.46 (2.0%) 17.50 (1.6%)0.2% ( -3% -3%) 0.738 FilteredTerm 161.09 (1.5%) 161.50 (1.0%)0.3% ( -2% -2%) 0.621 FilteredOrHighMed 155.61 (1.4%) 156.02 (1.3%)0.3% ( -2% -2%) 0.614 CountFilteredOrHighMed 117.03 (0.8%) 117.35 (0.8%)0.3% ( -1% -1%) 0.379 TermMonthSort 3472.10 (3.6%) 3482.72 (3.7%)0.3% ( -6% -7%) 0.830 FilteredPhrase 32.86 (2.9%) 32.96 (1.5%)0.3
Re: [PR] Speed up scoring conjunctions a bit. [lucene]
jpountz merged PR #14345: URL: https://github.com/apache/lucene/pull/14345 -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org
[PR] Improve PointRangeQuery's "inverse" optimization. [lucene]
jpountz opened a new pull request, #14353: URL: https://github.com/apache/lucene/pull/14353 When a `PointRangeQuery` matches for than 50% of points and the field is dense and single-valued, it internally computes the set of docs that don't match the query, which should be faster since fewer docs match the query than not. This change improves this optimization to take advantage of the `FixedBitSet#or(DocIdSetIterator)` optimization (which internally calls `DocIdSetIterator#intoBitSet`). -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org
Re: [PR] PointInSetQuery use reverse collection to improve performance [lucene]
msfroh commented on PR #14352: URL: https://github.com/apache/lucene/pull/14352#issuecomment-2722703844 > Another new thought is that I believe SinglePointVisitor is unnecessary, as processing each point separately will traverse the bkd tree multiple times. MergePointVisitor will only traverse the bkd tree once. Therefore, in most cases, SinglePointVisitor will result in slower performance. As long as MergePointVisitor supports both single and multiple dimensions, reverse collection optimization can be used. Right now, `MergePointVisitor` only supports 1-dimension, since it relies on the fact that it can traverse the tree from left to right at the same time as it's traversing the points from left to right (like a merge sort). I think it might be possible to do an N-dimensional approach like a quick sort, though. Suppose you have a list of N-dimensional points. At the root of the tree, (I think) you always have two children. If you take the upper bound of the right child / lower bound of the left child as your pivot, you can do an O(n) reordering of the list into points that potentially match on the left and right. (If there's a common value shared between right and left, you would have to push those values both ways.) As you go down the tree, you're focused on a sublist of the original list and can reorder that sublist based on the next level's pivot. The values shared between left and right might mess things up, though, since you can't safely reorder them on the left without potentially messing up the right. Hmm... might still be doable, though. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org
Re: [PR] [DRAFT] Case-insensitive matching over union of strings [lucene]
msfroh commented on PR #14350: URL: https://github.com/apache/lucene/pull/14350#issuecomment-2722105723 My thinking is that a query that uses this should lowercase, dedupe, and sort the input before feeding it into `StringsToAutomaton`. That would handle @dweiss's example (i.e. that input is "invalid", or at least exact duplicates wouldn't add any new states, I think, since the full string exists as a prior prefix). Would a check with `Character.isLowerCase()` on each input codepoint for the case-insensitive case be sufficient to reject that kind of input across all valid Unicode strings? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org
Re: [PR] Address completion fields testing gap and truly allow loading FST off heap [lucene]
github-actions[bot] commented on PR #14270: URL: https://github.com/apache/lucene/pull/14270#issuecomment-2722992549 This PR has not had activity in the past 2 weeks, labeling it as stale. If the PR is waiting for review, notify the d...@lucene.apache.org list. Thank you for your contribution! -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org
Re: [PR] Add posTagFormat parameter for OpenNLPPOSFilter [lucene]
github-actions[bot] commented on PR #14194: URL: https://github.com/apache/lucene/pull/14194#issuecomment-2722992638 This PR has not had activity in the past 2 weeks, labeling it as stale. If the PR is waiting for review, notify the d...@lucene.apache.org list. Thank you for your contribution! -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org
Re: [PR] Make Lucene better at skipping long runs of matches. [lucene]
gf2121 commented on code in PR #14312: URL: https://github.com/apache/lucene/pull/14312#discussion_r1987237995 ## lucene/core/src/java/org/apache/lucene/search/DenseConjunctionBulkScorer.java: ## @@ -117,6 +107,65 @@ private static int advance(FixedBitSet set, int i) { } } + private int scoreWindow( + LeafCollector collector, Bits acceptDocs, List iterators, int min, int max) + throws IOException { + +// Advance all iterators to the first doc that is greater than or equal to min. This is +// important as this is the only place where we can take advantage of a large gap between +// consecutive matches in any clause. +for (DocIdSetIterator iterator : iterators) { + if (iterator.docID() >= min) { +min = iterator.docID(); + } else { +min = iterator.advance(min); + } +} +if (min >= max) { + return min; +} + +int bitsetWindowMax = (int) Math.min(max, (long) min + WINDOW_SIZE); + +if (acceptDocs == null) { + int minDocIDRunEnd = max; + for (DocIdSetIterator iterator : iterators) { +if (iterator.docID() > min) { + minDocIDRunEnd = min; + break; +} else { + minDocIDRunEnd = Math.min(minDocIDRunEnd, iterator.docIDRunEnd()); +} + } + + if (minDocIDRunEnd >= bitsetWindowMax) { Review Comment: > I would like to avoid collecting tiny windows of doc IDs at once, so that collectors can feel free to apply logic that has some overhead in LeafCollector#collect(DocIdStream) (e.g. https://github.com/apache/lucene/pull/14273/files#diff-05525bb5769d4251279bcd9c76d259f8eb451a16075af357e69bd98890c3db5bR257). Good point, thanks! -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org
Re: [PR] [DRAFT] Case-insensitive matching over union of strings [lucene]
rmuir commented on PR #14350: URL: https://github.com/apache/lucene/pull/14350#issuecomment-2722901888 > Would a check with `Character.isLowerCase()` on each input codepoint for the case-insensitive case be sufficient to reject that kind of input across all valid Unicode strings? I dont think so for greek. I would step back from that and try to get matching working with simple Character.toLowerCase/toUpperCase first? If the user provides data with a certain order/casing as you suggest, will they always get a DFA? I'm less concerned about it being minimal, let's start with deterministic. I don't think we should do this if it will explode (e.g. NFA). And union of strings really wants to do that, if handled the naive way, that's why there is a special algorithm for it. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org
Re: [PR] [DRAFT] Case-insensitive matching over union of strings [lucene]
msfroh commented on PR #14350: URL: https://github.com/apache/lucene/pull/14350#issuecomment-2722961519 To the best of my understanding from reading the through the code while sketching this PR, I believe it would produce a minimal DFA if every character in a set of alternatives in the input strings have the same canonical representation. (The existing implementation already throws if input is not sorted BytesRefs.) That is, if you input `cap, cat, cats, cob`, it will generate the minimal DFA. If you input `CAP, CAT, CATS, COB`, you'll end up with the same minimal DFA (albeit with the transitions added in the opposite order, which I think is fine). But if you input `CAP, CATS, cat, cob`, you'll end up with a NFA. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org
Re: [PR] PointInSetQuery clips segments by lower and upper [lucene]
hanbj commented on PR #14268: URL: https://github.com/apache/lucene/pull/14268#issuecomment-2720535117 @stefanvodita Thank you for the review. Unit testing has been added -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org
Re: [PR] knn search - add tests to perform exact search when filtering does not return enough results [lucene]
benwtrent commented on code in PR #14274: URL: https://github.com/apache/lucene/pull/14274#discussion_r1993097312 ## lucene/core/src/test/org/apache/lucene/search/BaseKnnVectorQueryTestCase.java: ## @@ -646,6 +654,24 @@ public void testRandomWithFilter() throws IOException { searcher.search( getThrowingKnnVectorQuery("field", randomVector(dimension), 1, filter4), numDocs)); + + // Test a filter with cost slightly more than k, and check we use exact search as k + // results are not retrieved from approximate search + Query filter5 = IntPoint.newRangeQuery("tag", lower, lower + 11); + results = + searcher.search( + getKnnVectorQuery("field", randomVector(dimension), 10, filter5), numDocs); + assertEquals(10, results.totalHits.value()); + assertEquals(results.totalHits.value(), results.scoreDocs.length); + expectThrows( + UnsupportedOperationException.class, + () -> + searcher.search( + getCappedResultsThrowingKnnVectorQuery( + "field", randomVector(dimension), 10, filter5, 5), + numDocs)); + assertEquals(10, results.totalHits.value()); + assertEquals(results.totalHits.value(), results.scoreDocs.length); Review Comment: I suspect this is flaky, have you ran this many times locally to detect any weird edges? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org
Re: [PR] A specialized Trie for Block Tree Index [lucene]
mikemccand commented on PR #14333: URL: https://github.com/apache/lucene/pull/14333#issuecomment-2720657669 This is an exciting change @gf2121! Smaller, simpler, and faster!? I'll try to review soon. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org
Re: [PR] PointInSetQuery clips segments by lower and upper [lucene]
iverase commented on PR #14268: URL: https://github.com/apache/lucene/pull/14268#issuecomment-2720590130 >When creating a PointInSetQuery object, the data in the packedPoints parameter is returned in order, so the maximum and minimum values can be determined when iterating over packedPoints. I don't think this is in the BDK tree contract, it feels you are taking advantagle of an implementation detail? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org
Re: [PR] [DRAFT] Case-insensitive matching over union of strings [lucene]
rmuir commented on PR #14350: URL: https://github.com/apache/lucene/pull/14350#issuecomment-2720652612 Bigger downside: that example isn't deterministic either. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org
[PR] PointInSetQuery use reverse collection to improve performance [lucene]
hanbj opened a new pull request, #14352: URL: https://github.com/apache/lucene/pull/14352 ### Description Performance issues with terms number field encountered in production environments: high query time and very high CPU usage. Through analysis and localization, it was found that the main time consumption is in the build_store stage. The main task of build_store is to collect document IDs and fill bitset in memory. When there are a large number of document IDs, it takes up a lot of CPU. After reading the code and optimizing it, the performance of terms number field is improved by 5-10 times in low cardinality case. 1. values.getDocCount() == reader.maxDoc() : ensures that each document contains this field. 2. values.getDocCount() == values.size() : ensures that each document has only one value for this field. 3. cost() > reader.maxDoc() / 2 : ensure that more than half of the documents matched by all the points to be queried. Because when each document has only one value for this field, the document id in the bkd tree will not be duplicated. There will be no collection of duplicate document IDs. Therefore, first determine the number of documents that match all the points to be queried. If it exceeds half, use reverse collection to reduce the number of collected document ids. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org
Re: [PR] [DRAFT] Case-insensitive matching over union of strings [lucene]
dweiss commented on PR #14350: URL: https://github.com/apache/lucene/pull/14350#issuecomment-2720790704 I also don't think you can make it deterministic in any trivial way. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org
Re: [PR] A specialized Trie for Block Tree Index [lucene]
gf2121 commented on code in PR #14333: URL: https://github.com/apache/lucene/pull/14333#discussion_r1994852913 ## lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Trie.java: ## @@ -0,0 +1,486 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene90.blocktree; + +import java.io.IOException; +import java.util.ArrayDeque; +import java.util.Deque; +import java.util.LinkedList; +import java.util.List; +import java.util.ListIterator; +import java.util.function.BiConsumer; +import org.apache.lucene.store.DataOutput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.store.RandomAccessInput; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.BytesRefBuilder; + +/** TODO make it a more memory efficient structure */ +class Trie { + + static final int SIGN_NO_CHILDREN = 0x00; + static final int SIGN_SINGLE_CHILDREN_WITH_OUTPUT = 0x01; + static final int SIGN_SINGLE_CHILDREN_WITHOUT_OUTPUT = 0x02; + static final int SIGN_MULTI_CHILDREN = 0x03; + + record Output(long fp, boolean hasTerms, BytesRef floorData) {} + + private enum Status { +UNSAVED, +SAVED, +DESTROYED + } + + private static class Node { +private final int label; +private final LinkedList children; +private Output output; +private long fp = -1; + +Node(int label, Output output, LinkedList children) { + this.label = label; + this.output = output; + this.children = children; +} + } + + private Status status = Status.UNSAVED; + final Node root = new Node(0, null, new LinkedList<>()); + + Trie(BytesRef k, Output v) { +if (k.length == 0) { + root.output = v; + return; +} +Node parent = root; +for (int i = 0; i < k.length; i++) { + int b = k.bytes[i + k.offset] & 0xFF; + Output output = i == k.length - 1 ? v : null; + Node node = new Node(b, output, new LinkedList<>()); + parent.children.add(node); + parent = node; +} + } + + void putAll(Trie trie) { +if (status != Status.UNSAVED || trie.status != Status.UNSAVED) { + throw new IllegalStateException("tries should be unsaved"); +} +trie.status = Status.DESTROYED; +putAll(this.root, trie.root); + } + + private static void putAll(Node n, Node add) { +assert n.label == add.label; +if (add.output != null) { + n.output = add.output; +} +ListIterator iter = n.children.listIterator(); +// TODO we can do more efficient if there is no intersection, block tree always do that +outer: +for (Node addChild : add.children) { + while (iter.hasNext()) { +Node nChild = iter.next(); +if (nChild.label == addChild.label) { + putAll(nChild, addChild); + continue outer; +} +if (nChild.label > addChild.label) { + iter.previous(); // move back + iter.add(addChild); + continue outer; +} + } + iter.add(addChild); +} + } + + Output getEmptyOutput() { +return root.output; + } + + void forEach(BiConsumer consumer) { +if (root.output != null) { + consumer.accept(new BytesRef(), root.output); +} +intersect(root.children, new BytesRefBuilder(), consumer); + } + + private void intersect( + List nodes, BytesRefBuilder key, BiConsumer consumer) { +for (Node node : nodes) { + key.append((byte) node.label); + if (node.output != null) consumer.accept(key.toBytesRef(), node.output); + intersect(node.children, key, consumer); + key.setLength(key.length() - 1); +} + } + + void save(DataOutput meta, IndexOutput index) throws IOException { +if (status != Status.UNSAVED) { + throw new IllegalStateException("only unsaved trie can be saved"); +} +status = Status.SAVED; +meta.writeVLong(index.getFilePointer()); +saveNodes(index); +meta.writeVLong(root.fp); +index.writeLong(0L); // additional 8 bytes for over-reading +meta.writeVLong(index.getFilePointer()); + } + + void saveNodes(IndexOutput index) throws IOException { +final long startFP = index.getFilePointer(); +Deque stack = new ArrayDeque<>(); +stack.p