Re: [PR] Avoid modify merge state in per field mergers [lucene]
jpountz commented on code in PR #13208: URL: https://github.com/apache/lucene/pull/13208#discussion_r1537183041 ## lucene/core/src/java/org/apache/lucene/codecs/perfield/PerFieldMergeState.java: ## @@ -31,40 +31,18 @@ import org.apache.lucene.index.MergeState; import org.apache.lucene.index.Terms; -/** - * Utility class to update the {@link MergeState} instance to be restricted to a set of fields. - * - * Warning: the input {@linkplain MergeState} instance will be updated when calling {@link - * #apply(Collection)}. - * - * It should be called within a {@code try {...} finally {...}} block to make sure that the - * mergeState instance is restored to its original state: - * - * - * PerFieldMergeState pfMergeState = new PerFieldMergeState(mergeState); - * try { - * doSomething(pfMergeState.apply(fields)); - * ... - * } finally { - * pfMergeState.reset(); - * } - * - */ +/** Utility class to update the {@link MergeState} instance to be restricted to a set of fields. */ final class PerFieldMergeState { private final MergeState in; - private final FieldInfos orgMergeFieldInfos; - private final FieldInfos[] orgFieldInfos; - private final FieldsProducer[] orgFieldsProducers; + private final MergeState copy; Review Comment: Should we dynamically allocate this copy in `apply()` so that each caller gets an independent instance, and we're less subject to a similar kind of bug in the future due to multiple threads trying to modify this `copy` at the same time? ## lucene/core/src/java/org/apache/lucene/index/MergeState.java: ## @@ -23,13 +23,7 @@ import java.util.Locale; import java.util.concurrent.Executor; import java.util.concurrent.TimeUnit; -import org.apache.lucene.codecs.DocValuesProducer; -import org.apache.lucene.codecs.FieldsProducer; -import org.apache.lucene.codecs.KnnVectorsReader; -import org.apache.lucene.codecs.NormsProducer; -import org.apache.lucene.codecs.PointsReader; -import org.apache.lucene.codecs.StoredFieldsReader; -import org.apache.lucene.codecs.TermVectorsReader; +import org.apache.lucene.codecs.*; Review Comment: We prefer to avoid star imports. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org
Re: [PR] Convert IOContext, MergeInfo, and FlushInfo to record classes [lucene]
jpountz commented on code in PR #13205: URL: https://github.com/apache/lucene/pull/13205#discussion_r1537197013 ## lucene/core/src/java/org/apache/lucene/store/IOContext.java: ## @@ -62,91 +55,36 @@ public enum Context { public static final IOContext LOAD = new IOContext(false, true); - public IOContext() { -this(false, false); - } - - public IOContext(FlushInfo flushInfo) { -assert flushInfo != null; -this.context = Context.FLUSH; -this.mergeInfo = null; -this.readOnce = false; -this.load = false; -this.flushInfo = flushInfo; - } - - public IOContext(Context context) { -this(context, null); + @SuppressWarnings("incomplete-switch") + public IOContext { +switch (context) { + case MERGE -> Objects.requireNonNull( + mergeInfo, "mergeInfo must not be null if context is MERGE"); + case FLUSH -> Objects.requireNonNull( + flushInfo, "flushInfo must not be null if context is FLUSH"); +} } private IOContext(boolean readOnce, boolean load) { -this.context = Context.READ; -this.mergeInfo = null; -this.readOnce = readOnce; -this.load = load; -this.flushInfo = null; - } - - public IOContext(MergeInfo mergeInfo) { -this(Context.MERGE, mergeInfo); +this(Context.READ, null, null, readOnce, load); } - private IOContext(Context context, MergeInfo mergeInfo) { -assert context != Context.MERGE || mergeInfo != null -: "MergeInfo must not be null if context is MERGE"; -assert context != Context.FLUSH : "Use IOContext(FlushInfo) to create a FLUSH IOContext"; -this.context = context; -this.readOnce = false; -this.load = false; -this.mergeInfo = mergeInfo; -this.flushInfo = null; + private IOContext(Context context) { +this(context, null, null, false, false); } - /** - * This constructor is used to initialize a {@link IOContext} instance with a new value for the - * readOnce variable. - * - * @param ctxt {@link IOContext} object whose information is used to create the new instance - * except the readOnce variable. - * @param readOnce The new {@link IOContext} object will use this value for readOnce. - */ - public IOContext(IOContext ctxt, boolean readOnce) { -this.context = ctxt.context; -this.mergeInfo = ctxt.mergeInfo; -this.flushInfo = ctxt.flushInfo; -this.readOnce = readOnce; -this.load = false; - } - - @Override - public int hashCode() { -return Objects.hash(context, flushInfo, mergeInfo, readOnce, load); + /** Creates an IOContext for flushing. */ + public IOContext(FlushInfo flushInfo) { +this(Context.FLUSH, null, flushInfo, false, false); } - @Override - public boolean equals(Object obj) { -if (this == obj) return true; -if (obj == null) return false; -if (getClass() != obj.getClass()) return false; -IOContext other = (IOContext) obj; -if (context != other.context) return false; -if (!Objects.equals(flushInfo, other.flushInfo)) return false; -if (!Objects.equals(mergeInfo, other.mergeInfo)) return false; -if (readOnce != other.readOnce) return false; -if (load != other.load) return false; -return true; + /** Creates an IOContext for merging. */ + public IOContext(MergeInfo mergeInfo) { +this(Context.MERGE, mergeInfo, null, false, false); } - @Override - public String toString() { -return "IOContext [context=" -+ context -+ ", mergeInfo=" -+ mergeInfo -+ ", flushInfo=" -+ flushInfo -+ ", readOnce=" -+ readOnce -+ "]"; + /** Return a copy of this IOContext with {@link #readOnce} set to {@code true}. */ + public IOContext toReadOnce() { +return new IOContext(context, mergeInfo, flushInfo, true, load); Review Comment: Should this set `load` to `false`? The current copy constructor that sets `readOnce` to true also forces `load` to be false. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org
Re: [PR] Reduce some unnecessary ArrayUtil#grow calls [lucene]
stefanvodita commented on PR #13171: URL: https://github.com/apache/lucene/pull/13171#issuecomment-2017455119 @easyice - I'm probably missing something obvious, but I thought `ArrayUtil.grow` already made the same check. Can you explain why this PR helps? https://github.com/apache/lucene/blob/04f335ad79cae019d1a552e41e0029d061d8e3d9/lucene/core/src/java/org/apache/lucene/util/ArrayUtil.java#L430-L435 -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org
Re: [PR] Add Romanian stopwords with s&t with comma [lucene]
stefanvodita commented on code in PR #12172: URL: https://github.com/apache/lucene/pull/12172#discussion_r1537211363 ## lucene/analysis/common/src/resources/org/apache/lucene/analysis/ro/stopwords.txt: ## @@ -190,27 +207,34 @@ sale sau său se +și şi sînt sîntem +sînteți Review Comment: If we have `sînt`/`sîntem`/`sînteți` shouldn't we have `sânt`/`sântem`/`sânteți` as well? Maybe that's a separate PR. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org
Re: [PR] Fix TestTaxonomyFacetValueSource.testRandom [lucene]
stefanvodita commented on PR #13198: URL: https://github.com/apache/lucene/pull/13198#issuecomment-2017533598 Aren't we changing the random number generation when we add the merge policy, so we're no longer producing a failing case by chance? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org
Re: [PR] Convert IOContext, MergeInfo, and FlushInfo to record classes [lucene]
uschindler commented on code in PR #13205: URL: https://github.com/apache/lucene/pull/13205#discussion_r1537285277 ## lucene/core/src/java/org/apache/lucene/store/IOContext.java: ## @@ -62,91 +55,36 @@ public enum Context { public static final IOContext LOAD = new IOContext(false, true); - public IOContext() { -this(false, false); - } - - public IOContext(FlushInfo flushInfo) { -assert flushInfo != null; -this.context = Context.FLUSH; -this.mergeInfo = null; -this.readOnce = false; -this.load = false; -this.flushInfo = flushInfo; - } - - public IOContext(Context context) { -this(context, null); + @SuppressWarnings("incomplete-switch") + public IOContext { +switch (context) { + case MERGE -> Objects.requireNonNull( + mergeInfo, "mergeInfo must not be null if context is MERGE"); + case FLUSH -> Objects.requireNonNull( + flushInfo, "flushInfo must not be null if context is FLUSH"); +} } private IOContext(boolean readOnce, boolean load) { -this.context = Context.READ; -this.mergeInfo = null; -this.readOnce = readOnce; -this.load = load; -this.flushInfo = null; - } - - public IOContext(MergeInfo mergeInfo) { -this(Context.MERGE, mergeInfo); +this(Context.READ, null, null, readOnce, load); } - private IOContext(Context context, MergeInfo mergeInfo) { -assert context != Context.MERGE || mergeInfo != null -: "MergeInfo must not be null if context is MERGE"; -assert context != Context.FLUSH : "Use IOContext(FlushInfo) to create a FLUSH IOContext"; -this.context = context; -this.readOnce = false; -this.load = false; -this.mergeInfo = mergeInfo; -this.flushInfo = null; + private IOContext(Context context) { +this(context, null, null, false, false); } - /** - * This constructor is used to initialize a {@link IOContext} instance with a new value for the - * readOnce variable. - * - * @param ctxt {@link IOContext} object whose information is used to create the new instance - * except the readOnce variable. - * @param readOnce The new {@link IOContext} object will use this value for readOnce. - */ - public IOContext(IOContext ctxt, boolean readOnce) { -this.context = ctxt.context; -this.mergeInfo = ctxt.mergeInfo; -this.flushInfo = ctxt.flushInfo; -this.readOnce = readOnce; -this.load = false; - } - - @Override - public int hashCode() { -return Objects.hash(context, flushInfo, mergeInfo, readOnce, load); + /** Creates an IOContext for flushing. */ + public IOContext(FlushInfo flushInfo) { +this(Context.FLUSH, null, flushInfo, false, false); } - @Override - public boolean equals(Object obj) { -if (this == obj) return true; -if (obj == null) return false; -if (getClass() != obj.getClass()) return false; -IOContext other = (IOContext) obj; -if (context != other.context) return false; -if (!Objects.equals(flushInfo, other.flushInfo)) return false; -if (!Objects.equals(mergeInfo, other.mergeInfo)) return false; -if (readOnce != other.readOnce) return false; -if (load != other.load) return false; -return true; + /** Creates an IOContext for merging. */ + public IOContext(MergeInfo mergeInfo) { +this(Context.MERGE, mergeInfo, null, false, false); } - @Override - public String toString() { -return "IOContext [context=" -+ context -+ ", mergeInfo=" -+ mergeInfo -+ ", flushInfo=" -+ flushInfo -+ ", readOnce=" -+ readOnce -+ "]"; + /** Return a copy of this IOContext with {@link #readOnce} set to {@code true}. */ + public IOContext toReadOnce() { +return new IOContext(context, mergeInfo, flushInfo, true, load); Review Comment: I missed to mention this in the issue description: This seems to be a bug in current code, because the copy-ctor did not do what its documentation said. The code is only used in [VariableGapTermsIndexReader.java](https://github.com/apache/lucene/pull/13205/files#diff-6cc5c40198a0a7cb2746909210a4ad045cb71a970c4d7fa023dbb02079eb33c7) at the moment. I don't know if ever `load=true` there, so changing this here as a bugfix is possibly a noop. Unless theres a real reason to enforce this, we must document it. What do you think? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org
Re: [PR] Convert IOContext, MergeInfo, and FlushInfo to record classes [lucene]
uschindler commented on code in PR #13205: URL: https://github.com/apache/lucene/pull/13205#discussion_r1537287589 ## lucene/core/src/java/org/apache/lucene/store/IOContext.java: ## @@ -62,91 +55,36 @@ public enum Context { public static final IOContext LOAD = new IOContext(false, true); - public IOContext() { -this(false, false); - } - - public IOContext(FlushInfo flushInfo) { -assert flushInfo != null; -this.context = Context.FLUSH; -this.mergeInfo = null; -this.readOnce = false; -this.load = false; -this.flushInfo = flushInfo; - } - - public IOContext(Context context) { -this(context, null); + @SuppressWarnings("incomplete-switch") + public IOContext { +switch (context) { + case MERGE -> Objects.requireNonNull( + mergeInfo, "mergeInfo must not be null if context is MERGE"); + case FLUSH -> Objects.requireNonNull( + flushInfo, "flushInfo must not be null if context is FLUSH"); +} } private IOContext(boolean readOnce, boolean load) { -this.context = Context.READ; -this.mergeInfo = null; -this.readOnce = readOnce; -this.load = load; -this.flushInfo = null; - } - - public IOContext(MergeInfo mergeInfo) { -this(Context.MERGE, mergeInfo); +this(Context.READ, null, null, readOnce, load); } - private IOContext(Context context, MergeInfo mergeInfo) { -assert context != Context.MERGE || mergeInfo != null -: "MergeInfo must not be null if context is MERGE"; -assert context != Context.FLUSH : "Use IOContext(FlushInfo) to create a FLUSH IOContext"; -this.context = context; -this.readOnce = false; -this.load = false; -this.mergeInfo = mergeInfo; -this.flushInfo = null; + private IOContext(Context context) { +this(context, null, null, false, false); } - /** - * This constructor is used to initialize a {@link IOContext} instance with a new value for the - * readOnce variable. - * - * @param ctxt {@link IOContext} object whose information is used to create the new instance - * except the readOnce variable. - * @param readOnce The new {@link IOContext} object will use this value for readOnce. - */ - public IOContext(IOContext ctxt, boolean readOnce) { -this.context = ctxt.context; -this.mergeInfo = ctxt.mergeInfo; -this.flushInfo = ctxt.flushInfo; -this.readOnce = readOnce; -this.load = false; - } - - @Override - public int hashCode() { -return Objects.hash(context, flushInfo, mergeInfo, readOnce, load); + /** Creates an IOContext for flushing. */ + public IOContext(FlushInfo flushInfo) { +this(Context.FLUSH, null, flushInfo, false, false); } - @Override - public boolean equals(Object obj) { -if (this == obj) return true; -if (obj == null) return false; -if (getClass() != obj.getClass()) return false; -IOContext other = (IOContext) obj; -if (context != other.context) return false; -if (!Objects.equals(flushInfo, other.flushInfo)) return false; -if (!Objects.equals(mergeInfo, other.mergeInfo)) return false; -if (readOnce != other.readOnce) return false; -if (load != other.load) return false; -return true; + /** Creates an IOContext for merging. */ + public IOContext(MergeInfo mergeInfo) { +this(Context.MERGE, mergeInfo, null, false, false); } - @Override - public String toString() { -return "IOContext [context=" -+ context -+ ", mergeInfo=" -+ mergeInfo -+ ", flushInfo=" -+ flushInfo -+ ", readOnce=" -+ readOnce -+ "]"; + /** Return a copy of this IOContext with {@link #readOnce} set to {@code true}. */ + public IOContext toReadOnce() { +return new IOContext(context, mergeInfo, flushInfo, true, load); Review Comment: Actually it would be better to add "modifier" methods like toReadOnce for all booleans? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org
Re: [PR] Convert IOContext, MergeInfo, and FlushInfo to record classes [lucene]
jpountz commented on code in PR #13205: URL: https://github.com/apache/lucene/pull/13205#discussion_r1537311661 ## lucene/core/src/java/org/apache/lucene/store/IOContext.java: ## @@ -62,91 +55,36 @@ public enum Context { public static final IOContext LOAD = new IOContext(false, true); - public IOContext() { -this(false, false); - } - - public IOContext(FlushInfo flushInfo) { -assert flushInfo != null; -this.context = Context.FLUSH; -this.mergeInfo = null; -this.readOnce = false; -this.load = false; -this.flushInfo = flushInfo; - } - - public IOContext(Context context) { -this(context, null); + @SuppressWarnings("incomplete-switch") + public IOContext { +switch (context) { + case MERGE -> Objects.requireNonNull( + mergeInfo, "mergeInfo must not be null if context is MERGE"); + case FLUSH -> Objects.requireNonNull( + flushInfo, "flushInfo must not be null if context is FLUSH"); +} } private IOContext(boolean readOnce, boolean load) { -this.context = Context.READ; -this.mergeInfo = null; -this.readOnce = readOnce; -this.load = load; -this.flushInfo = null; - } - - public IOContext(MergeInfo mergeInfo) { -this(Context.MERGE, mergeInfo); +this(Context.READ, null, null, readOnce, load); } - private IOContext(Context context, MergeInfo mergeInfo) { -assert context != Context.MERGE || mergeInfo != null -: "MergeInfo must not be null if context is MERGE"; -assert context != Context.FLUSH : "Use IOContext(FlushInfo) to create a FLUSH IOContext"; -this.context = context; -this.readOnce = false; -this.load = false; -this.mergeInfo = mergeInfo; -this.flushInfo = null; + private IOContext(Context context) { +this(context, null, null, false, false); } - /** - * This constructor is used to initialize a {@link IOContext} instance with a new value for the - * readOnce variable. - * - * @param ctxt {@link IOContext} object whose information is used to create the new instance - * except the readOnce variable. - * @param readOnce The new {@link IOContext} object will use this value for readOnce. - */ - public IOContext(IOContext ctxt, boolean readOnce) { -this.context = ctxt.context; -this.mergeInfo = ctxt.mergeInfo; -this.flushInfo = ctxt.flushInfo; -this.readOnce = readOnce; -this.load = false; - } - - @Override - public int hashCode() { -return Objects.hash(context, flushInfo, mergeInfo, readOnce, load); + /** Creates an IOContext for flushing. */ + public IOContext(FlushInfo flushInfo) { +this(Context.FLUSH, null, flushInfo, false, false); } - @Override - public boolean equals(Object obj) { -if (this == obj) return true; -if (obj == null) return false; -if (getClass() != obj.getClass()) return false; -IOContext other = (IOContext) obj; -if (context != other.context) return false; -if (!Objects.equals(flushInfo, other.flushInfo)) return false; -if (!Objects.equals(mergeInfo, other.mergeInfo)) return false; -if (readOnce != other.readOnce) return false; -if (load != other.load) return false; -return true; + /** Creates an IOContext for merging. */ + public IOContext(MergeInfo mergeInfo) { +this(Context.MERGE, mergeInfo, null, false, false); } - @Override - public String toString() { -return "IOContext [context=" -+ context -+ ", mergeInfo=" -+ mergeInfo -+ ", flushInfo=" -+ flushInfo -+ ", readOnce=" -+ readOnce -+ "]"; + /** Return a copy of this IOContext with {@link #readOnce} set to {@code true}. */ + public IOContext toReadOnce() { +return new IOContext(context, mergeInfo, flushInfo, true, load); Review Comment: IMO this is a bug in the doc rather than in the code. `load` imlies that the `IndexInput` will be hammered with calls to `seek`/`read`, possibly via the `RandomAccessInput` abstraction, so it's incompatible with `readOnce`. I hope we can refactor all these booleans to an enum as a next step, something like `access = READ_ONCE | READ | LOAD | RANDOM` similarly to the various pre-built `IOContext`s we expose. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org
Re: [PR] Convert IOContext, MergeInfo, and FlushInfo to record classes [lucene]
uschindler commented on code in PR #13205: URL: https://github.com/apache/lucene/pull/13205#discussion_r1537345317 ## lucene/core/src/java/org/apache/lucene/store/IOContext.java: ## @@ -62,91 +55,36 @@ public enum Context { public static final IOContext LOAD = new IOContext(false, true); - public IOContext() { -this(false, false); - } - - public IOContext(FlushInfo flushInfo) { -assert flushInfo != null; -this.context = Context.FLUSH; -this.mergeInfo = null; -this.readOnce = false; -this.load = false; -this.flushInfo = flushInfo; - } - - public IOContext(Context context) { -this(context, null); + @SuppressWarnings("incomplete-switch") + public IOContext { +switch (context) { + case MERGE -> Objects.requireNonNull( + mergeInfo, "mergeInfo must not be null if context is MERGE"); + case FLUSH -> Objects.requireNonNull( + flushInfo, "flushInfo must not be null if context is FLUSH"); +} } private IOContext(boolean readOnce, boolean load) { -this.context = Context.READ; -this.mergeInfo = null; -this.readOnce = readOnce; -this.load = load; -this.flushInfo = null; - } - - public IOContext(MergeInfo mergeInfo) { -this(Context.MERGE, mergeInfo); +this(Context.READ, null, null, readOnce, load); } - private IOContext(Context context, MergeInfo mergeInfo) { -assert context != Context.MERGE || mergeInfo != null -: "MergeInfo must not be null if context is MERGE"; -assert context != Context.FLUSH : "Use IOContext(FlushInfo) to create a FLUSH IOContext"; -this.context = context; -this.readOnce = false; -this.load = false; -this.mergeInfo = mergeInfo; -this.flushInfo = null; + private IOContext(Context context) { +this(context, null, null, false, false); } - /** - * This constructor is used to initialize a {@link IOContext} instance with a new value for the - * readOnce variable. - * - * @param ctxt {@link IOContext} object whose information is used to create the new instance - * except the readOnce variable. - * @param readOnce The new {@link IOContext} object will use this value for readOnce. - */ - public IOContext(IOContext ctxt, boolean readOnce) { -this.context = ctxt.context; -this.mergeInfo = ctxt.mergeInfo; -this.flushInfo = ctxt.flushInfo; -this.readOnce = readOnce; -this.load = false; - } - - @Override - public int hashCode() { -return Objects.hash(context, flushInfo, mergeInfo, readOnce, load); + /** Creates an IOContext for flushing. */ + public IOContext(FlushInfo flushInfo) { +this(Context.FLUSH, null, flushInfo, false, false); } - @Override - public boolean equals(Object obj) { -if (this == obj) return true; -if (obj == null) return false; -if (getClass() != obj.getClass()) return false; -IOContext other = (IOContext) obj; -if (context != other.context) return false; -if (!Objects.equals(flushInfo, other.flushInfo)) return false; -if (!Objects.equals(mergeInfo, other.mergeInfo)) return false; -if (readOnce != other.readOnce) return false; -if (load != other.load) return false; -return true; + /** Creates an IOContext for merging. */ + public IOContext(MergeInfo mergeInfo) { +this(Context.MERGE, mergeInfo, null, false, false); } - @Override - public String toString() { -return "IOContext [context=" -+ context -+ ", mergeInfo=" -+ mergeInfo -+ ", flushInfo=" -+ flushInfo -+ ", readOnce=" -+ readOnce -+ "]"; + /** Return a copy of this IOContext with {@link #readOnce} set to {@code true}. */ + public IOContext toReadOnce() { +return new IOContext(context, mergeInfo, flushInfo, true, load); Review Comment: OK, will change this and the documentation. Actually so load=true and readOnce=true should be mutually exclusive. I will add a check to the default ctor. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org
Re: [PR] Fix TestTaxonomyFacetValueSource.testRandom [lucene]
dweiss commented on PR #13198: URL: https://github.com/apache/lucene/pull/13198#issuecomment-2017652016 Whenever you touch the random number generator, it'll change anything down from there. Reiterate/Beast your tests to find a new offending seed (or improve the probability your change fixes the problem): https://github.com/apache/lucene/blob/main/help/tests.txt#L89-L123 -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org
Re: [PR] Speed up writeGroupVInts [lucene]
jpountz commented on code in PR #13203: URL: https://github.com/apache/lucene/pull/13203#discussion_r1537354439 ## lucene/core/src/java/org/apache/lucene/util/GroupVIntUtil.java: ## @@ -111,4 +112,40 @@ public static int readGroupVInt( pos += 1 + n4Minus1; return (int) (pos - posStart); } + + private static int numBytes(int v) { +// | 1 to return 1 when v = 0 +return Integer.BYTES - (Integer.numberOfLeadingZeros(v | 1) >> 3); + } + + public static void writeGroupVInts(DataOutput out, byte[] scratch, long[] values, int limit) Review Comment: Can you add some minimal javadocs? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org
Re: [PR] Convert IOContext, MergeInfo, and FlushInfo to record classes [lucene]
uschindler commented on code in PR #13205: URL: https://github.com/apache/lucene/pull/13205#discussion_r1537360487 ## lucene/core/src/java/org/apache/lucene/store/IOContext.java: ## @@ -62,91 +55,36 @@ public enum Context { public static final IOContext LOAD = new IOContext(false, true); - public IOContext() { -this(false, false); - } - - public IOContext(FlushInfo flushInfo) { -assert flushInfo != null; -this.context = Context.FLUSH; -this.mergeInfo = null; -this.readOnce = false; -this.load = false; -this.flushInfo = flushInfo; - } - - public IOContext(Context context) { -this(context, null); + @SuppressWarnings("incomplete-switch") + public IOContext { +switch (context) { + case MERGE -> Objects.requireNonNull( + mergeInfo, "mergeInfo must not be null if context is MERGE"); + case FLUSH -> Objects.requireNonNull( + flushInfo, "flushInfo must not be null if context is FLUSH"); +} } private IOContext(boolean readOnce, boolean load) { -this.context = Context.READ; -this.mergeInfo = null; -this.readOnce = readOnce; -this.load = load; -this.flushInfo = null; - } - - public IOContext(MergeInfo mergeInfo) { -this(Context.MERGE, mergeInfo); +this(Context.READ, null, null, readOnce, load); } - private IOContext(Context context, MergeInfo mergeInfo) { -assert context != Context.MERGE || mergeInfo != null -: "MergeInfo must not be null if context is MERGE"; -assert context != Context.FLUSH : "Use IOContext(FlushInfo) to create a FLUSH IOContext"; -this.context = context; -this.readOnce = false; -this.load = false; -this.mergeInfo = mergeInfo; -this.flushInfo = null; + private IOContext(Context context) { +this(context, null, null, false, false); } - /** - * This constructor is used to initialize a {@link IOContext} instance with a new value for the - * readOnce variable. - * - * @param ctxt {@link IOContext} object whose information is used to create the new instance - * except the readOnce variable. - * @param readOnce The new {@link IOContext} object will use this value for readOnce. - */ - public IOContext(IOContext ctxt, boolean readOnce) { -this.context = ctxt.context; -this.mergeInfo = ctxt.mergeInfo; -this.flushInfo = ctxt.flushInfo; -this.readOnce = readOnce; -this.load = false; - } - - @Override - public int hashCode() { -return Objects.hash(context, flushInfo, mergeInfo, readOnce, load); + /** Creates an IOContext for flushing. */ + public IOContext(FlushInfo flushInfo) { +this(Context.FLUSH, null, flushInfo, false, false); } - @Override - public boolean equals(Object obj) { -if (this == obj) return true; -if (obj == null) return false; -if (getClass() != obj.getClass()) return false; -IOContext other = (IOContext) obj; -if (context != other.context) return false; -if (!Objects.equals(flushInfo, other.flushInfo)) return false; -if (!Objects.equals(mergeInfo, other.mergeInfo)) return false; -if (readOnce != other.readOnce) return false; -if (load != other.load) return false; -return true; + /** Creates an IOContext for merging. */ + public IOContext(MergeInfo mergeInfo) { +this(Context.MERGE, mergeInfo, null, false, false); } - @Override - public String toString() { -return "IOContext [context=" -+ context -+ ", mergeInfo=" -+ mergeInfo -+ ", flushInfo=" -+ flushInfo -+ ", readOnce=" -+ readOnce -+ "]"; + /** Return a copy of this IOContext with {@link #readOnce} set to {@code true}. */ + public IOContext toReadOnce() { +return new IOContext(context, mergeInfo, flushInfo, true, load); Review Comment: I changed this in c2d5f083e5d7ee08fe6cabd9fa2782b5bc7de226. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org
Re: [PR] Convert IOContext, MergeInfo, and FlushInfo to record classes [lucene]
uschindler commented on code in PR #13205: URL: https://github.com/apache/lucene/pull/13205#discussion_r1537361582 ## lucene/core/src/java/org/apache/lucene/store/IOContext.java: ## @@ -62,91 +55,36 @@ public enum Context { public static final IOContext LOAD = new IOContext(false, true); - public IOContext() { -this(false, false); - } - - public IOContext(FlushInfo flushInfo) { -assert flushInfo != null; -this.context = Context.FLUSH; -this.mergeInfo = null; -this.readOnce = false; -this.load = false; -this.flushInfo = flushInfo; - } - - public IOContext(Context context) { -this(context, null); + @SuppressWarnings("incomplete-switch") + public IOContext { +switch (context) { + case MERGE -> Objects.requireNonNull( + mergeInfo, "mergeInfo must not be null if context is MERGE"); + case FLUSH -> Objects.requireNonNull( + flushInfo, "flushInfo must not be null if context is FLUSH"); +} } private IOContext(boolean readOnce, boolean load) { -this.context = Context.READ; -this.mergeInfo = null; -this.readOnce = readOnce; -this.load = load; -this.flushInfo = null; - } - - public IOContext(MergeInfo mergeInfo) { -this(Context.MERGE, mergeInfo); +this(Context.READ, null, null, readOnce, load); } - private IOContext(Context context, MergeInfo mergeInfo) { -assert context != Context.MERGE || mergeInfo != null -: "MergeInfo must not be null if context is MERGE"; -assert context != Context.FLUSH : "Use IOContext(FlushInfo) to create a FLUSH IOContext"; -this.context = context; -this.readOnce = false; -this.load = false; -this.mergeInfo = mergeInfo; -this.flushInfo = null; + private IOContext(Context context) { +this(context, null, null, false, false); } - /** - * This constructor is used to initialize a {@link IOContext} instance with a new value for the - * readOnce variable. - * - * @param ctxt {@link IOContext} object whose information is used to create the new instance - * except the readOnce variable. - * @param readOnce The new {@link IOContext} object will use this value for readOnce. - */ - public IOContext(IOContext ctxt, boolean readOnce) { -this.context = ctxt.context; -this.mergeInfo = ctxt.mergeInfo; -this.flushInfo = ctxt.flushInfo; -this.readOnce = readOnce; -this.load = false; - } - - @Override - public int hashCode() { -return Objects.hash(context, flushInfo, mergeInfo, readOnce, load); + /** Creates an IOContext for flushing. */ + public IOContext(FlushInfo flushInfo) { +this(Context.FLUSH, null, flushInfo, false, false); } - @Override - public boolean equals(Object obj) { -if (this == obj) return true; -if (obj == null) return false; -if (getClass() != obj.getClass()) return false; -IOContext other = (IOContext) obj; -if (context != other.context) return false; -if (!Objects.equals(flushInfo, other.flushInfo)) return false; -if (!Objects.equals(mergeInfo, other.mergeInfo)) return false; -if (readOnce != other.readOnce) return false; -if (load != other.load) return false; -return true; + /** Creates an IOContext for merging. */ + public IOContext(MergeInfo mergeInfo) { +this(Context.MERGE, mergeInfo, null, false, false); } - @Override - public String toString() { -return "IOContext [context=" -+ context -+ ", mergeInfo=" -+ mergeInfo -+ ", flushInfo=" -+ flushInfo -+ ", readOnce=" -+ readOnce -+ "]"; + /** Return a copy of this IOContext with {@link #readOnce} set to {@code true}. */ + public IOContext toReadOnce() { +return new IOContext(context, mergeInfo, flushInfo, true, load); Review Comment: We should open an issue about this. Maybe the new enum should more follow the MADV flags like Robert suggests. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org
Re: [PR] Convert IOContext, MergeInfo, and FlushInfo to record classes [lucene]
uschindler commented on code in PR #13205: URL: https://github.com/apache/lucene/pull/13205#discussion_r1537361582 ## lucene/core/src/java/org/apache/lucene/store/IOContext.java: ## @@ -62,91 +55,36 @@ public enum Context { public static final IOContext LOAD = new IOContext(false, true); - public IOContext() { -this(false, false); - } - - public IOContext(FlushInfo flushInfo) { -assert flushInfo != null; -this.context = Context.FLUSH; -this.mergeInfo = null; -this.readOnce = false; -this.load = false; -this.flushInfo = flushInfo; - } - - public IOContext(Context context) { -this(context, null); + @SuppressWarnings("incomplete-switch") + public IOContext { +switch (context) { + case MERGE -> Objects.requireNonNull( + mergeInfo, "mergeInfo must not be null if context is MERGE"); + case FLUSH -> Objects.requireNonNull( + flushInfo, "flushInfo must not be null if context is FLUSH"); +} } private IOContext(boolean readOnce, boolean load) { -this.context = Context.READ; -this.mergeInfo = null; -this.readOnce = readOnce; -this.load = load; -this.flushInfo = null; - } - - public IOContext(MergeInfo mergeInfo) { -this(Context.MERGE, mergeInfo); +this(Context.READ, null, null, readOnce, load); } - private IOContext(Context context, MergeInfo mergeInfo) { -assert context != Context.MERGE || mergeInfo != null -: "MergeInfo must not be null if context is MERGE"; -assert context != Context.FLUSH : "Use IOContext(FlushInfo) to create a FLUSH IOContext"; -this.context = context; -this.readOnce = false; -this.load = false; -this.mergeInfo = mergeInfo; -this.flushInfo = null; + private IOContext(Context context) { +this(context, null, null, false, false); } - /** - * This constructor is used to initialize a {@link IOContext} instance with a new value for the - * readOnce variable. - * - * @param ctxt {@link IOContext} object whose information is used to create the new instance - * except the readOnce variable. - * @param readOnce The new {@link IOContext} object will use this value for readOnce. - */ - public IOContext(IOContext ctxt, boolean readOnce) { -this.context = ctxt.context; -this.mergeInfo = ctxt.mergeInfo; -this.flushInfo = ctxt.flushInfo; -this.readOnce = readOnce; -this.load = false; - } - - @Override - public int hashCode() { -return Objects.hash(context, flushInfo, mergeInfo, readOnce, load); + /** Creates an IOContext for flushing. */ + public IOContext(FlushInfo flushInfo) { +this(Context.FLUSH, null, flushInfo, false, false); } - @Override - public boolean equals(Object obj) { -if (this == obj) return true; -if (obj == null) return false; -if (getClass() != obj.getClass()) return false; -IOContext other = (IOContext) obj; -if (context != other.context) return false; -if (!Objects.equals(flushInfo, other.flushInfo)) return false; -if (!Objects.equals(mergeInfo, other.mergeInfo)) return false; -if (readOnce != other.readOnce) return false; -if (load != other.load) return false; -return true; + /** Creates an IOContext for merging. */ + public IOContext(MergeInfo mergeInfo) { +this(Context.MERGE, mergeInfo, null, false, false); } - @Override - public String toString() { -return "IOContext [context=" -+ context -+ ", mergeInfo=" -+ mergeInfo -+ ", flushInfo=" -+ flushInfo -+ ", readOnce=" -+ readOnce -+ "]"; + /** Return a copy of this IOContext with {@link #readOnce} set to {@code true}. */ + public IOContext toReadOnce() { +return new IOContext(context, mergeInfo, flushInfo, true, load); Review Comment: > IMO this is a bug in the doc rather than in the code. `load` imlies that the `IndexInput` will be hammered with calls to `seek`/`read`, possibly via the `RandomAccessInput` abstraction, so it's incompatible with `readOnce`. I hope we can refactor all these booleans to an enum as a next step, something like `access = READ_ONCE | READ | LOAD | RANDOM` similarly to the various pre-built `IOContext`s we expose. We should open an issue about this. Maybe the new enum should more follow the MADV flags like Robert suggests. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org
Re: [PR] Add Romanian stopwords with s&t with comma [lucene]
strainu commented on code in PR #12172: URL: https://github.com/apache/lucene/pull/12172#discussion_r1537383981 ## lucene/analysis/common/src/resources/org/apache/lucene/analysis/ro/stopwords.txt: ## @@ -190,27 +207,34 @@ sale sau său se +și şi sînt sîntem +sînteți Review Comment: No, `sânt` & co are not correct forms, the post-1993 forms are `sunt`/`suntem`/`sunteți`. See https://dexonline.ro/definitie/s%C3%A2ntem vs https://dexonline.ro/definitie/suntem -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org
Re: [PR] Subtract deleted file size from the cache size of NRTCachingDirectory. [lucene]
uschindler commented on PR #13206: URL: https://github.com/apache/lucene/pull/13206#issuecomment-2017708203 Hi, > There is a weakness in the fix if the file is concurrently deleted in another thread while being closed I don't think that adding synchronization for that is really necessary. Actually NRTCachingDir would use a larger cache for a short amount of time. I prefer to fix NRTCachingDirectory to have an overall better tracking of files. Of course in real filesystems, the size of deleted, but still open files is still counted, but that's hard to implement here without refactoring the whole class to use some "inode" like structure, decoupled from filename. I wonder why this bug was not seen before? This looks like a serious issue. Maybe it was introduced when we switched to ByteBufferDirectory. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org
Re: [PR] Subtract deleted file size from the cache size of NRTCachingDirectory. [lucene]
uschindler commented on PR #13206: URL: https://github.com/apache/lucene/pull/13206#issuecomment-2017717168 Actually the delete while it gets written to should never appear in Lucene. The bigger problem is when the file is still open in an NRTReader and gets deleted. I am not sure how we should handle that case? It may be serious depending on size of cache, because servers like Elasticserach or Solr often have files open longer times for reading and IndexWriter deletes them. Maybe let's think about this a bit more and maybe there's an easy way how to deal with it. Maybe all IndexInputs created by the directory should decrement when they close if the file is "scheduled for delete"? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org
Re: [PR] Add Romanian stopwords with s&t with comma [lucene]
stefanvodita commented on code in PR #12172: URL: https://github.com/apache/lucene/pull/12172#discussion_r1537416729 ## lucene/analysis/common/src/resources/org/apache/lucene/analysis/ro/stopwords.txt: ## @@ -190,27 +207,34 @@ sale sau său se +și şi sînt sîntem +sînteți Review Comment: I think your point is the `î` spellings for "a fi" were used officially at some point, while the `â` spellings never were. It's tricky to talk about what is correct in a language though. Have people written text using the `â` spellings? Of course, switching `î` and `â` is a common pattern, which is "correct" for other words. Should we handle stemming for the `â` spellings then? I don't know, maybe it depends on how frequent the `â` spelling is in practice. I don't feel strongly about this one way or the other. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org
Re: [PR] Add Romanian stopwords with s&t with comma [lucene]
strainu commented on code in PR #12172: URL: https://github.com/apache/lucene/pull/12172#discussion_r1537424935 ## lucene/analysis/common/src/resources/org/apache/lucene/analysis/ro/stopwords.txt: ## @@ -190,27 +207,34 @@ sale sau său se +și şi sînt sîntem +sînteți Review Comment: I don't know enough about how Lucene is used at large to understand whether there would be a difference between a search for `noi suntem romani` vs `noi suntem români` vs `noi santem romani` vs `noi sântem romani` vs `noi suntem români` (`romani` means `Romans`, `români` means `Romanians`). I would recommend we leave this for another PR though, as the current one only tries to solve the technical problems related to S comma-below vs cedilla-below. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org
Re: [PR] Add Romanian stopwords with s&t with comma [lucene]
strainu commented on code in PR #12172: URL: https://github.com/apache/lucene/pull/12172#discussion_r1537424935 ## lucene/analysis/common/src/resources/org/apache/lucene/analysis/ro/stopwords.txt: ## @@ -190,27 +207,34 @@ sale sau său se +și şi sînt sîntem +sînteți Review Comment: I don't know enough about how Lucene is used at large to understand whether there would be a difference between a search for `noi suntem romani` vs `noi suntem români` vs `noi santem romani` vs `noi sântem romani` vs `noi sântem români` vs `noi suntem români` (`romani` means `Romans`, `români` means `Romanians`). I would recommend we leave this for another PR though, as the current one only tries to solve the technical problems related to S comma-below vs cedilla-below. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org
Re: [PR] Add timeout support to AbstractKnnVectorQuery [lucene]
kaivalnp commented on code in PR #13202: URL: https://github.com/apache/lucene/pull/13202#discussion_r1537438862 ## lucene/core/src/java/org/apache/lucene/search/TimeLimitingKnnCollectorManager.java: ## @@ -0,0 +1,91 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search; + +import java.io.IOException; +import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.index.QueryTimeout; +import org.apache.lucene.search.knn.KnnCollectorManager; + +/** A {@link KnnCollectorManager} that collects results with a timeout. */ +public class TimeLimitingKnnCollectorManager implements KnnCollectorManager { + private final KnnCollectorManager delegate; + private final QueryTimeout queryTimeout; + + public TimeLimitingKnnCollectorManager(KnnCollectorManager delegate, QueryTimeout timeout) { +this.delegate = delegate; +this.queryTimeout = timeout; + } + + /** Get the {@link QueryTimeout} for terminating graph searches. */ + public QueryTimeout getQueryTimeout() { +return queryTimeout; + } + + @Override + public KnnCollector newCollector(int visitedLimit, LeafReaderContext context) throws IOException { +KnnCollector collector = delegate.newCollector(visitedLimit, context); +if (queryTimeout == null) { + return collector; +} +return new KnnCollector() { + @Override + public boolean earlyTerminated() { +return queryTimeout.shouldExit() || collector.earlyTerminated(); + } + + @Override + public void incVisitedCount(int count) { +collector.incVisitedCount(count); + } + + @Override + public long visitedCount() { +return collector.visitedCount(); + } + + @Override + public long visitLimit() { +return collector.visitLimit(); + } + + @Override + public int k() { +return collector.k(); + } + + @Override + public boolean collect(int docId, float similarity) { +return collector.collect(docId, similarity); + } + + @Override + public float minCompetitiveSimilarity() { +return collector.minCompetitiveSimilarity(); + } + + @Override + public TopDocs topDocs() { +if (queryTimeout.shouldExit()) { + // Quickly exit + return TopDocsCollector.EMPTY_TOPDOCS; +} Review Comment: > callers can use `TotalHits.Relation` to figure out if search terminated early. This information is internal to `AbstractKnnVectorQuery` -- even if we pass some `TotalHits.Relation`, the rewritten `DocAndScoreQuery` treats all results as "complete" (no distinction for the relation [here](https://github.com/apache/lucene/blob/04f335ad79cae019d1a552e41e0029d061d8e3d9/lucene/core/src/java/org/apache/lucene/search/AbstractKnnVectorQuery.java#L239-L254), and does not pass it on to `IndexSearcher#search`) Though I agree, we could pass those results right now and handle them properly in a follow up issue. I've added a commit to do this -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org
Re: [PR] Made the UnifiedHighlighter's hasUnrecognizedQuery function processes FunctionQuery the same way as MatchAllDocsQuery and MatchNoDocsQuery queries for performance reasons. [lucene]
romseygeek commented on code in PR #13165: URL: https://github.com/apache/lucene/pull/13165#discussion_r1537485874 ## lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/UnifiedHighlighter.java: ## @@ -1130,7 +1134,16 @@ public boolean acceptField(String field) { @Override public void visitLeaf(Query query) { if (MultiTermHighlighting.canExtractAutomataFromLeafQuery(query) == false) { - if (!(query instanceof MatchAllDocsQuery || query instanceof MatchNoDocsQuery)) { + boolean no_effect_query = false; Review Comment: `MatchAllDocsQuery` is already final, but the other two aren't. Maybe this would be better done as a new protected method, `isIgnorableQuery(Query q)` with a default implementation of `instanceof` checks, and then if somebody has their own extension (or another query implementation that will do weird things here) they can override the method? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org
Re: [PR] upgrade snowball to 26db1ab9adbf437f37a6facd3ee2aad1da9eba03 [lucene]
uschindler commented on code in PR #13209: URL: https://github.com/apache/lucene/pull/13209#discussion_r1537494919 ## lucene/analysis/common/src/java/org/apache/lucene/analysis/snowball/SnowballFilter.java: ## @@ -70,6 +70,11 @@ public SnowballFilter(TokenStream input, SnowballStemmer stemmer) { public SnowballFilter(TokenStream in, String name) { super(in); Objects.requireNonNull(name, "name"); +// it was called "German2" for eons, but snowball folded it into "German" and deleted "German2" +// for now, don't annoy our users... +if (name.equals("German2")) { Review Comment: It's ok to delegate German2 to German. ## lucene/analysis/common/src/java/org/apache/lucene/analysis/snowball/SnowballPorterFilterFactory.java: ## @@ -59,7 +59,13 @@ public class SnowballPorterFilterFactory extends TokenFilterFactory implements R /** Creates a new SnowballPorterFilterFactory */ public SnowballPorterFilterFactory(Map args) { super(args); -language = get(args, "language", "English"); +String lang = get(args, "language", "English"); +// it was called "German2" for eons, but snowball folded it into "German" and deleted "German2" Review Comment: This code is not required as the filter itself renames, but looks still fine to change early. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org
[I] Replace boolean flags on MergeContext with an enum [lucene]
jpountz opened a new issue, #13211: URL: https://github.com/apache/lucene/issues/13211 ### Description `MergeContext` has a few boolean flags: `readOnce`, `load`, `randomAccess`. But some combinations of these flags don't make sense, e.g. something can't be `readOnce` and `randomAccess` at the same time. So it would be natural to identify combinations that make sense and represent them as an `enum`. Separately @rmuir suggested having a more natural translation between `IOContext` and `(m|f)advise`. So maybe this enum could be modeled after the allowed values for `posix_fadvise`: `NOREUSE` (read-once), `RANDOM`, etc. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org
Re: [PR] Convert IOContext, MergeInfo, and FlushInfo to record classes [lucene]
jpountz commented on code in PR #13205: URL: https://github.com/apache/lucene/pull/13205#discussion_r1537555897 ## lucene/core/src/java/org/apache/lucene/store/IOContext.java: ## @@ -62,91 +55,36 @@ public enum Context { public static final IOContext LOAD = new IOContext(false, true); - public IOContext() { -this(false, false); - } - - public IOContext(FlushInfo flushInfo) { -assert flushInfo != null; -this.context = Context.FLUSH; -this.mergeInfo = null; -this.readOnce = false; -this.load = false; -this.flushInfo = flushInfo; - } - - public IOContext(Context context) { -this(context, null); + @SuppressWarnings("incomplete-switch") + public IOContext { +switch (context) { + case MERGE -> Objects.requireNonNull( + mergeInfo, "mergeInfo must not be null if context is MERGE"); + case FLUSH -> Objects.requireNonNull( + flushInfo, "flushInfo must not be null if context is FLUSH"); +} } private IOContext(boolean readOnce, boolean load) { -this.context = Context.READ; -this.mergeInfo = null; -this.readOnce = readOnce; -this.load = load; -this.flushInfo = null; - } - - public IOContext(MergeInfo mergeInfo) { -this(Context.MERGE, mergeInfo); +this(Context.READ, null, null, readOnce, load); } - private IOContext(Context context, MergeInfo mergeInfo) { -assert context != Context.MERGE || mergeInfo != null -: "MergeInfo must not be null if context is MERGE"; -assert context != Context.FLUSH : "Use IOContext(FlushInfo) to create a FLUSH IOContext"; -this.context = context; -this.readOnce = false; -this.load = false; -this.mergeInfo = mergeInfo; -this.flushInfo = null; + private IOContext(Context context) { +this(context, null, null, false, false); } - /** - * This constructor is used to initialize a {@link IOContext} instance with a new value for the - * readOnce variable. - * - * @param ctxt {@link IOContext} object whose information is used to create the new instance - * except the readOnce variable. - * @param readOnce The new {@link IOContext} object will use this value for readOnce. - */ - public IOContext(IOContext ctxt, boolean readOnce) { -this.context = ctxt.context; -this.mergeInfo = ctxt.mergeInfo; -this.flushInfo = ctxt.flushInfo; -this.readOnce = readOnce; -this.load = false; - } - - @Override - public int hashCode() { -return Objects.hash(context, flushInfo, mergeInfo, readOnce, load); + /** Creates an IOContext for flushing. */ + public IOContext(FlushInfo flushInfo) { +this(Context.FLUSH, null, flushInfo, false, false); } - @Override - public boolean equals(Object obj) { -if (this == obj) return true; -if (obj == null) return false; -if (getClass() != obj.getClass()) return false; -IOContext other = (IOContext) obj; -if (context != other.context) return false; -if (!Objects.equals(flushInfo, other.flushInfo)) return false; -if (!Objects.equals(mergeInfo, other.mergeInfo)) return false; -if (readOnce != other.readOnce) return false; -if (load != other.load) return false; -return true; + /** Creates an IOContext for merging. */ + public IOContext(MergeInfo mergeInfo) { +this(Context.MERGE, mergeInfo, null, false, false); } - @Override - public String toString() { -return "IOContext [context=" -+ context -+ ", mergeInfo=" -+ mergeInfo -+ ", flushInfo=" -+ flushInfo -+ ", readOnce=" -+ readOnce -+ "]"; + /** Return a copy of this IOContext with {@link #readOnce} set to {@code true}. */ + public IOContext toReadOnce() { +return new IOContext(context, mergeInfo, flushInfo, true, load); Review Comment: I opened https://github.com/apache/lucene/issues/13211. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org
Re: [PR] Add WrappedCandidateMatcher for composing matchers [lucene]
bjacobowitz commented on PR #13109: URL: https://github.com/apache/lucene/pull/13109#issuecomment-2017958689 > The `protected` visibility on `matchQuery()` should already be fine here - you can override or call protected methods from within subclasses. I think making `reportError()` and `finish()` `protected final` would be the neatest solution. @romseygeek On further reflection, I think protected visibility may not be sufficient for a practical solution here, even when using subclasses. I'll again use `ParallelMatcher` as an example, since I think it's a fairly representative use case. `ParallelMatcher` [takes in a `MatcherFactory`](https://github.com/apache/lucene/blob/75e1ebc4501733a6a9857858b3fd9c30304234d0/lucene/monitor/src/java/org/apache/lucene/monitor/ParallelMatcher.java#L66) to [create instances of `CandidateMatcher`](https://github.com/apache/lucene/blob/75e1ebc4501733a6a9857858b3fd9c30304234d0/lucene/monitor/src/java/org/apache/lucene/monitor/ParallelMatcher.java#L120), with the intent to [take advantage of the `matchQuery` implementation in the factory's generated instances of `CandidateMatcher`](https://github.com/apache/lucene/blob/75e1ebc4501733a6a9857858b3fd9c30304234d0/lucene/monitor/src/java/org/apache/lucene/monitor/ParallelMatcher.java#L129). If we try to reimplement `ParallelMatcher` outside the Lucene Monitor package, we can't call `matchQuery` on the factory-built `CandidateMatcher` from outside (i.e. on an instance other than `this`) because of `matchQuery`'s protected access -- the only reason the existing `ParallelMatcher` implementati on can do this is because it is in the Lucene Monitor and therefore has package-local access to the protected functions on another object in the same package. To access and invoke the implementation of matchQuery, we would need to create a subclass of the concrete `CandidateMatcher` implementation generated by the factory, which isn't directly possible, because the factory pattern creates a situation where the concrete class isn't known. We could approach a solution using an alternate factory implementation that would return a subclass of a known concrete type with hidden functions exposed. For example, we could create a subclass of `CollectingMatcher` (let's call it `ExposedCollectingMatcher`) with a public function `exposedMatchQuery` which calls `this.matchQuery()`, and then we could create a factory to return that subclass (it might be called `ExposedCollectingMatcherFactory`). The problem with that approach is that it breaks down in cases where the concrete implementation is not available to be subclassed, as is the case for several anonymous `CandidateMatcher` implementations (e.g. instances generated by [`MATCHER` factory in `ExplainingMatch`](https://github.com/apache/lucene/blob/75e1ebc4501733a6a9857858b3fd9c30304234d0/lucene/monitor/src/java/org/apache/lucene/monitor/ExplainingMatch.java#L32-L54)). Ideally we would like a factory that returns a more generic class that extends the functionality of a `CandidateMatcher`, which is what `WrappedCandidateMatcher` provides. Of course another solution would be to simply make these functions public, but I understand the desire to avoid that if possible. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org
Re: [PR] Subtract deleted file size from the cache size of NRTCachingDirectory. [lucene]
jpountz commented on PR #13206: URL: https://github.com/apache/lucene/pull/13206#issuecomment-2017975876 > The bigger problem is when the file is still open in an NRTReader and gets deleted. Hmm does it actually happen? I thought index files were ref-counted so that files would only get deleted when the NRT reader gets closed? This is what `StandardDirectoryReader#doClose` suggests. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org
Re: [PR] upgrade snowball to 26db1ab9adbf437f37a6facd3ee2aad1da9eba03 [lucene]
rmuir commented on code in PR #13209: URL: https://github.com/apache/lucene/pull/13209#discussion_r1537576805 ## lucene/analysis/common/src/java/org/apache/lucene/analysis/snowball/SnowballPorterFilterFactory.java: ## @@ -59,7 +59,13 @@ public class SnowballPorterFilterFactory extends TokenFilterFactory implements R /** Creates a new SnowballPorterFilterFactory */ public SnowballPorterFilterFactory(Map args) { super(args); -language = get(args, "language", "English"); +String lang = get(args, "language", "English"); +// it was called "German2" for eons, but snowball folded it into "German" and deleted "German2" Review Comment: The filter has two constructors: ``` /** takes explicit instance: we can't provide back compat/user help here unless we generate subclass or something messy */ SnowballFilter(TokenStream, SnowballStemmer) /** takes string: here we map German2 to German */ SnowBallFilter(TokenStream, String) ``` The factory does reflection itself and uses the first ctor. So it needs the mapping too. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org
Re: [PR] Fix demo application, if no knn dict was provided and over 100 docs are indexed [lucene]
msokolov merged PR #13163: URL: https://github.com/apache/lucene/pull/13163 -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org
Re: [I] Demo application throws an exception when following documentation. [lucene]
msokolov closed issue #12330: Demo application throws an exception when following documentation. URL: https://github.com/apache/lucene/issues/12330 -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org
Re: [PR] Fix demo application, if no knn dict was provided and over 100 docs are indexed [lucene]
msokolov commented on PR #13163: URL: https://github.com/apache/lucene/pull/13163#issuecomment-2018132006 I also cherry-picked to branch_9x: 3d109bdc493b451f967a7ad3102267c9d177ade0 -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org
Re: [PR] Avoid modify merge state in per field mergers [lucene]
dnhatn commented on code in PR #13208: URL: https://github.com/apache/lucene/pull/13208#discussion_r1537724985 ## lucene/core/src/java/org/apache/lucene/codecs/perfield/PerFieldMergeState.java: ## @@ -31,40 +31,18 @@ import org.apache.lucene.index.MergeState; import org.apache.lucene.index.Terms; -/** - * Utility class to update the {@link MergeState} instance to be restricted to a set of fields. - * - * Warning: the input {@linkplain MergeState} instance will be updated when calling {@link - * #apply(Collection)}. - * - * It should be called within a {@code try {...} finally {...}} block to make sure that the - * mergeState instance is restored to its original state: - * - * - * PerFieldMergeState pfMergeState = new PerFieldMergeState(mergeState); - * try { - * doSomething(pfMergeState.apply(fields)); - * ... - * } finally { - * pfMergeState.reset(); - * } - * - */ +/** Utility class to update the {@link MergeState} instance to be restricted to a set of fields. */ final class PerFieldMergeState { private final MergeState in; - private final FieldInfos orgMergeFieldInfos; - private final FieldInfos[] orgFieldInfos; - private final FieldsProducer[] orgFieldsProducers; + private final MergeState copy; Review Comment: ++ I pushed https://github.com/apache/lucene/pull/13208/commits/ee3b0357e07e07cca76662df658d101f927254e4 -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org
Re: [PR] Avoid modify merge state in per field mergers [lucene]
dnhatn commented on code in PR #13208: URL: https://github.com/apache/lucene/pull/13208#discussion_r1537724628 ## lucene/core/src/java/org/apache/lucene/index/MergeState.java: ## @@ -23,13 +23,7 @@ import java.util.Locale; import java.util.concurrent.Executor; import java.util.concurrent.TimeUnit; -import org.apache.lucene.codecs.DocValuesProducer; -import org.apache.lucene.codecs.FieldsProducer; -import org.apache.lucene.codecs.KnnVectorsReader; -import org.apache.lucene.codecs.NormsProducer; -import org.apache.lucene.codecs.PointsReader; -import org.apache.lucene.codecs.StoredFieldsReader; -import org.apache.lucene.codecs.TermVectorsReader; +import org.apache.lucene.codecs.*; Review Comment: @jpountz Sorry, I haven't changed the imports setting for my new Intellij setup. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org
Re: [PR] Avoid modify merge state in per field mergers [lucene]
jpountz commented on code in PR #13208: URL: https://github.com/apache/lucene/pull/13208#discussion_r1537732974 ## lucene/core/src/java/org/apache/lucene/index/MergeState.java: ## @@ -266,4 +266,35 @@ static PackedLongValues removeDeletes(final int maxDoc, final Bits liveDocs) { } return docMapBuilder.build(); } + + /** + * Creates a new state from an existing state with new fieldInfos and FieldsProducer. + * + * @param state the state to be copied + * @param mergeFieldInfos the new field infos of the merged segment + * @param fieldInfos the new field infos + * @param fieldsProducers the new field producers + */ + public MergeState( Review Comment: I'm not too happy with this constructor being public since it's really an implementation detail of `PerFieldMergeState `. At the same time, I like having the per-field-specific logic in the `.perfield` package rather than here. I'm not sure how to fix this cleanly... my best idea at this point would be to store all parameters of the main `MergeState` constructor on the `MergeState` object, so that `PerFieldMergeState` can then call this constructor again, wrapping codec readers with a `FilterCodecReader` that only exposes a subset of fields? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org
Re: [PR] Speed up writeGroupVInts [lucene]
easyice commented on code in PR #13203: URL: https://github.com/apache/lucene/pull/13203#discussion_r1537749847 ## lucene/core/src/java/org/apache/lucene/util/GroupVIntUtil.java: ## @@ -111,4 +112,40 @@ public static int readGroupVInt( pos += 1 + n4Minus1; return (int) (pos - posStart); } + + private static int numBytes(int v) { +// | 1 to return 1 when v = 0 +return Integer.BYTES - (Integer.numberOfLeadingZeros(v | 1) >> 3); + } + + public static void writeGroupVInts(DataOutput out, byte[] scratch, long[] values, int limit) Review Comment: done! -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org
Re: [PR] Speed up writeGroupVInts [lucene]
easyice commented on PR #13203: URL: https://github.com/apache/lucene/pull/13203#issuecomment-2018225641 Thanks for reviewing! -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org
Re: [PR] Avoid modify merge state in per field mergers [lucene]
dnhatn commented on code in PR #13208: URL: https://github.com/apache/lucene/pull/13208#discussion_r1537756776 ## lucene/core/src/java/org/apache/lucene/index/MergeState.java: ## @@ -266,4 +266,35 @@ static PackedLongValues removeDeletes(final int maxDoc, final Bits liveDocs) { } return docMapBuilder.build(); } + + /** + * Creates a new state from an existing state with new fieldInfos and FieldsProducer. + * + * @param state the state to be copied + * @param mergeFieldInfos the new field infos of the merged segment + * @param fieldInfos the new field infos + * @param fieldsProducers the new field producers + */ + public MergeState( Review Comment: That also works for me. I pushed https://github.com/apache/lucene/pull/13208/commits/0f6a566f18521b5810c91442b24a5be7874184bc. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org
Re: [PR] Avoid modify merge state in per field mergers [lucene]
jpountz commented on code in PR #13208: URL: https://github.com/apache/lucene/pull/13208#discussion_r1537788640 ## lucene/core/src/java/org/apache/lucene/index/MergeState.java: ## @@ -266,4 +266,35 @@ static PackedLongValues removeDeletes(final int maxDoc, final Bits liveDocs) { } return docMapBuilder.build(); } + + /** + * Creates a new state from an existing state with new fieldInfos and FieldsProducer. + * + * @param state the state to be copied + * @param mergeFieldInfos the new field infos of the merged segment + * @param fieldInfos the new field infos + * @param fieldsProducers the new field producers + */ + public MergeState( Review Comment: I had something different in mind but I realize it wouldn't work without re-computing merge instances over and over again, which we probably don't want. Your approach looks better. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org
Re: [PR] Reduce some unnecessary ArrayUtil#grow calls [lucene]
easyice commented on PR #13171: URL: https://github.com/apache/lucene/pull/13171#issuecomment-2018304676 @stefanvodita Thank you for looking into this! when an array does not need to grow, The `ArrayUtil.grow` will return the input `array`, then it will call an extra assignment like `ref.bytes = ref.bytes` in `BytesRefBuilder#grow`. I observed there was some performance impact, which was noticeable in the `append` method. Maybe i'm being overly picky, Overall, it doesn't have a significant impact on performance. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org
Re: [PR] Convert IOContext, MergeInfo, and FlushInfo to record classes [lucene]
uschindler commented on code in PR #13205: URL: https://github.com/apache/lucene/pull/13205#discussion_r1537821986 ## lucene/core/src/java/org/apache/lucene/store/IOContext.java: ## @@ -62,91 +55,36 @@ public enum Context { public static final IOContext LOAD = new IOContext(false, true); - public IOContext() { -this(false, false); - } - - public IOContext(FlushInfo flushInfo) { -assert flushInfo != null; -this.context = Context.FLUSH; -this.mergeInfo = null; -this.readOnce = false; -this.load = false; -this.flushInfo = flushInfo; - } - - public IOContext(Context context) { -this(context, null); + @SuppressWarnings("incomplete-switch") + public IOContext { +switch (context) { + case MERGE -> Objects.requireNonNull( + mergeInfo, "mergeInfo must not be null if context is MERGE"); + case FLUSH -> Objects.requireNonNull( + flushInfo, "flushInfo must not be null if context is FLUSH"); +} } private IOContext(boolean readOnce, boolean load) { -this.context = Context.READ; -this.mergeInfo = null; -this.readOnce = readOnce; -this.load = load; -this.flushInfo = null; - } - - public IOContext(MergeInfo mergeInfo) { -this(Context.MERGE, mergeInfo); +this(Context.READ, null, null, readOnce, load); } - private IOContext(Context context, MergeInfo mergeInfo) { -assert context != Context.MERGE || mergeInfo != null -: "MergeInfo must not be null if context is MERGE"; -assert context != Context.FLUSH : "Use IOContext(FlushInfo) to create a FLUSH IOContext"; -this.context = context; -this.readOnce = false; -this.load = false; -this.mergeInfo = mergeInfo; -this.flushInfo = null; + private IOContext(Context context) { +this(context, null, null, false, false); } - /** - * This constructor is used to initialize a {@link IOContext} instance with a new value for the - * readOnce variable. - * - * @param ctxt {@link IOContext} object whose information is used to create the new instance - * except the readOnce variable. - * @param readOnce The new {@link IOContext} object will use this value for readOnce. - */ - public IOContext(IOContext ctxt, boolean readOnce) { -this.context = ctxt.context; -this.mergeInfo = ctxt.mergeInfo; -this.flushInfo = ctxt.flushInfo; -this.readOnce = readOnce; -this.load = false; - } - - @Override - public int hashCode() { -return Objects.hash(context, flushInfo, mergeInfo, readOnce, load); + /** Creates an IOContext for flushing. */ + public IOContext(FlushInfo flushInfo) { +this(Context.FLUSH, null, flushInfo, false, false); } - @Override - public boolean equals(Object obj) { -if (this == obj) return true; -if (obj == null) return false; -if (getClass() != obj.getClass()) return false; -IOContext other = (IOContext) obj; -if (context != other.context) return false; -if (!Objects.equals(flushInfo, other.flushInfo)) return false; -if (!Objects.equals(mergeInfo, other.mergeInfo)) return false; -if (readOnce != other.readOnce) return false; -if (load != other.load) return false; -return true; + /** Creates an IOContext for merging. */ + public IOContext(MergeInfo mergeInfo) { +this(Context.MERGE, mergeInfo, null, false, false); } - @Override - public String toString() { -return "IOContext [context=" -+ context -+ ", mergeInfo=" -+ mergeInfo -+ ", flushInfo=" -+ flushInfo -+ ", readOnce=" -+ readOnce -+ "]"; + /** Return a copy of this IOContext with {@link #readOnce} set to {@code true}. */ + public IOContext toReadOnce() { +return new IOContext(context, mergeInfo, flushInfo, true, load); Review Comment: Thanks. I fixed the title of the PR, but looks good. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org
Re: [PR] Add support for posix_madvise to Java 21 MMapDirectory [lucene]
uschindler commented on PR #13196: URL: https://github.com/apache/lucene/pull/13196#issuecomment-2018328574 @jpountz Are you fine with merging? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org
Re: [PR] Avoid modify merge state in per field mergers [lucene]
dnhatn merged PR #13208: URL: https://github.com/apache/lucene/pull/13208 -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org
Re: [PR] Avoid modify merge state in per field mergers (#13208) [lucene]
dnhatn merged PR #13212: URL: https://github.com/apache/lucene/pull/13212 -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org
Re: [I] Regarding the frequency used for scoring sloppy phrase queries. [lucene]
odelmarcelle commented on issue #13152: URL: https://github.com/apache/lucene/issues/13152#issuecomment-2018475392 That's my personal opinion, but given that it's up to the user to allow sloppy matches, I question why the score penalty even exists. If a user wants to rank higher exact matches, it's easy to do so by boosting a more restrictive query. Now, if the penalty is there to stay, I would argue that an exponential decay function would be more suitable for the penalty weight. The penalty would decrease with a fixed factor, unlike the current implementation which is penalize very harshly small slops but not so much bigger ones. This could be implemented, for example, having this kind of weight: `Math.pow(k, matchLength)` with `k` fixed to, say, `0.95` for a parameter-free implementation. See a comparison of penalty weights for the current implementation and the exponential decay: `1f / (1f + matchLength)` `1.0, 0.5, 0.333, 0.25, 0.2, 0.167, 0.143, 0.125, 0.111, 0.1` `Math.pow(0.95, matchLength)` `1.0, 0.95, 0.902, 0.857, 0.815, 0.774, 0.735, 0.698, 0.663, 0.63` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org
Re: [PR] Avoid modify merge state in per field mergers [lucene]
benwtrent commented on PR #13208: URL: https://github.com/apache/lucene/pull/13208#issuecomment-2018483694 Huzzah! Thanks @dnhatn @jpountz !!! -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org
Re: [PR] Add support for posix_madvise to Java 21 MMapDirectory [lucene]
uschindler merged PR #13196: URL: https://github.com/apache/lucene/pull/13196 -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org
[PR] Add support for posix_madvise to Java 21 MMapDirectory (backport) [lucene]
uschindler opened a new pull request, #13213: URL: https://github.com/apache/lucene/pull/13213 backport of #13196 -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org
Re: [PR] Avoid modify merge state in per field mergers [lucene]
mikemccand commented on PR #13208: URL: https://github.com/apache/lucene/pull/13208#issuecomment-2018667449 Thank you @dnhatn! I've kicked off a one-off nightly benchy run ... -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org
Re: [PR] Add support for posix_madvise to Java 21 MMapDirectory [lucene]
uschindler commented on PR #13196: URL: https://github.com/apache/lucene/pull/13196#issuecomment-2018749550 The test added by @ChrisHegarty sometimes fails on windows: It does not close the file it opened for random access testing, so the directory can't be deleted. Will fix this in a separate commit. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org
Re: [PR] Add support for posix_madvise to Java 21 MMapDirectory (backport) [lucene]
uschindler merged PR #13213: URL: https://github.com/apache/lucene/pull/13213 -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org
Re: [PR] Add support for posix_madvise to Java 21 MMapDirectory [lucene]
uschindler commented on PR #13196: URL: https://github.com/apache/lucene/pull/13196#issuecomment-2018772367 I fixed the test in https://github.com/apache/lucene/commit/ae5d3534e3ef44ff3336dea0308d8a82f0672ff2 -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org
Re: [PR] Made the UnifiedHighlighter's hasUnrecognizedQuery function processes FunctionQuery the same way as MatchAllDocsQuery and MatchNoDocsQuery queries for performance reasons. [lucene]
vletard commented on code in PR #13165: URL: https://github.com/apache/lucene/pull/13165#discussion_r1538165525 ## lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/UnifiedHighlighter.java: ## @@ -1130,7 +1134,16 @@ public boolean acceptField(String field) { @Override public void visitLeaf(Query query) { if (MultiTermHighlighting.canExtractAutomataFromLeafQuery(query) == false) { - if (!(query instanceof MatchAllDocsQuery || query instanceof MatchNoDocsQuery)) { + boolean no_effect_query = false; Review Comment: I just edited my branch with your suggestion. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org
Re: [PR] Convert IOContext, MergeInfo, and FlushInfo to record classes [lucene]
uschindler commented on PR #13205: URL: https://github.com/apache/lucene/pull/13205#issuecomment-2018817418 Hi @ChrisHegarty, can you have another look on the crazy randomAccess flag afetr I merged main into this. Especially the checks in the record's constructor should be checked again. I have no idea what flags are mutually exclusive. Maybe there are more combinations missing. Lucily @jpountz opened #13211, can't wait to see this be resolved. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org
Re: [PR] Reduce some unnecessary ArrayUtil#grow calls [lucene]
stefanvodita commented on PR #13171: URL: https://github.com/apache/lucene/pull/13171#issuecomment-2018978359 Thanks for the explanation! The way I think about it is whether it would be ok to make this check every time we call `grow`. Probably not, it's nice that `grow` does the check for you. Maybe simplicity is worth more than the performance gain here? If you think the performance win is compelling, I won't argue against it. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org
Re: [PR] Made DocIdsWriter use DISI when reading documents with an IntersectVisitor [lucene]
jpountz commented on code in PR #13149: URL: https://github.com/apache/lucene/pull/13149#discussion_r1538276505 ## lucene/core/src/java/org/apache/lucene/search/PointRangeQuery.java: ## @@ -222,6 +230,14 @@ public void visit(DocIdSetIterator iterator) throws IOException { cost[0] = Math.max(0, cost[0] - iterator.cost()); } + @Override + public void visit(IntsRef ref) throws IOException { +for (int i = ref.offset; i < ref.offset + ref.length; i++) { + result.clear(ref.ints[i]); + cost[0]--; Review Comment: you could move this outside of the loop and decrement by `ref.length` in one go? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org
Re: [I] Remove IndexSearcher#search(Query,Collector) in favor of IndexSearcher#search(Query,CollectorManager) [LUCENE-10002] [lucene]
wardle commented on issue #11041: URL: https://github.com/apache/lucene/issues/11041#issuecomment-2018982902 Hi all. There is a small overhead with using CollectorManager over Collector. In my own usage, I have a read-only index which uses only a single slice, and I've chosen not to provide an executor service because my usage uses lots of cores for concurrent requests. Consequently, it seems to be that this is more of an issue of documentation, and best practices. Additional usage information in IndexSearcher#search to point users to use a CollectorManager might well suffice, and prevent what is presumably a concern about incorrect usage, while permitting users who know the nature of their index, and their usage patterns, to leverage the APIs as they need. Clearly, it is trivial to use CollectorManager and wrap my existing (custom) Collector, but it is an unnecessary overhead, albeit a very small one, but at the cost of losing backwards compatibility and with little gain as fundamentally both could co-exist without complication, but documentation steering most users to prefer CollectorManager. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org