jpountz commented on code in PR #12436: URL: https://github.com/apache/lucene/pull/12436#discussion_r1275169909
########## lucene/core/src/java/org/apache/lucene/index/IndexingChain.java: ########## @@ -621,6 +621,12 @@ private void initializeFieldInfo(PerField pf) throws IOException { final Sort indexSort = indexWriterConfig.getIndexSort(); validateIndexSortDVType(indexSort, pf.fieldName, s.docValuesType); } + if (s.vectorDimension != 0) { + validateMaxVectorDimension( + pf.fieldName, + s.vectorDimension, + indexWriterConfig.getCodec().knnVectorsFormat().getMaxDimensions()); + } Review Comment: I worry that this adds a hashtable lookup on a hot code path. Maybe it's not that bad for vectors, which are slow to index anyway, but I'd rather avoid it. What about making the codec responsible for checking the limit? Something like below: ```patch diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene95/Lucene95HnswVectorsFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene95/Lucene95HnswVectorsFormat.java index cb3e5ef8b10..6c365e53528 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene95/Lucene95HnswVectorsFormat.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene95/Lucene95HnswVectorsFormat.java @@ -108,6 +108,9 @@ public final class Lucene95HnswVectorsFormat extends KnnVectorsFormat { public static final int VERSION_START = 0; public static final int VERSION_CURRENT = VERSION_START; + /** A maximum number of vector dimensions supported by this codeс */ + public static final int MAX_DIMENSIONS = 1024; + /** * A maximum configurable maximum max conn. * @@ -177,7 +180,7 @@ public final class Lucene95HnswVectorsFormat extends KnnVectorsFormat { @Override public KnnVectorsWriter fieldsWriter(SegmentWriteState state) throws IOException { - return new Lucene95HnswVectorsWriter(state, maxConn, beamWidth); + return new Lucene95HnswVectorsWriter(state, maxConn, beamWidth, MAX_DIMENSIONS); } @Override diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene95/Lucene95HnswVectorsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene95/Lucene95HnswVectorsWriter.java index 5358d66f16e..196f12a21ad 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene95/Lucene95HnswVectorsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene95/Lucene95HnswVectorsWriter.java @@ -60,13 +60,15 @@ public final class Lucene95HnswVectorsWriter extends KnnVectorsWriter { private final IndexOutput meta, vectorData, vectorIndex; private final int M; private final int beamWidth; + private final int maxDimension; private final List<FieldWriter<?>> fields = new ArrayList<>(); private boolean finished; - Lucene95HnswVectorsWriter(SegmentWriteState state, int M, int beamWidth) throws IOException { + Lucene95HnswVectorsWriter(SegmentWriteState state, int M, int beamWidth, int maxDimension) throws IOException { this.M = M; this.beamWidth = beamWidth; + this.maxDimension = maxDimension; segmentWriteState = state; String metaFileName = IndexFileNames.segmentFileName( @@ -117,6 +119,9 @@ public final class Lucene95HnswVectorsWriter extends KnnVectorsWriter { @Override public KnnFieldVectorsWriter<?> addField(FieldInfo fieldInfo) throws IOException { + if (fieldInfo.getVectorDimension() > maxDimension) { + throw new IllegalArgumentException("Number of dimensions " + fieldInfo.getVectorDimension() + " for field " + fieldInfo.name + " exceeds the limit of " + maxDimension); + } FieldWriter<?> newField = FieldWriter.create(fieldInfo, M, beamWidth, segmentWriteState.infoStream); fields.add(newField); ``` ########## lucene/core/src/java/org/apache/lucene/index/IndexingChain.java: ########## @@ -621,6 +621,12 @@ private void initializeFieldInfo(PerField pf) throws IOException { final Sort indexSort = indexWriterConfig.getIndexSort(); validateIndexSortDVType(indexSort, pf.fieldName, s.docValuesType); } + if (s.vectorDimension != 0) { + validateMaxVectorDimension( + pf.fieldName, + s.vectorDimension, + indexWriterConfig.getCodec().knnVectorsFormat().getMaxDimensions()); + } Review Comment: I worry that this adds a hashtable lookup on a hot code path. Maybe it's not that bad for vectors, which are slow to index anyway, but I'd rather avoid it. What about making the codec responsible for checking the limit? Something like below: ```patch diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene95/Lucene95HnswVectorsFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene95/Lucene95HnswVectorsFormat.java index cb3e5ef8b10..6c365e53528 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene95/Lucene95HnswVectorsFormat.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene95/Lucene95HnswVectorsFormat.java @@ -108,6 +108,9 @@ public final class Lucene95HnswVectorsFormat extends KnnVectorsFormat { public static final int VERSION_START = 0; public static final int VERSION_CURRENT = VERSION_START; + /** A maximum number of vector dimensions supported by this codeс */ + public static final int MAX_DIMENSIONS = 1024; + /** * A maximum configurable maximum max conn. * @@ -177,7 +180,7 @@ public final class Lucene95HnswVectorsFormat extends KnnVectorsFormat { @Override public KnnVectorsWriter fieldsWriter(SegmentWriteState state) throws IOException { - return new Lucene95HnswVectorsWriter(state, maxConn, beamWidth); + return new Lucene95HnswVectorsWriter(state, maxConn, beamWidth, MAX_DIMENSIONS); } @Override diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene95/Lucene95HnswVectorsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene95/Lucene95HnswVectorsWriter.java index 5358d66f16e..196f12a21ad 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene95/Lucene95HnswVectorsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene95/Lucene95HnswVectorsWriter.java @@ -60,13 +60,15 @@ public final class Lucene95HnswVectorsWriter extends KnnVectorsWriter { private final IndexOutput meta, vectorData, vectorIndex; private final int M; private final int beamWidth; + private final int maxDimension; private final List<FieldWriter<?>> fields = new ArrayList<>(); private boolean finished; - Lucene95HnswVectorsWriter(SegmentWriteState state, int M, int beamWidth) throws IOException { + Lucene95HnswVectorsWriter(SegmentWriteState state, int M, int beamWidth, int maxDimension) throws IOException { this.M = M; this.beamWidth = beamWidth; + this.maxDimension = maxDimension; segmentWriteState = state; String metaFileName = IndexFileNames.segmentFileName( @@ -117,6 +119,9 @@ public final class Lucene95HnswVectorsWriter extends KnnVectorsWriter { @Override public KnnFieldVectorsWriter<?> addField(FieldInfo fieldInfo) throws IOException { + if (fieldInfo.getVectorDimension() > maxDimension) { + throw new IllegalArgumentException("Number of dimensions " + fieldInfo.getVectorDimension() + " for field " + fieldInfo.name + " exceeds the limit of " + maxDimension); + } FieldWriter<?> newField = FieldWriter.create(fieldInfo, M, beamWidth, segmentWriteState.infoStream); fields.add(newField); ``` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org