s1monw commented on code in PR #12685: URL: https://github.com/apache/lucene/pull/12685#discussion_r1360627901
########## lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99SegmentInfoFormat.java: ########## @@ -0,0 +1,236 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.codecs.lucene99; + +import java.io.IOException; +import java.util.Map; +import java.util.Set; +import org.apache.lucene.codecs.CodecUtil; +import org.apache.lucene.codecs.SegmentInfoFormat; +import org.apache.lucene.index.*; +import org.apache.lucene.search.Sort; +import org.apache.lucene.search.SortField; +import org.apache.lucene.store.*; +import org.apache.lucene.util.Version; + +/** + * Lucene 9.9 Segment info format. + * + * <p>Files: + * + * <ul> + * <li><code>.si</code>: Header, SegVersion, SegSize, IsCompoundFile, Diagnostics, Files, + * Attributes, IndexSort, Footer + * </ul> + * + * Data types: + * + * <ul> + * <li>Header --> {@link CodecUtil#writeIndexHeader IndexHeader} + * <li>SegSize --> {@link DataOutput#writeInt Int32} + * <li>SegVersion --> {@link DataOutput#writeString String} + * <li>SegMinVersion --> {@link DataOutput#writeString String} + * <li>Files --> {@link DataOutput#writeSetOfStrings Set<String>} + * <li>Diagnostics,Attributes --> {@link DataOutput#writeMapOfStrings Map<String,String>} + * <li>IsCompoundFile --> {@link DataOutput#writeByte Int8} + * <li>HasBlocks --> {@link DataOutput#writeByte Int8} + * <li>IndexSort --> {@link DataOutput#writeVInt Int32} count, followed by {@code count} + * SortField + * <li>SortField --> {@link DataOutput#writeString String} sort class, followed by a per-sort + * bytestream (see {@link SortFieldProvider#readSortField(DataInput)}) + * <li>Footer --> {@link CodecUtil#writeFooter CodecFooter} + * </ul> + * + * Field Descriptions: + * + * <ul> + * <li>SegVersion is the code version that created the segment. + * <li>SegMinVersion is the minimum code version that contributed documents to the segment. + * <li>SegSize is the number of documents contained in the segment index. + * <li>IsCompoundFile records whether the segment is written as a compound file or not. If this is + * -1, the segment is not a compound file. If it is 1, the segment is a compound file. + * <li>HasBlocks records whether the segment contains documents written as a block and guarantees + * consecutive document ids for all documents in the block + * <li>The Diagnostics Map is privately written by {@link IndexWriter}, as a debugging aid, for + * each segment it creates. It includes metadata like the current Lucene version, OS, Java + * version, why the segment was created (merge, flush, addIndexes), etc. + * <li>Files is a list of files referred to by this segment. + * </ul> + * + * @see SegmentInfos + * @lucene.experimental + */ +public class Lucene99SegmentInfoFormat extends SegmentInfoFormat { + + /** File extension used to store {@link SegmentInfo}. */ + public static final String SI_EXTENSION = "si"; + + static final String CODEC_NAME = "Lucene90SegmentInfo"; + static final int VERSION_START = 0; + static final int VERSION_CURRENT = VERSION_START; + + /** Sole constructor. */ + public Lucene99SegmentInfoFormat() {} + + @Override + public SegmentInfo read(Directory dir, String segment, byte[] segmentID, IOContext context) + throws IOException { + final String fileName = IndexFileNames.segmentFileName(segment, "", SI_EXTENSION); + try (ChecksumIndexInput input = dir.openChecksumInput(fileName)) { + Throwable priorE = null; + SegmentInfo si = null; + try { + CodecUtil.checkIndexHeader( + input, CODEC_NAME, VERSION_START, VERSION_CURRENT, segmentID, ""); + si = parseSegmentInfo(dir, input, segment, segmentID); + } catch (Throwable exception) { + priorE = exception; + } finally { + CodecUtil.checkFooter(input, priorE); + } + return si; + } + } + + private SegmentInfo parseSegmentInfo( + Directory dir, DataInput input, String segment, byte[] segmentID) throws IOException { + final Version version = Version.fromBits(input.readInt(), input.readInt(), input.readInt()); + byte hasMinVersion = input.readByte(); + final Version minVersion; + switch (hasMinVersion) { + case 0: + minVersion = null; + break; + case 1: + minVersion = Version.fromBits(input.readInt(), input.readInt(), input.readInt()); + break; + default: + throw new CorruptIndexException("Illegal boolean value " + hasMinVersion, input); + } + + final int docCount = input.readInt(); + if (docCount < 0) { + throw new CorruptIndexException("invalid docCount: " + docCount, input); + } + final boolean isCompoundFile = input.readByte() == SegmentInfo.YES; Review Comment: Instead of writing a byte each time here and change index format we could also write a bitset to mark features like this. it might be easier down the road. not sure how often it happens. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org