mikemccand commented on code in PR #13054: URL: https://github.com/apache/lucene/pull/13054#discussion_r1946542491
########## lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymMap.java: ########## @@ -218,12 +231,26 @@ public void add(CharsRef input, CharsRef output, boolean includeOrig) { add(input, countWords(input), output, countWords(output), includeOrig); } - /** Builds an {@link SynonymMap} and returns it. */ + /** Buils a {@link SynonymMap} and returns it. */ Review Comment: Hmm, waaaaay up above, the javadoc for `Builder`, it mentions `FSTSynonymMap` twice -- can you fix those to `SynonymMap` instead? That must be holdover from ancient naming... Also, it's a bit annoying that GH does not allow me to put comments on parts of the code you did not change :) I guess this is GH's appempt to keep me in "eyes on the prize" mode ... so I only comment on stuff changed in the PR. ########## lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymMapDirectory.java: ########## @@ -0,0 +1,193 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.analysis.synonym; + +import java.io.Closeable; +import java.io.IOException; +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.List; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.FSDirectory; +import org.apache.lucene.store.IOContext; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.fst.ByteSequenceOutputs; +import org.apache.lucene.util.fst.FST; +import org.apache.lucene.util.fst.OffHeapFSTStore; + +/** + * Wraps an {@link FSDirectory} to read and write a compiled {@link SynonymMap}. When reading, the + * FST and output words are kept off-heap. + * + * @lucene.experimental + */ +public class SynonymMapDirectory implements Closeable { + private final SynonymMapFormat synonymMapFormat = + new SynonymMapFormat(); // TODO -- Should this be more flexible/codec-like? Less? + private final Directory directory; + private final List<Closeable> resources = new ArrayList<>(); + + public SynonymMapDirectory(Path path) throws IOException { + directory = FSDirectory.open(path); + } + + IndexOutput fstOutput() throws IOException { + return synonymMapFormat.getFSTOutput(directory); + } + + WordsOutput wordsOutput() throws IOException { + return synonymMapFormat.getWordsOutput(directory); + } + + void writeMetadata(int wordCount, int maxHorizontalContext, FST.FSTMetadata<BytesRef> fstMetadata) + throws IOException { + synonymMapFormat.writeMetadata( + directory, new SynonymMetadata(wordCount, maxHorizontalContext, fstMetadata)); + } + + SynonymMap readMap() throws IOException { + CloseableSynonymMap closeableSynonymMap = synonymMapFormat.readSynonymMap(directory); + resources.add(closeableSynonymMap); + return closeableSynonymMap.map; + } + + boolean hasSynonyms() throws IOException { + // TODO should take the path to the synonyms file to compare file hash against file used to + // build the directory + return directory.listAll().length > 0; + } + + @Override + public void close() throws IOException { + for (Closeable c : resources) { + c.close(); + } + directory.close(); + } + + /** + * Abstraction to support writing individual output words to the directory. Should be closed after + * the last word is written. + */ + abstract static class WordsOutput implements Closeable { + public abstract void addWord(BytesRef word) throws IOException; + } + + private record CloseableSynonymMap(SynonymMap map, IndexInput indexInput) implements Closeable { + @Override + public void close() throws IOException { + indexInput.close(); + } + } + + private record SynonymMetadata( + int wordCount, int maxHorizontalContext, FST.FSTMetadata<BytesRef> fstMetadata) {} + + private static class SynonymMapFormat { + private static final String FST_FILE = "synonyms.fst"; + private static final String WORDS_FILE = "synonyms.wrd"; + private static final String METADATA_FILE = "synonyms.mdt"; + + private IndexOutput getFSTOutput(Directory directory) throws IOException { + return directory.createOutput(FST_FILE, IOContext.DEFAULT); + } + + private WordsOutput getWordsOutput(Directory directory) throws IOException { + IndexOutput wordsOutput = directory.createOutput(WORDS_FILE, IOContext.DEFAULT); + return new WordsOutput() { + @Override + public void close() throws IOException { + wordsOutput.close(); + } + + @Override + public void addWord(BytesRef word) throws IOException { + wordsOutput.writeVInt(word.length); + wordsOutput.writeBytes(word.bytes, word.offset, word.length); + } + }; + } + ; + + private void writeMetadata(Directory directory, SynonymMetadata synonymMetadata) + throws IOException { + try (IndexOutput metadataOutput = directory.createOutput(METADATA_FILE, IOContext.DEFAULT)) { + metadataOutput.writeVInt(synonymMetadata.wordCount); + metadataOutput.writeVInt(synonymMetadata.maxHorizontalContext); + synonymMetadata.fstMetadata.save(metadataOutput); + } + directory.sync(List.of(FST_FILE, WORDS_FILE, METADATA_FILE)); + } + + private SynonymMetadata readMetadata(Directory directory) throws IOException { + try (IndexInput metadataInput = directory.openInput(METADATA_FILE, IOContext.READONCE)) { + int wordCount = metadataInput.readVInt(); + int maxHorizontalContext = metadataInput.readVInt(); + FST.FSTMetadata<BytesRef> fstMetadata = + FST.readMetadata(metadataInput, ByteSequenceOutputs.getSingleton()); + return new SynonymMetadata(wordCount, maxHorizontalContext, fstMetadata); + } + } + + private CloseableSynonymMap readSynonymMap(Directory directory) throws IOException { + SynonymMetadata synonymMetadata = readMetadata(directory); + IndexInput in = directory.openInput(FST_FILE, IOContext.DEFAULT); + FST<BytesRef> fst = + FST.fromFSTReader( + synonymMetadata.fstMetadata, new OffHeapFSTStore(in, 0, synonymMetadata.fstMetadata)); + OnHeapSynonymDictionary words; + try (IndexInput wordsInput = directory.openInput(WORDS_FILE, IOContext.DEFAULT)) { + words = new OnHeapSynonymDictionary(synonymMetadata.wordCount, wordsInput); + } + SynonymMap map = new SynonymMap(fst, words, synonymMetadata.maxHorizontalContext); + return new CloseableSynonymMap(map, in); + } + + private static class OnHeapSynonymDictionary extends SynonymMap.SynonymDictionary { + private final int[] bytesStartArray; + private final byte[] wordBytes; + + private OnHeapSynonymDictionary(int wordCount, IndexInput wordsFile) throws IOException { + bytesStartArray = new int[wordCount + 1]; + int pos = 0; + for (int i = 0; i < wordCount; i++) { + bytesStartArray[i] = pos; + int size = wordsFile.readVInt(); + pos += size; + wordsFile.seek(wordsFile.getFilePointer() + size); Review Comment: Can you use `.skipBytes` here instead? ########## lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymMapDirectory.java: ########## @@ -0,0 +1,193 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.analysis.synonym; + +import java.io.Closeable; +import java.io.IOException; +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.List; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.FSDirectory; +import org.apache.lucene.store.IOContext; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.fst.ByteSequenceOutputs; +import org.apache.lucene.util.fst.FST; +import org.apache.lucene.util.fst.OffHeapFSTStore; + +/** + * Wraps an {@link FSDirectory} to read and write a compiled {@link SynonymMap}. When reading, the + * FST and output words are kept off-heap. + * + * @lucene.experimental + */ +public class SynonymMapDirectory implements Closeable { + private final SynonymMapFormat synonymMapFormat = + new SynonymMapFormat(); // TODO -- Should this be more flexible/codec-like? Less? + private final Directory directory; + private final List<Closeable> resources = new ArrayList<>(); + + public SynonymMapDirectory(Path path) throws IOException { + directory = FSDirectory.open(path); + } + + IndexOutput fstOutput() throws IOException { + return synonymMapFormat.getFSTOutput(directory); + } + + WordsOutput wordsOutput() throws IOException { + return synonymMapFormat.getWordsOutput(directory); + } + + void writeMetadata(int wordCount, int maxHorizontalContext, FST.FSTMetadata<BytesRef> fstMetadata) + throws IOException { + synonymMapFormat.writeMetadata( + directory, new SynonymMetadata(wordCount, maxHorizontalContext, fstMetadata)); + } + + SynonymMap readMap() throws IOException { + CloseableSynonymMap closeableSynonymMap = synonymMapFormat.readSynonymMap(directory); + resources.add(closeableSynonymMap); + return closeableSynonymMap.map; + } + + boolean hasSynonyms() throws IOException { + // TODO should take the path to the synonyms file to compare file hash against file used to + // build the directory + return directory.listAll().length > 0; + } + + @Override + public void close() throws IOException { + for (Closeable c : resources) { + c.close(); + } + directory.close(); + } + + /** + * Abstraction to support writing individual output words to the directory. Should be closed after + * the last word is written. + */ + abstract static class WordsOutput implements Closeable { + public abstract void addWord(BytesRef word) throws IOException; + } + + private record CloseableSynonymMap(SynonymMap map, IndexInput indexInput) implements Closeable { + @Override + public void close() throws IOException { + indexInput.close(); + } + } + + private record SynonymMetadata( + int wordCount, int maxHorizontalContext, FST.FSTMetadata<BytesRef> fstMetadata) {} + + private static class SynonymMapFormat { + private static final String FST_FILE = "synonyms.fst"; + private static final String WORDS_FILE = "synonyms.wrd"; + private static final String METADATA_FILE = "synonyms.mdt"; Review Comment: Ahh yes up to three files, so I think it makes sense for user to pass `Directory` (not `Path`). ########## lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymMap.java: ########## @@ -218,12 +231,26 @@ public void add(CharsRef input, CharsRef output, boolean includeOrig) { add(input, countWords(input), output, countWords(output), includeOrig); } - /** Builds an {@link SynonymMap} and returns it. */ + /** Buils a {@link SynonymMap} and returns it. */ public SynonymMap build() throws IOException { + return build(null); + } + + /** + * Builds a {@link SynonymMap} and returns it. If directory is non-null, it will write the + * compiled SynonymMap to disk and return an off-heap version. + */ + public SynonymMap build(SynonymMapDirectory directory) throws IOException { Review Comment: This ability to save a `SynonymMap` is new to your PR, right? We cannot save/load them today? So this is a nice new additional feature (in addition to off-heap option, and a nice side effect of it) in your PR? ########## lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymMapDirectory.java: ########## @@ -0,0 +1,191 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.analysis.synonym; + +import java.io.Closeable; +import java.io.IOException; +import java.nio.file.Path; +import java.util.List; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.FSDirectory; +import org.apache.lucene.store.IOContext; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.fst.ByteSequenceOutputs; +import org.apache.lucene.util.fst.FST; +import org.apache.lucene.util.fst.OffHeapFSTStore; + +/** + * Wraps an {@link FSDirectory} to read and write a compiled {@link SynonymMap}. When reading, the + * FST and output words are kept off-heap. Review Comment: > I wonder in practice what the "typical" size of FST vs words is? Like does the FST dominate the storage? Aha, you answered this in an earlier comment: > The on-heap FST seems to occupy about 36MB of heap. The off-heap FST with on-heap words occupies about 560kB. The off-heap FST with off-heap words occupies about 150kB. It's wild that the FST is so much larger than the words... I'm not yet understanding why. ########## lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymMapDirectory.java: ########## @@ -0,0 +1,191 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.analysis.synonym; + +import java.io.Closeable; +import java.io.IOException; +import java.nio.file.Path; +import java.util.List; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.FSDirectory; +import org.apache.lucene.store.IOContext; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.fst.ByteSequenceOutputs; +import org.apache.lucene.util.fst.FST; +import org.apache.lucene.util.fst.OffHeapFSTStore; + +/** + * Wraps an {@link FSDirectory} to read and write a compiled {@link SynonymMap}. When reading, the + * FST and output words are kept off-heap. + */ +public class SynonymMapDirectory implements Closeable { + private final SynonymMapFormat synonymMapFormat = + new SynonymMapFormat(); // TODO -- Should this be more flexible/codec-like? Less? + private final Directory directory; + + public SynonymMapDirectory(Path path) throws IOException { Review Comment: Let's leave it as is for now (three separate files)? But let's mark things `@lucene.experimental` to reserve the right to change APIs. Hmm, also: will these synonym files be backwards compatible across releases? Across major releases? I would say we should not promise across major releases? Furthermore, we should enforce that not-promise, by writing the major release into the metadata somewhere and checking if that changed between writing and reading and throw a clear exception if so? Within minor releases maybe we allow backcompat? If so, we need to add some testing to confirm syns written in 10.x are still readable/usable in 10.y? ########## lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymMapDirectory.java: ########## @@ -0,0 +1,193 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.analysis.synonym; + +import java.io.Closeable; +import java.io.IOException; +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.List; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.FSDirectory; +import org.apache.lucene.store.IOContext; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.fst.ByteSequenceOutputs; +import org.apache.lucene.util.fst.FST; +import org.apache.lucene.util.fst.OffHeapFSTStore; + +/** + * Wraps an {@link FSDirectory} to read and write a compiled {@link SynonymMap}. When reading, the + * FST and output words are kept off-heap. + * + * @lucene.experimental + */ +public class SynonymMapDirectory implements Closeable { + private final SynonymMapFormat synonymMapFormat = + new SynonymMapFormat(); // TODO -- Should this be more flexible/codec-like? Less? + private final Directory directory; + private final List<Closeable> resources = new ArrayList<>(); + + public SynonymMapDirectory(Path path) throws IOException { + directory = FSDirectory.open(path); + } + + IndexOutput fstOutput() throws IOException { + return synonymMapFormat.getFSTOutput(directory); + } + + WordsOutput wordsOutput() throws IOException { + return synonymMapFormat.getWordsOutput(directory); + } + + void writeMetadata(int wordCount, int maxHorizontalContext, FST.FSTMetadata<BytesRef> fstMetadata) + throws IOException { + synonymMapFormat.writeMetadata( + directory, new SynonymMetadata(wordCount, maxHorizontalContext, fstMetadata)); + } + + SynonymMap readMap() throws IOException { + CloseableSynonymMap closeableSynonymMap = synonymMapFormat.readSynonymMap(directory); + resources.add(closeableSynonymMap); + return closeableSynonymMap.map; + } + + boolean hasSynonyms() throws IOException { + // TODO should take the path to the synonyms file to compare file hash against file used to + // build the directory + return directory.listAll().length > 0; + } + + @Override + public void close() throws IOException { + for (Closeable c : resources) { + c.close(); + } + directory.close(); + } + + /** + * Abstraction to support writing individual output words to the directory. Should be closed after + * the last word is written. + */ + abstract static class WordsOutput implements Closeable { + public abstract void addWord(BytesRef word) throws IOException; + } + + private record CloseableSynonymMap(SynonymMap map, IndexInput indexInput) implements Closeable { + @Override + public void close() throws IOException { + indexInput.close(); + } + } + + private record SynonymMetadata( + int wordCount, int maxHorizontalContext, FST.FSTMetadata<BytesRef> fstMetadata) {} + + private static class SynonymMapFormat { + private static final String FST_FILE = "synonyms.fst"; + private static final String WORDS_FILE = "synonyms.wrd"; + private static final String METADATA_FILE = "synonyms.mdt"; + + private IndexOutput getFSTOutput(Directory directory) throws IOException { + return directory.createOutput(FST_FILE, IOContext.DEFAULT); + } + + private WordsOutput getWordsOutput(Directory directory) throws IOException { Review Comment: `createWordsOutput`? ########## lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymMapDirectory.java: ########## @@ -0,0 +1,193 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.analysis.synonym; + +import java.io.Closeable; +import java.io.IOException; +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.List; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.FSDirectory; +import org.apache.lucene.store.IOContext; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.fst.ByteSequenceOutputs; +import org.apache.lucene.util.fst.FST; +import org.apache.lucene.util.fst.OffHeapFSTStore; + +/** + * Wraps an {@link FSDirectory} to read and write a compiled {@link SynonymMap}. When reading, the + * FST and output words are kept off-heap. + * + * @lucene.experimental + */ +public class SynonymMapDirectory implements Closeable { + private final SynonymMapFormat synonymMapFormat = + new SynonymMapFormat(); // TODO -- Should this be more flexible/codec-like? Less? + private final Directory directory; + private final List<Closeable> resources = new ArrayList<>(); + + public SynonymMapDirectory(Path path) throws IOException { + directory = FSDirectory.open(path); + } + + IndexOutput fstOutput() throws IOException { + return synonymMapFormat.getFSTOutput(directory); + } + + WordsOutput wordsOutput() throws IOException { + return synonymMapFormat.getWordsOutput(directory); + } + + void writeMetadata(int wordCount, int maxHorizontalContext, FST.FSTMetadata<BytesRef> fstMetadata) + throws IOException { + synonymMapFormat.writeMetadata( + directory, new SynonymMetadata(wordCount, maxHorizontalContext, fstMetadata)); + } + + SynonymMap readMap() throws IOException { + CloseableSynonymMap closeableSynonymMap = synonymMapFormat.readSynonymMap(directory); + resources.add(closeableSynonymMap); + return closeableSynonymMap.map; + } + + boolean hasSynonyms() throws IOException { + // TODO should take the path to the synonyms file to compare file hash against file used to + // build the directory + return directory.listAll().length > 0; + } + + @Override + public void close() throws IOException { + for (Closeable c : resources) { + c.close(); + } + directory.close(); + } + + /** + * Abstraction to support writing individual output words to the directory. Should be closed after + * the last word is written. + */ + abstract static class WordsOutput implements Closeable { + public abstract void addWord(BytesRef word) throws IOException; + } + + private record CloseableSynonymMap(SynonymMap map, IndexInput indexInput) implements Closeable { + @Override + public void close() throws IOException { + indexInput.close(); + } + } + + private record SynonymMetadata( + int wordCount, int maxHorizontalContext, FST.FSTMetadata<BytesRef> fstMetadata) {} + + private static class SynonymMapFormat { Review Comment: +1 to keep this format private for now. ########## lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymMapDirectory.java: ########## @@ -0,0 +1,191 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.analysis.synonym; + +import java.io.Closeable; +import java.io.IOException; +import java.nio.file.Path; +import java.util.List; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.FSDirectory; +import org.apache.lucene.store.IOContext; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.fst.ByteSequenceOutputs; +import org.apache.lucene.util.fst.FST; +import org.apache.lucene.util.fst.OffHeapFSTStore; + +/** + * Wraps an {@link FSDirectory} to read and write a compiled {@link SynonymMap}. When reading, the + * FST and output words are kept off-heap. Review Comment: Hmm can you fix the javadoc above to explain that words are on-heap and FST is off-heap? +1 to default to that sweet spot. I wonder in practice what the "typical" size of FST vs words is? Like does the FST dominate the storage? ########## lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymMapDirectory.java: ########## @@ -0,0 +1,193 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.analysis.synonym; + +import java.io.Closeable; +import java.io.IOException; +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.List; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.FSDirectory; +import org.apache.lucene.store.IOContext; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.fst.ByteSequenceOutputs; +import org.apache.lucene.util.fst.FST; +import org.apache.lucene.util.fst.OffHeapFSTStore; + +/** + * Wraps an {@link FSDirectory} to read and write a compiled {@link SynonymMap}. When reading, the + * FST and output words are kept off-heap. + * + * @lucene.experimental + */ +public class SynonymMapDirectory implements Closeable { + private final SynonymMapFormat synonymMapFormat = + new SynonymMapFormat(); // TODO -- Should this be more flexible/codec-like? Less? + private final Directory directory; + private final List<Closeable> resources = new ArrayList<>(); + + public SynonymMapDirectory(Path path) throws IOException { + directory = FSDirectory.open(path); + } + + IndexOutput fstOutput() throws IOException { + return synonymMapFormat.getFSTOutput(directory); + } + + WordsOutput wordsOutput() throws IOException { + return synonymMapFormat.getWordsOutput(directory); + } + + void writeMetadata(int wordCount, int maxHorizontalContext, FST.FSTMetadata<BytesRef> fstMetadata) + throws IOException { + synonymMapFormat.writeMetadata( + directory, new SynonymMetadata(wordCount, maxHorizontalContext, fstMetadata)); + } + + SynonymMap readMap() throws IOException { + CloseableSynonymMap closeableSynonymMap = synonymMapFormat.readSynonymMap(directory); + resources.add(closeableSynonymMap); + return closeableSynonymMap.map; + } + + boolean hasSynonyms() throws IOException { + // TODO should take the path to the synonyms file to compare file hash against file used to + // build the directory + return directory.listAll().length > 0; + } + + @Override + public void close() throws IOException { + for (Closeable c : resources) { + c.close(); + } + directory.close(); + } + + /** + * Abstraction to support writing individual output words to the directory. Should be closed after + * the last word is written. + */ + abstract static class WordsOutput implements Closeable { + public abstract void addWord(BytesRef word) throws IOException; + } + + private record CloseableSynonymMap(SynonymMap map, IndexInput indexInput) implements Closeable { + @Override + public void close() throws IOException { + indexInput.close(); + } + } + + private record SynonymMetadata( + int wordCount, int maxHorizontalContext, FST.FSTMetadata<BytesRef> fstMetadata) {} + + private static class SynonymMapFormat { + private static final String FST_FILE = "synonyms.fst"; + private static final String WORDS_FILE = "synonyms.wrd"; + private static final String METADATA_FILE = "synonyms.mdt"; + + private IndexOutput getFSTOutput(Directory directory) throws IOException { Review Comment: Rename to `createFSTOutput`? Make it clear we are always creating a new one, not getting an existing one? ########## lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymMap.java: ########## @@ -290,11 +317,28 @@ public SynonymMap build() throws IOException { fstCompiler.add(Util.toUTF32(input, scratchIntsRef), scratch.toBytesRef()); } - FST<BytesRef> fst = FST.fromFSTReader(fstCompiler.compile(), fstCompiler.getFSTReader()); + FST.FSTMetadata<BytesRef> fstMetaData = fstCompiler.compile(); + if (directory != null) { + fstOutput.close(); // TODO -- Should fstCompiler.compile take care of this? Review Comment: I think the idea is a caller could in theory write multiple FSTs into a single `IndexOutput` (remove the `TODO`?)? ########## lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymMap.java: ########## @@ -290,11 +317,28 @@ public SynonymMap build() throws IOException { fstCompiler.add(Util.toUTF32(input, scratchIntsRef), scratch.toBytesRef()); } - FST<BytesRef> fst = FST.fromFSTReader(fstCompiler.compile(), fstCompiler.getFSTReader()); + FST.FSTMetadata<BytesRef> fstMetaData = fstCompiler.compile(); + if (directory != null) { + fstOutput.close(); // TODO -- Should fstCompiler.compile take care of this? + try (SynonymMapDirectory.WordsOutput wordsOutput = directory.wordsOutput()) { Review Comment: A better on-disk layout might be to write a single big `byte[]` blob for all words, and then something like the cool "linear fit" encoding that `MonotonicLongValues` uses on-disk. This would be more compact and faster to load and maybe more options of what is on/off heap, etc. But save all that for later! vInt prefix length encoding is fine for starters! Progress not perfection! ########## lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymMapDirectory.java: ########## @@ -0,0 +1,193 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.analysis.synonym; + +import java.io.Closeable; +import java.io.IOException; +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.List; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.FSDirectory; +import org.apache.lucene.store.IOContext; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.fst.ByteSequenceOutputs; +import org.apache.lucene.util.fst.FST; +import org.apache.lucene.util.fst.OffHeapFSTStore; + +/** + * Wraps an {@link FSDirectory} to read and write a compiled {@link SynonymMap}. When reading, the + * FST and output words are kept off-heap. + * + * @lucene.experimental + */ +public class SynonymMapDirectory implements Closeable { + private final SynonymMapFormat synonymMapFormat = + new SynonymMapFormat(); // TODO -- Should this be more flexible/codec-like? Less? + private final Directory directory; + private final List<Closeable> resources = new ArrayList<>(); + + public SynonymMapDirectory(Path path) throws IOException { + directory = FSDirectory.open(path); + } + + IndexOutput fstOutput() throws IOException { + return synonymMapFormat.getFSTOutput(directory); + } + + WordsOutput wordsOutput() throws IOException { + return synonymMapFormat.getWordsOutput(directory); + } + + void writeMetadata(int wordCount, int maxHorizontalContext, FST.FSTMetadata<BytesRef> fstMetadata) + throws IOException { + synonymMapFormat.writeMetadata( + directory, new SynonymMetadata(wordCount, maxHorizontalContext, fstMetadata)); + } + + SynonymMap readMap() throws IOException { + CloseableSynonymMap closeableSynonymMap = synonymMapFormat.readSynonymMap(directory); + resources.add(closeableSynonymMap); + return closeableSynonymMap.map; + } + + boolean hasSynonyms() throws IOException { + // TODO should take the path to the synonyms file to compare file hash against file used to Review Comment: Whoa, what would this `TODO` achieve? Is it somehow trying to check if the compiled synonyms have become stale relative to the original source synonyms (a "make" like capability)? We don't know down here whether the original source synonyms are backed by a file... ########## lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymMapDirectory.java: ########## @@ -0,0 +1,193 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.analysis.synonym; + +import java.io.Closeable; +import java.io.IOException; +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.List; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.FSDirectory; +import org.apache.lucene.store.IOContext; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.fst.ByteSequenceOutputs; +import org.apache.lucene.util.fst.FST; +import org.apache.lucene.util.fst.OffHeapFSTStore; + +/** + * Wraps an {@link FSDirectory} to read and write a compiled {@link SynonymMap}. When reading, the Review Comment: Hmm any reason why it must be an `FSDirectory`? Can it just be any `Directory`? Do we really rely on filesystem backing somehow? It looks like we are just using Lucene's standard `IndexInput/Output`... ########## lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymMapDirectory.java: ########## @@ -0,0 +1,191 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.analysis.synonym; + +import java.io.Closeable; +import java.io.IOException; +import java.nio.file.Path; +import java.util.List; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.FSDirectory; +import org.apache.lucene.store.IOContext; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.fst.ByteSequenceOutputs; +import org.apache.lucene.util.fst.FST; +import org.apache.lucene.util.fst.OffHeapFSTStore; + +/** + * Wraps an {@link FSDirectory} to read and write a compiled {@link SynonymMap}. When reading, the + * FST and output words are kept off-heap. + */ +public class SynonymMapDirectory implements Closeable { Review Comment: If we ever want to bring back the off-heap words, it seems like `SynonymMapDirectory` is the way to go, because we need to store two "things" in this directory? Or, were we stuffing both FST and words into a single file when you had words off-heap too? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org