eribeiro commented on a change in pull request #864: SOLR-13101 : Shared storage support in SolrCloud URL: https://github.com/apache/lucene-solr/pull/864#discussion_r324482829
########## File path: solr/core/src/java/org/apache/solr/store/blob/metadata/ServerSideMetadata.java ########## @@ -0,0 +1,190 @@ +package org.apache.solr.store.blob.metadata; + +import java.io.FileNotFoundException; +import java.io.IOException; +import java.security.MessageDigest; +import java.security.NoSuchAlgorithmException; +import java.util.Arrays; +import java.util.Objects; + +import org.apache.commons.codec.binary.Hex; +import org.apache.lucene.index.IndexCommit; +import org.apache.lucene.store.Directory; +import org.apache.solr.core.CoreContainer; +import org.apache.solr.core.DirectoryFactory; +import org.apache.solr.core.SolrCore; +import org.apache.solr.store.blob.client.BlobCoreMetadata; +import org.apache.solr.store.blob.client.BlobException; + +import com.google.common.collect.ImmutableCollection; +import com.google.common.collect.ImmutableSet; + +/** + * Object capturing the metadata of a shard index on a Solr node. + * + * This works in conjunction with {@link BlobCoreMetadata} to find the differences between + * local (Solr node) and remote (Blob store) commit point for a core.<p> + * + * This object is somewhere between {@link org.apache.lucene.index.IndexCommit} and {@link org.apache.lucene.index.SegmentInfos} + * and by implementing it separately we can add additional metadata to it as needed. + */ +public class ServerSideMetadata { + + /** + * Files composing the core. They are are referenced from the core's current commit point's segments_N file + * which is ALSO included in this collection. + */ + private final ImmutableCollection<CoreFileData> files; + + /** + * Hash of the directory content used to make sure the content doesn't change as we proceed to pull new files from Blob + * (if we need to pull new files from Blob) + */ + private final String directoryHash; + + private final SolrCore core; + private final String coreName; + private final CoreContainer container; + + /** + * Given a core name, builds the local metadata + * + * + * @throws Exception if core corresponding to <code>coreName</code> can't be found. + */ + public ServerSideMetadata(String coreName, CoreContainer container) throws Exception { + this.coreName = coreName; + this.container = container; + this.core = container.getCore(coreName); + + if (core == null) { + throw new Exception("Can't find core " + coreName); + } + + try { + IndexCommit commit = core.getDeletionPolicy().getLatestCommit(); + if (commit == null) { + throw new BlobException("Core " + coreName + " has no available commit point"); + } + + // Work around possible bug returning same file multiple times by using a set here + // See org.apache.solr.handler.ReplicationHandler.getFileList() + ImmutableCollection.Builder<CoreFileData> builder = new ImmutableSet.Builder<>(); + + Directory coreDir = core.getDirectoryFactory().get(core.getIndexDir(), DirectoryFactory.DirContext.DEFAULT, core.getSolrConfig().indexConfig.lockType); + try { + // Capture now the hash and verify again if we need to pull content from the Blob store into this directory, + // to make sure there are no local changes at the same time that might lead to a corruption in case of interaction + // with the download. + directoryHash = getSolrDirectoryHash(coreDir); + + for (String fileName : commit.getFileNames()) { + // Note we add here all segment related files as well as the commit point's segments_N file + // Note commit points do not contain lock (write.lock) files. + builder.add(new CoreFileData(fileName, coreDir.fileLength(fileName))); + } + } finally { + core.getDirectoryFactory().release(coreDir); + } + files = builder.build(); + } finally { + core.close(); + } + } + + public String getCoreName() { + return this.coreName; + } + + public CoreContainer getCoreContainer() { + return this.container; + } + + public String getDirectoryHash() { + return this.directoryHash; + } + + public ImmutableCollection<CoreFileData> getFiles(){ + return this.files; + } + + /** + * Returns <code>true</code> if the contents of the directory passed into this method is identical to the contents of + * the directory of the Solr core of this instance, taken at instance creation time.<p> + * + * Passing in the Directory (expected to be the directory of the same core used during construction) because it seems + * safer than trying to get it again here... + */ + public boolean isSameDirectoryContent(Directory coreDir) throws NoSuchAlgorithmException, IOException { + return directoryHash.equals(getSolrDirectoryHash(coreDir)); + } + + /** + * Computes a hash of a Solr Directory in order to make sure the directory doesn't change as we pull content into it (if we need to + * pull content into it) + */ + private String getSolrDirectoryHash(Directory coreDir) throws NoSuchAlgorithmException, IOException { Review comment: I would suggest to use a [Merkle Tree](https://en.wikipedia.org/wiki/Merkle_tree) because you have flexible and fine grained view (if necessary) of the files that changed. It also can be serialized on disk and stored on blob store if necessary. See the quick-and-dirty implementation I did here: https://gist.github.com/eribeiro/39ff8b73c43d453edd041bf1305425e0 (it is _really_ quick and dirty and slow, even tough much time is dominated by stream().map() call. This snippet outputs a tree as a json snippet that can be copy and pasted here (https://www.sitepoint.com/demos/online-json-tree-viewer/) or here (https://vanya.jp.net/vtree/) for example of usage. ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org