This is an automated email from the ASF dual-hosted git repository. kturner pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/accumulo.git
The following commit(s) were added to refs/heads/main by this push: new dd6c02d5d5 fixes #2567 corrects race condition in tablet metadata verification (#2574) dd6c02d5d5 is described below commit dd6c02d5d5aae78c8fa0a5fed2023354a4be4598 Author: Keith Turner <ktur...@apache.org> AuthorDate: Mon Apr 4 11:56:40 2022 -0400 fixes #2567 corrects race condition in tablet metadata verification (#2574) --- .../org/apache/accumulo/tserver/TabletServer.java | 7 +- .../accumulo/tserver/tablet/DatafileManager.java | 178 ++++++++++++--------- .../tserver/tablet/MetadataUpdateCount.java | 82 ++++++++++ .../org/apache/accumulo/tserver/tablet/Tablet.java | 70 ++++++-- 4 files changed, 247 insertions(+), 90 deletions(-) diff --git a/server/tserver/src/main/java/org/apache/accumulo/tserver/TabletServer.java b/server/tserver/src/main/java/org/apache/accumulo/tserver/TabletServer.java index 8c33967e94..578d689504 100644 --- a/server/tserver/src/main/java/org/apache/accumulo/tserver/TabletServer.java +++ b/server/tserver/src/main/java/org/apache/accumulo/tserver/TabletServer.java @@ -153,6 +153,7 @@ import org.apache.accumulo.tserver.session.Session; import org.apache.accumulo.tserver.session.SessionManager; import org.apache.accumulo.tserver.tablet.BulkImportCacheCleaner; import org.apache.accumulo.tserver.tablet.CommitSession; +import org.apache.accumulo.tserver.tablet.MetadataUpdateCount; import org.apache.accumulo.tserver.tablet.Tablet; import org.apache.accumulo.tserver.tablet.TabletData; import org.apache.commons.collections4.map.LRUMap; @@ -818,9 +819,9 @@ public class TabletServer extends AbstractServer { watchCriticalFixedDelay(aconf, tabletCheckFrequency, () -> { final SortedMap<KeyExtent,Tablet> onlineTabletsSnapshot = onlineTablets.snapshot(); - Map<KeyExtent,Long> updateCounts = new HashMap<>(); + Map<KeyExtent,MetadataUpdateCount> updateCounts = new HashMap<>(); - // gather updateCounts for each tablet + // gather updateCounts for each tablet before reading tablet metadata onlineTabletsSnapshot.forEach((ke, tablet) -> { updateCounts.put(ke, tablet.getUpdateCount()); }); @@ -842,7 +843,7 @@ public class TabletServer extends AbstractServer { for (var tabletMetadata : tabletsMetadata) { KeyExtent extent = tabletMetadata.getExtent(); Tablet tablet = onlineTabletsSnapshot.get(extent); - Long counter = updateCounts.get(extent); + MetadataUpdateCount counter = updateCounts.get(extent); tablet.compareTabletInfo(counter, tabletMetadata); } } diff --git a/server/tserver/src/main/java/org/apache/accumulo/tserver/tablet/DatafileManager.java b/server/tserver/src/main/java/org/apache/accumulo/tserver/tablet/DatafileManager.java index 143fdd1109..ea5583deca 100644 --- a/server/tserver/src/main/java/org/apache/accumulo/tserver/tablet/DatafileManager.java +++ b/server/tserver/src/main/java/org/apache/accumulo/tserver/tablet/DatafileManager.java @@ -33,6 +33,7 @@ import java.util.SortedMap; import java.util.TreeMap; import java.util.TreeSet; import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicReference; import org.apache.accumulo.core.conf.Property; import org.apache.accumulo.core.dataImpl.KeyExtent; @@ -69,13 +70,20 @@ class DatafileManager { // ensure we only have one reader/writer of our bulk file notes at a time private final Object bulkFileImportLock = new Object(); - // This must be incremented whenever datafileSizes is mutated - private long updateCount; + // This must be incremented before and after datafileSizes and metadata table updates. These + // counts allow detection of overlapping operations w/o placing a lock around metadata table + // updates and datafileSizes updates. There is a periodic metadata consistency check that runs in + // the tablet server against all tablets. This check compares what a tablet object has in memory + // to what is in the metadata table to ensure they are in agreement. Inorder to avoid false + // positives, when this consistency check runs its needs to know if it overlaps in time with any + // metadata updates made by the tablet. The consistency check uses these counts to know that. + private final AtomicReference<MetadataUpdateCount> metadataUpdateCount; DatafileManager(Tablet tablet, SortedMap<StoredTabletFile,DataFileValue> datafileSizes) { this.datafileSizes.putAll(datafileSizes); - this.updateCount = 0L; this.tablet = tablet; + this.metadataUpdateCount = + new AtomicReference<>(new MetadataUpdateCount(tablet.getExtent(), 0L, 0L)); } private final Set<TabletFile> filesToDeleteAfterScan = new HashSet<>(); @@ -246,22 +254,29 @@ class DatafileManager { } } - synchronized (tablet) { - for (Entry<StoredTabletFile,DataFileValue> tpath : newFiles.entrySet()) { - if (datafileSizes.containsKey(tpath.getKey())) { - log.error("Adding file that is already in set {}", tpath.getKey()); + // increment start count before metadata update AND updating in memory map of files + metadataUpdateCount.updateAndGet(MetadataUpdateCount::incrementStart); + // do not place any code here between above stmt and try{}finally + try { + synchronized (tablet) { + for (Entry<StoredTabletFile,DataFileValue> tpath : newFiles.entrySet()) { + if (datafileSizes.containsKey(tpath.getKey())) { + log.error("Adding file that is already in set {}", tpath.getKey()); + } + datafileSizes.put(tpath.getKey(), tpath.getValue()); } - datafileSizes.put(tpath.getKey(), tpath.getValue()); - } - updateCount++; - tablet.getTabletResources().importedMapFiles(); + tablet.getTabletResources().importedMapFiles(); - tablet.computeNumEntries(); - } + tablet.computeNumEntries(); + } - for (Entry<StoredTabletFile,DataFileValue> entry : newFiles.entrySet()) { - TabletLogger.bulkImported(tablet.getExtent(), entry.getKey()); + for (Entry<StoredTabletFile,DataFileValue> entry : newFiles.entrySet()) { + TabletLogger.bulkImported(tablet.getExtent(), entry.getKey()); + } + } finally { + // increment finish count after metadata update AND updating in memory map of files + metadataUpdateCount.updateAndGet(MetadataUpdateCount::incrementFinish); } return newFiles.keySet(); @@ -357,35 +372,43 @@ class DatafileManager { tablet.finishClearingUnusedLogs(); } - do { - try { - // the purpose of making this update use the new commit session, instead of the old one - // passed in, is because the new one will reference the logs used by current memory... + // increment start count before metadata update AND updating in memory map of files + metadataUpdateCount.updateAndGet(MetadataUpdateCount::incrementStart); + // do not place any code here between above stmt and try{}finally + try { - tablet.getTabletServer().minorCompactionFinished( - tablet.getTabletMemory().getCommitSession(), commitSession.getWALogSeq() + 2); - break; - } catch (IOException e) { - log.error("Failed to write to write-ahead log " + e.getMessage() + " will retry", e); - sleepUninterruptibly(1, TimeUnit.SECONDS); - } - } while (true); + do { + try { + // the purpose of making this update use the new commit session, instead of the old one + // passed in, is because the new one will reference the logs used by current memory... - synchronized (tablet) { - t1 = System.currentTimeMillis(); + tablet.getTabletServer().minorCompactionFinished( + tablet.getTabletMemory().getCommitSession(), commitSession.getWALogSeq() + 2); + break; + } catch (IOException e) { + log.error("Failed to write to write-ahead log " + e.getMessage() + " will retry", e); + sleepUninterruptibly(1, TimeUnit.SECONDS); + } + } while (true); + + synchronized (tablet) { + t1 = System.currentTimeMillis(); - if (dfv.getNumEntries() > 0 && newFile.isPresent()) { - StoredTabletFile newFileStored = newFile.get(); - if (datafileSizes.containsKey(newFileStored)) { - log.error("Adding file that is already in set {}", newFileStored); + if (dfv.getNumEntries() > 0 && newFile.isPresent()) { + StoredTabletFile newFileStored = newFile.get(); + if (datafileSizes.containsKey(newFileStored)) { + log.error("Adding file that is already in set {}", newFileStored); + } + datafileSizes.put(newFileStored, dfv); } - datafileSizes.put(newFileStored, dfv); - updateCount++; - } - tablet.flushComplete(flushId); + tablet.flushComplete(flushId); - t2 = System.currentTimeMillis(); + t2 = System.currentTimeMillis(); + } + } finally { + // increment finish count after metadata update AND updating in memory map of files + metadataUpdateCount.updateAndGet(MetadataUpdateCount::incrementFinish); } TabletLogger.flushed(tablet.getExtent(), newFile); @@ -431,49 +454,58 @@ class DatafileManager { Long compactionIdToWrite = null; - synchronized (tablet) { - t1 = System.currentTimeMillis(); - - Preconditions.checkState(datafileSizes.keySet().containsAll(oldDatafiles), - "Compacted files %s are not a subset of tablet files %s", oldDatafiles, - datafileSizes.keySet()); - if (dfv.getNumEntries() > 0) { - Preconditions.checkState(!datafileSizes.containsKey(newFile), - "New compaction file %s already exist in tablet files %s", newFile, + // increment start count before metadata update AND updating in memory map of files + metadataUpdateCount.updateAndGet(MetadataUpdateCount::incrementStart); + // do not place any code here between above stmt and try{}finally + try { + + synchronized (tablet) { + t1 = System.currentTimeMillis(); + + Preconditions.checkState(datafileSizes.keySet().containsAll(oldDatafiles), + "Compacted files %s are not a subset of tablet files %s", oldDatafiles, datafileSizes.keySet()); - } + if (dfv.getNumEntries() > 0) { + Preconditions.checkState(!datafileSizes.containsKey(newFile), + "New compaction file %s already exist in tablet files %s", newFile, + datafileSizes.keySet()); + } - tablet.incrementDataSourceDeletions(); + tablet.incrementDataSourceDeletions(); - datafileSizes.keySet().removeAll(oldDatafiles); + datafileSizes.keySet().removeAll(oldDatafiles); - if (dfv.getNumEntries() > 0) { - datafileSizes.put(newFile, dfv); - // could be used by a follow on compaction in a multipass compaction - } - updateCount++; + if (dfv.getNumEntries() > 0) { + datafileSizes.put(newFile, dfv); + // could be used by a follow on compaction in a multipass compaction + } - tablet.computeNumEntries(); + tablet.computeNumEntries(); - lastLocation = tablet.resetLastLocation(); + lastLocation = tablet.resetLastLocation(); - if (compactionId != null && Collections.disjoint(selectedFiles, datafileSizes.keySet())) { - compactionIdToWrite = compactionId; + if (compactionId != null && Collections.disjoint(selectedFiles, datafileSizes.keySet())) { + compactionIdToWrite = compactionId; + } + + t2 = System.currentTimeMillis(); } - t2 = System.currentTimeMillis(); - } + // known consistency issue between minor and major compactions - see ACCUMULO-18 + Set<StoredTabletFile> filesInUseByScans = waitForScansToFinish(oldDatafiles); + if (!filesInUseByScans.isEmpty()) + log.debug("Adding scan refs to metadata {} {}", extent, filesInUseByScans); + ManagerMetadataUtil.replaceDatafiles(tablet.getContext(), extent, oldDatafiles, + filesInUseByScans, newFile, compactionIdToWrite, dfv, + tablet.getTabletServer().getClientAddressString(), lastLocation, + tablet.getTabletServer().getLock(), ecid); + tablet.setLastCompactionID(compactionIdToWrite); + removeFilesAfterScan(filesInUseByScans); - // known consistency issue between minor and major compactions - see ACCUMULO-18 - Set<StoredTabletFile> filesInUseByScans = waitForScansToFinish(oldDatafiles); - if (!filesInUseByScans.isEmpty()) - log.debug("Adding scan refs to metadata {} {}", extent, filesInUseByScans); - ManagerMetadataUtil.replaceDatafiles(tablet.getContext(), extent, oldDatafiles, - filesInUseByScans, newFile, compactionIdToWrite, dfv, - tablet.getTabletServer().getClientAddressString(), lastLocation, - tablet.getTabletServer().getLock(), ecid); - tablet.setLastCompactionID(compactionIdToWrite); - removeFilesAfterScan(filesInUseByScans); + } finally { + // increment finish count after metadata update AND updating in memory map of files + metadataUpdateCount.updateAndGet(MetadataUpdateCount::incrementFinish); + } if (log.isTraceEnabled()) { log.trace(String.format("MajC finish lock %.2f secs", (t2 - t1) / 1000.0)); @@ -500,8 +532,8 @@ class DatafileManager { return datafileSizes.size(); } - public long getUpdateCount() { - return updateCount; + public MetadataUpdateCount getUpdateCount() { + return metadataUpdateCount.get(); } } diff --git a/server/tserver/src/main/java/org/apache/accumulo/tserver/tablet/MetadataUpdateCount.java b/server/tserver/src/main/java/org/apache/accumulo/tserver/tablet/MetadataUpdateCount.java new file mode 100644 index 0000000000..2ccd919519 --- /dev/null +++ b/server/tserver/src/main/java/org/apache/accumulo/tserver/tablet/MetadataUpdateCount.java @@ -0,0 +1,82 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.accumulo.tserver.tablet; + +import java.util.Objects; + +import org.apache.accumulo.core.dataImpl.KeyExtent; + +/** + * The tablet server does periodic consistency checks to see if what is in the metadata table agrees + * with what each tablet has in memory. When doing these checks its very important to know if the + * tablet severs read from the metadata table overlaps in time with any tablet metadata table + * updates. These counts allow that to be known. For example if these counts are acquired twice for + * a tablet and are the same both times it means that no metadata table updates occurred between the + * two acquisition times. + */ +public class MetadataUpdateCount { + private final KeyExtent extent; + private final long startedCount; + private final long finishedCount; + + MetadataUpdateCount(KeyExtent extent, long startedCount, long finishedCount) { + this.extent = Objects.requireNonNull(extent); + this.startedCount = startedCount; + this.finishedCount = finishedCount; + } + + @Override + public boolean equals(Object o) { + if (this == o) + return true; + if (o == null || getClass() != o.getClass()) + return false; + MetadataUpdateCount that = (MetadataUpdateCount) o; + return startedCount == that.startedCount && finishedCount == that.finishedCount; + } + + @Override + public int hashCode() { + return Objects.hash(startedCount, finishedCount); + } + + public KeyExtent getExtent() { + return extent; + } + + /** + * @return true if the counters were acquired while a metadata table update was being made + */ + public boolean overlapsUpdate() { + return startedCount != finishedCount; + } + + public MetadataUpdateCount incrementStart() { + return new MetadataUpdateCount(extent, startedCount + 1, finishedCount); + } + + public MetadataUpdateCount incrementFinish() { + return new MetadataUpdateCount(extent, startedCount, finishedCount + 1); + } + + @Override + public String toString() { + return "[startedCount:" + startedCount + ",finishedCount:" + finishedCount + "]"; + } +} diff --git a/server/tserver/src/main/java/org/apache/accumulo/tserver/tablet/Tablet.java b/server/tserver/src/main/java/org/apache/accumulo/tserver/tablet/Tablet.java index 07e21ed627..eefc71cfa5 100644 --- a/server/tserver/src/main/java/org/apache/accumulo/tserver/tablet/Tablet.java +++ b/server/tserver/src/main/java/org/apache/accumulo/tserver/tablet/Tablet.java @@ -1419,7 +1419,11 @@ public class Tablet { } }); - compareToDataInMemory(tabletMeta); + if (!tabletMeta.getFilesMap().equals(getDatafileManager().getDatafileSizes())) { + String msg = "Data files in " + extent + " differ from in-memory data " + + tabletMeta.getFilesMap() + " " + getDatafileManager().getDatafileSizes(); + log.error(msg); + } } catch (Exception e) { String msg = "Failed to do close consistency check for tablet " + extent; log.error(msg, e); @@ -1435,23 +1439,61 @@ public class Tablet { } } - private void compareToDataInMemory(TabletMetadata tabletMetadata) { - if (!tabletMetadata.getFilesMap().equals(getDatafileManager().getDatafileSizes())) { - String msg = "Data files in " + extent + " differ from in-memory data " - + tabletMetadata.getFilesMap() + " " + getDatafileManager().getDatafileSizes(); - log.error(msg); - } - } + /** + * Checks that tablet metadata from the metadata table matches what this tablet has in memory. The + * caller of this method must acquire the updateCounter parameter before acquiring the + * tabletMetadata. + * + * @param updateCounter + * used to check for conucurrent updates in which case this check is a no-op. See + * {@link #getUpdateCount()} + * @param tabletMetadata + * the metadata for this tablet that was acquired from the metadata table. + */ + public synchronized void compareTabletInfo(MetadataUpdateCount updateCounter, + TabletMetadata tabletMetadata) { + + // verify the given counter is for this tablet, if this check fail it indicates a bug in the + // calling code + Preconditions.checkArgument(updateCounter.getExtent().equals(getExtent()), + "Counter had unexpected extent %s != %s", updateCounter.getExtent(), getExtent()); + + // verify the given tablet metadata is for this tablet, if this check fail it indicates a bug in + // the calling code + Preconditions.checkArgument(tabletMetadata.getExtent().equals(getExtent()), + "Tablet metadata had unexpected extent %s != %s", tabletMetadata.getExtent(), getExtent()); + + // All of the log messages in this method have the AMCC acronym which means Accumulo Metadata + // Consistency Check. AMCC was added to the log messages to make grep/search for all log + // message from this method easy to find. - public synchronized void compareTabletInfo(Long updateCounter, TabletMetadata tabletMetadata) { if (isClosed() || isClosing()) { + log.trace("AMCC Tablet {} was closed, so skipping check", tabletMetadata.getExtent()); return; } - // if the counter didn't change, compare metadata to what is in memory - if (updateCounter == this.getUpdateCount()) { - this.compareToDataInMemory(tabletMetadata); + + var dataFileSizes = getDatafileManager().getDatafileSizes(); + + if (!tabletMetadata.getFilesMap().equals(dataFileSizes)) { + // The counters are modified outside of locks before and after tablet metadata operations and + // data file updates so, it's very important to acquire the 2nd counts after doing the + // equality check above. If the counts are the same (as the ones acquired before reading + // metadata table) after the equality check above then we know the tablet did not do any + // metadata updates while we were reading metadata and then comparing. + var latestCount = this.getUpdateCount(); + if (updateCounter.overlapsUpdate() || !updateCounter.equals(latestCount)) { + log.trace( + "AMCC Tablet {} may have been updating its metadata while it was being read for " + + "check, so skipping check {} {}", + tabletMetadata.getExtent(), updateCounter, latestCount); + } else { + log.error("Data files in {} differ from in-memory data {} {} {} {}", extent, + tabletMetadata.getFilesMap(), dataFileSizes, updateCounter, latestCount); + } + } else { + log.trace("AMCC Tablet {} files in memory are same as in metadata table {}", + tabletMetadata.getExtent(), updateCounter); } - // if counter did change, don't compare metadata and try again later } /** @@ -2264,7 +2306,7 @@ public class Tablet { return datafileManager; } - public synchronized long getUpdateCount() { + public MetadataUpdateCount getUpdateCount() { return getDatafileManager().getUpdateCount(); }