This is an automated email from the ASF dual-hosted git repository.

kturner pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/accumulo.git


The following commit(s) were added to refs/heads/main by this push:
     new dd6c02d5d5 fixes #2567 corrects race condition in tablet metadata 
verification (#2574)
dd6c02d5d5 is described below

commit dd6c02d5d5aae78c8fa0a5fed2023354a4be4598
Author: Keith Turner <ktur...@apache.org>
AuthorDate: Mon Apr 4 11:56:40 2022 -0400

    fixes #2567 corrects race condition in tablet metadata verification (#2574)
---
 .../org/apache/accumulo/tserver/TabletServer.java  |   7 +-
 .../accumulo/tserver/tablet/DatafileManager.java   | 178 ++++++++++++---------
 .../tserver/tablet/MetadataUpdateCount.java        |  82 ++++++++++
 .../org/apache/accumulo/tserver/tablet/Tablet.java |  70 ++++++--
 4 files changed, 247 insertions(+), 90 deletions(-)

diff --git 
a/server/tserver/src/main/java/org/apache/accumulo/tserver/TabletServer.java 
b/server/tserver/src/main/java/org/apache/accumulo/tserver/TabletServer.java
index 8c33967e94..578d689504 100644
--- a/server/tserver/src/main/java/org/apache/accumulo/tserver/TabletServer.java
+++ b/server/tserver/src/main/java/org/apache/accumulo/tserver/TabletServer.java
@@ -153,6 +153,7 @@ import org.apache.accumulo.tserver.session.Session;
 import org.apache.accumulo.tserver.session.SessionManager;
 import org.apache.accumulo.tserver.tablet.BulkImportCacheCleaner;
 import org.apache.accumulo.tserver.tablet.CommitSession;
+import org.apache.accumulo.tserver.tablet.MetadataUpdateCount;
 import org.apache.accumulo.tserver.tablet.Tablet;
 import org.apache.accumulo.tserver.tablet.TabletData;
 import org.apache.commons.collections4.map.LRUMap;
@@ -818,9 +819,9 @@ public class TabletServer extends AbstractServer {
     watchCriticalFixedDelay(aconf, tabletCheckFrequency, () -> {
       final SortedMap<KeyExtent,Tablet> onlineTabletsSnapshot = 
onlineTablets.snapshot();
 
-      Map<KeyExtent,Long> updateCounts = new HashMap<>();
+      Map<KeyExtent,MetadataUpdateCount> updateCounts = new HashMap<>();
 
-      // gather updateCounts for each tablet
+      // gather updateCounts for each tablet before reading tablet metadata
       onlineTabletsSnapshot.forEach((ke, tablet) -> {
         updateCounts.put(ke, tablet.getUpdateCount());
       });
@@ -842,7 +843,7 @@ public class TabletServer extends AbstractServer {
           for (var tabletMetadata : tabletsMetadata) {
             KeyExtent extent = tabletMetadata.getExtent();
             Tablet tablet = onlineTabletsSnapshot.get(extent);
-            Long counter = updateCounts.get(extent);
+            MetadataUpdateCount counter = updateCounts.get(extent);
             tablet.compareTabletInfo(counter, tabletMetadata);
           }
         }
diff --git 
a/server/tserver/src/main/java/org/apache/accumulo/tserver/tablet/DatafileManager.java
 
b/server/tserver/src/main/java/org/apache/accumulo/tserver/tablet/DatafileManager.java
index 143fdd1109..ea5583deca 100644
--- 
a/server/tserver/src/main/java/org/apache/accumulo/tserver/tablet/DatafileManager.java
+++ 
b/server/tserver/src/main/java/org/apache/accumulo/tserver/tablet/DatafileManager.java
@@ -33,6 +33,7 @@ import java.util.SortedMap;
 import java.util.TreeMap;
 import java.util.TreeSet;
 import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicReference;
 
 import org.apache.accumulo.core.conf.Property;
 import org.apache.accumulo.core.dataImpl.KeyExtent;
@@ -69,13 +70,20 @@ class DatafileManager {
   // ensure we only have one reader/writer of our bulk file notes at a time
   private final Object bulkFileImportLock = new Object();
 
-  // This must be incremented whenever datafileSizes is mutated
-  private long updateCount;
+  // This must be incremented before and after datafileSizes and metadata 
table updates. These
+  // counts allow detection of overlapping operations w/o placing a lock 
around metadata table
+  // updates and datafileSizes updates. There is a periodic metadata 
consistency check that runs in
+  // the tablet server against all tablets. This check compares what a tablet 
object has in memory
+  // to what is in the metadata table to ensure they are in agreement. Inorder 
to avoid false
+  // positives, when this consistency check runs its needs to know if it 
overlaps in time with any
+  // metadata updates made by the tablet. The consistency check uses these 
counts to know that.
+  private final AtomicReference<MetadataUpdateCount> metadataUpdateCount;
 
   DatafileManager(Tablet tablet, SortedMap<StoredTabletFile,DataFileValue> 
datafileSizes) {
     this.datafileSizes.putAll(datafileSizes);
-    this.updateCount = 0L;
     this.tablet = tablet;
+    this.metadataUpdateCount =
+        new AtomicReference<>(new MetadataUpdateCount(tablet.getExtent(), 0L, 
0L));
   }
 
   private final Set<TabletFile> filesToDeleteAfterScan = new HashSet<>();
@@ -246,22 +254,29 @@ class DatafileManager {
       }
     }
 
-    synchronized (tablet) {
-      for (Entry<StoredTabletFile,DataFileValue> tpath : newFiles.entrySet()) {
-        if (datafileSizes.containsKey(tpath.getKey())) {
-          log.error("Adding file that is already in set {}", tpath.getKey());
+    // increment start count before metadata update AND updating in memory map 
of files
+    metadataUpdateCount.updateAndGet(MetadataUpdateCount::incrementStart);
+    // do not place any code here between above stmt and try{}finally
+    try {
+      synchronized (tablet) {
+        for (Entry<StoredTabletFile,DataFileValue> tpath : 
newFiles.entrySet()) {
+          if (datafileSizes.containsKey(tpath.getKey())) {
+            log.error("Adding file that is already in set {}", tpath.getKey());
+          }
+          datafileSizes.put(tpath.getKey(), tpath.getValue());
         }
-        datafileSizes.put(tpath.getKey(), tpath.getValue());
-      }
-      updateCount++;
 
-      tablet.getTabletResources().importedMapFiles();
+        tablet.getTabletResources().importedMapFiles();
 
-      tablet.computeNumEntries();
-    }
+        tablet.computeNumEntries();
+      }
 
-    for (Entry<StoredTabletFile,DataFileValue> entry : newFiles.entrySet()) {
-      TabletLogger.bulkImported(tablet.getExtent(), entry.getKey());
+      for (Entry<StoredTabletFile,DataFileValue> entry : newFiles.entrySet()) {
+        TabletLogger.bulkImported(tablet.getExtent(), entry.getKey());
+      }
+    } finally {
+      // increment finish count after metadata update AND updating in memory 
map of files
+      metadataUpdateCount.updateAndGet(MetadataUpdateCount::incrementFinish);
     }
 
     return newFiles.keySet();
@@ -357,35 +372,43 @@ class DatafileManager {
       tablet.finishClearingUnusedLogs();
     }
 
-    do {
-      try {
-        // the purpose of making this update use the new commit session, 
instead of the old one
-        // passed in, is because the new one will reference the logs used by 
current memory...
+    // increment start count before metadata update AND updating in memory map 
of files
+    metadataUpdateCount.updateAndGet(MetadataUpdateCount::incrementStart);
+    // do not place any code here between above stmt and try{}finally
+    try {
 
-        tablet.getTabletServer().minorCompactionFinished(
-            tablet.getTabletMemory().getCommitSession(), 
commitSession.getWALogSeq() + 2);
-        break;
-      } catch (IOException e) {
-        log.error("Failed to write to write-ahead log " + e.getMessage() + " 
will retry", e);
-        sleepUninterruptibly(1, TimeUnit.SECONDS);
-      }
-    } while (true);
+      do {
+        try {
+          // the purpose of making this update use the new commit session, 
instead of the old one
+          // passed in, is because the new one will reference the logs used by 
current memory...
 
-    synchronized (tablet) {
-      t1 = System.currentTimeMillis();
+          tablet.getTabletServer().minorCompactionFinished(
+              tablet.getTabletMemory().getCommitSession(), 
commitSession.getWALogSeq() + 2);
+          break;
+        } catch (IOException e) {
+          log.error("Failed to write to write-ahead log " + e.getMessage() + " 
will retry", e);
+          sleepUninterruptibly(1, TimeUnit.SECONDS);
+        }
+      } while (true);
+
+      synchronized (tablet) {
+        t1 = System.currentTimeMillis();
 
-      if (dfv.getNumEntries() > 0 && newFile.isPresent()) {
-        StoredTabletFile newFileStored = newFile.get();
-        if (datafileSizes.containsKey(newFileStored)) {
-          log.error("Adding file that is already in set {}", newFileStored);
+        if (dfv.getNumEntries() > 0 && newFile.isPresent()) {
+          StoredTabletFile newFileStored = newFile.get();
+          if (datafileSizes.containsKey(newFileStored)) {
+            log.error("Adding file that is already in set {}", newFileStored);
+          }
+          datafileSizes.put(newFileStored, dfv);
         }
-        datafileSizes.put(newFileStored, dfv);
-        updateCount++;
-      }
 
-      tablet.flushComplete(flushId);
+        tablet.flushComplete(flushId);
 
-      t2 = System.currentTimeMillis();
+        t2 = System.currentTimeMillis();
+      }
+    } finally {
+      // increment finish count after metadata update AND updating in memory 
map of files
+      metadataUpdateCount.updateAndGet(MetadataUpdateCount::incrementFinish);
     }
 
     TabletLogger.flushed(tablet.getExtent(), newFile);
@@ -431,49 +454,58 @@ class DatafileManager {
 
     Long compactionIdToWrite = null;
 
-    synchronized (tablet) {
-      t1 = System.currentTimeMillis();
-
-      
Preconditions.checkState(datafileSizes.keySet().containsAll(oldDatafiles),
-          "Compacted files %s are not a subset of tablet files %s", 
oldDatafiles,
-          datafileSizes.keySet());
-      if (dfv.getNumEntries() > 0) {
-        Preconditions.checkState(!datafileSizes.containsKey(newFile),
-            "New compaction file %s already exist in tablet files %s", newFile,
+    // increment start count before metadata update AND updating in memory map 
of files
+    metadataUpdateCount.updateAndGet(MetadataUpdateCount::incrementStart);
+    // do not place any code here between above stmt and try{}finally
+    try {
+
+      synchronized (tablet) {
+        t1 = System.currentTimeMillis();
+
+        
Preconditions.checkState(datafileSizes.keySet().containsAll(oldDatafiles),
+            "Compacted files %s are not a subset of tablet files %s", 
oldDatafiles,
             datafileSizes.keySet());
-      }
+        if (dfv.getNumEntries() > 0) {
+          Preconditions.checkState(!datafileSizes.containsKey(newFile),
+              "New compaction file %s already exist in tablet files %s", 
newFile,
+              datafileSizes.keySet());
+        }
 
-      tablet.incrementDataSourceDeletions();
+        tablet.incrementDataSourceDeletions();
 
-      datafileSizes.keySet().removeAll(oldDatafiles);
+        datafileSizes.keySet().removeAll(oldDatafiles);
 
-      if (dfv.getNumEntries() > 0) {
-        datafileSizes.put(newFile, dfv);
-        // could be used by a follow on compaction in a multipass compaction
-      }
-      updateCount++;
+        if (dfv.getNumEntries() > 0) {
+          datafileSizes.put(newFile, dfv);
+          // could be used by a follow on compaction in a multipass compaction
+        }
 
-      tablet.computeNumEntries();
+        tablet.computeNumEntries();
 
-      lastLocation = tablet.resetLastLocation();
+        lastLocation = tablet.resetLastLocation();
 
-      if (compactionId != null && Collections.disjoint(selectedFiles, 
datafileSizes.keySet())) {
-        compactionIdToWrite = compactionId;
+        if (compactionId != null && Collections.disjoint(selectedFiles, 
datafileSizes.keySet())) {
+          compactionIdToWrite = compactionId;
+        }
+
+        t2 = System.currentTimeMillis();
       }
 
-      t2 = System.currentTimeMillis();
-    }
+      // known consistency issue between minor and major compactions - see 
ACCUMULO-18
+      Set<StoredTabletFile> filesInUseByScans = 
waitForScansToFinish(oldDatafiles);
+      if (!filesInUseByScans.isEmpty())
+        log.debug("Adding scan refs to metadata {} {}", extent, 
filesInUseByScans);
+      ManagerMetadataUtil.replaceDatafiles(tablet.getContext(), extent, 
oldDatafiles,
+          filesInUseByScans, newFile, compactionIdToWrite, dfv,
+          tablet.getTabletServer().getClientAddressString(), lastLocation,
+          tablet.getTabletServer().getLock(), ecid);
+      tablet.setLastCompactionID(compactionIdToWrite);
+      removeFilesAfterScan(filesInUseByScans);
 
-    // known consistency issue between minor and major compactions - see 
ACCUMULO-18
-    Set<StoredTabletFile> filesInUseByScans = 
waitForScansToFinish(oldDatafiles);
-    if (!filesInUseByScans.isEmpty())
-      log.debug("Adding scan refs to metadata {} {}", extent, 
filesInUseByScans);
-    ManagerMetadataUtil.replaceDatafiles(tablet.getContext(), extent, 
oldDatafiles,
-        filesInUseByScans, newFile, compactionIdToWrite, dfv,
-        tablet.getTabletServer().getClientAddressString(), lastLocation,
-        tablet.getTabletServer().getLock(), ecid);
-    tablet.setLastCompactionID(compactionIdToWrite);
-    removeFilesAfterScan(filesInUseByScans);
+    } finally {
+      // increment finish count after metadata update AND updating in memory 
map of files
+      metadataUpdateCount.updateAndGet(MetadataUpdateCount::incrementFinish);
+    }
 
     if (log.isTraceEnabled()) {
       log.trace(String.format("MajC finish lock %.2f secs", (t2 - t1) / 
1000.0));
@@ -500,8 +532,8 @@ class DatafileManager {
     return datafileSizes.size();
   }
 
-  public long getUpdateCount() {
-    return updateCount;
+  public MetadataUpdateCount getUpdateCount() {
+    return metadataUpdateCount.get();
   }
 
 }
diff --git 
a/server/tserver/src/main/java/org/apache/accumulo/tserver/tablet/MetadataUpdateCount.java
 
b/server/tserver/src/main/java/org/apache/accumulo/tserver/tablet/MetadataUpdateCount.java
new file mode 100644
index 0000000000..2ccd919519
--- /dev/null
+++ 
b/server/tserver/src/main/java/org/apache/accumulo/tserver/tablet/MetadataUpdateCount.java
@@ -0,0 +1,82 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.accumulo.tserver.tablet;
+
+import java.util.Objects;
+
+import org.apache.accumulo.core.dataImpl.KeyExtent;
+
+/**
+ * The tablet server does periodic consistency checks to see if what is in the 
metadata table agrees
+ * with what each tablet has in memory. When doing these checks its very 
important to know if the
+ * tablet severs read from the metadata table overlaps in time with any tablet 
metadata table
+ * updates. These counts allow that to be known. For example if these counts 
are acquired twice for
+ * a tablet and are the same both times it means that no metadata table 
updates occurred between the
+ * two acquisition times.
+ */
+public class MetadataUpdateCount {
+  private final KeyExtent extent;
+  private final long startedCount;
+  private final long finishedCount;
+
+  MetadataUpdateCount(KeyExtent extent, long startedCount, long finishedCount) 
{
+    this.extent = Objects.requireNonNull(extent);
+    this.startedCount = startedCount;
+    this.finishedCount = finishedCount;
+  }
+
+  @Override
+  public boolean equals(Object o) {
+    if (this == o)
+      return true;
+    if (o == null || getClass() != o.getClass())
+      return false;
+    MetadataUpdateCount that = (MetadataUpdateCount) o;
+    return startedCount == that.startedCount && finishedCount == 
that.finishedCount;
+  }
+
+  @Override
+  public int hashCode() {
+    return Objects.hash(startedCount, finishedCount);
+  }
+
+  public KeyExtent getExtent() {
+    return extent;
+  }
+
+  /**
+   * @return true if the counters were acquired while a metadata table update 
was being made
+   */
+  public boolean overlapsUpdate() {
+    return startedCount != finishedCount;
+  }
+
+  public MetadataUpdateCount incrementStart() {
+    return new MetadataUpdateCount(extent, startedCount + 1, finishedCount);
+  }
+
+  public MetadataUpdateCount incrementFinish() {
+    return new MetadataUpdateCount(extent, startedCount, finishedCount + 1);
+  }
+
+  @Override
+  public String toString() {
+    return "[startedCount:" + startedCount + ",finishedCount:" + finishedCount 
+ "]";
+  }
+}
diff --git 
a/server/tserver/src/main/java/org/apache/accumulo/tserver/tablet/Tablet.java 
b/server/tserver/src/main/java/org/apache/accumulo/tserver/tablet/Tablet.java
index 07e21ed627..eefc71cfa5 100644
--- 
a/server/tserver/src/main/java/org/apache/accumulo/tserver/tablet/Tablet.java
+++ 
b/server/tserver/src/main/java/org/apache/accumulo/tserver/tablet/Tablet.java
@@ -1419,7 +1419,11 @@ public class Tablet {
         }
       });
 
-      compareToDataInMemory(tabletMeta);
+      if 
(!tabletMeta.getFilesMap().equals(getDatafileManager().getDatafileSizes())) {
+        String msg = "Data files in " + extent + " differ from in-memory data "
+            + tabletMeta.getFilesMap() + " " + 
getDatafileManager().getDatafileSizes();
+        log.error(msg);
+      }
     } catch (Exception e) {
       String msg = "Failed to do close consistency check for tablet " + extent;
       log.error(msg, e);
@@ -1435,23 +1439,61 @@ public class Tablet {
     }
   }
 
-  private void compareToDataInMemory(TabletMetadata tabletMetadata) {
-    if 
(!tabletMetadata.getFilesMap().equals(getDatafileManager().getDatafileSizes())) 
{
-      String msg = "Data files in " + extent + " differ from in-memory data "
-          + tabletMetadata.getFilesMap() + " " + 
getDatafileManager().getDatafileSizes();
-      log.error(msg);
-    }
-  }
+  /**
+   * Checks that tablet metadata from the metadata table matches what this 
tablet has in memory. The
+   * caller of this method must acquire the updateCounter parameter before 
acquiring the
+   * tabletMetadata.
+   *
+   * @param updateCounter
+   *          used to check for conucurrent updates in which case this check 
is a no-op. See
+   *          {@link #getUpdateCount()}
+   * @param tabletMetadata
+   *          the metadata for this tablet that was acquired from the metadata 
table.
+   */
+  public synchronized void compareTabletInfo(MetadataUpdateCount updateCounter,
+      TabletMetadata tabletMetadata) {
+
+    // verify the given counter is for this tablet, if this check fail it 
indicates a bug in the
+    // calling code
+    Preconditions.checkArgument(updateCounter.getExtent().equals(getExtent()),
+        "Counter had unexpected extent %s != %s", updateCounter.getExtent(), 
getExtent());
+
+    // verify the given tablet metadata is for this tablet, if this check fail 
it indicates a bug in
+    // the calling code
+    Preconditions.checkArgument(tabletMetadata.getExtent().equals(getExtent()),
+        "Tablet metadata had unexpected extent %s != %s", 
tabletMetadata.getExtent(), getExtent());
+
+    // All of the log messages in this method have the AMCC acronym which 
means Accumulo Metadata
+    // Consistency Check. AMCC was added to the log messages to make 
grep/search for all log
+    // message from this method easy to find.
 
-  public synchronized void compareTabletInfo(Long updateCounter, 
TabletMetadata tabletMetadata) {
     if (isClosed() || isClosing()) {
+      log.trace("AMCC Tablet {} was closed, so skipping check", 
tabletMetadata.getExtent());
       return;
     }
-    // if the counter didn't change, compare metadata to what is in memory
-    if (updateCounter == this.getUpdateCount()) {
-      this.compareToDataInMemory(tabletMetadata);
+
+    var dataFileSizes = getDatafileManager().getDatafileSizes();
+
+    if (!tabletMetadata.getFilesMap().equals(dataFileSizes)) {
+      // The counters are modified outside of locks before and after tablet 
metadata operations and
+      // data file updates so, it's very important to acquire the 2nd counts 
after doing the
+      // equality check above. If the counts are the same (as the ones 
acquired before reading
+      // metadata table) after the equality check above then we know the 
tablet did not do any
+      // metadata updates while we were reading metadata and then comparing.
+      var latestCount = this.getUpdateCount();
+      if (updateCounter.overlapsUpdate() || 
!updateCounter.equals(latestCount)) {
+        log.trace(
+            "AMCC Tablet {} may have been updating its metadata while it was 
being read for "
+                + "check, so skipping check {} {}",
+            tabletMetadata.getExtent(), updateCounter, latestCount);
+      } else {
+        log.error("Data files in {} differ from in-memory data {} {} {} {}", 
extent,
+            tabletMetadata.getFilesMap(), dataFileSizes, updateCounter, 
latestCount);
+      }
+    } else {
+      log.trace("AMCC Tablet {} files in memory are same as in metadata table 
{}",
+          tabletMetadata.getExtent(), updateCounter);
     }
-    // if counter did change, don't compare metadata and try again later
   }
 
   /**
@@ -2264,7 +2306,7 @@ public class Tablet {
     return datafileManager;
   }
 
-  public synchronized long getUpdateCount() {
+  public MetadataUpdateCount getUpdateCount() {
     return getDatafileManager().getUpdateCount();
   }
 

Reply via email to