This is an automated email from the ASF dual-hosted git repository. kturner pushed a commit to branch elasticity in repository https://gitbox.apache.org/repos/asf/accumulo.git
The following commit(s) were added to refs/heads/elasticity by this push: new f586e7f9c9 avoid unnecessary metadata reads in client tablet cache (#4432) f586e7f9c9 is described below commit f586e7f9c9a82c0f01cba1ac279a7d583c7be3c4 Author: Keith Turner <ktur...@apache.org> AuthorDate: Fri Apr 5 12:05:21 2024 -0400 avoid unnecessary metadata reads in client tablet cache (#4432) For ondemand tablets the client tablet cache caches tablets w/o a location. There was a bug fixed in #4280 where the cache would do a metadata table lookup for each mutation when tablets had no location. The fix in #4280 only partially fixed the problem, after that change more metadata lookups than needed were still being done. Also there was a bug with the batchscanner that #4280 did not address. Before this change when tablets had no location, the batch scanner would do a metadata lookup for each range passed to the batch scanner (well the client tablet cache would these metadata lookups on behalf of the batch scanner). For example before this change if the batch scanner was given 10K ranges that all fell in a single tablet w/o a location, it would do 10K metadata lookups. After this change for that situation it will do a single metadata lookup. This change minimizes the metadata lookups done by the batch writer and batch scanner. The fix is to make sure that cached entries populated by looking up one range or mutation are used by subsequent range or mutations lookups, even if there is no location present. This is done by always reusing cache entries that were created after work started on a batch of mutations or ranges. Cache entries w/o a location that existed before work started on a batch are ignored. By reusing cache entries created after starting work on a batch we minimize metadata lookups. A test was also added to ensure the client tablet cache does not do excessive metadata table lookups. If this test had existed, it would have caught the problem. --- .../core/clientImpl/ClientTabletCache.java | 8 +- .../core/clientImpl/ClientTabletCacheImpl.java | 66 ++++++---- .../core/clientImpl/ClientTabletCacheImplTest.java | 137 +++++++++++++++++++++ 3 files changed, 181 insertions(+), 30 deletions(-) diff --git a/core/src/main/java/org/apache/accumulo/core/clientImpl/ClientTabletCache.java b/core/src/main/java/org/apache/accumulo/core/clientImpl/ClientTabletCache.java index 2564fb0063..a31ca2418b 100644 --- a/core/src/main/java/org/apache/accumulo/core/clientImpl/ClientTabletCache.java +++ b/core/src/main/java/org/apache/accumulo/core/clientImpl/ClientTabletCache.java @@ -20,7 +20,6 @@ package org.apache.accumulo.core.clientImpl; import static com.google.common.base.Preconditions.checkArgument; -import java.time.Duration; import java.util.ArrayList; import java.util.Collection; import java.util.HashMap; @@ -46,6 +45,7 @@ import org.apache.accumulo.core.singletons.SingletonManager; import org.apache.accumulo.core.singletons.SingletonService; import org.apache.accumulo.core.util.Interner; import org.apache.accumulo.core.util.UtilWaitThread; +import org.apache.accumulo.core.util.time.NanoTime; import org.apache.hadoop.io.Text; import com.google.common.base.Preconditions; @@ -311,7 +311,7 @@ public abstract class ClientTabletCache { private final TabletAvailability availability; private final boolean hostingRequested; - private final Long creationTime = System.nanoTime(); + private final NanoTime creationTime = NanoTime.now(); public CachedTablet(KeyExtent tablet_extent, String tablet_location, String session, TabletAvailability availability, boolean hostingRequested) { @@ -392,8 +392,8 @@ public abstract class ClientTabletCache { return this.availability; } - public Duration getAge() { - return Duration.ofNanos(System.nanoTime() - creationTime); + public NanoTime getCreationTime() { + return creationTime; } public boolean wasHostingRequested() { diff --git a/core/src/main/java/org/apache/accumulo/core/clientImpl/ClientTabletCacheImpl.java b/core/src/main/java/org/apache/accumulo/core/clientImpl/ClientTabletCacheImpl.java index 10fb3aa21e..eb66665225 100644 --- a/core/src/main/java/org/apache/accumulo/core/clientImpl/ClientTabletCacheImpl.java +++ b/core/src/main/java/org/apache/accumulo/core/clientImpl/ClientTabletCacheImpl.java @@ -60,6 +60,7 @@ import org.apache.accumulo.core.trace.TraceUtil; import org.apache.accumulo.core.util.OpTimer; import org.apache.accumulo.core.util.Pair; import org.apache.accumulo.core.util.TextUtil; +import org.apache.accumulo.core.util.time.NanoTime; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.WritableComparator; import org.slf4j.Logger; @@ -234,23 +235,17 @@ public class ClientTabletCacheImpl extends ClientTabletCache { wLock.lock(); try { - CachedTablet lastTablet = null; + // Want to ignore any entries in the cache w/o a location that were created before the + // following time. Entries created after the following time may have been populated by the + // following loop, and we want to use those. + var cacheCutoff = NanoTime.now(); + for (T mutation : notInCache) { row.set(mutation.getRow()); - // ELASTICITY_TODO using lastTablet avoids doing a metadata table lookup per mutation. - // However this still does at least one metadata lookup per tablet. This is not as good as - // the pre-elasticity code that would lookup N tablets at once and use them to bin - // mutations. So there is further room for improvement in the way this code interacts with - // cache and metadata table. - CachedTablet tl; - if (lastTablet != null && lastTablet.getExtent().contains(row)) { - tl = lastTablet; - } else { - tl = _findTablet(context, row, false, false, false, lcSession, LocationNeed.REQUIRED); - lastTablet = tl; - } + CachedTablet tl = _findTablet(context, row, false, false, false, lcSession, + LocationNeed.REQUIRED, cacheCutoff); if (!addMutation(binnedMutations, mutation, tl, lcSession)) { failures.add(mutation); @@ -331,6 +326,11 @@ public class ClientTabletCacheImpl extends ClientTabletCache { List<Range> failures = new ArrayList<>(); List<CachedTablet> cachedTablets = new ArrayList<>(); + // Use anything in the cache w/o a location populated after this point in time. Cache entries + // w/o a location created before the following time should be ignored and the metadata table + // consulted. + var cacheCutoff = NanoTime.now(); + l1: for (Range range : ranges) { cachedTablets.clear(); @@ -348,7 +348,8 @@ public class ClientTabletCacheImpl extends ClientTabletCache { if (useCache) { tl = lcSession.checkLock(findTabletInCache(startRow)); } else { - tl = _findTablet(context, startRow, false, false, false, lcSession, locationNeed); + tl = _findTablet(context, startRow, false, false, false, lcSession, locationNeed, + cacheCutoff); } if (tl == null) { @@ -358,6 +359,7 @@ public class ClientTabletCacheImpl extends ClientTabletCache { cachedTablets.add(tl); + // a range may extend over multiple tablets, look for additional tablet that overlap the range while (tl.getExtent().endRow() != null && !range.afterEndKey(new Key(tl.getExtent().endRow()).followingKey(PartialKey.ROW))) { if (useCache) { @@ -366,7 +368,7 @@ public class ClientTabletCacheImpl extends ClientTabletCache { tl = lcSession.checkLock(findTabletInCache(row)); } else { tl = _findTablet(context, tl.getExtent().endRow(), true, false, false, lcSession, - locationNeed); + locationNeed, cacheCutoff); } if (tl == null) { @@ -560,7 +562,8 @@ public class ClientTabletCacheImpl extends ClientTabletCache { } LockCheckerSession lcSession = new LockCheckerSession(); - CachedTablet tl = _findTablet(context, row, skipRow, false, true, lcSession, locationNeed); + CachedTablet tl = + _findTablet(context, row, skipRow, false, true, lcSession, locationNeed, NanoTime.now()); if (timer != null) { timer.stop(); @@ -610,14 +613,19 @@ public class ClientTabletCacheImpl extends ClientTabletCache { var currTablet = extent; + // Use anything in the cache w/o a location populated after this point in time. Cache entries + // w/o a location created before the following time should be ignored and the metadata table + // consulted. + var cacheCutoff = NanoTime.now(); + for (int i = 0; i < hostAheadCount; i++) { if (currTablet.endRow() == null || hostAheadRange .afterEndKey(new Key(currTablet.endRow()).followingKey(PartialKey.ROW))) { break; } - CachedTablet followingTablet = - _findTablet(context, currTablet.endRow(), true, false, true, lcSession, locationNeed); + CachedTablet followingTablet = _findTablet(context, currTablet.endRow(), true, false, true, + lcSession, locationNeed, cacheCutoff); if (followingTablet == null) { break; @@ -679,14 +687,14 @@ public class ClientTabletCacheImpl extends ClientTabletCache { List<TKeyExtent> extentsToBringOnline = new ArrayList<>(); for (var cachedTablet : tabletsWithNoLocation) { - if (cachedTablet.getAge().compareTo(STALE_DURATION) < 0) { + if (cachedTablet.getCreationTime().elapsed().compareTo(STALE_DURATION) < 0) { if (cachedTablet.getAvailability() == TabletAvailability.ONDEMAND) { if (!cachedTablet.wasHostingRequested()) { extentsToBringOnline.add(cachedTablet.getExtent().toThrift()); log.trace("requesting ondemand tablet to be hosted {}", cachedTablet.getExtent()); } else { log.trace("ignoring ondemand tablet that already has a hosting request in place {} {}", - cachedTablet.getExtent(), cachedTablet.getAge()); + cachedTablet.getExtent(), cachedTablet.getCreationTime().elapsed()); } } else if (cachedTablet.getAvailability() == TabletAvailability.UNHOSTED) { throw new InvalidTabletHostingRequestException("Extent " + cachedTablet.getExtent() @@ -855,10 +863,15 @@ public class ClientTabletCacheImpl extends ClientTabletCache { return null; } + /** + * @param cacheCutoff Tablets w/o locations are cached. When LocationNeed is REQUIRED, this cut + * off is used to determine if cached entries w/o a location should be used or of we should + * instead ignore them and reread the tablet information from the metadata table. + */ protected CachedTablet _findTablet(ClientContext context, Text row, boolean skipRow, - boolean retry, boolean lock, LockCheckerSession lcSession, LocationNeed locationNeed) - throws AccumuloException, AccumuloSecurityException, TableNotFoundException, - InvalidTabletHostingRequestException { + boolean retry, boolean lock, LockCheckerSession lcSession, LocationNeed locationNeed, + NanoTime cacheCutoff) throws AccumuloException, AccumuloSecurityException, + TableNotFoundException, InvalidTabletHostingRequestException { if (skipRow) { row = new Text(row); @@ -878,9 +891,10 @@ public class ClientTabletCacheImpl extends ClientTabletCache { tl = processInvalidatedAndCheckLock(context, lcSession, row); } - if (tl == null - || (locationNeed == LocationNeed.REQUIRED && tl.getTserverLocation().isEmpty())) { - // not in cache, so obtain info + if (tl == null || (locationNeed == LocationNeed.REQUIRED && tl.getTserverLocation().isEmpty() + && tl.getCreationTime().compareTo(cacheCutoff) < 0)) { + // not in cache OR the cached entry was created before the cut off time, so obtain info from + // metadata table if (lock) { wLock.lock(); try { diff --git a/core/src/test/java/org/apache/accumulo/core/clientImpl/ClientTabletCacheImplTest.java b/core/src/test/java/org/apache/accumulo/core/clientImpl/ClientTabletCacheImplTest.java index 4b13ac638a..57350292ea 100644 --- a/core/src/test/java/org/apache/accumulo/core/clientImpl/ClientTabletCacheImplTest.java +++ b/core/src/test/java/org/apache/accumulo/core/clientImpl/ClientTabletCacheImplTest.java @@ -25,6 +25,7 @@ import static org.junit.jupiter.api.Assertions.assertNotNull; import static org.junit.jupiter.api.Assertions.assertNull; import static org.junit.jupiter.api.Assertions.assertThrows; import static org.junit.jupiter.api.Assertions.assertTrue; +import static org.junit.jupiter.api.Assertions.fail; import java.util.ArrayList; import java.util.Collections; @@ -36,6 +37,7 @@ import java.util.Map.Entry; import java.util.Set; import java.util.SortedMap; import java.util.TreeMap; +import java.util.function.BiConsumer; import org.apache.accumulo.core.client.AccumuloException; import org.apache.accumulo.core.client.TableNotFoundException; @@ -493,20 +495,25 @@ public class ClientTabletCacheImplTest { static class TServers { private final Map<String,Map<KeyExtent,SortedMap<Key,Value>>> tservers = new HashMap<>(); + private BiConsumer<CachedTablet,Text> lookupConsumer = (cachedTablet, row) -> {}; } static class TestCachedTabletObtainer implements CachedTabletObtainer { private final Map<String,Map<KeyExtent,SortedMap<Key,Value>>> tservers; + private final BiConsumer<CachedTablet,Text> lookupConsumer; TestCachedTabletObtainer(TServers tservers) { this.tservers = tservers.tservers; + this.lookupConsumer = tservers.lookupConsumer; } @Override public CachedTablets lookupTablet(ClientContext context, CachedTablet src, Text row, Text stopRow, ClientTabletCache parent) { + lookupConsumer.accept(src, row); + Map<KeyExtent,SortedMap<Key,Value>> tablets = tservers.get(src.getTserverLocation().orElseThrow()); @@ -1835,4 +1842,134 @@ public class ClientTabletCacheImplTest { } + /** + * This test ensures the cache does not query the metadata table more than is needed. This + * behavior is very important for performance. + */ + @Test + public void testLookupCounts() throws Exception { + + List<KeyExtent> lookups = new ArrayList<>(); + TServers tservers = new TServers(); + tservers.lookupConsumer = (src, row) -> lookups.add(src.getExtent()); + + ClientTabletCacheImpl metaCache = createLocators(tservers, "tserver1", "tserver2", "foo"); + + var ke1 = createNewKeyExtent("foo", "g", null); + var ke2 = createNewKeyExtent("foo", "m", "g"); + var ke3 = createNewKeyExtent("foo", "r", "m"); + var ke4 = createNewKeyExtent("foo", null, "r"); + + setLocation(tservers, "tserver2", METADATA_TABLE_EXTENT, ke1, null, null); + setLocation(tservers, "tserver2", METADATA_TABLE_EXTENT, ke2, null, null); + setLocation(tservers, "tserver2", METADATA_TABLE_EXTENT, ke3, null, null); + setLocation(tservers, "tserver2", METADATA_TABLE_EXTENT, ke4, null, null); + + List<Mutation> ml = new ArrayList<>(); + for (char c = 'a'; c <= 'z'; c++) { + ml.add(createNewMutation("" + c, "cf1:cq1=v3", "cf1:cq2=v4")); + } + Map<String,TabletServerMutations<Mutation>> binnedMutations = new HashMap<>(); + List<Mutation> afailures = new ArrayList<>(); + assertEquals(List.of(), lookups); + metaCache.binMutations(context, ml, binnedMutations, afailures); + assertTrue(binnedMutations.isEmpty()); + assertEquals(ml.size(), afailures.size()); + // since all tablets are in the same metadata table, should only see one lookup to the metadata + // table + assertEquals(List.of(ROOT_TABLE_EXTENT, METADATA_TABLE_EXTENT), lookups); + + // since tablets had no locations, attempting to binMutations again should go back to the + // metadata table once + lookups.clear(); + binnedMutations.clear(); + afailures.clear(); + metaCache.binMutations(context, ml, binnedMutations, afailures); + assertTrue(binnedMutations.isEmpty()); + assertEquals(ml.size(), afailures.size()); + assertEquals(List.of(METADATA_TABLE_EXTENT), lookups); + + // test binning ranges + var range1 = createNewRange("a", "c"); + var range2 = createNewRange("h", "o"); + var range3 = createNewRange("x", "z"); + var ranges = List.of(range1, range2, range3); + + // binning ranges should do a single metadata lookup since location is required and what is + // currently cached has no location + BiConsumer<CachedTablet,Range> rangeConsumer = + (t, r) -> fail("Tablet have no locations, so should not bin"); + lookups.clear(); + var failures = metaCache.findTablets(context, ranges, rangeConsumer, LocationNeed.REQUIRED); + assertEquals(3, failures.size()); + assertEquals(List.of(METADATA_TABLE_EXTENT), lookups); + + // should do a single metadata lookup since the cached entries have no location + lookups.clear(); + failures = metaCache.findTablets(context, ranges, rangeConsumer, LocationNeed.REQUIRED); + assertEquals(3, failures.size()); + assertEquals(List.of(METADATA_TABLE_EXTENT), lookups); + + // since location is not required and the currently cached entries have no locations the + // following should not do metadata lookups + Map<KeyExtent,List<Range>> seen = new HashMap<>(); + rangeConsumer = (t, r) -> { + seen.computeIfAbsent(t.getExtent(), k -> new ArrayList<>()).add(r); + assertTrue(t.getTserverLocation().isEmpty()); + }; + lookups.clear(); + failures = metaCache.findTablets(context, ranges, rangeConsumer, LocationNeed.NOT_REQUIRED); + assertEquals(List.of(), lookups); + assertEquals(Map.of(ke1, List.of(range1), ke2, List.of(range2), ke3, List.of(range2), ke4, + List.of(range3)), seen); + assertEquals(0, failures.size()); + + // set locations + setLocation(tservers, "tserver2", METADATA_TABLE_EXTENT, ke1, "T1", "I1"); + setLocation(tservers, "tserver2", METADATA_TABLE_EXTENT, ke2, "T2", "I2"); + setLocation(tservers, "tserver2", METADATA_TABLE_EXTENT, ke3, "T3", "I3"); + setLocation(tservers, "tserver2", METADATA_TABLE_EXTENT, ke4, "T3", "I3"); + + // now that locations are set, all mutations should bin.. should only do one metadata lookup to + // get the locations + lookups.clear(); + binnedMutations.clear(); + afailures.clear(); + assertEquals(List.of(), lookups); + metaCache.binMutations(context, ml, binnedMutations, afailures); + assertEquals(Set.of("T1", "T2", "T3"), binnedMutations.keySet()); + assertEquals(7, binnedMutations.get("T1").getMutations().get(ke1).size()); + assertEquals(6, binnedMutations.get("T2").getMutations().get(ke2).size()); + assertEquals(5, binnedMutations.get("T3").getMutations().get(ke3).size()); + assertEquals(8, binnedMutations.get("T3").getMutations().get(ke4).size()); + assertEquals(0, afailures.size()); + assertEquals(List.of(METADATA_TABLE_EXTENT), lookups); + + // binning mutations again should do zero metadata lookups + lookups.clear(); + binnedMutations.clear(); + afailures.clear(); + metaCache.binMutations(context, ml, binnedMutations, afailures); + assertEquals(Set.of("T1", "T2", "T3"), binnedMutations.keySet()); + assertEquals(7, binnedMutations.get("T1").getMutations().get(ke1).size()); + assertEquals(6, binnedMutations.get("T2").getMutations().get(ke2).size()); + assertEquals(5, binnedMutations.get("T3").getMutations().get(ke3).size()); + assertEquals(8, binnedMutations.get("T3").getMutations().get(ke4).size()); + assertEquals(0, afailures.size()); + assertEquals(List.of(), lookups); + + // since the cached entries have locations there should be no metadata lookups when binning + // ranges + seen.clear(); + rangeConsumer = (t, r) -> { + seen.computeIfAbsent(t.getExtent(), k -> new ArrayList<>()).add(r); + assertTrue(t.getTserverLocation().isPresent()); + }; + lookups.clear(); + failures = metaCache.findTablets(context, ranges, rangeConsumer, LocationNeed.REQUIRED); + assertEquals(List.of(), lookups); + assertEquals(Map.of(ke1, List.of(range1), ke2, List.of(range2), ke3, List.of(range2), ke4, + List.of(range3)), seen); + assertEquals(0, failures.size()); + } }