This is an automated email from the ASF dual-hosted git repository. dlmarion pushed a commit to branch elasticity in repository https://gitbox.apache.org/repos/asf/accumulo.git
commit 7facf2f35556247dbf847d854fe995d5c20ad105 Merge: d7264bc1d0 aada55ef50 Author: Dave Marion <dlmar...@apache.org> AuthorDate: Fri May 24 17:43:12 2024 +0000 Merge branch 'main' into elasticity core/pom.xml | 16 ++ .../accumulo/core/logging/ConditionalLogger.java | 194 +++++++++++++++++++++ .../core/logging/DeduplicatingLoggerTest.java | 69 ++++++++ .../core/logging/EscalatingLoggerTest.java | 77 ++++++++ .../accumulo/manager/TabletGroupWatcher.java | 11 +- .../accumulo/tserver/UnloadTabletHandler.java | 1 - .../org/apache/accumulo/tserver/tablet/Tablet.java | 22 ++- 7 files changed, 386 insertions(+), 4 deletions(-) diff --cc server/manager/src/main/java/org/apache/accumulo/manager/TabletGroupWatcher.java index 1b41145fa8,443df6c8f3..9299aab1be --- a/server/manager/src/main/java/org/apache/accumulo/manager/TabletGroupWatcher.java +++ b/server/manager/src/main/java/org/apache/accumulo/manager/TabletGroupWatcher.java @@@ -20,13 -20,10 +20,14 @@@ package org.apache.accumulo.manager import static com.google.common.util.concurrent.Uninterruptibles.sleepUninterruptibly; import static java.lang.Math.min; +import static java.util.Objects.requireNonNull; +import static org.apache.accumulo.core.metadata.schema.TabletMetadata.ColumnType.FILES; +import static org.apache.accumulo.core.metadata.schema.TabletMetadata.ColumnType.LOGS; import java.io.IOException; + import java.time.Duration; import java.util.ArrayList; +import java.util.Collection; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; @@@ -56,11 -56,10 +57,12 @@@ import org.apache.accumulo.core.data.Ra import org.apache.accumulo.core.data.TableId; import org.apache.accumulo.core.data.Value; import org.apache.accumulo.core.dataImpl.KeyExtent; -import org.apache.accumulo.core.gc.ReferenceFile; + import org.apache.accumulo.core.logging.ConditionalLogger.EscalatingLogger; import org.apache.accumulo.core.logging.TabletLogger; +import org.apache.accumulo.core.manager.state.TabletManagement; +import org.apache.accumulo.core.manager.state.TabletManagement.ManagementAction; import org.apache.accumulo.core.manager.state.tables.TableState; +import org.apache.accumulo.core.manager.thrift.ManagerGoalState; import org.apache.accumulo.core.manager.thrift.ManagerState; import org.apache.accumulo.core.manager.thrift.TabletServerStatus; import org.apache.accumulo.core.metadata.AccumuloTable; @@@ -100,31 -110,18 +102,36 @@@ import org.apache.hadoop.fs.Path import org.apache.hadoop.io.Text; import org.apache.thrift.TException; import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + import org.slf4j.event.Level; -import com.google.common.annotations.VisibleForTesting; import com.google.common.base.Preconditions; import com.google.common.collect.ImmutableSortedSet; import com.google.common.collect.Iterators; abstract class TabletGroupWatcher extends AccumuloDaemonThread { + public static class BadLocationStateException extends Exception { + private static final long serialVersionUID = 2L; + + // store as byte array because Text isn't Serializable + private final byte[] metadataTableEntry; + + public BadLocationStateException(String msg, Text row) { + super(msg); + this.metadataTableEntry = TextUtil.getBytes(requireNonNull(row)); + } + + public Text getEncodedEndRow() { + return new Text(metadataTableEntry); + } + } + + private static final Logger LOG = LoggerFactory.getLogger(TabletGroupWatcher.class); ++ + private static final Logger TABLET_UNLOAD_LOGGER = + new EscalatingLogger(Manager.log, Duration.ofMinutes(5), 1000, Level.INFO); ++ private final Manager manager; private final TabletStateStore store; private final TabletGroupWatcher dependentWatcher; @@@ -222,536 -182,203 +229,536 @@@ } } - @Override - public void run() { - int[] oldCounts = new int[TabletState.values().length]; - EventCoordinator.Listener eventListener = this.manager.nextEvent.getListener(); + class EventHandler implements EventCoordinator.Listener { - WalStateManager wals = new WalStateManager(manager.getContext()); + // Setting this to true to start with because its not know what happended before this object was + // created, so just start off with full scan. + private boolean needsFullScan = true; - while (manager.stillManager()) { - // slow things down a little, otherwise we spam the logs when there are many wake-up events - sleepUninterruptibly(100, TimeUnit.MILLISECONDS); + private final BlockingQueue<Range> rangesToProcess; - final long waitTimeBetweenScans = manager.getConfiguration() - .getTimeInMillis(Property.MANAGER_TABLET_GROUP_WATCHER_INTERVAL); + class RangeProccessor implements Runnable { + @Override + public void run() { + try { + while (manager.stillManager()) { + var range = rangesToProcess.poll(100, TimeUnit.MILLISECONDS); + if (range == null) { + // check to see if still the manager + continue; + } - int totalUnloaded = 0; - int unloaded = 0; - ClosableIterator<TabletLocationState> iter = null; - try { - Map<TableId,MergeStats> mergeStatsCache = new HashMap<>(); - Map<TableId,MergeStats> currentMerges = new HashMap<>(); - for (MergeInfo merge : manager.merges()) { - if (merge.getExtent() != null) { - currentMerges.put(merge.getExtent().tableId(), new MergeStats(merge)); + ArrayList<Range> ranges = new ArrayList<>(); + ranges.add(range); + + rangesToProcess.drainTo(ranges); + + if (!processRanges(ranges)) { + setNeedsFullScan(); + } } + } catch (InterruptedException e) { + throw new RuntimeException(e); } + } + } - // Get the current status for the current list of tservers - SortedMap<TServerInstance,TabletServerStatus> currentTServers = new TreeMap<>(); - for (TServerInstance entry : manager.tserverSet.getCurrentServers()) { - currentTServers.put(entry, manager.tserverStatus.get(entry)); - } + EventHandler() { + rangesToProcess = new ArrayBlockingQueue<>(3000); - if (currentTServers.isEmpty()) { - eventListener.waitForEvents(waitTimeBetweenScans); - synchronized (this) { - lastScanServers = Collections.emptySortedSet(); + Threads + .createThread("TGW [" + store.name() + "] event range processor", new RangeProccessor()) + .start(); + } + + private synchronized void setNeedsFullScan() { + needsFullScan = true; + notifyAll(); + } + + public synchronized void clearNeedsFullScan() { + needsFullScan = false; + } + + public synchronized boolean isNeedsFullScan() { + return needsFullScan; + } + + @Override + public void process(EventCoordinator.Event event) { + + switch (event.getScope()) { + case ALL: + case DATA_LEVEL: + setNeedsFullScan(); + break; + case TABLE: + case TABLE_RANGE: + if (!rangesToProcess.offer(event.getExtent().toMetaRange())) { + Manager.log.debug("[{}] unable to process event range {} because queue is full", + store.name(), event.getExtent()); + setNeedsFullScan(); } - continue; + break; + default: + throw new IllegalArgumentException("Unhandled scope " + event.getScope()); + } + } + + synchronized void waitForFullScan(long millis) { + if (!needsFullScan) { + try { + wait(millis); + } catch (InterruptedException e) { + throw new RuntimeException(e); } + } + } + } - TabletLists tLists = new TabletLists(manager, currentTServers); + private boolean processRanges(List<Range> ranges) { + if (manager.getManagerGoalState() == ManagerGoalState.CLEAN_STOP) { + return false; + } - ManagerState managerState = manager.getManagerState(); - int[] counts = new int[TabletState.values().length]; - stats.begin(); - // Walk through the tablets in our store, and work tablets - // towards their goal - iter = store.iterator(); - while (iter.hasNext()) { - TabletLocationState tls = iter.next(); - if (tls == null) { - continue; - } + TabletManagementParameters tabletMgmtParams = createTabletManagementParameters(false); - // ignore entries for tables that do not exist in zookeeper - if (manager.getTableManager().getTableState(tls.extent.tableId()) == null) { - continue; - } + var currentTservers = getCurrentTservers(tabletMgmtParams.getOnlineTsevers()); + if (currentTservers.isEmpty()) { + return false; + } - // Don't overwhelm the tablet servers with work - if (tLists.unassigned.size() + unloaded - > Manager.MAX_TSERVER_WORK_CHUNK * currentTServers.size()) { - flushChanges(tLists, wals); - tLists.reset(); - unloaded = 0; - eventListener.waitForEvents(waitTimeBetweenScans); - } - TableId tableId = tls.extent.tableId(); - TableConfiguration tableConf = manager.getContext().getTableConfiguration(tableId); - - MergeStats mergeStats = mergeStatsCache.computeIfAbsent(tableId, k -> { - var mStats = currentMerges.get(k); - return mStats != null ? mStats : new MergeStats(new MergeInfo()); - }); - TabletGoalState goal = manager.getGoalState(tls, mergeStats.getMergeInfo()); - Location location = tls.getLocation(); - TabletState state = tls.getState(currentTServers.keySet()); - - TabletLogger.missassigned(tls.extent, goal.toString(), state.toString(), - tls.getFutureServer(), tls.getCurrentServer(), tls.walogs.size()); - - stats.update(tableId, state); - mergeStats.update(tls.extent, state); - - // Always follow through with assignments - if (state == TabletState.ASSIGNED) { - goal = TabletGoalState.HOSTED; + try (var iter = store.iterator(ranges, tabletMgmtParams)) { + long t1 = System.currentTimeMillis(); + manageTablets(iter, tabletMgmtParams, currentTservers, false); + long t2 = System.currentTimeMillis(); + Manager.log.debug(String.format("[%s]: partial scan time %.2f seconds for %,d ranges", + store.name(), (t2 - t1) / 1000., ranges.size())); + } catch (Exception e) { + Manager.log.error("Error processing {} ranges for store {} ", ranges.size(), store.name(), e); + } + + return true; + } + + private final Set<KeyExtent> hostingRequestInProgress = new ConcurrentSkipListSet<>(); + + public void hostOndemand(Collection<KeyExtent> extents) { + // This is only expected to be called for the user level + Preconditions.checkState(getLevel() == Ample.DataLevel.USER); + + final List<KeyExtent> inProgress = new ArrayList<>(); + extents.forEach(ke -> { + if (hostingRequestInProgress.add(ke)) { + LOG.info("Tablet hosting requested for: {} ", ke); + inProgress.add(ke); + } else { + LOG.trace("Ignoring hosting request because another thread is currently processing it {}", + ke); + } + }); + // Do not add any code here, it may interfere with the finally block removing extents from + // hostingRequestInProgress + try (var mutator = manager.getContext().getAmple().conditionallyMutateTablets()) { + inProgress.forEach(ke -> { + mutator.mutateTablet(ke).requireAbsentOperation() + .requireTabletAvailability(TabletAvailability.ONDEMAND).requireAbsentLocation() + .setHostingRequested().submit(TabletMetadata::getHostingRequested); + + }); + + List<Range> ranges = new ArrayList<>(); + + mutator.process().forEach((extent, result) -> { + if (result.getStatus() == Ample.ConditionalResult.Status.ACCEPTED) { + // cache this success for a bit + ranges.add(extent.toMetaRange()); + } else { + if (LOG.isTraceEnabled()) { + // only read the metadata if the logging is enabled + LOG.trace("Failed to set hosting request {}", result.readMetadata()); } - if (Manager.log.isTraceEnabled()) { - Manager.log.trace( - "[{}] Shutting down all Tservers: {}, dependentCount: {} Extent: {}, state: {}, goal: {}", - store.name(), manager.serversToShutdown.equals(currentTServers.keySet()), - dependentWatcher == null ? "null" : dependentWatcher.assignedOrHosted(), tls.extent, - state, goal); + } + }); + + processRanges(ranges); + } finally { + inProgress.forEach(hostingRequestInProgress::remove); + } + } + + private TabletManagementParameters + createTabletManagementParameters(boolean lookForTabletsNeedingVolReplacement) { + + HashMap<Ample.DataLevel,Boolean> parentLevelUpgrade = new HashMap<>(); + UpgradeCoordinator.UpgradeStatus upgradeStatus = manager.getUpgradeStatus(); + for (var level : Ample.DataLevel.values()) { + parentLevelUpgrade.put(level, upgradeStatus.isParentLevelUpgraded(level)); + } + + Set<TServerInstance> shutdownServers; + if (store.getLevel() == Ample.DataLevel.USER) { + shutdownServers = manager.shutdownServers(); + } else { + // Use the servers to shutdown filtered by the dependent watcher. These are servers to + // shutdown that the dependent watcher has determined it has no tablets hosted on or assigned + // to. + shutdownServers = dependentWatcher.getFilteredServersToShutdown(); + } + + var tServersSnapshot = manager.tserversSnapshot(); + + return new TabletManagementParameters(manager.getManagerState(), parentLevelUpgrade, + manager.onlineTables(), tServersSnapshot, shutdownServers, manager.migrationsSnapshot(), + store.getLevel(), manager.getCompactionHints(store.getLevel()), canSuspendTablets(), + lookForTabletsNeedingVolReplacement ? manager.getContext().getVolumeReplacements() + : Map.of(), + manager.getSteadyTime()); + } + + private Set<TServerInstance> getFilteredServersToShutdown() { + return filteredServersToShutdown; + } + + private static class TableMgmtStats { + int[] counts = new int[TabletState.values().length]; + private int totalUnloaded; + private long totalVolumeReplacements; + private int tabletsWithErrors; + } + + private TableMgmtStats manageTablets(Iterator<TabletManagement> iter, + TabletManagementParameters tableMgmtParams, + SortedMap<TServerInstance,TabletServerStatus> currentTServers, boolean isFullScan) + throws BadLocationStateException, TException, DistributedStoreException, WalMarkerException, + IOException { + + final TableMgmtStats tableMgmtStats = new TableMgmtStats(); + final boolean shuttingDownAllTabletServers = + tableMgmtParams.getServersToShutdown().equals(currentTServers.keySet()); + if (shuttingDownAllTabletServers && !isFullScan) { + // If we are shutting down all of the TabletServers, then don't process any events + // from the EventCoordinator. + LOG.debug("Partial scan requested, but aborted due to shutdown of all TabletServers"); + return tableMgmtStats; + } + + int unloaded = 0; + + TabletLists tLists = new TabletLists(currentTServers, tableMgmtParams.getGroupedTServers(), + tableMgmtParams.getServersToShutdown()); + + CompactionJobGenerator compactionGenerator = + new CompactionJobGenerator(new ServiceEnvironmentImpl(manager.getContext()), + tableMgmtParams.getCompactionHints(), tableMgmtParams.getSteadyTime()); + + Set<TServerInstance> filteredServersToShutdown = + new HashSet<>(tableMgmtParams.getServersToShutdown()); + + while (iter.hasNext()) { + final TabletManagement mti = iter.next(); + if (mti == null) { + throw new IllegalStateException("State store returned a null ManagerTabletInfo object"); + } + + final TabletMetadata tm = mti.getTabletMetadata(); + + final String mtiError = mti.getErrorMessage(); + if (mtiError != null) { + // An error happened on the TabletServer in the TabletManagementIterator + // when trying to process this extent. + LOG.warn( + "Error on TabletServer trying to get Tablet management information for extent: {}. Error message: {}", + tm.getExtent(), mtiError); + this.metrics.incrementTabletGroupWatcherError(this.store.getLevel()); + tableMgmtStats.tabletsWithErrors++; + continue; + } + + final TableId tableId = tm.getTableId(); + // ignore entries for tables that do not exist in zookeeper + if (manager.getTableManager().getTableState(tableId) == null) { + continue; + } + + // Don't overwhelm the tablet servers with work + if (tLists.unassigned.size() + unloaded + > Manager.MAX_TSERVER_WORK_CHUNK * currentTServers.size() + || tLists.volumeReplacements.size() > 1000) { + flushChanges(tLists); + tLists.reset(); + unloaded = 0; + } + + final TableConfiguration tableConf = manager.getContext().getTableConfiguration(tableId); + + TabletState state = TabletState.compute(tm, currentTServers.keySet()); + if (state == TabletState.ASSIGNED_TO_DEAD_SERVER) { + /* + * This code exists to deal with a race condition caused by two threads running in this + * class that compute tablets actions. One thread does full scans and the other reacts to + * events and does partial scans. Below is an example of the race condition this is + * handling. + * + * - TGW Thread 1 : reads the set of tablets servers and its empty + * + * - TGW Thread 2 : reads the set of tablet servers and its [TS1] + * + * - TGW Thread 2 : Sees tabletX without a location and assigns it to TS1 + * + * - TGW Thread 1 : Sees tabletX assigned to TS1 and assumes it's assigned to a dead tablet + * server because its set of live servers is the empty set. + * + * To deal with this race condition, this code recomputes the tablet state using the latest + * tservers when a tablet is seen assigned to a dead tserver. + */ + + TabletState newState = TabletState.compute(tm, manager.tserversSnapshot().getTservers()); + if (newState != state) { + LOG.debug("Tablet state changed when using latest set of tservers {} {} {}", + tm.getExtent(), state, newState); + state = newState; + } + } + tableMgmtStats.counts[state.ordinal()]++; + + // This is final because nothing in this method should change the goal. All computation of the + // goal should be done in TabletGoalState.compute() so that all parts of the Accumulo code + // will compute a consistent goal. + final TabletGoalState goal = + TabletGoalState.compute(tm, state, manager.tabletBalancer, tableMgmtParams); + + final Set<ManagementAction> actions = mti.getActions(); + + if (actions.contains(ManagementAction.NEEDS_RECOVERY) && goal != TabletGoalState.HOSTED) { + LOG.warn("Tablet has wals, but goal is not hosted. Tablet: {}, goal:{}", tm.getExtent(), + goal); + } + + if (actions.contains(ManagementAction.NEEDS_VOLUME_REPLACEMENT)) { + tableMgmtStats.totalVolumeReplacements++; + if (state == TabletState.UNASSIGNED || state == TabletState.SUSPENDED) { + var volRep = + VolumeUtil.computeVolumeReplacements(tableMgmtParams.getVolumeReplacements(), tm); + if (volRep.logsToRemove.size() + volRep.filesToRemove.size() > 0) { + if (tm.getLocation() != null) { + // since the totalVolumeReplacements counter was incremented, should try this again + // later after its unassigned + LOG.debug("Volume replacement needed for {} but it has a location {}.", + tm.getExtent(), tm.getLocation()); + } else if (tm.getOperationId() != null) { + LOG.debug("Volume replacement needed for {} but it has an active operation {}.", + tm.getExtent(), tm.getOperationId()); + } else { + LOG.debug("Volume replacement needed for {}.", tm.getExtent()); + // buffer replacements so that multiple mutations can be done at once + tLists.volumeReplacements.add(volRep); + } + } else { + LOG.debug("Volume replacement evaluation for {} returned no changes.", tm.getExtent()); } + } else { + LOG.debug("Volume replacement needed for {} but its tablet state is {}.", tm.getExtent(), + state); + } + } + + if (actions.contains(ManagementAction.BAD_STATE) && tm.isFutureAndCurrentLocationSet()) { + throw new BadLocationStateException( + tm.getExtent() + " is both assigned and hosted, which should never happen: " + this, + tm.getExtent().toMetaRow()); + } + + final Location location = tm.getLocation(); + Location current = null; + Location future = null; + if (tm.hasCurrent()) { + current = tm.getLocation(); + } else { + future = tm.getLocation(); + } + TabletLogger.missassigned(tm.getExtent(), goal.toString(), state.toString(), + future != null ? future.getServerInstance() : null, + current != null ? current.getServerInstance() : null, tm.getLogs().size()); + + if (isFullScan) { + stats.update(tableId, state); + } + + if (Manager.log.isTraceEnabled()) { + Manager.log.trace( + "[{}] Shutting down all Tservers: {}, dependentCount: {} Extent: {}, state: {}, goal: {} actions:{} #wals:{}", + store.name(), tableMgmtParams.getServersToShutdown().equals(currentTServers.keySet()), + dependentWatcher == null ? "null" : dependentWatcher.assignedOrHosted(), tm.getExtent(), + state, goal, actions, tm.getLogs().size()); + } + + if (actions.contains(ManagementAction.NEEDS_SPLITTING)) { + LOG.debug("{} may need splitting.", tm.getExtent()); + manager.getSplitter().initiateSplit(new SeedSplitTask(manager, tm.getExtent())); + } + + if (actions.contains(ManagementAction.NEEDS_COMPACTING)) { + var jobs = compactionGenerator.generateJobs(tm, + TabletManagementIterator.determineCompactionKinds(actions)); + LOG.debug("{} may need compacting adding {} jobs", tm.getExtent(), jobs.size()); + manager.getCompactionCoordinator().addJobs(tm, jobs); + } - // if we are shutting down all the tabletservers, we have to do it in order - if ((goal == TabletGoalState.SUSPENDED && state == TabletState.HOSTED) - && manager.serversToShutdown.equals(currentTServers.keySet())) { - if (dependentWatcher != null) { - // If the dependentWatcher is for the user tables, check to see - // that user tables exist. - DataLevel dependentLevel = dependentWatcher.store.getLevel(); - boolean userTablesExist = true; - switch (dependentLevel) { - case USER: - Set<TableId> onlineTables = manager.onlineTables(); - onlineTables.remove(AccumuloTable.ROOT.tableId()); - onlineTables.remove(AccumuloTable.METADATA.tableId()); - userTablesExist = !onlineTables.isEmpty(); - break; - case METADATA: - case ROOT: - default: - break; + // ELASITICITY_TODO the case where a planner generates compactions at time T1 for tablet + // and later at time T2 generates nothing for the same tablet is not being handled. At + // time T1 something could have been queued. However at time T2 we will not clear those + // entries from the queue because we see nothing here for that case. After a full + // metadata scan could remove any tablets that were not updated during the scan. + + if (actions.contains(ManagementAction.NEEDS_LOCATION_UPDATE) + || actions.contains(ManagementAction.NEEDS_RECOVERY)) { + + if (tm.getLocation() != null) { + filteredServersToShutdown.remove(tm.getLocation().getServerInstance()); + } + + if (goal == TabletGoalState.HOSTED) { + + // RecoveryManager.recoverLogs will return false when all of the logs + // have been sorted so that recovery can occur. Delay the hosting of + // the Tablet until the sorting is finished. + if ((state != TabletState.HOSTED && actions.contains(ManagementAction.NEEDS_RECOVERY)) + && manager.recoveryManager.recoverLogs(tm.getExtent(), tm.getLogs())) { + LOG.debug("Not hosting {} as it needs recovery, logs: {}", tm.getExtent(), + tm.getLogs().size()); + continue; + } + switch (state) { + case HOSTED: + if (location.getServerInstance().equals(manager.migrations.get(tm.getExtent()))) { + manager.migrations.remove(tm.getExtent()); } - // If the stats object in the dependentWatcher is empty, then it - // currently does not have data about what is hosted or not. In - // that case host these tablets until the dependent watcher can - // gather some data. - final Map<TableId,TableCounts> stats = dependentWatcher.getStats(); - if (dependentLevel == DataLevel.USER) { - if (userTablesExist - && (stats == null || stats.isEmpty() || assignedOrHosted(stats) > 0)) { - goal = TabletGoalState.HOSTED; - } - } else if (stats == null || stats.isEmpty() || assignedOrHosted(stats) > 0) { - goal = TabletGoalState.HOSTED; + break; + case ASSIGNED_TO_DEAD_SERVER: + hostDeadTablet(tLists, tm, location); + break; + case SUSPENDED: + hostSuspendedTablet(tLists, tm, location, tableConf); + break; + case UNASSIGNED: + hostUnassignedTablet(tLists, tm.getExtent(), + new UnassignedTablet(location, tm.getLast())); + break; + case ASSIGNED: + // Send another reminder + tLists.assigned.add(new Assignment(tm.getExtent(), + future != null ? future.getServerInstance() : null, tm.getLast())); + break; + default: + break; + } + } else { + switch (state) { + case SUSPENDED: + // Request a move to UNASSIGNED, so as to allow balancing to continue. + tLists.suspendedToGoneServers.add(tm); + cancelOfflineTableMigrations(tm.getExtent()); + break; + case UNASSIGNED: + cancelOfflineTableMigrations(tm.getExtent()); + break; + case ASSIGNED_TO_DEAD_SERVER: + unassignDeadTablet(tLists, tm); + break; + case HOSTED: + TServerConnection client = + manager.tserverSet.getConnection(location.getServerInstance()); + if (client != null) { - LOG.debug("Requesting tserver {} unload tablet {}", location.getServerInstance(), - tm.getExtent()); ++ TABLET_UNLOAD_LOGGER.trace("[{}] Requesting TabletServer {} unload {} {}", ++ store.name(), location.getServerInstance(), tm.getExtent(), goal.howUnload()); + client.unloadTablet(manager.managerLock, tm.getExtent(), goal.howUnload(), + manager.getSteadyTime().getMillis()); + tableMgmtStats.totalUnloaded++; + unloaded++; + } else { + Manager.log.warn("Could not connect to server {}", location); } - } + break; + case ASSIGNED: + break; } + } + } + } - if (goal == TabletGoalState.HOSTED) { - if ((state != TabletState.HOSTED && !tls.walogs.isEmpty()) - && manager.recoveryManager.recoverLogs(tls.extent, tls.walogs)) { - continue; - } - switch (state) { - case HOSTED: - if (location.getServerInstance().equals(manager.migrations.get(tls.extent))) { - manager.migrations.remove(tls.extent); - } - break; - case ASSIGNED_TO_DEAD_SERVER: - hostDeadTablet(tLists, tls, location, wals); - break; - case SUSPENDED: - hostSuspendedTablet(tLists, tls, location, tableConf); - break; - case UNASSIGNED: - hostUnassignedTablet(tLists, tls.extent, new UnassignedTablet(location, tls.last)); - break; - case ASSIGNED: - // Send another reminder - tLists.assigned.add(new Assignment(tls.extent, tls.getFutureServer(), tls.last)); - break; - } - } else { - switch (state) { - case SUSPENDED: - // Request a move to UNASSIGNED, so as to allow balancing to continue. - tLists.suspendedToGoneServers.add(tls); - cancelOfflineTableMigrations(tls.extent); - break; - case UNASSIGNED: - cancelOfflineTableMigrations(tls.extent); - break; - case ASSIGNED_TO_DEAD_SERVER: - unassignDeadTablet(tLists, tls, wals); - break; - case HOSTED: - TServerConnection client = - manager.tserverSet.getConnection(location.getServerInstance()); - if (client != null) { - try { - TABLET_UNLOAD_LOGGER.trace("[{}] Requesting TabletServer {} unload {} {}", - store.name(), location.getServerInstance(), tls.extent, goal.howUnload()); - client.unloadTablet(manager.managerLock, tls.extent, goal.howUnload(), - manager.getSteadyTime().getMillis()); - unloaded++; - totalUnloaded++; - } catch (TException tException) { - Manager.log.warn("[{}] Failed to request tablet unload {} {} {}", store.name(), - location.getServerInstance(), tls.extent, goal.howUnload(), tException); - } - } else { - Manager.log.warn("Could not connect to server {}", location); - } - break; - case ASSIGNED: - break; - } + flushChanges(tLists); + + if (isFullScan) { + this.filteredServersToShutdown = Set.copyOf(filteredServersToShutdown); + } + + return tableMgmtStats; + } + + private SortedMap<TServerInstance,TabletServerStatus> + getCurrentTservers(Set<TServerInstance> onlineTservers) { + // Get the current status for the current list of tservers + final SortedMap<TServerInstance,TabletServerStatus> currentTServers = new TreeMap<>(); + for (TServerInstance entry : onlineTservers) { + currentTServers.put(entry, manager.tserverStatus.get(entry)); + } + return currentTServers; + } + + @Override + public void run() { + int[] oldCounts = new int[TabletState.values().length]; + boolean lookForTabletsNeedingVolReplacement = true; + + while (manager.stillManager()) { + if (!eventHandler.isNeedsFullScan()) { + // If an event handled by the EventHandler.RangeProcessor indicated + // that we need to do a full scan, then do it. Otherwise wait a bit + // before re-checking the tablets. + sleepUninterruptibly(100, TimeUnit.MILLISECONDS); + } + + final long waitTimeBetweenScans = manager.getConfiguration() + .getTimeInMillis(Property.MANAGER_TABLET_GROUP_WATCHER_INTERVAL); + + TabletManagementParameters tableMgmtParams = + createTabletManagementParameters(lookForTabletsNeedingVolReplacement); + var currentTServers = getCurrentTservers(tableMgmtParams.getOnlineTsevers()); + + ClosableIterator<TabletManagement> iter = null; + try { + if (currentTServers.isEmpty()) { + eventHandler.waitForFullScan(waitTimeBetweenScans); + synchronized (this) { + lastScanServers = Collections.emptySortedSet(); } - counts[state.ordinal()]++; + continue; } - flushChanges(tLists, wals); + stats.begin(); + + ManagerState managerState = tableMgmtParams.getManagerState(); + + // Clear the need for a full scan before starting a full scan inorder to detect events that + // happen during the full scan. + eventHandler.clearNeedsFullScan(); + + iter = store.iterator(tableMgmtParams); + manager.getCompactionCoordinator().getJobQueues().beginFullScan(store.getLevel()); + var tabletMgmtStats = manageTablets(iter, tableMgmtParams, currentTServers, true); + manager.getCompactionCoordinator().getJobQueues().endFullScan(store.getLevel()); + + // If currently looking for volume replacements, determine if the next round needs to look. + if (lookForTabletsNeedingVolReplacement) { + // Continue to look for tablets needing volume replacement if there was an error + // processing tablets in the call to manageTablets() or if we are still performing volume + // replacement. We only want to stop looking for tablets that need volume replacement when + // we have successfully processed all tablet metadata and no more volume replacements are + // being performed. + lookForTabletsNeedingVolReplacement = tabletMgmtStats.totalVolumeReplacements != 0 + || tabletMgmtStats.tabletsWithErrors != 0; + } // provide stats after flushing changes to avoid race conditions w/ delete table stats.end(managerState); diff --cc server/tserver/src/main/java/org/apache/accumulo/tserver/tablet/Tablet.java index 573e3c49f0,4ea148046b..0747a10867 --- a/server/tserver/src/main/java/org/apache/accumulo/tserver/tablet/Tablet.java +++ b/server/tserver/src/main/java/org/apache/accumulo/tserver/tablet/Tablet.java @@@ -21,11 -21,13 +21,12 @@@ package org.apache.accumulo.tserver.tab import static com.google.common.util.concurrent.Uninterruptibles.sleepUninterruptibly; import static java.nio.charset.StandardCharsets.UTF_8; import static java.util.stream.Collectors.toList; +import static org.apache.accumulo.core.util.LazySingletons.RANDOM; -import java.io.ByteArrayInputStream; -import java.io.DataInputStream; import java.io.FileNotFoundException; import java.io.IOException; -import java.lang.ref.SoftReference; +import java.io.UncheckedIOException; + import java.time.Duration; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; @@@ -60,8 -67,11 +61,9 @@@ import org.apache.accumulo.core.dataImp import org.apache.accumulo.core.file.FilePrefix; import org.apache.accumulo.core.iterators.SortedKeyValueIterator; import org.apache.accumulo.core.iteratorsImpl.system.SourceSwitchingIterator; + import org.apache.accumulo.core.logging.ConditionalLogger.DeduplicatingLogger; import org.apache.accumulo.core.logging.TabletLogger; import org.apache.accumulo.core.manager.state.tables.TableState; -import org.apache.accumulo.core.manager.thrift.BulkImportState; import org.apache.accumulo.core.metadata.AccumuloTable; import org.apache.accumulo.core.metadata.ReferencedTabletFile; import org.apache.accumulo.core.metadata.StoredTabletFile;