This is an automated email from the ASF dual-hosted git repository. ddanielr pushed a commit to branch elasticity in repository https://gitbox.apache.org/repos/asf/accumulo.git
commit d4249cfb18fe03a93667603b4d5eafd3d44c6164 Merge: 24ead4b4bd 2cab146529 Author: Daniel Roberts <ddani...@gmail.com> AuthorDate: Sun Jul 28 19:48:58 2024 +0000 Merge branch 'main' into elasticity .../accumulo/core/metrics/MetricsProducer.java | 10 +++++ .../java/org/apache/accumulo/manager/Manager.java | 9 ++++ .../accumulo/manager/metrics/BalancerMetrics.java | 51 ++++++++++++++++++++++ .../java/org/apache/accumulo/test/BalanceIT.java | 14 ++++++ .../BalanceInPresenceOfOfflineTableIT.java | 2 + .../apache/accumulo/test/metrics/MetricsIT.java | 1 + 6 files changed, 87 insertions(+) diff --cc core/src/main/java/org/apache/accumulo/core/metrics/MetricsProducer.java index 08d8bc6514,f742ac460b..7383ee9e3e --- a/core/src/main/java/org/apache/accumulo/core/metrics/MetricsProducer.java +++ b/core/src/main/java/org/apache/accumulo/core/metrics/MetricsProducer.java @@@ -617,27 -581,14 +617,35 @@@ import io.micrometer.core.instrument.Me * <td>Distribution Summary</td> * <td></td> * </tr> + * <tr> + * <td>N/A</td> + * <td>N/A</td> + * <td>{@link #METRICS_MANAGER_ROOT_TGW_ERRORS}</td> + * <td>Gauge</td> + * <td></td> + * </tr> + * <tr> + * <td>N/A</td> + * <td>N/A</td> + * <td>{@link #METRICS_MANAGER_META_TGW_ERRORS}</td> + * <td>Gauge</td> + * <td></td> + * </tr> + * <tr> + * <td>N/A</td> + * <td>N/A</td> + * <td>{@link #METRICS_MANAGER_USER_TGW_ERRORS}</td> + * <td>Gauge</td> + * <td></td> + * </tr> + * <!-- Balancing --> + * <tr> + * <td>N/A</td> + * <td>N/A</td> + * <td>{@value METRICS_MANAGER_BALANCER_MIGRATIONS_NEEDED}</td> + * <td>Gauge</td> + * <td>The number of migrations that need to complete before the system is balanced</td> + * </tr> * </table> * * @since 2.1.0 diff --cc server/manager/src/main/java/org/apache/accumulo/manager/Manager.java index 21e4d271d4,a7fb4a3a9d..260f1823f5 --- a/server/manager/src/main/java/org/apache/accumulo/manager/Manager.java +++ b/server/manager/src/main/java/org/apache/accumulo/manager/Manager.java @@@ -120,10 -113,9 +120,11 @@@ import org.apache.accumulo.core.util.th import org.apache.accumulo.core.util.threads.Threads; import org.apache.accumulo.core.util.time.NanoTime; import org.apache.accumulo.core.util.time.SteadyTime; +import org.apache.accumulo.manager.compaction.coordinator.CompactionCoordinator; + import org.apache.accumulo.manager.metrics.BalancerMetrics; import org.apache.accumulo.manager.metrics.ManagerMetrics; import org.apache.accumulo.manager.recovery.RecoveryManager; +import org.apache.accumulo.manager.split.Splitter; import org.apache.accumulo.manager.state.TableCounts; import org.apache.accumulo.manager.tableOps.TraceRepo; import org.apache.accumulo.manager.upgrade.PreUpgradeValidation; @@@ -215,8 -208,9 +216,9 @@@ public class Manager extends AbstractSe ServiceLock managerLock = null; private TServer clientService = null; - private volatile TabletBalancer tabletBalancer; + protected volatile TabletBalancer tabletBalancer; private final BalancerEnvironment balancerEnvironment; + private final BalancerMetrics balancerMetrics = new BalancerMetrics(); private ManagerState state = ManagerState.INITIAL; @@@ -536,28 -570,122 +538,32 @@@ } } - public MetricsProducer getBalancerMetrics() { - return balancerMetrics; - } - - enum TabletGoalState { - HOSTED(TUnloadTabletGoal.UNKNOWN), - UNASSIGNED(TUnloadTabletGoal.UNASSIGNED), - DELETED(TUnloadTabletGoal.DELETED), - SUSPENDED(TUnloadTabletGoal.SUSPENDED); - - private final TUnloadTabletGoal unloadGoal; + private Splitter splitter; - TabletGoalState(TUnloadTabletGoal unloadGoal) { - this.unloadGoal = unloadGoal; - } - - /** The purpose of unloading this tablet. */ - public TUnloadTabletGoal howUnload() { - return unloadGoal; - } + public Splitter getSplitter() { + return splitter; } - TabletGoalState getSystemGoalState(TabletLocationState tls) { - switch (getManagerState()) { - case NORMAL: - return TabletGoalState.HOSTED; - case HAVE_LOCK: // fall-through intended - case INITIAL: // fall-through intended - case SAFE_MODE: - if (tls.extent.isMeta()) { - return TabletGoalState.HOSTED; - } - return TabletGoalState.UNASSIGNED; - case UNLOAD_METADATA_TABLETS: - if (tls.extent.isRootTablet()) { - return TabletGoalState.HOSTED; - } - return TabletGoalState.UNASSIGNED; - case UNLOAD_ROOT_TABLET: - case STOP: - return TabletGoalState.UNASSIGNED; - default: - throw new IllegalStateException("Unknown Manager State"); - } ++ public MetricsProducer getBalancerMetrics() { ++ return balancerMetrics; + } + - TabletGoalState getTableGoalState(KeyExtent extent) { - TableState tableState = getContext().getTableManager().getTableState(extent.tableId()); - if (tableState == null) { - return TabletGoalState.DELETED; - } - switch (tableState) { - case DELETING: - return TabletGoalState.DELETED; - case OFFLINE: - case NEW: - return TabletGoalState.UNASSIGNED; - default: - return TabletGoalState.HOSTED; - } + public UpgradeCoordinator.UpgradeStatus getUpgradeStatus() { + return upgradeCoordinator.getStatus(); } - TabletGoalState getGoalState(TabletLocationState tls, MergeInfo mergeInfo) { - KeyExtent extent = tls.extent; - // Shutting down? - TabletGoalState state = getSystemGoalState(tls); - if (state == TabletGoalState.HOSTED) { - if (!upgradeCoordinator.getStatus().isParentLevelUpgraded(extent)) { - // The place where this tablet stores its metadata was not upgraded, so do not assign this - // tablet yet. - return TabletGoalState.UNASSIGNED; - } - - if (tls.current != null && serversToShutdown.contains(tls.current.getServerInstance())) { - return TabletGoalState.SUSPENDED; - } - // Handle merge transitions - if (mergeInfo.getExtent() != null) { + public CompactionCoordinator getCompactionCoordinator() { + return compactionCoordinator; + } - final boolean overlaps = mergeInfo.overlaps(extent); + public void hostOndemand(List<KeyExtent> extents) { + extents.forEach(e -> Preconditions.checkArgument(DataLevel.of(e.tableId()) == DataLevel.USER)); - if (overlaps) { - log.debug("mergeInfo overlaps: {} true", extent); - switch (mergeInfo.getState()) { - case NONE: - case COMPLETE: - break; - case STARTED: - return TabletGoalState.HOSTED; - case WAITING_FOR_OFFLINE: - // If we have walogs we need to be HOSTED to recover - if (!tls.walogs.isEmpty()) { - return TabletGoalState.HOSTED; - } else { - return TabletGoalState.UNASSIGNED; - } - case MERGING: - case MERGED: - return TabletGoalState.UNASSIGNED; - } - } else { - log.trace("mergeInfo overlaps: {} false", extent); - } - } - - // taking table offline? - state = getTableGoalState(extent); - if (state == TabletGoalState.HOSTED) { - // Maybe this tablet needs to be migrated - TServerInstance dest = migrations.get(extent); - if (dest != null && tls.current != null && !dest.equals(tls.current.getServerInstance())) { - return TabletGoalState.UNASSIGNED; - } + for (var watcher : watchers) { + if (watcher.getLevel() == DataLevel.USER) { + watcher.hostOndemand(extents); } } - return state; } private class MigrationCleanupThread implements Runnable { @@@ -1140,9 -1257,10 +1147,11 @@@ } MetricsInfo metricsInfo = getContext().getMetricsInfo(); - metricsInfo.addServiceTags(getApplicationName(), sa.getAddress()); - - var producers = ManagerMetrics.getProducers(getConfiguration(), this); + metricsInfo.addServiceTags(getApplicationName(), sa.getAddress(), getResourceGroup()); + ManagerMetrics managerMetrics = new ManagerMetrics(getConfiguration(), this); + var producers = managerMetrics.getProducers(getConfiguration(), this); + producers.add(balancerMetrics); ++ metricsInfo.addMetricsProducers(producers.toArray(new MetricsProducer[0])); metricsInfo.init(); diff --cc test/src/main/java/org/apache/accumulo/test/BalanceIT.java index 27ae709699,0164463903..231d50fb4a --- a/test/src/main/java/org/apache/accumulo/test/BalanceIT.java +++ b/test/src/main/java/org/apache/accumulo/test/BalanceIT.java @@@ -33,6 -37,20 +37,16 @@@ import org.slf4j.LoggerFactory public class BalanceIT extends AccumuloClusterHarness { private static final Logger log = LoggerFactory.getLogger(BalanceIT.class); + @Override + public void configureMiniCluster(MiniAccumuloConfigImpl cfg, Configuration hadoopCoreSite) { + Map<String,String> siteConfig = cfg.getSiteConfig(); + siteConfig.put(Property.TSERV_MAXMEM.getKey(), "10K"); - siteConfig.put(Property.TSERV_MAJC_DELAY.getKey(), "50ms"); + siteConfig.put(Property.GENERAL_MICROMETER_ENABLED.getKey(), "true"); + siteConfig.put("general.custom.metrics.opts.logging.step", "0.5s"); + cfg.setSiteConfig(siteConfig); - // ensure we have two tservers - if (cfg.getNumTservers() < 2) { - cfg.setNumTservers(2); - } ++ cfg.getClusterServerConfiguration().setNumDefaultTabletServers(2); + } + @Override protected Duration defaultTimeout() { return Duration.ofMinutes(1); diff --cc test/src/main/java/org/apache/accumulo/test/functional/BalanceInPresenceOfOfflineTableIT.java index 46cb89ba7b,c03ad5c03d..f38a6d8bff --- a/test/src/main/java/org/apache/accumulo/test/functional/BalanceInPresenceOfOfflineTableIT.java +++ b/test/src/main/java/org/apache/accumulo/test/functional/BalanceInPresenceOfOfflineTableIT.java @@@ -71,11 -67,13 +71,13 @@@ public class BalanceInPresenceOfOffline public void configureMiniCluster(MiniAccumuloConfigImpl cfg, Configuration hadoopCoreSite) { Map<String,String> siteConfig = cfg.getSiteConfig(); siteConfig.put(Property.TSERV_MAXMEM.getKey(), "10K"); - siteConfig.put(Property.TSERV_MAJC_DELAY.getKey(), "50ms"); + siteConfig.put(Property.GENERAL_MICROMETER_ENABLED.getKey(), "true"); + siteConfig.put("general.custom.metrics.opts.logging.step", "0.5s"); cfg.setSiteConfig(siteConfig); // ensure we have two tservers - if (cfg.getNumTservers() < 2) { - cfg.setNumTservers(2); + if (cfg.getClusterServerConfiguration().getTabletServerConfiguration() + .get(Constants.DEFAULT_RESOURCE_GROUP_NAME) < 2) { + cfg.getClusterServerConfiguration().setNumDefaultTabletServers(2); } }