This is an automated email from the ASF dual-hosted git repository. dlmarion pushed a commit to branch elasticity in repository https://gitbox.apache.org/repos/asf/accumulo.git
The following commit(s) were added to refs/heads/elasticity by this push: new 07313e3b41 Enable reporting of TabletManagementIterator errors (#3872) 07313e3b41 is described below commit 07313e3b41539aab327db79cd3348dc3a00684c4 Author: Dave Marion <dlmar...@apache.org> AuthorDate: Tue Oct 24 16:00:51 2023 -0400 Enable reporting of TabletManagementIterator errors (#3872) Enable the TabletManagementIterator to report error's back to the TabletGroupWatcher when trying to determine actions that should be taken for an extent. Increment metric gauges when an error occurs. Fixes #3469 --- .../core/manager/state/TabletManagement.java | 39 +++++++++++++++++++--- .../accumulo/core/metrics/MetricsProducer.java | 26 +++++++++++++++ .../manager/state/TabletManagementIterator.java | 36 +++++++++++++------- .../manager/state/TabletManagementScanner.java | 4 +-- .../server/manager/state/ZooTabletStateStore.java | 25 ++++++++------ .../server/manager/state/TabletManagementTest.java | 20 +++++++++++ .../gc/GarbageCollectWriteAheadLogsTest.java | 6 ++-- .../java/org/apache/accumulo/manager/Manager.java | 8 ++--- .../accumulo/manager/TabletGroupWatcher.java | 19 +++++++++-- .../accumulo/manager/metrics/ManagerMetrics.java | 30 +++++++++++++++++ .../apache/accumulo/test/metrics/MetricsIT.java | 4 ++- 11 files changed, 179 insertions(+), 38 deletions(-) diff --git a/core/src/main/java/org/apache/accumulo/core/manager/state/TabletManagement.java b/core/src/main/java/org/apache/accumulo/core/manager/state/TabletManagement.java index a687de873f..647f6f9412 100644 --- a/core/src/main/java/org/apache/accumulo/core/manager/state/TabletManagement.java +++ b/core/src/main/java/org/apache/accumulo/core/manager/state/TabletManagement.java @@ -47,6 +47,7 @@ public class TabletManagement { ColumnType.HOSTING_GOAL, ColumnType.HOSTING_REQUESTED, ColumnType.FILES, ColumnType.LAST, ColumnType.OPID, ColumnType.ECOMP, ColumnType.DIR, ColumnType.SELECTED); + private static final Text ERROR_COLUMN_NAME = new Text("ERROR"); private static final Text REASONS_COLUMN_NAME = new Text("REASONS"); private static final Text EMPTY = new Text(""); @@ -62,12 +63,20 @@ public class TabletManagement { decodedRow.put(reasonsKey, reasonsValue); } - public final Set<ManagementAction> actions; - public final TabletMetadata tabletMetadata; + public static void addError(final SortedMap<Key,Value> decodedRow, final Exception error) { + final Key errorKey = new Key(decodedRow.firstKey().getRow(), ERROR_COLUMN_NAME, EMPTY); + final Value errorValue = new Value(error.getMessage()); + decodedRow.put(errorKey, errorValue); + } + + private final Set<ManagementAction> actions; + private final TabletMetadata tabletMetadata; + private final String errorMessage; - public TabletManagement(Set<ManagementAction> actions, TabletMetadata tm) { + public TabletManagement(Set<ManagementAction> actions, TabletMetadata tm, String errorMessage) { this.actions = actions; this.tabletMetadata = tm; + this.errorMessage = errorMessage; } public TabletManagement(Key wholeRowKey, Value wholeRowValue) throws IOException { @@ -77,9 +86,21 @@ public class TabletManagement { public TabletManagement(Key wholeRowKey, Value wholeRowValue, boolean saveKV) throws IOException { final SortedMap<Key,Value> decodedRow = WholeRowIterator.decodeRow(wholeRowKey, wholeRowValue); Text row = decodedRow.firstKey().getRow(); - Value val = decodedRow.remove(new Key(row, REASONS_COLUMN_NAME, EMPTY)); + // Decode any errors that happened on the TabletServer + Value errorValue = decodedRow.remove(new Key(row, ERROR_COLUMN_NAME, EMPTY)); + if (errorValue != null) { + this.errorMessage = errorValue.toString(); + } else { + this.errorMessage = null; + } + // Decode the ManagementActions if it exists + Value actionValue = decodedRow.remove(new Key(row, REASONS_COLUMN_NAME, EMPTY)); Set<ManagementAction> actions = new HashSet<>(); - Splitter.on(',').split(val.toString()).forEach(a -> actions.add(ManagementAction.valueOf(a))); + if (actionValue != null) { + Splitter.on(',').split(actionValue.toString()) + .forEach(a -> actions.add(ManagementAction.valueOf(a))); + } + TabletMetadata tm = TabletMetadata.convertRow(decodedRow.entrySet().iterator(), CONFIGURED_COLUMNS, saveKV, true); this.actions = actions; @@ -94,6 +115,14 @@ public class TabletManagement { return tabletMetadata; } + /** + * @return exception message if an exception was thrown while computing this tablets management + * actions OR null if no exception was seen + */ + public String getErrorMessage() { + return errorMessage; + } + @Override public String toString() { return actions.toString() + "," + tabletMetadata.toString(); diff --git a/core/src/main/java/org/apache/accumulo/core/metrics/MetricsProducer.java b/core/src/main/java/org/apache/accumulo/core/metrics/MetricsProducer.java index 0c66ce5411..021c045cc0 100644 --- a/core/src/main/java/org/apache/accumulo/core/metrics/MetricsProducer.java +++ b/core/src/main/java/org/apache/accumulo/core/metrics/MetricsProducer.java @@ -628,6 +628,27 @@ import io.micrometer.core.instrument.MeterRegistry; * <td>Counter</td> * <td></td> * </tr> + * <tr> + * <td>N/A</td> + * <td>N/A</td> + * <td>{@link #METRICS_MANAGER_ROOT_TGW_ERRORS}</td> + * <td>Gauge</td> + * <td></td> + * </tr> + * <tr> + * <td>N/A</td> + * <td>N/A</td> + * <td>{@link #METRICS_MANAGER_META_TGW_ERRORS}</td> + * <td>Gauge</td> + * <td></td> + * </tr> + * <tr> + * <td>N/A</td> + * <td>N/A</td> + * <td>{@link #METRICS_MANAGER_USER_TGW_ERRORS}</td> + * <td>Gauge</td> + * <td></td> + * </tr> * </table> * * @since 2.1.0 @@ -674,6 +695,11 @@ public interface MetricsProducer { String METRICS_GC_POST_OP_DURATION = METRICS_GC_PREFIX + "post.op.duration"; String METRICS_GC_RUN_CYCLE = METRICS_GC_PREFIX + "run.cycle"; + String METRICS_MANAGER_PREFIX = "accumulo.manager."; + String METRICS_MANAGER_ROOT_TGW_ERRORS = METRICS_MANAGER_PREFIX + "tabletmgmt.root.errors"; + String METRICS_MANAGER_META_TGW_ERRORS = METRICS_MANAGER_PREFIX + "tabletmgmt.meta.errors"; + String METRICS_MANAGER_USER_TGW_ERRORS = METRICS_MANAGER_PREFIX + "tabletmgmt.user.errors"; + String METRICS_MAJC_PREFIX = "accumulo.tserver.compactions.majc."; String METRICS_MAJC_QUEUED = METRICS_MAJC_PREFIX + "queued"; String METRICS_MAJC_RUNNING = METRICS_MAJC_PREFIX + "running"; diff --git a/server/base/src/main/java/org/apache/accumulo/server/manager/state/TabletManagementIterator.java b/server/base/src/main/java/org/apache/accumulo/server/manager/state/TabletManagementIterator.java index fcc94f2169..e0787dbee1 100644 --- a/server/base/src/main/java/org/apache/accumulo/server/manager/state/TabletManagementIterator.java +++ b/server/base/src/main/java/org/apache/accumulo/server/manager/state/TabletManagementIterator.java @@ -429,20 +429,32 @@ public class TabletManagementIterator extends SkippingIterator { TabletManagement.CONFIGURED_COLUMNS, false, true); actions.clear(); - if (managerState != ManagerState.NORMAL || current.isEmpty() || onlineTables.isEmpty()) { - // when manager is in the process of starting up or shutting down return everything. - actions.add(ManagementAction.NEEDS_LOCATION_UPDATE); - } else { - LOG.trace("Evaluating extent: {}", tm); - computeTabletManagementActions(tm, actions); + Exception error = null; + try { + if (managerState != ManagerState.NORMAL || current.isEmpty() || onlineTables.isEmpty()) { + // when manager is in the process of starting up or shutting down return everything. + actions.add(ManagementAction.NEEDS_LOCATION_UPDATE); + } else { + LOG.trace("Evaluating extent: {}", tm); + computeTabletManagementActions(tm, actions); + } + } catch (Exception e) { + LOG.error("Error computing tablet management actions for extent: {}", tm.getExtent(), e); + error = e; } - if (!actions.isEmpty()) { - // If we simply returned here, then the client would get the encoded K,V - // from the WholeRowIterator. However, it would not know the reason(s) why - // it was returned. Insert a K,V pair to represent the reasons. The client - // can pull this K,V pair from the results by looking at the colf. - TabletManagement.addActions(decodedRow, actions); + if (!actions.isEmpty() || error != null) { + if (error != null) { + // Insert the error into K,V pair representing + // the tablet metadata. + TabletManagement.addError(decodedRow, error); + } else if (!actions.isEmpty()) { + // If we simply returned here, then the client would get the encoded K,V + // from the WholeRowIterator. However, it would not know the reason(s) why + // it was returned. Insert a K,V pair to represent the reasons. The client + // can pull this K,V pair from the results by looking at the colf. + TabletManagement.addActions(decodedRow, actions); + } topKey = decodedRow.firstKey(); topValue = WholeRowIterator.encodeRow(new ArrayList<>(decodedRow.keySet()), new ArrayList<>(decodedRow.values())); diff --git a/server/base/src/main/java/org/apache/accumulo/server/manager/state/TabletManagementScanner.java b/server/base/src/main/java/org/apache/accumulo/server/manager/state/TabletManagementScanner.java index 253d6e4f45..e34d16304c 100644 --- a/server/base/src/main/java/org/apache/accumulo/server/manager/state/TabletManagementScanner.java +++ b/server/base/src/main/java/org/apache/accumulo/server/manager/state/TabletManagementScanner.java @@ -132,9 +132,9 @@ public class TabletManagementScanner implements ClosableIterator<TabletManagemen Entry<Key,Value> e = iter.next(); try { TabletManagement tm = TabletManagementIterator.decode(e); - log.trace("Returning metadata tablet, extent: {}, hostingGoal: {}, actions: {}", + log.trace("Returning metadata tablet, extent: {}, hostingGoal: {}, actions: {}, error: {}", tm.getTabletMetadata().getExtent(), tm.getTabletMetadata().getHostingGoal(), - tm.getActions()); + tm.getActions(), tm.getErrorMessage()); return tm; } catch (IOException e1) { throw new RuntimeException("Error creating TabletMetadata object", e1); diff --git a/server/base/src/main/java/org/apache/accumulo/server/manager/state/ZooTabletStateStore.java b/server/base/src/main/java/org/apache/accumulo/server/manager/state/ZooTabletStateStore.java index 7fa5158da5..c88847c536 100644 --- a/server/base/src/main/java/org/apache/accumulo/server/manager/state/ZooTabletStateStore.java +++ b/server/base/src/main/java/org/apache/accumulo/server/manager/state/ZooTabletStateStore.java @@ -73,19 +73,24 @@ class ZooTabletStateStore extends AbstractTabletStateStore implements TabletStat @Override public TabletManagement next() { finished = true; - TabletMetadata tm = ample.readTablet(RootTable.EXTENT, ReadConsistency.EVENTUAL); - var actions = EnumSet.of(ManagementAction.NEEDS_LOCATION_UPDATE); - - CompactionJobGenerator cjg = - new CompactionJobGenerator(new ServiceEnvironmentImpl(ctx), Map.of()); - var jobs = cjg.generateJobs(tm, - EnumSet.of(CompactionKind.SYSTEM, CompactionKind.USER, CompactionKind.SELECTOR)); - if (!jobs.isEmpty()) { - actions.add(ManagementAction.NEEDS_COMPACTING); + final var actions = EnumSet.of(ManagementAction.NEEDS_LOCATION_UPDATE); + final TabletMetadata tm = ample.readTablet(RootTable.EXTENT, ReadConsistency.EVENTUAL); + String error = null; + try { + CompactionJobGenerator cjg = + new CompactionJobGenerator(new ServiceEnvironmentImpl(ctx), Map.of()); + var jobs = cjg.generateJobs(tm, + EnumSet.of(CompactionKind.SYSTEM, CompactionKind.USER, CompactionKind.SELECTOR)); + if (!jobs.isEmpty()) { + actions.add(ManagementAction.NEEDS_COMPACTING); + } + } catch (Exception e) { + log.error("Error computing tablet management actions for Root extent", e); + error = e.getMessage(); } + return new TabletManagement(actions, tm, error); - return new TabletManagement(actions, tm); } @Override diff --git a/server/base/src/test/java/org/apache/accumulo/server/manager/state/TabletManagementTest.java b/server/base/src/test/java/org/apache/accumulo/server/manager/state/TabletManagementTest.java index b9fd8837d7..fe867e2e64 100644 --- a/server/base/src/test/java/org/apache/accumulo/server/manager/state/TabletManagementTest.java +++ b/server/base/src/test/java/org/apache/accumulo/server/manager/state/TabletManagementTest.java @@ -141,6 +141,26 @@ public class TabletManagementTest { assertEquals(actions, tmi.getActions()); } + @Test + public void testEncodeDecodeWithErrors() throws Exception { + KeyExtent extent = new KeyExtent(TableId.of("5"), new Text("df"), new Text("da")); + + final SortedMap<Key,Value> entries = createMetadataEntryKV(extent); + + TabletManagement.addError(entries, new UnsupportedOperationException("Not supported.")); + Key key = entries.firstKey(); + Value val = WholeRowIterator.encodeRow(new ArrayList<>(entries.keySet()), + new ArrayList<>(entries.values())); + + // Remove the ERROR column from the entries map for the comparison check + // below + entries.remove(new Key(key.getRow().toString(), "ERROR", "")); + + TabletManagement tmi = new TabletManagement(key, val, true); + assertEquals(entries, tmi.getTabletMetadata().getKeyValues()); + assertEquals("Not supported.", tmi.getErrorMessage()); + } + @Test public void testBinary() throws Exception { // test end row with non ascii data diff --git a/server/gc/src/test/java/org/apache/accumulo/gc/GarbageCollectWriteAheadLogsTest.java b/server/gc/src/test/java/org/apache/accumulo/gc/GarbageCollectWriteAheadLogsTest.java index 407ddc7589..f8e9c8b151 100644 --- a/server/gc/src/test/java/org/apache/accumulo/gc/GarbageCollectWriteAheadLogsTest.java +++ b/server/gc/src/test/java/org/apache/accumulo/gc/GarbageCollectWriteAheadLogsTest.java @@ -67,10 +67,12 @@ public class GarbageCollectWriteAheadLogsTest { try { tabletAssignedToServer1 = new TabletManagement(Set.of(), TabletMetadata.builder(extent).putLocation(Location.current(server1)) - .putHostingGoal(TabletHostingGoal.ALWAYS).build(LAST, SUSPEND, LOGS)); + .putHostingGoal(TabletHostingGoal.ALWAYS).build(LAST, SUSPEND, LOGS), + ""); tabletAssignedToServer2 = new TabletManagement(Set.of(), TabletMetadata.builder(extent).putLocation(Location.current(server2)) - .putHostingGoal(TabletHostingGoal.NEVER).build(LAST, SUSPEND, LOGS)); + .putHostingGoal(TabletHostingGoal.NEVER).build(LAST, SUSPEND, LOGS), + ""); } catch (Exception ex) { throw new RuntimeException(ex); } diff --git a/server/manager/src/main/java/org/apache/accumulo/manager/Manager.java b/server/manager/src/main/java/org/apache/accumulo/manager/Manager.java index f10dbd25c1..60b2115d6e 100644 --- a/server/manager/src/main/java/org/apache/accumulo/manager/Manager.java +++ b/server/manager/src/main/java/org/apache/accumulo/manager/Manager.java @@ -1163,10 +1163,10 @@ public class Manager extends AbstractServer managerUpgrading.set(true); } + ManagerMetrics mm = new ManagerMetrics(getConfiguration(), this); try { MetricsUtil.initializeMetrics(getContext().getConfiguration(), this.applicationName, sa.getAddress()); - ManagerMetrics mm = new ManagerMetrics(getConfiguration(), this); MetricsUtil.initializeProducers(this, mm); } catch (ClassNotFoundException | InstantiationException | IllegalAccessException | IllegalArgumentException | InvocationTargetException | NoSuchMethodException @@ -1221,7 +1221,7 @@ public class Manager extends AbstractServer this.splitter = new Splitter(context); this.splitter.start(); - watchers.add(new TabletGroupWatcher(this, this.userTabletStore, null) { + watchers.add(new TabletGroupWatcher(this, this.userTabletStore, null, mm) { @Override boolean canSuspendTablets() { // Always allow user data tablets to enter suspended state. @@ -1229,7 +1229,7 @@ public class Manager extends AbstractServer } }); - watchers.add(new TabletGroupWatcher(this, this.metadataTabletStore, watchers.get(0)) { + watchers.add(new TabletGroupWatcher(this, this.metadataTabletStore, watchers.get(0), mm) { @Override boolean canSuspendTablets() { // Allow metadata tablets to enter suspended state only if so configured. Generally @@ -1240,7 +1240,7 @@ public class Manager extends AbstractServer } }); - watchers.add(new TabletGroupWatcher(this, this.rootTabletStore, watchers.get(1)) { + watchers.add(new TabletGroupWatcher(this, this.rootTabletStore, watchers.get(1), mm) { @Override boolean canSuspendTablets() { // Never allow root tablet to enter suspended state. diff --git a/server/manager/src/main/java/org/apache/accumulo/manager/TabletGroupWatcher.java b/server/manager/src/main/java/org/apache/accumulo/manager/TabletGroupWatcher.java index da6d4a4f9e..dece4f4497 100644 --- a/server/manager/src/main/java/org/apache/accumulo/manager/TabletGroupWatcher.java +++ b/server/manager/src/main/java/org/apache/accumulo/manager/TabletGroupWatcher.java @@ -72,6 +72,7 @@ import org.apache.accumulo.core.util.TextUtil; import org.apache.accumulo.core.util.threads.Threads; import org.apache.accumulo.core.util.threads.Threads.AccumuloDaemonThread; import org.apache.accumulo.manager.Manager.TabletGoalState; +import org.apache.accumulo.manager.metrics.ManagerMetrics; import org.apache.accumulo.manager.split.SplitTask; import org.apache.accumulo.manager.state.TableCounts; import org.apache.accumulo.manager.state.TableStats; @@ -122,14 +123,17 @@ abstract class TabletGroupWatcher extends AccumuloDaemonThread { final TableStats stats = new TableStats(); private SortedSet<TServerInstance> lastScanServers = Collections.emptySortedSet(); private final EventHandler eventHandler; + private final ManagerMetrics metrics; private WalStateManager walStateManager; - TabletGroupWatcher(Manager manager, TabletStateStore store, TabletGroupWatcher dependentWatcher) { + TabletGroupWatcher(Manager manager, TabletStateStore store, TabletGroupWatcher dependentWatcher, + ManagerMetrics metrics) { super("Watching " + store.name()); this.manager = manager; this.store = store; this.dependentWatcher = dependentWatcher; + this.metrics = metrics; this.walStateManager = new WalStateManager(manager.getContext()); this.eventHandler = new EventHandler(); manager.getEventCoordinator().addListener(store.getLevel(), eventHandler); @@ -336,9 +340,20 @@ abstract class TabletGroupWatcher extends AccumuloDaemonThread { throw new IllegalStateException("State store returned a null ManagerTabletInfo object"); } - final Set<ManagementAction> actions = mti.getActions(); final TabletMetadata tm = mti.getTabletMetadata(); + final String mtiError = mti.getErrorMessage(); + if (mtiError != null) { + // An error happened on the TabletServer in the TabletManagementIterator + // when trying to process this extent. + LOG.warn( + "Error on TabletServer trying to get Tablet management information for extent: {}. Error message: {}", + tm.getExtent(), mtiError); + this.metrics.incrementTabletGroupWatcherError(this.store.getLevel()); + continue; + } + + final Set<ManagementAction> actions = mti.getActions(); if (tm.isFutureAndCurrentLocationSet()) { throw new BadLocationStateException( tm.getExtent() + " is both assigned and hosted, which should never happen: " + this, diff --git a/server/manager/src/main/java/org/apache/accumulo/manager/metrics/ManagerMetrics.java b/server/manager/src/main/java/org/apache/accumulo/manager/metrics/ManagerMetrics.java index cd3a52955d..0163e7bf62 100644 --- a/server/manager/src/main/java/org/apache/accumulo/manager/metrics/ManagerMetrics.java +++ b/server/manager/src/main/java/org/apache/accumulo/manager/metrics/ManagerMetrics.java @@ -20,9 +20,13 @@ package org.apache.accumulo.manager.metrics; import static java.util.Objects.requireNonNull; +import java.util.concurrent.atomic.AtomicLong; + import org.apache.accumulo.core.conf.AccumuloConfiguration; import org.apache.accumulo.core.conf.Property; +import org.apache.accumulo.core.metadata.schema.Ample.DataLevel; import org.apache.accumulo.core.metrics.MetricsProducer; +import org.apache.accumulo.core.metrics.MetricsUtil; import org.apache.accumulo.manager.Manager; import org.apache.accumulo.manager.metrics.fate.FateMetrics; @@ -33,6 +37,10 @@ public class ManagerMetrics implements MetricsProducer { private final FateMetrics fateMetrics; private final QueueMetrics queueMetrics; + private AtomicLong rootTGWErrorsGauge; + private AtomicLong metadataTGWErrorsGauge; + private AtomicLong userTGWErrorsGauge; + public ManagerMetrics(final AccumuloConfiguration conf, final Manager manager) { requireNonNull(conf, "AccumuloConfiguration must not be null"); requireNonNull(conf, "Manager must not be null"); @@ -41,9 +49,31 @@ public class ManagerMetrics implements MetricsProducer { queueMetrics = new QueueMetrics(manager.getCompactionQueues()); } + public void incrementTabletGroupWatcherError(DataLevel level) { + switch (level) { + case METADATA: + metadataTGWErrorsGauge.incrementAndGet(); + break; + case ROOT: + rootTGWErrorsGauge.incrementAndGet(); + break; + case USER: + userTGWErrorsGauge.incrementAndGet(); + break; + default: + throw new IllegalStateException("Unhandled DataLevel: " + level); + } + } + @Override public void registerMetrics(MeterRegistry registry) { fateMetrics.registerMetrics(registry); queueMetrics.registerMetrics(registry); + rootTGWErrorsGauge = registry.gauge(METRICS_MANAGER_ROOT_TGW_ERRORS, + MetricsUtil.getCommonTags(), new AtomicLong(0)); + metadataTGWErrorsGauge = registry.gauge(METRICS_MANAGER_META_TGW_ERRORS, + MetricsUtil.getCommonTags(), new AtomicLong(0)); + userTGWErrorsGauge = registry.gauge(METRICS_MANAGER_USER_TGW_ERRORS, + MetricsUtil.getCommonTags(), new AtomicLong(0)); } } diff --git a/test/src/main/java/org/apache/accumulo/test/metrics/MetricsIT.java b/test/src/main/java/org/apache/accumulo/test/metrics/MetricsIT.java index 9a59ecb00a..6aea6afa01 100644 --- a/test/src/main/java/org/apache/accumulo/test/metrics/MetricsIT.java +++ b/test/src/main/java/org/apache/accumulo/test/metrics/MetricsIT.java @@ -96,7 +96,9 @@ public class MetricsIT extends ConfigurableMacBase implements MetricsProducer { Set<String> unexpectedMetrics = Set.of(METRICS_SCAN_YIELDS, METRICS_UPDATE_ERRORS, METRICS_SCAN_BUSY_TIMEOUT, METRICS_SCAN_PAUSED_FOR_MEM, METRICS_SCAN_RETURN_FOR_MEM, - METRICS_MINC_PAUSED, METRICS_MAJC_PAUSED, METRICS_MAJC_QUEUED, METRICS_MAJC_RUNNING); + METRICS_MINC_PAUSED, METRICS_MAJC_PAUSED, METRICS_MAJC_QUEUED, METRICS_MAJC_RUNNING, + METRICS_MANAGER_ROOT_TGW_ERRORS, METRICS_MANAGER_META_TGW_ERRORS, + METRICS_MANAGER_USER_TGW_ERRORS); Set<String> flakyMetrics = Set.of(METRICS_GC_WAL_ERRORS, METRICS_FATE_TYPE_IN_PROGRESS, METRICS_PROPSTORE_EVICTION_COUNT, METRICS_PROPSTORE_REFRESH_COUNT, METRICS_PROPSTORE_REFRESH_LOAD_COUNT, METRICS_PROPSTORE_ZK_ERROR_COUNT,