This is an automated email from the ASF dual-hosted git repository. edcoleman pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/accumulo.git
commit 46ed92329cf83911c5b43f348658941e0979f3f9 Merge: 8da73ce467 96b86a5f62 Author: Ed Coleman <edcole...@apache.org> AuthorDate: Fri May 3 16:57:08 2024 +0000 Merge remote-tracking branch 'upstream/2.1' includes: - 96b86a5f62 - update for additional scan server metrics - c488f788ad - adds scan server metrics .../accumulo/core/metrics/MetricsProducer.java | 108 +++++++++++---------- .../org/apache/accumulo/tserver/ScanServer.java | 55 +++++++++-- .../apache/accumulo/tserver/ScanServerMetrics.java | 59 +++++++++++ .../accumulo/tserver/ThriftScanClientHandler.java | 4 +- .../tserver/metrics/TabletServerScanMetrics.java | 24 +++-- .../apache/accumulo/tserver/ScanServerTest.java | 9 ++ .../apache/accumulo/test/metrics/MetricsIT.java | 11 ++- 7 files changed, 192 insertions(+), 78 deletions(-) diff --cc core/src/main/java/org/apache/accumulo/core/metrics/MetricsProducer.java index 5e43b2a938,1bb2a1c10e..2fdf9172ab --- a/core/src/main/java/org/apache/accumulo/core/metrics/MetricsProducer.java +++ b/core/src/main/java/org/apache/accumulo/core/metrics/MetricsProducer.java @@@ -599,9 -613,14 +600,9 @@@ public interface MetricsProducer String METRICS_MINC_PREFIX = "accumulo.tserver.compactions.minc."; String METRICS_MINC_QUEUED = METRICS_MINC_PREFIX + "queued"; String METRICS_MINC_RUNNING = METRICS_MINC_PREFIX + "running"; - - String METRICS_REPLICATION_PREFIX = "accumulo.replication."; - String METRICS_REPLICATION_QUEUE = METRICS_REPLICATION_PREFIX + "queue"; - String METRICS_REPLICATION_PENDING_FILES = METRICS_REPLICATION_PREFIX + "files.pending"; - String METRICS_REPLICATION_PEERS = METRICS_REPLICATION_PREFIX + "peers"; - String METRICS_REPLICATION_THREADS = METRICS_REPLICATION_PREFIX + "threads"; + String METRICS_MINC_PAUSED = METRICS_MINC_PREFIX + "paused"; - String METRICS_SCAN_PREFIX = "accumulo.tserver.scans."; + String METRICS_SCAN_PREFIX = "accumulo.scan."; String METRICS_SCAN_TIMES = METRICS_SCAN_PREFIX + "times"; String METRICS_SCAN_OPEN_FILES = METRICS_SCAN_PREFIX + "files.open"; String METRICS_SCAN_RESULTS = METRICS_SCAN_PREFIX + "result"; @@@ -609,10 -628,15 +610,17 @@@ String METRICS_SCAN_START = METRICS_SCAN_PREFIX + "start"; String METRICS_SCAN_CONTINUE = METRICS_SCAN_PREFIX + "continue"; String METRICS_SCAN_CLOSE = METRICS_SCAN_PREFIX + "close"; - String METRICS_SCAN_BUSY_TIMEOUT = METRICS_SCAN_PREFIX + "busy.timeout"; + String METRICS_SCAN_BUSY_TIMEOUT_COUNTER = METRICS_SCAN_PREFIX + "busy.timeout.count"; + String METRICS_SCAN_RESERVATION_TIMER = METRICS_SCAN_PREFIX + "reservation.timer"; + String METRICS_SCAN_QUERIES = METRICS_SCAN_PREFIX + "queries"; + String METRICS_SCAN_QUERY_SCAN_RESULTS = METRICS_SCAN_PREFIX + "query.results"; + String METRICS_SCAN_QUERY_SCAN_RESULTS_BYTES = METRICS_SCAN_PREFIX + "query.results.bytes"; + String METRICS_SCAN_SCANNED_ENTRIES = METRICS_SCAN_PREFIX + "query.scanned.entries"; + String METRICS_SCAN_PAUSED_FOR_MEM = METRICS_SCAN_PREFIX + "paused.for.memory"; + String METRICS_SCAN_RETURN_FOR_MEM = METRICS_SCAN_PREFIX + "return.early.for.memory"; + String METRICS_SCAN_TABLET_METADATA_CACHE = METRICS_SCAN_PREFIX + "tablet.metadata.cache"; + String METRICS_TSERVER_PREFIX = "accumulo.tserver."; String METRICS_TSERVER_ENTRIES = METRICS_TSERVER_PREFIX + "entries"; String METRICS_TSERVER_MEM_ENTRIES = METRICS_TSERVER_PREFIX + "entries.mem"; diff --cc server/tserver/src/main/java/org/apache/accumulo/tserver/ScanServer.java index ca4b28d06e,1e237e80e8..44ab680a98 --- a/server/tserver/src/main/java/org/apache/accumulo/tserver/ScanServer.java +++ b/server/tserver/src/main/java/org/apache/accumulo/tserver/ScanServer.java @@@ -76,17 -70,20 +76,18 @@@ import org.apache.accumulo.core.metadat import org.apache.accumulo.core.metadata.StoredTabletFile; import org.apache.accumulo.core.metadata.schema.Ample; import org.apache.accumulo.core.metadata.schema.TabletMetadata; +import org.apache.accumulo.core.metadata.schema.TabletsMetadata; import org.apache.accumulo.core.metrics.MetricsInfo; import org.apache.accumulo.core.securityImpl.thrift.TCredentials; -import org.apache.accumulo.core.spi.scan.ScanServerSelector; -import org.apache.accumulo.core.tabletserver.thrift.ActiveScan; +import org.apache.accumulo.core.tabletscan.thrift.ActiveScan; ++import org.apache.accumulo.core.tabletscan.thrift.ScanServerBusyException; +import org.apache.accumulo.core.tabletscan.thrift.TSampleNotPresentException; +import org.apache.accumulo.core.tabletscan.thrift.TSamplerConfiguration; +import org.apache.accumulo.core.tabletscan.thrift.TabletScanClientService; +import org.apache.accumulo.core.tabletscan.thrift.TooManyFilesException; import org.apache.accumulo.core.tabletserver.thrift.NoSuchScanIDException; import org.apache.accumulo.core.tabletserver.thrift.NotServingTabletException; -import org.apache.accumulo.core.tabletserver.thrift.ScanServerBusyException; -import org.apache.accumulo.core.tabletserver.thrift.TSampleNotPresentException; -import org.apache.accumulo.core.tabletserver.thrift.TSamplerConfiguration; -import org.apache.accumulo.core.tabletserver.thrift.TabletScanClientService; -import org.apache.accumulo.core.tabletserver.thrift.TooManyFilesException; -import org.apache.accumulo.core.trace.thrift.TInfo; import org.apache.accumulo.core.util.Halt; -import org.apache.accumulo.core.util.HostAndPort; import org.apache.accumulo.core.util.UtilWaitThread; import org.apache.accumulo.core.util.threads.ThreadPools; import org.apache.accumulo.server.AbstractServer; @@@ -125,8 -122,9 +126,10 @@@ import com.github.benmanes.caffeine.cac import com.google.common.annotations.VisibleForTesting; import com.google.common.base.Preconditions; import com.google.common.collect.Sets; +import com.google.common.net.HostAndPort; + import io.micrometer.core.instrument.Tag; + public class ScanServer extends AbstractServer implements TabletScanClientService.Iface, TabletHostingServer { @@@ -375,10 -376,12 +379,12 @@@ MetricsInfo metricsInfo = getContext().getMetricsInfo(); metricsInfo.addServiceTags(getApplicationName(), clientAddress); + metricsInfo.addCommonTags(List.of(Tag.of("resource.group", groupName))); scanMetrics = new TabletServerScanMetrics(); + scanServerMetrics = new ScanServerMetrics(tabletMetadataCache); - metricsInfo.addMetricsProducers(this, scanMetrics); - metricsInfo.addMetricsProducers(scanMetrics, scanServerMetrics); ++ metricsInfo.addMetricsProducers(this, scanMetrics, scanServerMetrics); metricsInfo.init(); // We need to set the compaction manager so that we don't get an NPE in CompactableImpl.close diff --cc server/tserver/src/main/java/org/apache/accumulo/tserver/metrics/TabletServerScanMetrics.java index 9a1faa6261,8e066dd7f7..09f8431bb4 --- a/server/tserver/src/main/java/org/apache/accumulo/tserver/metrics/TabletServerScanMetrics.java +++ b/server/tserver/src/main/java/org/apache/accumulo/tserver/metrics/TabletServerScanMetrics.java @@@ -39,9 -39,7 +39,9 @@@ public class TabletServerScanMetrics im private Counter startScanCalls; private Counter continueScanCalls; private Counter closeScanCalls; - private Counter busyTimeoutReturned; + private Counter busyTimeoutCount; + private Counter pausedForMemory; + private Counter earlyReturnForMemory; private final LongAdder lookupCount = new LongAdder(); private final LongAdder queryResultCount = new LongAdder(); @@@ -72,10 -70,10 +72,6 @@@ return this.queryResultBytes.sum(); } -- public void incrementScannedCount(long amount) { -- this.scannedCount.add(amount); -- } -- public LongAdder getScannedCounter() { return this.scannedCount; } @@@ -116,18 -114,10 +112,18 @@@ closeScanCalls.increment(value); } - public void incrementScanBusyTimeout(double value) { - busyTimeoutReturned.increment(value); + public void incrementBusy(double value) { + busyTimeoutCount.increment(value); } + public void incrementScanPausedForLowMemory() { + pausedForMemory.increment(); + } + + public void incrementEarlyReturnForLowMemory() { + earlyReturnForMemory.increment(); + } + @Override public void registerMetrics(MeterRegistry registry) { Gauge.builder(METRICS_SCAN_OPEN_FILES, openFiles::get) @@@ -143,23 -133,20 +139,25 @@@ .description("calls to continue a scan / multiscan").register(registry); closeScanCalls = Counter.builder(METRICS_SCAN_CLOSE) .description("calls to close a scan / multiscan").register(registry); - busyTimeoutReturned = Counter.builder(METRICS_SCAN_BUSY_TIMEOUT) - .description("times that a scan has timed out in the queue").register(registry); - Gauge.builder(METRICS_TSERVER_QUERIES, this, TabletServerScanMetrics::getLookupCount) + busyTimeoutCount = Counter.builder(METRICS_SCAN_BUSY_TIMEOUT_COUNTER) + .description("The number of scans where a busy timeout happened").register(registry); + Gauge.builder(METRICS_SCAN_QUERIES, this, TabletServerScanMetrics::getLookupCount) .description("Number of queries").register(registry); - Gauge.builder(METRICS_TSERVER_SCAN_RESULTS, this, TabletServerScanMetrics::getQueryResultCount) + Gauge + .builder(METRICS_SCAN_QUERY_SCAN_RESULTS, this, + TabletServerScanMetrics::getQueryResultCount) .description("Query rate (entries/sec)").register(registry); Gauge - .builder(METRICS_TSERVER_SCAN_RESULTS_BYTES, this, + .builder(METRICS_SCAN_QUERY_SCAN_RESULTS_BYTES, this, TabletServerScanMetrics::getQueryByteCount) .description("Query rate (bytes/sec)").register(registry); - Gauge.builder(METRICS_TSERVER_SCANNED_ENTRIES, this, TabletServerScanMetrics::getScannedCount) + Gauge.builder(METRICS_SCAN_SCANNED_ENTRIES, this, TabletServerScanMetrics::getScannedCount) .description("Scanned rate").register(registry); + pausedForMemory = Counter.builder(METRICS_SCAN_PAUSED_FOR_MEM) + .description("scan paused due to server being low on memory").register(registry); + earlyReturnForMemory = Counter.builder(METRICS_SCAN_RETURN_FOR_MEM) + .description("scan returned results early due to server being low on memory") + .register(registry); } } diff --cc test/src/main/java/org/apache/accumulo/test/metrics/MetricsIT.java index 4b92480f19,55622d0793..f2be6e71a5 --- a/test/src/main/java/org/apache/accumulo/test/metrics/MetricsIT.java +++ b/test/src/main/java/org/apache/accumulo/test/metrics/MetricsIT.java @@@ -99,10 -99,12 +99,13 @@@ public class MetricsIT extends Configur doWorkToGenerateMetrics(); cluster.stop(); -- Set<String> unexpectedMetrics = Set.of(METRICS_SCAN_YIELDS, METRICS_UPDATE_ERRORS, - METRICS_COMPACTOR_MAJC_STUCK, METRICS_SCAN_BUSY_TIMEOUT, METRICS_SCAN_PAUSED_FOR_MEM, - METRICS_SCAN_RETURN_FOR_MEM, METRICS_MINC_PAUSED, METRICS_MAJC_PAUSED); - Set<String> flakyMetrics = Set.of(METRICS_GC_WAL_ERRORS, METRICS_FATE_TYPE_IN_PROGRESS); - METRICS_REPLICATION_QUEUE, METRICS_COMPACTOR_MAJC_STUCK, METRICS_SCAN_BUSY_TIMEOUT_COUNTER); - // add sserver as flaky until scan server included in mini tests. ++ Set<String> unexpectedMetrics = ++ Set.of(METRICS_SCAN_YIELDS, METRICS_UPDATE_ERRORS, METRICS_COMPACTOR_MAJC_STUCK, ++ METRICS_SCAN_BUSY_TIMEOUT_COUNTER, METRICS_SCAN_PAUSED_FOR_MEM, ++ METRICS_SCAN_RETURN_FOR_MEM, METRICS_MINC_PAUSED, METRICS_MAJC_PAUSED); + Set<String> flakyMetrics = Set.of(METRICS_GC_WAL_ERRORS, METRICS_FATE_TYPE_IN_PROGRESS, + METRICS_SCAN_BUSY_TIMEOUT_COUNTER, METRICS_SCAN_RESERVATION_TIMER, + METRICS_SCAN_TABLET_METADATA_CACHE); Map<String,String> expectedMetricNames = this.getMetricFields(); flakyMetrics.forEach(expectedMetricNames::remove); // might not see these