This is an automated email from the ASF dual-hosted git repository. lahirujayathilake pushed a commit to branch access-integration in repository https://gitbox.apache.org/repos/asf/airavata-custos.git
commit fb429331976b840cd106a841b2f683e55ca42dff Author: lahiruj <[email protected]> AuthorDate: Fri Mar 27 14:19:17 2026 -0400 add Prometheus metrics, health indicator, instrument poller/worker, and structured JSON logging for prod and MDC packet correlation --- allocations/access-ci-service/pom.xml | 14 +- .../custos/access/ci/service/AmiePoller.java | 23 +++- .../ci/service/metrics/AmieHealthIndicator.java | 78 +++++++++++ .../access/ci/service/metrics/AmieMetrics.java | 124 +++++++++++++++++ .../service/worker/amie/ProcessingEventWorker.java | 28 +++- .../src/main/resources/application.yml | 15 ++ .../resources/distribution/conf/application.yml | 15 ++ .../resources/distribution/conf/logback-spring.xml | 104 ++++++++++---- .../src/main/resources/logback-spring.xml | 104 ++++++++++---- .../service/metrics/AmieHealthIndicatorTest.java | 118 ++++++++++++++++ .../access/ci/service/metrics/AmieMetricsTest.java | 129 +++++++++++++++++ .../worker/amie/ProcessingEventWorkerTest.java | 16 ++- compose/docker-compose.yml | 25 ++++ compose/grafana/dashboards/amie-service.json | 152 +++++++++++++++++++++ .../grafana/provisioning/dashboards/dashboards.yml | 12 ++ .../provisioning/datasources/prometheus.yml | 9 ++ compose/prometheus/prometheus.yml | 18 +++ pom.xml | 6 + 18 files changed, 925 insertions(+), 65 deletions(-) diff --git a/allocations/access-ci-service/pom.xml b/allocations/access-ci-service/pom.xml index 0b0f10b98..528083099 100644 --- a/allocations/access-ci-service/pom.xml +++ b/allocations/access-ci-service/pom.xml @@ -77,7 +77,19 @@ <groupId>com.google.protobuf</groupId> <artifactId>protobuf-java</artifactId> </dependency> - + <dependency> + <groupId>org.springframework.boot</groupId> + <artifactId>spring-boot-starter-actuator</artifactId> + </dependency> + <dependency> + <groupId>io.micrometer</groupId> + <artifactId>micrometer-registry-prometheus</artifactId> + </dependency> + <dependency> + <groupId>net.logstash.logback</groupId> + <artifactId>logstash-logback-encoder</artifactId> + </dependency> + <!-- Test dependencies --> <dependency> <groupId>org.springframework.boot</groupId> diff --git a/allocations/access-ci-service/src/main/java/org/apache/custos/access/ci/service/AmiePoller.java b/allocations/access-ci-service/src/main/java/org/apache/custos/access/ci/service/AmiePoller.java index 7a179aff8..ca3d07e50 100644 --- a/allocations/access-ci-service/src/main/java/org/apache/custos/access/ci/service/AmiePoller.java +++ b/allocations/access-ci-service/src/main/java/org/apache/custos/access/ci/service/AmiePoller.java @@ -20,6 +20,7 @@ package org.apache.custos.access.ci.service; import com.fasterxml.jackson.databind.JsonNode; import org.apache.custos.access.ci.service.client.amie.AmieClient; +import org.apache.custos.access.ci.service.metrics.AmieMetrics; import org.apache.custos.access.ci.service.model.amie.PacketEntity; import org.apache.custos.access.ci.service.model.amie.PacketStatus; import org.apache.custos.access.ci.service.model.amie.ProcessingEventEntity; @@ -30,6 +31,7 @@ import org.apache.custos.access.ci.service.repo.amie.ProcessingEventRepository; import org.apache.custos.access.ci.service.util.ProtoUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import org.slf4j.MDC; import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty; import org.springframework.scheduling.annotation.Scheduled; import org.springframework.stereotype.Component; @@ -51,11 +53,14 @@ public class AmiePoller { private final AmieClient client; private final PacketRepository packetRepo; private final ProcessingEventRepository eventRepo; + private final AmieMetrics amieMetrics; - public AmiePoller(AmieClient client, PacketRepository packetRepo, ProcessingEventRepository eventRepo) { + public AmiePoller(AmieClient client, PacketRepository packetRepo, ProcessingEventRepository eventRepo, + AmieMetrics amieMetrics) { this.client = client; this.packetRepo = packetRepo; this.eventRepo = eventRepo; + this.amieMetrics = amieMetrics; } @@ -71,12 +76,24 @@ public class AmiePoller { } LOGGER.info("Found {} packets to process.", packets.size()); + amieMetrics.recordPollerFetch(packets.size()); + for (JsonNode packetNode : packets) { + String packetType = packetNode.path("type").asText(null); + long amiePacketRecId = packetNode.at("/header/packet_rec_id").asLong(-1); + + MDC.put("amieId", String.valueOf(amiePacketRecId)); + if (packetType != null) { + MDC.put("packetType", packetType); + } try { processIndividualPacket(packetNode); } catch (Exception e) { // If a malformed packet is found LOGGER.error("An unexpected error occurred while processing a packet. Raw packet: {}", packetNode.toString(), e); + } finally { + MDC.remove("amieId"); + MDC.remove("packetType"); } } } @@ -109,6 +126,8 @@ public class AmiePoller { newPacket.setReceivedAt(Instant.now()); packetRepo.save(newPacket); + amieMetrics.recordPacketReceived(packetType); + ProcessingEventEntity decodeEvent = new ProcessingEventEntity(); decodeEvent.setPacket(newPacket); decodeEvent.setType(ProcessingEventType.DECODE_PACKET); @@ -122,4 +141,4 @@ public class AmiePoller { } ); } -} \ No newline at end of file +} diff --git a/allocations/access-ci-service/src/main/java/org/apache/custos/access/ci/service/metrics/AmieHealthIndicator.java b/allocations/access-ci-service/src/main/java/org/apache/custos/access/ci/service/metrics/AmieHealthIndicator.java new file mode 100644 index 000000000..cb97d28b8 --- /dev/null +++ b/allocations/access-ci-service/src/main/java/org/apache/custos/access/ci/service/metrics/AmieHealthIndicator.java @@ -0,0 +1,78 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.custos.access.ci.service.metrics; + +import org.apache.custos.access.ci.service.config.AmieProperties; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.springframework.beans.factory.annotation.Qualifier; +import org.springframework.boot.actuate.health.AbstractHealthIndicator; +import org.springframework.boot.actuate.health.Health; +import org.springframework.http.ResponseEntity; +import org.springframework.stereotype.Component; +import org.springframework.web.client.RestClientException; +import org.springframework.web.client.RestTemplate; + +/** + * Actuator health indicator for the upstream AMIE API. + * + * <p>Performs an HTTP GET to the configured AMIE base URL and reports + * {@link Health#up()} when the endpoint responds, or {@link Health#down()} when + * a network or HTTP error occurs. + */ +@Component +public class AmieHealthIndicator extends AbstractHealthIndicator { + + private static final Logger LOGGER = LoggerFactory.getLogger(AmieHealthIndicator.class); + + private final AmieProperties amieProperties; + private final RestTemplate restTemplate; + + public AmieHealthIndicator(AmieProperties amieProperties, + @Qualifier("amieRestTemplate") RestTemplate restTemplate) { + super("AMIE API health check failed"); + this.amieProperties = amieProperties; + this.restTemplate = restTemplate; + } + + @Override + protected void doHealthCheck(Health.Builder builder) { + String baseUrl = amieProperties.getBaseUrl(); + String siteCode = amieProperties.getSiteCode(); + try { + ResponseEntity<String> response = restTemplate.getForEntity(baseUrl, String.class); + if (response.getStatusCode().is2xxSuccessful() || response.getStatusCode().is3xxRedirection()) { + builder.up() + .withDetail("url", baseUrl) + .withDetail("siteCode", siteCode) + .withDetail("httpStatus", response.getStatusCode().value()); + } else { + builder.down() + .withDetail("url", baseUrl) + .withDetail("siteCode", siteCode) + .withDetail("httpStatus", response.getStatusCode().value()); + } + } catch (RestClientException ex) { + LOGGER.warn("AMIE API health check failed for URL [{}]: {}", baseUrl, ex.getMessage()); + builder.down(ex) + .withDetail("url", baseUrl) + .withDetail("siteCode", siteCode); + } + } +} diff --git a/allocations/access-ci-service/src/main/java/org/apache/custos/access/ci/service/metrics/AmieMetrics.java b/allocations/access-ci-service/src/main/java/org/apache/custos/access/ci/service/metrics/AmieMetrics.java new file mode 100644 index 000000000..981775687 --- /dev/null +++ b/allocations/access-ci-service/src/main/java/org/apache/custos/access/ci/service/metrics/AmieMetrics.java @@ -0,0 +1,124 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.custos.access.ci.service.metrics; + +import io.micrometer.core.instrument.Counter; +import io.micrometer.core.instrument.MeterRegistry; +import io.micrometer.core.instrument.Timer; +import org.springframework.stereotype.Component; + +/** + * Metrics for the AMIE packet processing pipeline. + * + * <p>All metric names follow the Prometheus naming convention: + * {@code amie_<subsystem>_<measurement>_<unit>}. + */ +@Component +public class AmieMetrics { + + private static final String PACKETS_RECEIVED_TOTAL = "amie_packets_received_total"; + private static final String PACKETS_PROCESSED_TOTAL = "amie_packets_processed_total"; + private static final String EVENTS_RETRY_TOTAL = "amie_events_retry_total"; + private static final String PROCESSING_DURATION_SECONDS = "amie_packet_processing_duration_seconds"; + private static final String POLLER_PACKETS_FETCHED = "amie_poller_packets_fetched"; + + private static final String TAG_TYPE = "type"; + private static final String TAG_OUTCOME = "outcome"; + private static final String TAG_HANDLER = "handler"; + + private final MeterRegistry meterRegistry; + + public AmieMetrics(MeterRegistry meterRegistry) { + this.meterRegistry = meterRegistry; + } + + /** + * Increments the counter tracking raw packets received from the AMIE API, tagged by packet type. + * + * @param packetType the AMIE packet type (e.g., "request_project_create") + */ + public void recordPacketReceived(String packetType) { + Counter.builder(PACKETS_RECEIVED_TOTAL) + .tag(TAG_TYPE, packetType) + .description("Total number of AMIE packets received from the API") + .register(meterRegistry) + .increment(); + } + + /** + * Increments the counter tracking processed packets, tagged by packet type and outcome. + * + * @param packetType the AMIE packet type (e.g., "request_project_create") + * @param outcome the processing outcome (e.g., "success", "failure") + */ + public void recordPacketProcessed(String packetType, String outcome) { + Counter.builder(PACKETS_PROCESSED_TOTAL) + .tag(TAG_TYPE, packetType) + .tag(TAG_OUTCOME, outcome) + .description("Total number of AMIE packets that completed processing") + .register(meterRegistry) + .increment(); + } + + /** + * Increments the counter tracking event retry attempts. + */ + public void recordRetry() { + Counter.builder(EVENTS_RETRY_TOTAL) + .description("Total number of AMIE processing event retry attempts") + .register(meterRegistry) + .increment(); + } + + /** + * Starts a timer sample for measuring packet processing duration. + * + * @return a {@link Timer.Sample} that must be stopped via {@link #stopProcessingTimer} + */ + public Timer.Sample startProcessingTimer() { + return Timer.start(meterRegistry); + } + + /** + * Stops a previously started timer sample and records the duration against the + * {@code amie_packet_processing_duration_seconds} timer, tagged by handler type. + * + * @param sample the sample returned by {@link #startProcessingTimer()} + * @param handlerType the name of the handler that processed the packet + */ + public void stopProcessingTimer(Timer.Sample sample, String handlerType) { + Timer timer = Timer.builder(PROCESSING_DURATION_SECONDS) + .tag(TAG_HANDLER, handlerType) + .description("Time taken to process an AMIE packet by handler type") + .register(meterRegistry); + sample.stop(timer); + } + + /** + * Increments the counter tracking the number of packets fetched during a poller run. + * + * @param count the number of packets fetched + */ + public void recordPollerFetch(int count) { + Counter.builder(POLLER_PACKETS_FETCHED) + .description("Total number of AMIE packets fetched by the poller") + .register(meterRegistry) + .increment(count); + } +} diff --git a/allocations/access-ci-service/src/main/java/org/apache/custos/access/ci/service/worker/amie/ProcessingEventWorker.java b/allocations/access-ci-service/src/main/java/org/apache/custos/access/ci/service/worker/amie/ProcessingEventWorker.java index bdc2d105b..9e5b5d268 100644 --- a/allocations/access-ci-service/src/main/java/org/apache/custos/access/ci/service/worker/amie/ProcessingEventWorker.java +++ b/allocations/access-ci-service/src/main/java/org/apache/custos/access/ci/service/worker/amie/ProcessingEventWorker.java @@ -19,7 +19,9 @@ package org.apache.custos.access.ci.service.worker.amie; import com.fasterxml.jackson.databind.ObjectMapper; +import io.micrometer.core.instrument.Timer; import org.apache.custos.access.ci.service.handler.amie.PacketRouter; +import org.apache.custos.access.ci.service.metrics.AmieMetrics; import org.apache.custos.access.ci.service.model.amie.PacketEntity; import org.apache.custos.access.ci.service.model.amie.PacketStatus; import org.apache.custos.access.ci.service.model.amie.ProcessingErrorEntity; @@ -31,6 +33,7 @@ import org.apache.custos.access.ci.service.repo.amie.ProcessingErrorRepository; import org.apache.custos.access.ci.service.repo.amie.ProcessingEventRepository; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import org.slf4j.MDC; import org.springframework.context.annotation.Lazy; import org.springframework.scheduling.annotation.Scheduled; import org.springframework.stereotype.Component; @@ -76,6 +79,7 @@ public class ProcessingEventWorker { private final PacketRepository packetRepo; private final ProcessingErrorRepository errorRepo; private final PacketRouter router; + private final AmieMetrics amieMetrics; private final ObjectMapper objectMapper = new ObjectMapper(); private final ProcessingEventWorker self; @@ -83,11 +87,13 @@ public class ProcessingEventWorker { PacketRepository packetRepo, ProcessingErrorRepository errorRepo, PacketRouter router, + AmieMetrics amieMetrics, @Lazy ProcessingEventWorker self) { this.eventRepo = eventRepo; this.packetRepo = packetRepo; this.errorRepo = errorRepo; this.router = router; + this.amieMetrics = amieMetrics; this.self = self; } @@ -109,18 +115,33 @@ public class ProcessingEventWorker { for (ProcessingEventEntity event : eventsToProcess) { String eventId = event.getId(); + PacketEntity packet = event.getPacket(); + + MDC.put("packetId", packet.getId()); + MDC.put("amieId", String.valueOf(packet.getAmieId())); + MDC.put("packetType", packet.getType()); + + Timer.Sample timerSample = amieMetrics.startProcessingTimer(); try { self.executeEventInTransaction(event); } catch (Exception e) { LOGGER.error("Transaction failed for eventId [{}]. Opening recovery transaction to record failure.", eventId, e); + amieMetrics.stopProcessingTimer(timerSample, packet.getType()); try { self.recordFailureInNewTransaction(eventId, e); } catch (Exception recoveryEx) { LOGGER.error("CRITICAL: Recovery transaction also failed for eventId [{}]. " + "Event may remain stuck until the next worker cycle.", eventId, recoveryEx); } + continue; + } finally { + MDC.remove("packetId"); + MDC.remove("amieId"); + MDC.remove("packetType"); + MDC.remove("handler"); } + amieMetrics.stopProcessingTimer(timerSample, packet.getType()); } } @@ -136,7 +157,7 @@ public class ProcessingEventWorker { eventRepo.saveAndFlush(event); var packetJson = objectMapper.readTree(packet.getRawJson()); - router.route(packetJson, packet); + router.route(packetJson, packet, event.getId()); handleSuccess(event, packet); } @@ -166,10 +187,13 @@ public class ProcessingEventWorker { if (isRetryable) { Instant nextRetryAt = computeNextRetryAt(effectiveAttempts); event.setNextRetryAt(nextRetryAt); + amieMetrics.recordRetry(); + amieMetrics.recordPacketProcessed(packet.getType(), "retry_scheduled"); LOGGER.warn("Event [{}] for packet amie_id [{}] failed on attempt {}/{}. Scheduled for retry after [{}].", eventId, packet.getAmieId(), effectiveAttempts, MAX_ATTEMPTS, nextRetryAt); } else { event.setNextRetryAt(null); + amieMetrics.recordPacketProcessed(packet.getType(), "permanently_failed"); LOGGER.error("Event [{}] for packet amie_id [{}] is PERMANENTLY_FAILED after {} attempt(s). Manual intervention required.", eventId, packet.getAmieId(), effectiveAttempts); packet.setStatus(PacketStatus.FAILED); @@ -203,6 +227,8 @@ public class ProcessingEventWorker { packetRepo.save(packet); } + amieMetrics.recordPacketProcessed(packet.getType(), "succeeded"); + LOGGER.info("Successfully processed event [{}] for packet amie_id [{}].", event.getType(), packet.getAmieId()); } diff --git a/allocations/access-ci-service/src/main/resources/application.yml b/allocations/access-ci-service/src/main/resources/application.yml index 49506f8c6..0156e6caf 100644 --- a/allocations/access-ci-service/src/main/resources/application.yml +++ b/allocations/access-ci-service/src/main/resources/application.yml @@ -18,6 +18,8 @@ server: port: 8083 spring: + profiles: + active: dev application: name: access-ci-service datasource: @@ -55,6 +57,19 @@ logging: level: root: info +management: + endpoints: + web: + exposure: + include: health, prometheus, info + endpoint: + health: + show-details: when-authorized + prometheus: + metrics: + export: + enabled: true + springdoc: swagger-ui: title: CUSTOS ACCESS CI SERVICE API diff --git a/allocations/access-ci-service/src/main/resources/distribution/conf/application.yml b/allocations/access-ci-service/src/main/resources/distribution/conf/application.yml index 49506f8c6..fbd9d6ca4 100644 --- a/allocations/access-ci-service/src/main/resources/distribution/conf/application.yml +++ b/allocations/access-ci-service/src/main/resources/distribution/conf/application.yml @@ -18,6 +18,8 @@ server: port: 8083 spring: + profiles: + active: prod application: name: access-ci-service datasource: @@ -55,6 +57,19 @@ logging: level: root: info +management: + endpoints: + web: + exposure: + include: health, prometheus, info + endpoint: + health: + show-details: when-authorized + prometheus: + metrics: + export: + enabled: true + springdoc: swagger-ui: title: CUSTOS ACCESS CI SERVICE API diff --git a/allocations/access-ci-service/src/main/resources/distribution/conf/logback-spring.xml b/allocations/access-ci-service/src/main/resources/distribution/conf/logback-spring.xml index 295c31dae..42794e77f 100644 --- a/allocations/access-ci-service/src/main/resources/distribution/conf/logback-spring.xml +++ b/allocations/access-ci-service/src/main/resources/distribution/conf/logback-spring.xml @@ -23,33 +23,77 @@ <configuration> <include resource="org/springframework/boot/logging/logback/base.xml"/> <springProperty scope="context" name="appName" source="spring.application.name"/> - <!-- Rolling File Appender --> - <appender name="FILE" class="ch.qos.logback.core.rolling.RollingFileAppender"> - <file>logs/custos-amie-decoder.log</file> - <encoder> - <pattern>%date{YYYY-MM-dd HH:mm:ss.SSS} app=${appName}, host=${HOSTNAME}, traceID=%X{traceId:-NONE}, - level=%-5level, [%thread] %logger{36} - %msg%n - </pattern> - </encoder> - <rollingPolicy class="ch.qos.logback.core.rolling.SizeAndTimeBasedRollingPolicy"> - <fileNamePattern>logs/%d{yyyy-MM}/custos-amie-decoder-%d{MM-dd-yyyy}-%i.log.gz</fileNamePattern> - <maxFileSize>10MB</maxFileSize> - <maxHistory>30</maxHistory> - <totalSizeCap>1GB</totalSizeCap> - </rollingPolicy> - </appender> - - <appender name="ASYNC_FILE" class="ch.qos.logback.classic.AsyncAppender"> - <appender-ref ref="FILE"/> - <queueSize>500</queueSize> - <discardingThreshold>0</discardingThreshold> - <includeCallerData>true</includeCallerData> - </appender> - - <logger name="org.hibernate" level="ERROR"/> - <logger name="org.springframework" level="INFO"/> - - <root level="INFO"> - <appender-ref ref="ASYNC_FILE"/> - </root> -</configuration> \ No newline at end of file + + <!-- ================================================================ --> + <!-- DEV / DEFAULT PROFILE — human-readable text format with MDC --> + <!-- ================================================================ --> + <springProfile name="default,dev"> + + <appender name="FILE" class="ch.qos.logback.core.rolling.RollingFileAppender"> + <file>logs/custos-amie-decoder.log</file> + <encoder> + <pattern>%date{yyyy-MM-dd HH:mm:ss.SSS} app=${appName}, host=${HOSTNAME}, traceId=%X{traceId:-}, packetId=%X{packetId:-}, amieId=%X{amieId:-}, packetType=%X{packetType:-}, handler=%X{handler:-}, level=%-5level, [%thread] %logger{36} - %msg%n</pattern> + </encoder> + <rollingPolicy class="ch.qos.logback.core.rolling.SizeAndTimeBasedRollingPolicy"> + <fileNamePattern>logs/%d{yyyy-MM}/custos-amie-decoder-%d{MM-dd-yyyy}-%i.log.gz</fileNamePattern> + <maxFileSize>10MB</maxFileSize> + <maxHistory>30</maxHistory> + <totalSizeCap>1GB</totalSizeCap> + </rollingPolicy> + </appender> + + <appender name="ASYNC_FILE" class="ch.qos.logback.classic.AsyncAppender"> + <appender-ref ref="FILE"/> + <queueSize>500</queueSize> + <discardingThreshold>0</discardingThreshold> + <includeCallerData>true</includeCallerData> + </appender> + + <logger name="org.hibernate" level="ERROR"/> + <logger name="org.springframework" level="INFO"/> + + <root level="INFO"> + <appender-ref ref="ASYNC_FILE"/> + </root> + + </springProfile> + + <!-- ================================================================ --> + <!-- PROD PROFILE — structured JSON format for log aggregation --> + <!-- ================================================================ --> + <springProfile name="prod"> + + <appender name="FILE_JSON" class="ch.qos.logback.core.rolling.RollingFileAppender"> + <file>logs/custos-amie-decoder.log</file> + <encoder class="net.logstash.logback.encoder.LogstashEncoder"> + <includeMdcKeyName>packetId</includeMdcKeyName> + <includeMdcKeyName>amieId</includeMdcKeyName> + <includeMdcKeyName>packetType</includeMdcKeyName> + <includeMdcKeyName>handler</includeMdcKeyName> + <includeMdcKeyName>traceId</includeMdcKeyName> + </encoder> + <rollingPolicy class="ch.qos.logback.core.rolling.SizeAndTimeBasedRollingPolicy"> + <fileNamePattern>logs/%d{yyyy-MM}/custos-amie-decoder-%d{MM-dd-yyyy}-%i.log.gz</fileNamePattern> + <maxFileSize>10MB</maxFileSize> + <maxHistory>30</maxHistory> + <totalSizeCap>1GB</totalSizeCap> + </rollingPolicy> + </appender> + + <appender name="ASYNC_FILE" class="ch.qos.logback.classic.AsyncAppender"> + <appender-ref ref="FILE_JSON"/> + <queueSize>500</queueSize> + <discardingThreshold>0</discardingThreshold> + <includeCallerData>true</includeCallerData> + </appender> + + <logger name="org.hibernate" level="ERROR"/> + <logger name="org.springframework" level="INFO"/> + + <root level="INFO"> + <appender-ref ref="ASYNC_FILE"/> + </root> + + </springProfile> + +</configuration> diff --git a/allocations/access-ci-service/src/main/resources/logback-spring.xml b/allocations/access-ci-service/src/main/resources/logback-spring.xml index 295c31dae..42794e77f 100644 --- a/allocations/access-ci-service/src/main/resources/logback-spring.xml +++ b/allocations/access-ci-service/src/main/resources/logback-spring.xml @@ -23,33 +23,77 @@ <configuration> <include resource="org/springframework/boot/logging/logback/base.xml"/> <springProperty scope="context" name="appName" source="spring.application.name"/> - <!-- Rolling File Appender --> - <appender name="FILE" class="ch.qos.logback.core.rolling.RollingFileAppender"> - <file>logs/custos-amie-decoder.log</file> - <encoder> - <pattern>%date{YYYY-MM-dd HH:mm:ss.SSS} app=${appName}, host=${HOSTNAME}, traceID=%X{traceId:-NONE}, - level=%-5level, [%thread] %logger{36} - %msg%n - </pattern> - </encoder> - <rollingPolicy class="ch.qos.logback.core.rolling.SizeAndTimeBasedRollingPolicy"> - <fileNamePattern>logs/%d{yyyy-MM}/custos-amie-decoder-%d{MM-dd-yyyy}-%i.log.gz</fileNamePattern> - <maxFileSize>10MB</maxFileSize> - <maxHistory>30</maxHistory> - <totalSizeCap>1GB</totalSizeCap> - </rollingPolicy> - </appender> - - <appender name="ASYNC_FILE" class="ch.qos.logback.classic.AsyncAppender"> - <appender-ref ref="FILE"/> - <queueSize>500</queueSize> - <discardingThreshold>0</discardingThreshold> - <includeCallerData>true</includeCallerData> - </appender> - - <logger name="org.hibernate" level="ERROR"/> - <logger name="org.springframework" level="INFO"/> - - <root level="INFO"> - <appender-ref ref="ASYNC_FILE"/> - </root> -</configuration> \ No newline at end of file + + <!-- ================================================================ --> + <!-- DEV / DEFAULT PROFILE — human-readable text format with MDC --> + <!-- ================================================================ --> + <springProfile name="default,dev"> + + <appender name="FILE" class="ch.qos.logback.core.rolling.RollingFileAppender"> + <file>logs/custos-amie-decoder.log</file> + <encoder> + <pattern>%date{yyyy-MM-dd HH:mm:ss.SSS} app=${appName}, host=${HOSTNAME}, traceId=%X{traceId:-}, packetId=%X{packetId:-}, amieId=%X{amieId:-}, packetType=%X{packetType:-}, handler=%X{handler:-}, level=%-5level, [%thread] %logger{36} - %msg%n</pattern> + </encoder> + <rollingPolicy class="ch.qos.logback.core.rolling.SizeAndTimeBasedRollingPolicy"> + <fileNamePattern>logs/%d{yyyy-MM}/custos-amie-decoder-%d{MM-dd-yyyy}-%i.log.gz</fileNamePattern> + <maxFileSize>10MB</maxFileSize> + <maxHistory>30</maxHistory> + <totalSizeCap>1GB</totalSizeCap> + </rollingPolicy> + </appender> + + <appender name="ASYNC_FILE" class="ch.qos.logback.classic.AsyncAppender"> + <appender-ref ref="FILE"/> + <queueSize>500</queueSize> + <discardingThreshold>0</discardingThreshold> + <includeCallerData>true</includeCallerData> + </appender> + + <logger name="org.hibernate" level="ERROR"/> + <logger name="org.springframework" level="INFO"/> + + <root level="INFO"> + <appender-ref ref="ASYNC_FILE"/> + </root> + + </springProfile> + + <!-- ================================================================ --> + <!-- PROD PROFILE — structured JSON format for log aggregation --> + <!-- ================================================================ --> + <springProfile name="prod"> + + <appender name="FILE_JSON" class="ch.qos.logback.core.rolling.RollingFileAppender"> + <file>logs/custos-amie-decoder.log</file> + <encoder class="net.logstash.logback.encoder.LogstashEncoder"> + <includeMdcKeyName>packetId</includeMdcKeyName> + <includeMdcKeyName>amieId</includeMdcKeyName> + <includeMdcKeyName>packetType</includeMdcKeyName> + <includeMdcKeyName>handler</includeMdcKeyName> + <includeMdcKeyName>traceId</includeMdcKeyName> + </encoder> + <rollingPolicy class="ch.qos.logback.core.rolling.SizeAndTimeBasedRollingPolicy"> + <fileNamePattern>logs/%d{yyyy-MM}/custos-amie-decoder-%d{MM-dd-yyyy}-%i.log.gz</fileNamePattern> + <maxFileSize>10MB</maxFileSize> + <maxHistory>30</maxHistory> + <totalSizeCap>1GB</totalSizeCap> + </rollingPolicy> + </appender> + + <appender name="ASYNC_FILE" class="ch.qos.logback.classic.AsyncAppender"> + <appender-ref ref="FILE_JSON"/> + <queueSize>500</queueSize> + <discardingThreshold>0</discardingThreshold> + <includeCallerData>true</includeCallerData> + </appender> + + <logger name="org.hibernate" level="ERROR"/> + <logger name="org.springframework" level="INFO"/> + + <root level="INFO"> + <appender-ref ref="ASYNC_FILE"/> + </root> + + </springProfile> + +</configuration> diff --git a/allocations/access-ci-service/src/test/java/org/apache/custos/access/ci/service/metrics/AmieHealthIndicatorTest.java b/allocations/access-ci-service/src/test/java/org/apache/custos/access/ci/service/metrics/AmieHealthIndicatorTest.java new file mode 100644 index 000000000..2ec32e14c --- /dev/null +++ b/allocations/access-ci-service/src/test/java/org/apache/custos/access/ci/service/metrics/AmieHealthIndicatorTest.java @@ -0,0 +1,118 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.custos.access.ci.service.metrics; + +import org.apache.custos.access.ci.service.config.AmieProperties; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Tag; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.extension.ExtendWith; +import org.mockito.Mock; +import org.mockito.junit.jupiter.MockitoExtension; +import org.springframework.boot.actuate.health.Health; +import org.springframework.boot.actuate.health.Status; +import org.springframework.http.HttpStatus; +import org.springframework.http.ResponseEntity; +import org.springframework.web.client.ResourceAccessException; +import org.springframework.web.client.RestTemplate; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.mockito.Mockito.when; + +@ExtendWith(MockitoExtension.class) +@Tag("unit") +class AmieHealthIndicatorTest { + + private static final String BASE_URL = "https://a3mdev.xsede.org/amie-api-test"; + private static final String SITE_CODE = "NEXUS"; + + @Mock + private RestTemplate restTemplate; + + private AmieProperties amieProperties; + private AmieHealthIndicator healthIndicator; + + @BeforeEach + void setUp() { + amieProperties = new AmieProperties(); + amieProperties.setBaseUrl(BASE_URL); + amieProperties.setSiteCode(SITE_CODE); + healthIndicator = new AmieHealthIndicator(amieProperties, restTemplate); + } + + @Test + void health_whenAmieApiReturns200_shouldBeUp() { + when(restTemplate.getForEntity(BASE_URL, String.class)) + .thenReturn(ResponseEntity.ok("OK")); + + Health health = healthIndicator.health(); + + assertThat(health.getStatus()).isEqualTo(Status.UP); + assertThat(health.getDetails()).containsEntry("url", BASE_URL); + assertThat(health.getDetails()).containsEntry("siteCode", SITE_CODE); + assertThat(health.getDetails()).containsEntry("httpStatus", 200); + } + + @Test + void health_whenAmieApiReturns302_shouldBeUp() { + when(restTemplate.getForEntity(BASE_URL, String.class)) + .thenReturn(ResponseEntity.status(HttpStatus.FOUND).build()); + + Health health = healthIndicator.health(); + + assertThat(health.getStatus()).isEqualTo(Status.UP); + assertThat(health.getDetails()).containsEntry("httpStatus", 302); + } + + @Test + void health_whenAmieApiReturns500_shouldBeDown() { + when(restTemplate.getForEntity(BASE_URL, String.class)) + .thenReturn(ResponseEntity.status(HttpStatus.INTERNAL_SERVER_ERROR).body("error")); + + Health health = healthIndicator.health(); + + assertThat(health.getStatus()).isEqualTo(Status.DOWN); + assertThat(health.getDetails()).containsEntry("url", BASE_URL); + assertThat(health.getDetails()).containsEntry("httpStatus", 500); + } + + @Test + void health_whenRestClientExceptionThrown_shouldBeDown() { + when(restTemplate.getForEntity(BASE_URL, String.class)) + .thenThrow(new ResourceAccessException("Connection refused")); + + Health health = healthIndicator.health(); + + assertThat(health.getStatus()).isEqualTo(Status.DOWN); + assertThat(health.getDetails()).containsEntry("url", BASE_URL); + assertThat(health.getDetails()).containsEntry("siteCode", SITE_CODE); + } + + @Test + void health_whenNetworkTimeout_shouldIncludeUrlInDownDetails() { + when(restTemplate.getForEntity(BASE_URL, String.class)) + .thenThrow(new ResourceAccessException("Read timed out")); + + Health health = healthIndicator.health(); + + assertThat(health.getStatus()).isEqualTo(Status.DOWN); + assertThat(health.getDetails()).containsKey("url"); + assertThat(health.getDetails().get("url")).isEqualTo(BASE_URL); + } +} diff --git a/allocations/access-ci-service/src/test/java/org/apache/custos/access/ci/service/metrics/AmieMetricsTest.java b/allocations/access-ci-service/src/test/java/org/apache/custos/access/ci/service/metrics/AmieMetricsTest.java new file mode 100644 index 000000000..fe1e10721 --- /dev/null +++ b/allocations/access-ci-service/src/test/java/org/apache/custos/access/ci/service/metrics/AmieMetricsTest.java @@ -0,0 +1,129 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.custos.access.ci.service.metrics; + +import io.micrometer.core.instrument.Counter; +import io.micrometer.core.instrument.Timer; +import io.micrometer.core.instrument.simple.SimpleMeterRegistry; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Tag; +import org.junit.jupiter.api.Test; + +import static org.assertj.core.api.Assertions.assertThat; + +@Tag("unit") +class AmieMetricsTest { + + private SimpleMeterRegistry registry; + private AmieMetrics amieMetrics; + + @BeforeEach + void setUp() { + registry = new SimpleMeterRegistry(); + amieMetrics = new AmieMetrics(registry); + } + + @Test + void recordPacketReceived_shouldIncrementCounterWithTypeTag() { + amieMetrics.recordPacketReceived("request_project_create"); + amieMetrics.recordPacketReceived("request_project_create"); + amieMetrics.recordPacketReceived("request_user_modify"); + + Counter createCounter = registry.find("amie_packets_received_total") + .tag("type", "request_project_create") + .counter(); + Counter modifyCounter = registry.find("amie_packets_received_total") + .tag("type", "request_user_modify") + .counter(); + + assertThat(createCounter).isNotNull(); + assertThat(createCounter.count()).isEqualTo(2.0); + assertThat(modifyCounter).isNotNull(); + assertThat(modifyCounter.count()).isEqualTo(1.0); + } + + @Test + void recordPacketProcessed_shouldIncrementCounterWithTypeAndOutcomeTags() { + amieMetrics.recordPacketProcessed("request_project_create", "success"); + amieMetrics.recordPacketProcessed("request_project_create", "failure"); + amieMetrics.recordPacketProcessed("request_project_create", "success"); + + Counter successCounter = registry.find("amie_packets_processed_total") + .tag("type", "request_project_create") + .tag("outcome", "success") + .counter(); + Counter failureCounter = registry.find("amie_packets_processed_total") + .tag("type", "request_project_create") + .tag("outcome", "failure") + .counter(); + + assertThat(successCounter).isNotNull(); + assertThat(successCounter.count()).isEqualTo(2.0); + assertThat(failureCounter).isNotNull(); + assertThat(failureCounter.count()).isEqualTo(1.0); + } + + @Test + void recordRetry_shouldIncrementRetryCounter() { + amieMetrics.recordRetry(); + amieMetrics.recordRetry(); + amieMetrics.recordRetry(); + + Counter retryCounter = registry.find("amie_events_retry_total").counter(); + + assertThat(retryCounter).isNotNull(); + assertThat(retryCounter.count()).isEqualTo(3.0); + } + + @Test + void startAndStopProcessingTimer_shouldRecordDurationWithHandlerTag() throws InterruptedException { + Timer.Sample sample = amieMetrics.startProcessingTimer(); + Thread.sleep(5); + amieMetrics.stopProcessingTimer(sample, "RequestProjectCreateHandler"); + + Timer timer = registry.find("amie_packet_processing_duration_seconds") + .tag("handler", "RequestProjectCreateHandler") + .timer(); + + assertThat(timer).isNotNull(); + assertThat(timer.count()).isEqualTo(1); + assertThat(timer.totalTime(java.util.concurrent.TimeUnit.MILLISECONDS)).isGreaterThan(0); + } + + @Test + void recordPollerFetch_shouldIncrementByCount() { + amieMetrics.recordPollerFetch(5); + amieMetrics.recordPollerFetch(3); + + Counter fetchCounter = registry.find("amie_poller_packets_fetched").counter(); + + assertThat(fetchCounter).isNotNull(); + assertThat(fetchCounter.count()).isEqualTo(8.0); + } + + @Test + void recordPollerFetch_withZeroCount_shouldNotChangeCounter() { + amieMetrics.recordPollerFetch(0); + + Counter fetchCounter = registry.find("amie_poller_packets_fetched").counter(); + + assertThat(fetchCounter).isNotNull(); + assertThat(fetchCounter.count()).isEqualTo(0.0); + } +} diff --git a/allocations/access-ci-service/src/test/java/org/apache/custos/access/ci/service/worker/amie/ProcessingEventWorkerTest.java b/allocations/access-ci-service/src/test/java/org/apache/custos/access/ci/service/worker/amie/ProcessingEventWorkerTest.java index 05e7f1df5..9a8ab9a0a 100644 --- a/allocations/access-ci-service/src/test/java/org/apache/custos/access/ci/service/worker/amie/ProcessingEventWorkerTest.java +++ b/allocations/access-ci-service/src/test/java/org/apache/custos/access/ci/service/worker/amie/ProcessingEventWorkerTest.java @@ -19,6 +19,7 @@ package org.apache.custos.access.ci.service.worker.amie; import org.apache.custos.access.ci.service.handler.amie.PacketRouter; +import org.apache.custos.access.ci.service.metrics.AmieMetrics; import org.apache.custos.access.ci.service.model.amie.PacketEntity; import org.apache.custos.access.ci.service.model.amie.PacketStatus; import org.apache.custos.access.ci.service.model.amie.ProcessingErrorEntity; @@ -43,7 +44,9 @@ import java.util.Optional; import static org.assertj.core.api.Assertions.assertThat; import static org.mockito.ArgumentMatchers.any; import static org.mockito.ArgumentMatchers.anyList; +import static org.mockito.ArgumentMatchers.anyString; import static org.mockito.Mockito.doThrow; +import static org.mockito.Mockito.lenient; import static org.mockito.Mockito.never; import static org.mockito.Mockito.verify; import static org.mockito.Mockito.when; @@ -67,6 +70,9 @@ class ProcessingEventWorkerTest { @Mock private PacketRouter router; + @Mock + private AmieMetrics amieMetrics; + @Mock private ProcessingEventWorker self; @@ -74,7 +80,8 @@ class ProcessingEventWorkerTest { @BeforeEach void setUp() { - worker = new ProcessingEventWorker(eventRepo, packetRepo, errorRepo, router, self); + lenient().when(amieMetrics.startProcessingTimer()).thenReturn(null); + worker = new ProcessingEventWorker(eventRepo, packetRepo, errorRepo, router, amieMetrics, self); } // ------------------------------------------------------------------ @@ -103,6 +110,8 @@ class ProcessingEventWorkerTest { verify(self).executeEventInTransaction(event1); verify(self).executeEventInTransaction(event2); verify(self, never()).recordFailureInNewTransaction(any(), any()); + verify(amieMetrics, org.mockito.Mockito.times(2)).startProcessingTimer(); + verify(amieMetrics, org.mockito.Mockito.times(2)).stopProcessingTimer(any(), anyString()); } @Test @@ -164,6 +173,8 @@ class ProcessingEventWorkerTest { verify(eventRepo).save(event); verify(packetRepo, never()).save(any()); + verify(amieMetrics).recordRetry(); + verify(amieMetrics).recordPacketProcessed("request_account_create", "retry_scheduled"); ArgumentCaptor<ProcessingErrorEntity> errorCaptor = ArgumentCaptor.forClass(ProcessingErrorEntity.class); verify(errorRepo).save(errorCaptor.capture()); @@ -219,6 +230,9 @@ class ProcessingEventWorkerTest { assertThat(packet.getStatus()).isEqualTo(PacketStatus.FAILED); assertThat(packet.getLastError()).isEqualTo("final failure"); + verify(amieMetrics).recordPacketProcessed("request_account_create", "permanently_failed"); + verify(amieMetrics, never()).recordRetry(); + verify(eventRepo).save(event); verify(errorRepo).save(any(ProcessingErrorEntity.class)); } diff --git a/compose/docker-compose.yml b/compose/docker-compose.yml index 66cbe537b..7c97e12d8 100644 --- a/compose/docker-compose.yml +++ b/compose/docker-compose.yml @@ -57,6 +57,31 @@ services: ports: - 18080:8080 + prometheus: + image: prom/prometheus:latest + container_name: prometheus + restart: unless-stopped + ports: + - "9090:9090" + volumes: + - ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml + extra_hosts: + - "host.docker.internal:host-gateway" + + grafana: + image: grafana/grafana:latest + container_name: grafana + restart: unless-stopped + ports: + - "3000:3000" + environment: + GF_SECURITY_ADMIN_USER: admin + GF_SECURITY_ADMIN_PASSWORD: admin + GF_DASHBOARDS_DEFAULT_HOME_DASHBOARD_PATH: /var/lib/grafana/dashboards/amie-service.json + volumes: + - ./grafana/provisioning:/etc/grafana/provisioning + - ./grafana/dashboards:/var/lib/grafana/dashboards + vault: image: vault:1.11.0 container_name: vault diff --git a/compose/grafana/dashboards/amie-service.json b/compose/grafana/dashboards/amie-service.json new file mode 100644 index 000000000..53072cb06 --- /dev/null +++ b/compose/grafana/dashboards/amie-service.json @@ -0,0 +1,152 @@ +{ + "annotations": { "list": [] }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "links": [], + "panels": [ + { + "type": "stat", + "title": "Total Packets Received", + "gridPos": { "h": 4, "w": 6, "x": 0, "y": 0 }, + "targets": [{ "expr": "sum(amie_packets_received_total)", "legendFormat": "Total" }], + "fieldConfig": { "defaults": { "thresholds": { "steps": [{ "color": "blue", "value": null }] } } }, + "options": { "colorMode": "background", "textMode": "value_and_name" }, + "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" } + }, + { + "type": "stat", + "title": "Packets Succeeded", + "gridPos": { "h": 4, "w": 6, "x": 6, "y": 0 }, + "targets": [{ "expr": "sum(amie_packets_processed_total{outcome=\"succeeded\"})", "legendFormat": "Succeeded" }], + "fieldConfig": { "defaults": { "thresholds": { "steps": [{ "color": "green", "value": null }] } } }, + "options": { "colorMode": "background", "textMode": "value_and_name" }, + "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" } + }, + { + "type": "stat", + "title": "Packets Failed", + "gridPos": { "h": 4, "w": 6, "x": 12, "y": 0 }, + "targets": [{ "expr": "sum(amie_packets_processed_total{outcome=~\"permanently_failed|failed\"})", "legendFormat": "Failed" }], + "fieldConfig": { "defaults": { "thresholds": { "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] } } }, + "options": { "colorMode": "background", "textMode": "value_and_name" }, + "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" } + }, + { + "type": "stat", + "title": "Retries", + "gridPos": { "h": 4, "w": 6, "x": 18, "y": 0 }, + "targets": [{ "expr": "sum(amie_events_retry_total) or vector(0)", "legendFormat": "Retries" }], + "fieldConfig": { "defaults": { "thresholds": { "steps": [{ "color": "green", "value": null }, { "color": "orange", "value": 1 }, { "color": "red", "value": 5 }] } } }, + "options": { "colorMode": "background", "textMode": "value_and_name" }, + "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" } + }, + { + "type": "timeseries", + "title": "Packets Processed Over Time", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 4 }, + "targets": [ + { "expr": "sum by (type) (rate(amie_packets_processed_total{outcome=\"succeeded\"}[5m]))", "legendFormat": "{{type}}" } + ], + "fieldConfig": { "defaults": { "custom": { "drawStyle": "line", "lineWidth": 2, "fillOpacity": 15, "pointSize": 5, "showPoints": "auto" } } }, + "options": { "tooltip": { "mode": "multi" }, "legend": { "displayMode": "table", "placement": "bottom", "calcs": ["sum"] } }, + "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" } + }, + { + "type": "timeseries", + "title": "Failures & Retries Over Time", + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 4 }, + "targets": [ + { "expr": "sum(rate(amie_packets_processed_total{outcome=~\"permanently_failed|retry_scheduled\"}[5m]))", "legendFormat": "Failures" }, + { "expr": "sum(rate(amie_events_retry_total[5m]))", "legendFormat": "Retries" } + ], + "fieldConfig": { "defaults": { "custom": { "drawStyle": "line", "lineWidth": 2, "fillOpacity": 15 }, "color": { "mode": "palette-classic" } } }, + "options": { "tooltip": { "mode": "multi" }, "legend": { "displayMode": "table", "placement": "bottom", "calcs": ["sum"] } }, + "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" } + }, + { + "type": "piechart", + "title": "Packets by Type", + "gridPos": { "h": 8, "w": 8, "x": 0, "y": 12 }, + "targets": [ + { "expr": "sum by (type) (amie_packets_received_total)", "legendFormat": "{{type}}" } + ], + "options": { "legend": { "displayMode": "table", "placement": "right", "values": ["value", "percent"] }, "pieType": "donut" }, + "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" } + }, + { + "type": "piechart", + "title": "Processing Outcomes", + "gridPos": { "h": 8, "w": 8, "x": 8, "y": 12 }, + "targets": [ + { "expr": "sum by (outcome) (amie_packets_processed_total)", "legendFormat": "{{outcome}}" } + ], + "fieldConfig": { "overrides": [ + { "matcher": { "id": "byName", "options": "succeeded" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] }, + { "matcher": { "id": "byName", "options": "permanently_failed" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] }, + { "matcher": { "id": "byName", "options": "retry_scheduled" }, "properties": [{ "id": "color", "value": { "fixedColor": "orange", "mode": "fixed" } }] } + ] }, + "options": { "legend": { "displayMode": "table", "placement": "right", "values": ["value", "percent"] }, "pieType": "donut" }, + "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" } + }, + { + "type": "bargauge", + "title": "Packets Received by Type", + "gridPos": { "h": 8, "w": 8, "x": 16, "y": 12 }, + "targets": [ + { "expr": "sum by (type) (amie_packets_received_total)", "legendFormat": "{{type}}" } + ], + "fieldConfig": { "defaults": { "thresholds": { "steps": [{ "color": "blue", "value": null }] } } }, + "options": { "displayMode": "gradient", "orientation": "horizontal", "showUnfilled": true }, + "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" } + }, + { + "type": "timeseries", + "title": "Poller: Packets Fetched per Cycle", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 20 }, + "targets": [ + { "expr": "rate(amie_poller_packets_fetched_total[5m]) * 60", "legendFormat": "Packets/min" } + ], + "fieldConfig": { "defaults": { "custom": { "drawStyle": "bars", "lineWidth": 1, "fillOpacity": 50 }, "color": { "fixedColor": "purple", "mode": "fixed" } } }, + "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" } + }, + { + "type": "timeseries", + "title": "Processing Duration (p50 / p95 / p99)", + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 20 }, + "targets": [ + { "expr": "histogram_quantile(0.50, sum by (le) (rate(amie_packet_processing_duration_seconds_bucket[5m])))", "legendFormat": "p50" }, + { "expr": "histogram_quantile(0.95, sum by (le) (rate(amie_packet_processing_duration_seconds_bucket[5m])))", "legendFormat": "p95" }, + { "expr": "histogram_quantile(0.99, sum by (le) (rate(amie_packet_processing_duration_seconds_bucket[5m])))", "legendFormat": "p99" } + ], + "fieldConfig": { "defaults": { "unit": "s", "custom": { "drawStyle": "line", "lineWidth": 2, "fillOpacity": 10 } } }, + "options": { "tooltip": { "mode": "multi" }, "legend": { "displayMode": "table", "placement": "bottom", "calcs": ["lastNotNull"] } }, + "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" } + }, + { + "type": "table", + "title": "Processed Count by Type & Outcome", + "gridPos": { "h": 8, "w": 24, "x": 0, "y": 28 }, + "targets": [ + { "expr": "sum by (type, outcome) (amie_packets_processed_total)", "legendFormat": "{{type}} — {{outcome}}", "format": "table", "instant": true } + ], + "transformations": [ + { "id": "organize", "options": { "excludeByName": { "Time": true }, "renameByName": { "type": "Packet Type", "outcome": "Outcome", "Value": "Count" } } } + ], + "fieldConfig": { "overrides": [ + { "matcher": { "id": "byName", "options": "Outcome" }, "properties": [{ "id": "custom.cellOptions", "value": { "type": "color-text" } }, { "id": "mappings", "value": [{ "type": "value", "options": { "succeeded": { "color": "green", "text": "Succeeded" }, "permanently_failed": { "color": "red", "text": "Failed" }, "retry_scheduled": { "color": "orange", "text": "Retried" } } }] }] } + ] }, + "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" } + } + ], + "refresh": "10s", + "schemaVersion": 39, + "tags": ["custos", "amie", "access-ci"], + "templating": { "list": [] }, + "time": { "from": "now-1h", "to": "now" }, + "timepicker": {}, + "timezone": "browser", + "title": "ACCESS CI — AMIE Packet Processing", + "uid": "custos-amie-overview", + "version": 1 +} diff --git a/compose/grafana/provisioning/dashboards/dashboards.yml b/compose/grafana/provisioning/dashboards/dashboards.yml new file mode 100644 index 000000000..3b928441c --- /dev/null +++ b/compose/grafana/provisioning/dashboards/dashboards.yml @@ -0,0 +1,12 @@ +apiVersion: 1 + +providers: + - name: 'Custos' + orgId: 1 + folder: 'Custos' + type: file + disableDeletion: false + editable: true + options: + path: /var/lib/grafana/dashboards + foldersFromFilesStructure: false diff --git a/compose/grafana/provisioning/datasources/prometheus.yml b/compose/grafana/provisioning/datasources/prometheus.yml new file mode 100644 index 000000000..bb009bb21 --- /dev/null +++ b/compose/grafana/provisioning/datasources/prometheus.yml @@ -0,0 +1,9 @@ +apiVersion: 1 + +datasources: + - name: Prometheus + type: prometheus + access: proxy + url: http://prometheus:9090 + isDefault: true + editable: false diff --git a/compose/prometheus/prometheus.yml b/compose/prometheus/prometheus.yml new file mode 100644 index 000000000..dbeb9c5d1 --- /dev/null +++ b/compose/prometheus/prometheus.yml @@ -0,0 +1,18 @@ +global: + scrape_interval: 15s + evaluation_interval: 15s + +scrape_configs: + - job_name: 'access-ci-service' + metrics_path: '/actuator/prometheus' + static_configs: + - targets: ['host.docker.internal:8083'] + labels: + service: 'access-ci-service' + + - job_name: 'custos-signer' + metrics_path: '/metrics' + static_configs: + - targets: ['host.docker.internal:8084'] + labels: + service: 'custos-signer' diff --git a/pom.xml b/pom.xml index 707937e04..a3b6f42cd 100644 --- a/pom.xml +++ b/pom.xml @@ -141,6 +141,11 @@ <artifactId>jakarta.ws.rs-api</artifactId> <version>${jakarta.ws.version}</version> </dependency> + <dependency> + <groupId>net.logstash.logback</groupId> + <artifactId>logstash-logback-encoder</artifactId> + <version>${logstash.logback.version}</version> + </dependency> </dependencies> </dependencyManagement> @@ -204,6 +209,7 @@ <ssh.username>ubuntu</ssh.username> <jakarta.ws.version>4.0.0</jakarta.ws.version> + <logstash.logback.version>7.4</logstash.logback.version> <custos.dist.name>apache-airavata-custos-${project.version}</custos.dist.name> <access.service.dist.name>apache-airavata-custos-access-ci-service-${project.version}</access.service.dist.name> </properties>
