This is an automated email from the ASF dual-hosted git repository. jerryshao pushed a commit to branch 1.2.0-hotfix in repository https://gitbox.apache.org/repos/asf/gravitino.git
commit 689ad2cc6ffc1ec2295e3c305c367fc3aaf74c2b Author: Mark Hoerth <[email protected]> AuthorDate: Wed Apr 22 06:51:56 2026 -0700 [#10839] feat(server): Add health check endpoints (liveness, readiness, aggregate) (#10840) Adds three health check endpoints following MicroProfile Health semantics: - `GET /api/health/live` — liveness, returns 200 when the HTTP thread is responsive - `GET /api/health/ready` — readiness, returns 200 when the entity store is reachable, 503 otherwise - `GET /api/health` — aggregate, returns 200 when both checks pass - `GET /health`, `GET /health/live`, `GET /health/ready` — root-level aliases forwarding to the corresponding `/api/health/*` endpoints, for enterprise GTM standards that hardcode well-known root paths Also exempts `/api/health*` and `/health/*` from `AuthenticationFilter` so Kubernetes probes and load balancers can reach the endpoints without credentials under any configured authenticator. Adds one new optional server configuration key: - `gravitino.server.health.entityStore.probeTimeoutMs` (default: 2000 ms) — timeout for the entity-store readiness probe used by `/api/health/ready` Modern Java services (Apache Polaris, Spring Boot, Quarkus, Micronaut) ship these endpoints by default. Gravitino runs on raw Jetty and does not, which blocks standard Kubernetes probe configuration, load balancer health checks, and enterprise GTM integration. This is a parity gap that surfaces on day one of enterprise deployments. Fix: #10839 Yes — adds three new public REST endpoints under `/api/health`. No existing endpoint behavior is changed. - New unit tests in `TestHealthOperations` covering liveness, readiness (happy path, entity store uninitialized, entity store throws, timeout), and aggregate status. - New unit tests in `TestHealthAliasServlet` verifying forward success for `/health`, `/health/live`, `/health/ready` and null-dispatcher 503 path. - New unit tests in `TestAuthenticationFilter` verifying health paths bypass authentication and non-health paths (including paths that merely contain "health", e.g. `/api/metalakes/health_metalake`) continue to require authentication. - Manual verification against a running Gravitino instance: all three endpoints return 200 with expected JSON bodies; `/api/version` continues to work unchanged. - **Auth filter exemption is hardcoded** to `/api/health*` and `/health/*`, matching how Spring Boot and Quarkus hardcode their well-known health paths. Happy to make this config-driven in a follow-up if preferred. - **Bounded timeout on entity store probe.** The readiness check runs `EntityStore.exists()` with a configurable ceiling (default 2 s) via a dedicated `ThreadPoolExecutor` to prevent a hanging JDBC connection from tying up Jetty worker threads. `RejectedExecutionException` is caught and returns DOWN so a saturated probe pool never blocks the request thread. - **Response body format.** `HealthResponse` extends `BaseResponse` and keeps `code: 0` even in 503 responses — the HTTP status is the probe signal, the body is diagnostic. This is intentional and differs from `ErrorResponse` usage. - **`/api/version` is conventionally also unauthenticated** across most Java services (used for client discovery before auth negotiation). Not changed in this PR but worth considering as a follow-up. --------- Co-authored-by: Mark Hoerth <[email protected]> Co-authored-by: Jerry Shao <[email protected]> Co-authored-by: Claude Sonnet 4.6 <[email protected]> --- .../org/apache/gravitino/dto/HealthCheckDTO.java | 70 ++++++ .../gravitino/dto/responses/HealthResponse.java | 82 +++++++ conf/gravitino.conf.template | 2 + docs/open-api/health.yaml | 168 ++++++++++++++ docs/open-api/openapi.yaml | 9 + .../org/apache/gravitino/server/ServerConfig.java | 9 + .../authentication/AuthenticationFilter.java | 43 ++++ .../authentication/TestAuthenticationFilter.java | 60 +++++ .../apache/gravitino/server/GravitinoServer.java | 6 + .../gravitino/server/web/HealthAliasServlet.java | 49 +++++ .../server/web/rest/HealthOperations.java | 244 +++++++++++++++++++++ .../server/web/TestHealthAliasServlet.java | 66 ++++++ .../server/web/rest/TestHealthOperations.java | 149 +++++++++++++ 13 files changed, 957 insertions(+) diff --git a/common/src/main/java/org/apache/gravitino/dto/HealthCheckDTO.java b/common/src/main/java/org/apache/gravitino/dto/HealthCheckDTO.java new file mode 100644 index 0000000000..9071c36aa1 --- /dev/null +++ b/common/src/main/java/org/apache/gravitino/dto/HealthCheckDTO.java @@ -0,0 +1,70 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.gravitino.dto; + +import com.fasterxml.jackson.annotation.JsonProperty; +import java.util.Collections; +import java.util.Map; +import lombok.EqualsAndHashCode; +import lombok.Getter; +import lombok.ToString; + +/** Represents the result of a single health check (e.g. entity store, HTTP server). */ +@Getter +@EqualsAndHashCode +@ToString +public class HealthCheckDTO { + + /** Status of a health check, matching MicroProfile Health vocabulary. */ + public enum Status { + /** The component is healthy and functioning normally. */ + UP, + /** The component is unhealthy or unreachable. */ + DOWN + } + + @JsonProperty("name") + private final String name; + + @JsonProperty("status") + private final Status status; + + @JsonProperty("details") + private final Map<String, String> details; + + /** + * Constructor for HealthCheckDTO. + * + * @param name the name of the check (e.g. "entityStore") + * @param status the status of the check + * @param details optional diagnostic details; null is coerced to empty map + */ + public HealthCheckDTO(String name, Status status, Map<String, String> details) { + this.name = name; + this.status = status; + this.details = details == null ? Collections.emptyMap() : details; + } + + /** Default constructor for HealthCheckDTO. (Used for Jackson deserialization.) */ + public HealthCheckDTO() { + this.name = null; + this.status = null; + this.details = Collections.emptyMap(); + } +} diff --git a/common/src/main/java/org/apache/gravitino/dto/responses/HealthResponse.java b/common/src/main/java/org/apache/gravitino/dto/responses/HealthResponse.java new file mode 100644 index 0000000000..eb1db2d8a1 --- /dev/null +++ b/common/src/main/java/org/apache/gravitino/dto/responses/HealthResponse.java @@ -0,0 +1,82 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.gravitino.dto.responses; + +import com.fasterxml.jackson.annotation.JsonIgnore; +import com.fasterxml.jackson.annotation.JsonProperty; +import com.google.common.base.Preconditions; +import java.util.Collections; +import java.util.List; +import lombok.EqualsAndHashCode; +import lombok.Getter; +import lombok.ToString; +import org.apache.gravitino.dto.HealthCheckDTO; + +/** Represents a response containing aggregate health status and per-check results. */ +@Getter +@EqualsAndHashCode(callSuper = true) +@ToString +public class HealthResponse extends BaseResponse { + + @JsonProperty("status") + private final HealthCheckDTO.Status status; + + @JsonProperty("checks") + private final List<HealthCheckDTO> checks; + + /** + * Constructor for HealthResponse. + * + * @param status aggregate health status + * @param checks per-check results; null is coerced to empty list + */ + public HealthResponse(HealthCheckDTO.Status status, List<HealthCheckDTO> checks) { + super(0); + this.status = status; + this.checks = checks == null ? Collections.emptyList() : checks; + } + + /** Default constructor for HealthResponse. (Used for Jackson deserialization.) */ + public HealthResponse() { + super(); + this.status = null; + this.checks = Collections.emptyList(); + } + + /** + * Validates the response data. + * + * @throws IllegalArgumentException if status or checks are not set. + */ + @Override + public void validate() throws IllegalArgumentException { + super.validate(); + Preconditions.checkArgument(status != null, "status must be non-null"); + } + + /** + * Returns true if the aggregate status is UP. + * + * @return true if the aggregate status is UP, false otherwise + */ + @JsonIgnore + public boolean isUp() { + return status == HealthCheckDTO.Status.UP; + } +} diff --git a/conf/gravitino.conf.template b/conf/gravitino.conf.template index 9e340f5a6e..e1c9ba5788 100644 --- a/conf/gravitino.conf.template +++ b/conf/gravitino.conf.template @@ -19,6 +19,8 @@ # THE CONFIGURATION FOR Gravitino SERVER gravitino.server.shutdown.timeout = 3000 +# Timeout in milliseconds for the entity-store readiness probe used by /api/health/ready +# gravitino.server.health.entityStore.probeTimeoutMs = 2000 # THE CONFIGURATION FOR Gravitino WEB SERVER # The host name of the built-in web server diff --git a/docs/open-api/health.yaml b/docs/open-api/health.yaml new file mode 100644 index 0000000000..d6956446bd --- /dev/null +++ b/docs/open-api/health.yaml @@ -0,0 +1,168 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + + +--- + +paths: + /health: + get: + tags: + - health + summary: Aggregate health + operationId: aggregateHealth + description: > + Returns aggregate health status combining liveness and readiness checks. + Returns 200 when all checks pass; 503 when any check fails. + This endpoint is exempt from authentication so probes can reach it without credentials. + security: [] + responses: + "200": + $ref: "#/components/responses/HealthResponse" + "503": + $ref: "#/components/responses/HealthResponse" + "4XX": + $ref: "./openapi.yaml#/components/responses/BadRequestErrorResponse" + "5XX": + $ref: "./openapi.yaml#/components/responses/ServerErrorResponse" + + /health/live: + get: + tags: + - health + summary: Liveness check + operationId: livenessCheck + description: > + Returns 200 as long as the HTTP server thread can respond. + Use this for Kubernetes liveness probes to determine whether to restart a pod. + This endpoint is exempt from authentication so probes can reach it without credentials. + security: [] + responses: + "200": + $ref: "#/components/responses/HealthResponse" + "4XX": + $ref: "./openapi.yaml#/components/responses/BadRequestErrorResponse" + "5XX": + $ref: "./openapi.yaml#/components/responses/ServerErrorResponse" + + /health/ready: + get: + tags: + - health + summary: Readiness check + operationId: readinessCheck + description: > + Returns 200 when the entity store is reachable within the configured probe timeout. + Returns 503 when the entity store is unavailable or slow. + Use this for Kubernetes readiness probes to control traffic routing. + This endpoint is exempt from authentication so probes can reach it without credentials. + security: [] + responses: + "200": + $ref: "#/components/responses/HealthResponse" + "503": + $ref: "#/components/responses/HealthResponse" + "4XX": + $ref: "./openapi.yaml#/components/responses/BadRequestErrorResponse" + "5XX": + $ref: "./openapi.yaml#/components/responses/ServerErrorResponse" + +components: + responses: + HealthResponse: + description: Health check result + content: + application/vnd.gravitino.v1+json: + schema: + $ref: "#/components/schemas/HealthResponse" + examples: + HealthUp: + $ref: "#/components/examples/HealthUp" + HealthDown: + $ref: "#/components/examples/HealthDown" + + schemas: + HealthResponse: + type: object + required: + - code + - status + - checks + properties: + code: + type: integer + description: Response code, 0 for success + example: 0 + status: + type: string + enum: + - UP + - DOWN + description: Aggregate health status + checks: + type: array + description: Per-component check results + items: + $ref: "#/components/schemas/HealthCheck" + + HealthCheck: + type: object + required: + - name + - status + properties: + name: + type: string + description: Name of the component being checked + example: entityStore + status: + type: string + enum: + - UP + - DOWN + description: Status of the component + details: + type: object + additionalProperties: + type: string + description: Optional diagnostic details about the check result + + examples: + HealthUp: + value: + code: 0 + status: UP + checks: + - name: httpServer + status: UP + details: {} + - name: entityStore + status: UP + details: {} + + HealthDown: + value: + code: 0 + status: DOWN + checks: + - name: httpServer + status: UP + details: {} + - name: entityStore + status: DOWN + details: + reason: timeout diff --git a/docs/open-api/openapi.yaml b/docs/open-api/openapi.yaml index 216b8a1733..b498c6409b 100644 --- a/docs/open-api/openapi.yaml +++ b/docs/open-api/openapi.yaml @@ -48,6 +48,15 @@ security: - BasicAuth: [] paths: + /health: + $ref: "./health.yaml#/paths/~1health" + + /health/live: + $ref: "./health.yaml#/paths/~1health~1live" + + /health/ready: + $ref: "./health.yaml#/paths/~1health~1ready" + /metalakes: # escape the path name by replacing `/` with `~1` $ref: "./metalakes.yaml#/paths/~1metalakes" diff --git a/server-common/src/main/java/org/apache/gravitino/server/ServerConfig.java b/server-common/src/main/java/org/apache/gravitino/server/ServerConfig.java index cb6a97587f..532f334c76 100644 --- a/server-common/src/main/java/org/apache/gravitino/server/ServerConfig.java +++ b/server-common/src/main/java/org/apache/gravitino/server/ServerConfig.java @@ -35,6 +35,15 @@ public class ServerConfig extends Config { .checkValue(value -> value > 0, ConfigConstants.POSITIVE_NUMBER_ERROR_MSG) .createWithDefault(3 * 1000); + public static final ConfigEntry<Long> HEALTH_ENTITY_STORE_PROBE_TIMEOUT_MS = + new ConfigBuilder("gravitino.server.health.entityStore.probeTimeoutMs") + .doc( + "Timeout in milliseconds for the entity-store readiness probe used by /api/health/ready") + .version(ConfigConstants.VERSION_1_3_0) + .longConf() + .checkValue(value -> value > 0, ConfigConstants.POSITIVE_NUMBER_ERROR_MSG) + .createWithDefault(2000L); + public ServerConfig(boolean loadDefaults) { super(loadDefaults); } diff --git a/server-common/src/main/java/org/apache/gravitino/server/authentication/AuthenticationFilter.java b/server-common/src/main/java/org/apache/gravitino/server/authentication/AuthenticationFilter.java index 7df4e68d6a..4919f7963b 100644 --- a/server-common/src/main/java/org/apache/gravitino/server/authentication/AuthenticationFilter.java +++ b/server-common/src/main/java/org/apache/gravitino/server/authentication/AuthenticationFilter.java @@ -55,6 +55,13 @@ public class AuthenticationFilter implements Filter { @Override public void doFilter(ServletRequest request, ServletResponse response, FilterChain chain) throws IOException, ServletException { + // Health check endpoints must be reachable without credentials so that Kubernetes + // probes, load balancers, and global traffic managers can monitor server availability. + // See org.apache.gravitino.server.web.rest.HealthOperations. + if (isHealthCheckRequest(request)) { + chain.doFilter(request, response); + return; + } try { List<Authenticator> authenticators; if (filterAuthenticators == null || filterAuthenticators.isEmpty()) { @@ -106,6 +113,42 @@ public class AuthenticationFilter implements Filter { } } + /** + * Sends an error response when authentication fails. Subclasses can override this to customize + * the error response format (e.g., Iceberg REST server returns JSON error bodies). + * + * <p>TODO: Gravitino server should override this method to return a correct JSON response + * following the Gravitino error response spec. + * + * @param response the HTTP servlet response + * @param exception the authentication exception + */ + protected void sendAuthErrorResponse(HttpServletResponse response, Exception exception) + throws IOException { + int status = + exception instanceof UnauthorizedException + ? HttpServletResponse.SC_UNAUTHORIZED + : HttpServletResponse.SC_INTERNAL_SERVER_ERROR; + response.sendError(status, exception.getMessage()); + } + + private static boolean isHealthCheckRequest(ServletRequest request) { + if (!(request instanceof HttpServletRequest)) { + return false; + } + String path = ((HttpServletRequest) request).getRequestURI(); + if (path == null) { + return false; + } + // Also match /health and /health/* — the root-level aliases that forward to /api/health/*. + // During a forward, getRequestURI() returns the original URI, not the forwarded target. + return path.equals("/health") + || path.startsWith("/health/") + || path.equals("/api/health") + || path.startsWith("/api/health/"); + } + + @Override public void destroy() {} } diff --git a/server-common/src/test/java/org/apache/gravitino/server/authentication/TestAuthenticationFilter.java b/server-common/src/test/java/org/apache/gravitino/server/authentication/TestAuthenticationFilter.java index 0ef48c922c..7366b89db6 100644 --- a/server-common/src/test/java/org/apache/gravitino/server/authentication/TestAuthenticationFilter.java +++ b/server-common/src/test/java/org/apache/gravitino/server/authentication/TestAuthenticationFilter.java @@ -122,4 +122,64 @@ public class TestAuthenticationFilter { filter.doFilter(mockRequest, mockResponse, mockChain); verify(mockResponse).sendError(HttpServletResponse.SC_UNAUTHORIZED, "UNAUTHORIZED"); } + + @Test + public void testDoFilterBypassesAuthenticationForHealthEndpoints() + throws ServletException, IOException { + // /health, /health/live, /health/ready are root-level aliases; during a Jetty forward + // getRequestURI() returns the original URI so the bypass must also match /health/* directly. + String[] healthPaths = { + "/health", + "/health/live", + "/health/ready", + "/api/health", + "/api/health/", + "/api/health/live", + "/api/health/ready" + }; + for (String path : healthPaths) { + Authenticator authenticator = mock(Authenticator.class); + AuthenticationFilter filter = new AuthenticationFilter(Lists.newArrayList(authenticator)); + FilterChain mockChain = mock(FilterChain.class); + HttpServletRequest mockRequest = mock(HttpServletRequest.class); + HttpServletResponse mockResponse = mock(HttpServletResponse.class); + when(mockRequest.getRequestURI()).thenReturn(path); + + filter.doFilter(mockRequest, mockResponse, mockChain); + + // Chain proceeds, no error response, and authenticator is never consulted. + verify(mockChain).doFilter(mockRequest, mockResponse); + verify(mockResponse, never()).sendError(anyInt(), anyString()); + verify(authenticator, never()).supportsToken(any()); + } + } + + @Test + public void testDoFilterDoesNotBypassAuthenticationForNonHealthPaths() + throws ServletException, IOException { + // Regression guard against an overly broad exemption. Paths that merely contain + // "health" or share a prefix with "/api/health" must still be authenticated. + String[] nonHealthPaths = { + "/api/metalakes/health_metalake", "/api/healthcheck", "/api/version", "/api/metalakes" + }; + for (String path : nonHealthPaths) { + Authenticator authenticator = mock(Authenticator.class); + AuthenticationFilter filter = new AuthenticationFilter(Lists.newArrayList(authenticator)); + FilterChain mockChain = mock(FilterChain.class); + HttpServletRequest mockRequest = mock(HttpServletRequest.class); + HttpServletResponse mockResponse = mock(HttpServletResponse.class); + when(mockRequest.getRequestURI()).thenReturn(path); + when(mockRequest.getHeaders(AuthConstants.HTTP_HEADER_AUTHORIZATION)) + .thenReturn(new Vector<>(Collections.singletonList("user")).elements()); + when(authenticator.supportsToken(any())).thenReturn(true); + when(authenticator.isDataFromToken()).thenReturn(true); + when(authenticator.authenticateToken(any())) + .thenThrow(new UnauthorizedException("UNAUTHORIZED")); + + filter.doFilter(mockRequest, mockResponse, mockChain); + + // Auth flow ran and rejected — proves these paths are not exempted. + verify(mockResponse).sendError(HttpServletResponse.SC_UNAUTHORIZED, "UNAUTHORIZED"); + } + } } diff --git a/server/src/main/java/org/apache/gravitino/server/GravitinoServer.java b/server/src/main/java/org/apache/gravitino/server/GravitinoServer.java index 90a537f885..6f86755bea 100644 --- a/server/src/main/java/org/apache/gravitino/server/GravitinoServer.java +++ b/server/src/main/java/org/apache/gravitino/server/GravitinoServer.java @@ -46,6 +46,7 @@ import org.apache.gravitino.policy.PolicyDispatcher; import org.apache.gravitino.server.authentication.ServerAuthenticator; import org.apache.gravitino.server.authorization.GravitinoAuthorizerProvider; import org.apache.gravitino.server.web.ConfigServlet; +import org.apache.gravitino.server.web.HealthAliasServlet; import org.apache.gravitino.server.web.HttpServerMetricsSource; import org.apache.gravitino.server.web.JettyServer; import org.apache.gravitino.server.web.JettyServerConfig; @@ -172,6 +173,11 @@ public class GravitinoServer extends ResourceConfig { server.addServlet(servlet, API_ANY_PATH); Servlet configServlet = new ConfigServlet(serverConfig); server.addServlet(configServlet, "/configs"); + + // Root-level alias for enterprise GTMs that require probes at well-known root paths. + // Forwards /health, /health/live, /health/ready to the canonical /api/health/* endpoints. + server.addServlet(new HealthAliasServlet(), "/health/*"); + server.addCustomFilters(API_ANY_PATH); server.addFilter(new VersioningFilter(), API_ANY_PATH); server.addSystemFilters(API_ANY_PATH); diff --git a/server/src/main/java/org/apache/gravitino/server/web/HealthAliasServlet.java b/server/src/main/java/org/apache/gravitino/server/web/HealthAliasServlet.java new file mode 100644 index 0000000000..daf21b9057 --- /dev/null +++ b/server/src/main/java/org/apache/gravitino/server/web/HealthAliasServlet.java @@ -0,0 +1,49 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.gravitino.server.web; + +import java.io.IOException; +import javax.servlet.RequestDispatcher; +import javax.servlet.ServletException; +import javax.servlet.http.HttpServlet; +import javax.servlet.http.HttpServletRequest; +import javax.servlet.http.HttpServletResponse; + +/** + * Serves root-level health paths ({@code /health}, {@code /health/live}, {@code /health/ready}) by + * forwarding to the canonical {@code /api/health/*} endpoints. + * + * <p>This alias exists for compatibility with enterprise global traffic managers that require + * probes at well-known root paths. The canonical implementation remains at {@code /api/health}. + */ +public class HealthAliasServlet extends HttpServlet { + + @Override + protected void doGet(HttpServletRequest req, HttpServletResponse resp) + throws ServletException, IOException { + // Prepend /api to the incoming path: /health → /api/health, /health/live → /api/health/live + String targetPath = "/api" + req.getRequestURI(); + RequestDispatcher dispatcher = req.getRequestDispatcher(targetPath); + if (dispatcher == null) { + resp.sendError(HttpServletResponse.SC_SERVICE_UNAVAILABLE, "health dispatcher unavailable"); + return; + } + dispatcher.forward(req, resp); + } +} diff --git a/server/src/main/java/org/apache/gravitino/server/web/rest/HealthOperations.java b/server/src/main/java/org/apache/gravitino/server/web/rest/HealthOperations.java new file mode 100644 index 0000000000..60c44aeb58 --- /dev/null +++ b/server/src/main/java/org/apache/gravitino/server/web/rest/HealthOperations.java @@ -0,0 +1,244 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.gravitino.server.web.rest; + +import com.codahale.metrics.annotation.ResponseMetered; +import com.codahale.metrics.annotation.Timed; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.Map; +import java.util.concurrent.CompletableFuture; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.LinkedBlockingQueue; +import java.util.concurrent.RejectedExecutionException; +import java.util.concurrent.ThreadPoolExecutor; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.TimeoutException; +import java.util.concurrent.atomic.AtomicInteger; +import javax.servlet.http.HttpServlet; +import javax.ws.rs.GET; +import javax.ws.rs.Path; +import javax.ws.rs.Produces; +import javax.ws.rs.core.MediaType; +import javax.ws.rs.core.Response; +import org.apache.gravitino.Entity.EntityType; +import org.apache.gravitino.EntityStore; +import org.apache.gravitino.GravitinoEnv; +import org.apache.gravitino.NameIdentifier; +import org.apache.gravitino.dto.HealthCheckDTO; +import org.apache.gravitino.dto.responses.HealthResponse; +import org.apache.gravitino.metrics.MetricNames; +import org.apache.gravitino.server.ServerConfig; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Health check endpoints following MicroProfile Health semantics. Exposes separate liveness, + * readiness, and aggregate endpoints so Kubernetes probes, load balancers, and global traffic + * managers can distinguish "restart this pod" from "route traffic elsewhere." + * + * <ul> + * <li>{@code GET /api/health/live} — liveness, 200 as long as the HTTP thread can respond + * <li>{@code GET /api/health/ready} — readiness, 200 when entity store is reachable + * <li>{@code GET /api/health} — aggregate, 200 when both pass + * </ul> + * + * All endpoints return 503 with a JSON body describing the failed check(s) when unhealthy. + */ +@Path("/health") +@Produces(MediaType.APPLICATION_JSON) +public class HealthOperations extends HttpServlet { + + private static final Logger LOG = LoggerFactory.getLogger(HealthOperations.class); + + private static final AtomicInteger PROBE_THREAD_COUNTER = new AtomicInteger(); + + private static final int PROBE_CORE_THREADS = 1; + private static final int PROBE_MAX_THREADS = 4; + private static final long PROBE_KEEP_ALIVE_SECONDS = 60L; + private static final int PROBE_QUEUE_CAPACITY = 20; + + private static final ExecutorService HEALTH_PROBE_EXECUTOR = + new ThreadPoolExecutor( + PROBE_CORE_THREADS, + PROBE_MAX_THREADS, + PROBE_KEEP_ALIVE_SECONDS, + TimeUnit.SECONDS, + new LinkedBlockingQueue<>(PROBE_QUEUE_CAPACITY), + r -> { + Thread t = new Thread(r, "health-probe-" + PROBE_THREAD_COUNTER.incrementAndGet()); + t.setDaemon(true); + return t; + }, + new ThreadPoolExecutor.AbortPolicy()); + + private static final String HEALTH_PROBE_SENTINEL = "gravitino_health_probe"; + + private static final String CHECK_HTTP_SERVER = "httpServer"; + private static final String CHECK_ENTITY_STORE = "entityStore"; + + /** + * Default constructor for Jersey auto-discovery. The entity store is resolved lazily at request + * time via {@link #getEntityStore()} so that probes issued before {@link GravitinoEnv} has + * finished initializing report DOWN rather than throwing NullPointerException. + */ + public HealthOperations() {} + + @GET + @Path("/live") + @Produces("application/vnd.gravitino.v1+json") + @Timed(name = "health.live." + MetricNames.HTTP_PROCESS_DURATION, absolute = true) + @ResponseMetered(name = "health.live", absolute = true) + public Response live() { + HealthCheckDTO check = up(CHECK_HTTP_SERVER, Collections.emptyMap()); + return ok(new HealthResponse(HealthCheckDTO.Status.UP, Collections.singletonList(check))); + } + + @GET + @Path("/ready") + @Produces("application/vnd.gravitino.v1+json") + @Timed(name = "health.ready." + MetricNames.HTTP_PROCESS_DURATION, absolute = true) + @ResponseMetered(name = "health.ready", absolute = true) + public Response ready() { + HealthCheckDTO entityStoreCheck = checkEntityStore(); + HealthCheckDTO.Status overall = entityStoreCheck.getStatus(); + HealthResponse body = new HealthResponse(overall, Collections.singletonList(entityStoreCheck)); + return overall == HealthCheckDTO.Status.UP ? ok(body) : serviceUnavailable(body); + } + + @GET + @Produces("application/vnd.gravitino.v1+json") + @Timed(name = "health." + MetricNames.HTTP_PROCESS_DURATION, absolute = true) + @ResponseMetered(name = "health", absolute = true) + public Response health() { + List<HealthCheckDTO> checks = new ArrayList<>(2); + checks.add(up(CHECK_HTTP_SERVER, Collections.emptyMap())); + checks.add(checkEntityStore()); + + HealthCheckDTO.Status overall = + checks.stream().anyMatch(c -> c.getStatus() == HealthCheckDTO.Status.DOWN) + ? HealthCheckDTO.Status.DOWN + : HealthCheckDTO.Status.UP; + + HealthResponse body = new HealthResponse(overall, checks); + return overall == HealthCheckDTO.Status.UP ? ok(body) : serviceUnavailable(body); + } + + private HealthCheckDTO checkEntityStore() { + EntityStore entityStore = getEntityStore(); + if (entityStore == null) { + return down(CHECK_ENTITY_STORE, "reason", "entity store not initialized"); + } + + long timeoutMs = getProbeTimeoutMs(); + CompletableFuture<Boolean> future; + try { + future = + CompletableFuture.supplyAsync( + () -> { + try { + return entityStore.exists( + NameIdentifier.of(HEALTH_PROBE_SENTINEL), EntityType.METALAKE); + } catch (IOException e) { + throw new RuntimeException(e); + } + }, + HEALTH_PROBE_EXECUTOR); + } catch (RejectedExecutionException e) { + LOG.warn("Entity store probe rejected — health executor queue full"); + return down(CHECK_ENTITY_STORE, "reason", "probe-rejected"); + } + + try { + future.get(timeoutMs, TimeUnit.MILLISECONDS); + return up(CHECK_ENTITY_STORE, Collections.emptyMap()); + + } catch (TimeoutException e) { + future.cancel(true); + LOG.warn("Entity store probe timed out after {}ms", timeoutMs); + return down(CHECK_ENTITY_STORE, "reason", "timeout"); + + } catch (InterruptedException e) { + future.cancel(true); + Thread.currentThread().interrupt(); + LOG.warn("Entity store probe interrupted"); + return down(CHECK_ENTITY_STORE, "reason", "interrupted"); + + } catch (ExecutionException e) { + Throwable cause = e.getCause() != null ? e.getCause() : e; + // Unwrap RuntimeException wrappers introduced by supplyAsync tunneling checked exceptions. + if (cause instanceof RuntimeException && cause.getCause() != null) { + cause = cause.getCause(); + } + LOG.warn("Entity store probe failed: {}", cause.toString()); + return down(CHECK_ENTITY_STORE, "reason", cause.getClass().getSimpleName()); + + } catch (Exception e) { + LOG.warn("Entity store probe encountered unexpected error", e); + return down(CHECK_ENTITY_STORE, "reason", e.getClass().getSimpleName()); + } + } + + /** Visible for testing — subclasses override to inject a mock entity store. */ + EntityStore getEntityStore() { + try { + return GravitinoEnv.getInstance().entityStore(); + } catch (Exception e) { + LOG.debug("Unable to resolve entity store from GravitinoEnv", e); + return null; + } + } + + /** Visible for testing — subclasses override to inject a custom timeout. */ + long getProbeTimeoutMs() { + try { + return GravitinoEnv.getInstance() + .config() + .get(ServerConfig.HEALTH_ENTITY_STORE_PROBE_TIMEOUT_MS); + } catch (Exception e) { + return ServerConfig.HEALTH_ENTITY_STORE_PROBE_TIMEOUT_MS.getDefaultValue(); + } + } + + private static HealthCheckDTO up(String name, Map<String, String> details) { + return new HealthCheckDTO(name, HealthCheckDTO.Status.UP, details); + } + + private static HealthCheckDTO down(String name, String detailKey, String detailValue) { + return new HealthCheckDTO( + name, HealthCheckDTO.Status.DOWN, Collections.singletonMap(detailKey, detailValue)); + } + + private static Response ok(HealthResponse body) { + return Response.status(Response.Status.OK) + .entity(body) + .type(MediaType.APPLICATION_JSON) + .build(); + } + + private static Response serviceUnavailable(HealthResponse body) { + return Response.status(Response.Status.SERVICE_UNAVAILABLE) + .entity(body) + .type(MediaType.APPLICATION_JSON) + .build(); + } +} diff --git a/server/src/test/java/org/apache/gravitino/server/web/TestHealthAliasServlet.java b/server/src/test/java/org/apache/gravitino/server/web/TestHealthAliasServlet.java new file mode 100644 index 0000000000..876af0111b --- /dev/null +++ b/server/src/test/java/org/apache/gravitino/server/web/TestHealthAliasServlet.java @@ -0,0 +1,66 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.gravitino.server.web; + +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.verify; +import static org.mockito.Mockito.when; + +import javax.servlet.RequestDispatcher; +import javax.servlet.http.HttpServletRequest; +import javax.servlet.http.HttpServletResponse; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.CsvSource; + +public class TestHealthAliasServlet { + + @ParameterizedTest + @CsvSource({ + "/health, /api/health", + "/health/live, /api/health/live", + "/health/ready, /api/health/ready" + }) + public void testDoGetForwardsToCanonicalPath(String incoming, String expected) throws Exception { + HealthAliasServlet servlet = new HealthAliasServlet(); + HttpServletRequest req = mock(HttpServletRequest.class); + HttpServletResponse resp = mock(HttpServletResponse.class); + RequestDispatcher dispatcher = mock(RequestDispatcher.class); + when(req.getRequestURI()).thenReturn(incoming.strip()); + when(req.getRequestDispatcher(expected.strip())).thenReturn(dispatcher); + + servlet.doGet(req, resp); + + verify(dispatcher).forward(req, resp); + } + + @Test + public void testDoGetReturns503WhenDispatcherIsNull() throws Exception { + HealthAliasServlet servlet = new HealthAliasServlet(); + HttpServletRequest req = mock(HttpServletRequest.class); + HttpServletResponse resp = mock(HttpServletResponse.class); + when(req.getRequestURI()).thenReturn("/health"); + when(req.getRequestDispatcher("/api/health")).thenReturn(null); + + servlet.doGet(req, resp); + + verify(resp) + .sendError(HttpServletResponse.SC_SERVICE_UNAVAILABLE, "health dispatcher unavailable"); + } +} diff --git a/server/src/test/java/org/apache/gravitino/server/web/rest/TestHealthOperations.java b/server/src/test/java/org/apache/gravitino/server/web/rest/TestHealthOperations.java new file mode 100644 index 0000000000..9f5262a2fb --- /dev/null +++ b/server/src/test/java/org/apache/gravitino/server/web/rest/TestHealthOperations.java @@ -0,0 +1,149 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.gravitino.server.web.rest; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.io.IOException; +import java.util.concurrent.CountDownLatch; +import javax.ws.rs.core.Response; +import org.apache.gravitino.EntityStore; +import org.apache.gravitino.dto.HealthCheckDTO; +import org.apache.gravitino.dto.responses.HealthResponse; +import org.junit.jupiter.api.Test; +import org.mockito.Mockito; + +public class TestHealthOperations { + + private HealthOperations newOps(EntityStore store) { + return newOps(store, 2000L); + } + + private HealthOperations newOps(EntityStore store, long probeTimeoutMs) { + return new HealthOperations() { + @Override + EntityStore getEntityStore() { + return store; + } + + @Override + long getProbeTimeoutMs() { + return probeTimeoutMs; + } + }; + } + + @Test + public void testLiveReturns200WithUpStatus() { + HealthOperations ops = newOps(null); + Response response = ops.live(); + assertEquals(200, response.getStatus()); + HealthResponse body = (HealthResponse) response.getEntity(); + assertEquals(HealthCheckDTO.Status.UP, body.getStatus()); + assertEquals(1, body.getChecks().size()); + assertEquals("httpServer", body.getChecks().get(0).getName()); + } + + @Test + public void testReadyReturns503WhenEntityStoreNotInitialized() { + HealthOperations ops = newOps(null); + Response response = ops.ready(); + assertEquals(503, response.getStatus()); + HealthResponse body = (HealthResponse) response.getEntity(); + assertEquals(HealthCheckDTO.Status.DOWN, body.getStatus()); + assertEquals("entityStore", body.getChecks().get(0).getName()); + assertNotNull(body.getChecks().get(0).getDetails().get("reason")); + } + + @Test + public void testReadyReturns200WhenEntityStoreReachable() throws IOException { + EntityStore store = Mockito.mock(EntityStore.class); + Mockito.when(store.exists(Mockito.any(), Mockito.any())).thenReturn(false); + HealthOperations ops = newOps(store); + Response response = ops.ready(); + assertEquals(200, response.getStatus()); + HealthResponse body = (HealthResponse) response.getEntity(); + assertEquals(HealthCheckDTO.Status.UP, body.getStatus()); + } + + @Test + public void testReadyReturns503WhenEntityStoreThrows() throws IOException { + EntityStore store = Mockito.mock(EntityStore.class); + Mockito.when(store.exists(Mockito.any(), Mockito.any())) + .thenThrow(new IOException("connection refused")); + HealthOperations ops = newOps(store); + Response response = ops.ready(); + assertEquals(503, response.getStatus()); + HealthResponse body = (HealthResponse) response.getEntity(); + assertEquals(HealthCheckDTO.Status.DOWN, body.getStatus()); + assertEquals("IOException", body.getChecks().get(0).getDetails().get("reason")); + } + + @Test + public void testAggregateReturns200WhenAllChecksPass() throws IOException { + EntityStore store = Mockito.mock(EntityStore.class); + Mockito.when(store.exists(Mockito.any(), Mockito.any())).thenReturn(false); + HealthOperations ops = newOps(store); + Response response = ops.health(); + assertEquals(200, response.getStatus()); + HealthResponse body = (HealthResponse) response.getEntity(); + assertEquals(2, body.getChecks().size()); + assertTrue(body.isUp()); + } + + @Test + public void testAggregateReturns503WhenEntityStoreDown() { + HealthOperations ops = newOps(null); + Response response = ops.health(); + assertEquals(503, response.getStatus()); + HealthResponse body = (HealthResponse) response.getEntity(); + assertEquals(HealthCheckDTO.Status.DOWN, body.getStatus()); + assertEquals(2, body.getChecks().size()); + } + + @Test + public void testReadyReturns503WhenEntityStoreTimesOut() throws IOException { + // Use a latch that is never released so the probe blocks indefinitely. + // This avoids the flakiness of a fixed sleep that could race on a loaded CI host. + CountDownLatch neverReleased = new CountDownLatch(1); + EntityStore store = Mockito.mock(EntityStore.class); + Mockito.when(store.exists(Mockito.any(), Mockito.any())) + .thenAnswer( + invocation -> { + neverReleased.await(); + return false; + }); + HealthOperations ops = newOps(store, 100L); + Response response = ops.ready(); + assertEquals(503, response.getStatus()); + HealthResponse body = (HealthResponse) response.getEntity(); + assertEquals(HealthCheckDTO.Status.DOWN, body.getStatus()); + assertEquals("timeout", body.getChecks().get(0).getDetails().get("reason")); + neverReleased.countDown(); + } + + @Test + public void testValidateThrowsWhenStatusIsNull() { + HealthResponse response = new HealthResponse(); + assertThrows(IllegalArgumentException.class, response::validate); + } +}
