diff --git a/changelog/unreleased/SOLR-16458-migrate-node-health-api.yml b/changelog/unreleased/SOLR-16458-migrate-node-health-api.yml
new file mode 100644
index 000000000000..59dc710934dd
--- /dev/null
+++ b/changelog/unreleased/SOLR-16458-migrate-node-health-api.yml
@@ -0,0 +1,8 @@
+title: "SolrJ now offers a SolrRequest class allowing users to perform v2 single-node healthchecks: NodeApi.Healthcheck"
+type: added
+authors:
+ - name: Eric Pugh
+ - name: Jason Gerlowski
+links:
+ - name: SOLR-16458
+ url: https://issues.apache.org/jira/browse/SOLR-16458
diff --git a/solr/api/src/java/org/apache/solr/client/api/endpoint/NodeHealthApi.java b/solr/api/src/java/org/apache/solr/client/api/endpoint/NodeHealthApi.java
new file mode 100644
index 000000000000..38ce0a20c9b8
--- /dev/null
+++ b/solr/api/src/java/org/apache/solr/client/api/endpoint/NodeHealthApi.java
@@ -0,0 +1,43 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.client.api.endpoint;
+
+import io.swagger.v3.oas.annotations.Operation;
+import io.swagger.v3.oas.annotations.Parameter;
+import jakarta.ws.rs.GET;
+import jakarta.ws.rs.Path;
+import jakarta.ws.rs.QueryParam;
+import org.apache.solr.client.api.model.NodeHealthResponse;
+
+/** V2 API definition for checking the health of a Solr node. */
+@Path("/node/health")
+public interface NodeHealthApi {
+
+ @GET
+ @Operation(
+ summary = "Determine the health of a Solr node.",
+ tags = {"node"})
+ NodeHealthResponse healthcheck(
+ @QueryParam("requireHealthyCores") Boolean requireHealthyCores,
+ @Parameter(
+ description =
+ "Maximum number of index generations a follower replica may lag behind its"
+ + " leader before the health check reports FAILURE. Only relevant when"
+ + " running in Standalone mode with leader/follower replication.")
+ @QueryParam("maxGenerationLag")
+ Integer maxGenerationLag);
+}
diff --git a/solr/api/src/java/org/apache/solr/client/api/model/NodeHealthResponse.java b/solr/api/src/java/org/apache/solr/client/api/model/NodeHealthResponse.java
new file mode 100644
index 000000000000..a0be8723b98a
--- /dev/null
+++ b/solr/api/src/java/org/apache/solr/client/api/model/NodeHealthResponse.java
@@ -0,0 +1,36 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.client.api.model;
+
+import com.fasterxml.jackson.annotation.JsonProperty;
+
+/** Response body for the '/api/node/health' endpoint. */
+public class NodeHealthResponse extends SolrJerseyResponse {
+
+ /** The possible health statuses for a Solr node. */
+ public enum NodeStatus {
+ OK,
+ FAILURE
+ }
+
+ @JsonProperty public NodeStatus status;
+
+ @JsonProperty public String message;
+
+ @JsonProperty("num_cores_unhealthy")
+ public Integer numCoresUnhealthy;
+}
diff --git a/solr/core/src/java/org/apache/solr/handler/admin/HealthCheckHandler.java b/solr/core/src/java/org/apache/solr/handler/admin/HealthCheckHandler.java
index 1ecf959e49ed..1dab5d1d9778 100644
--- a/solr/core/src/java/org/apache/solr/handler/admin/HealthCheckHandler.java
+++ b/solr/core/src/java/org/apache/solr/handler/admin/HealthCheckHandler.java
@@ -17,39 +17,21 @@
package org.apache.solr.handler.admin;
-import static org.apache.solr.common.params.CommonParams.FAILURE;
-import static org.apache.solr.common.params.CommonParams.OK;
-import static org.apache.solr.common.params.CommonParams.STATUS;
-import static org.apache.solr.handler.admin.api.ReplicationAPIBase.GENERATION;
-
-import java.lang.invoke.MethodHandles;
-import java.util.ArrayList;
-import java.util.Arrays;
import java.util.Collection;
import java.util.List;
-import java.util.Locale;
-import java.util.stream.Collectors;
-import org.apache.lucene.index.IndexCommit;
-import org.apache.solr.api.AnnotatedApi;
import org.apache.solr.api.Api;
+import org.apache.solr.api.JerseyResource;
+import org.apache.solr.client.api.model.NodeHealthResponse;
import org.apache.solr.client.solrj.request.HealthCheckRequest;
-import org.apache.solr.cloud.CloudDescriptor;
import org.apache.solr.common.SolrException;
-import org.apache.solr.common.cloud.ClusterState;
-import org.apache.solr.common.cloud.Replica.State;
-import org.apache.solr.common.cloud.ZkStateReader;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.core.CoreContainer;
-import org.apache.solr.core.SolrCore;
-import org.apache.solr.handler.IndexFetcher;
-import org.apache.solr.handler.ReplicationHandler;
import org.apache.solr.handler.RequestHandlerBase;
-import org.apache.solr.handler.admin.api.NodeHealthAPI;
+import org.apache.solr.handler.admin.api.NodeHealth;
+import org.apache.solr.handler.api.V2ApiUtils;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.response.SolrQueryResponse;
import org.apache.solr.security.AuthorizationContext;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
/**
* Health Check Handler for reporting the health of a specific node.
@@ -77,12 +59,13 @@
* specify the acceptable generation lag follower should be with respect to its leader using the
* maxGenerationLag=<max_generation_lag> request parameter. If
* maxGenerationLag is not provided then health check would simply return OK.
+ *
+ *
All health-check logic lives in the v2 {@link NodeHealth}; this handler is a thin v1 bridge
+ * that extracts request parameters and delegates.
*/
public class HealthCheckHandler extends RequestHandlerBase {
- private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
private static final String PARAM_REQUIRE_HEALTHY_CORES = "requireHealthyCores";
- private static final List UNHEALTHY_STATES = Arrays.asList(State.DOWN, State.RECOVERING);
CoreContainer coreContainer;
@@ -100,224 +83,18 @@ public CoreContainer getCoreContainer() {
@Override
public void handleRequestBody(SolrQueryRequest req, SolrQueryResponse rsp) throws Exception {
rsp.setHttpCaching(false);
-
- // Core container should not be null and active (redundant check)
- if (coreContainer == null || coreContainer.isShutDown()) {
- rsp.setException(
- new SolrException(
- SolrException.ErrorCode.SERVER_ERROR,
- "CoreContainer is either not initialized or shutting down"));
- return;
- }
- if (!coreContainer.isZooKeeperAware()) {
- if (log.isDebugEnabled()) {
- log.debug("Invoked HealthCheckHandler in legacy mode.");
- }
- healthCheckLegacyMode(req, rsp);
- } else {
- if (log.isDebugEnabled()) {
- log.debug(
- "Invoked HealthCheckHandler in cloud mode on [{}]",
- this.coreContainer.getZkController().getNodeName());
- }
- healthCheckCloudMode(req, rsp);
- }
- }
-
- private void healthCheckCloudMode(SolrQueryRequest req, SolrQueryResponse rsp) {
- ZkStateReader zkStateReader = coreContainer.getZkController().getZkStateReader();
- ClusterState clusterState = zkStateReader.getClusterState();
- // Check for isConnected and isClosed
- if (zkStateReader.getZkClient().isClosed() || !zkStateReader.getZkClient().isConnected()) {
- rsp.add(STATUS, FAILURE);
- rsp.setException(
- new SolrException(
- SolrException.ErrorCode.SERVICE_UNAVAILABLE,
- "Host Unavailable: Not connected to zk"));
- return;
- }
-
- // Fail if not in live_nodes
- if (!clusterState.getLiveNodes().contains(coreContainer.getZkController().getNodeName())) {
- rsp.add(STATUS, FAILURE);
- rsp.setException(
- new SolrException(
- SolrException.ErrorCode.SERVICE_UNAVAILABLE,
- "Host Unavailable: Not in live nodes as per zk"));
- return;
- }
-
- // Optionally require that all cores on this node are active if param 'requireHealthyCores=true'
- if (req.getParams().getBool(PARAM_REQUIRE_HEALTHY_CORES, false)) {
- if (!coreContainer.isStatusLoadComplete()) {
- rsp.add(STATUS, FAILURE);
- rsp.setException(
- new SolrException(
- SolrException.ErrorCode.SERVICE_UNAVAILABLE,
- "Host Unavailable: Core Loading not complete"));
- return;
- }
- Collection coreDescriptors =
- coreContainer.getCoreDescriptors().stream()
- .map(cd -> cd.getCloudDescriptor())
- .collect(Collectors.toList());
- long unhealthyCores = findUnhealthyCores(coreDescriptors, clusterState);
- if (unhealthyCores > 0) {
- rsp.add(STATUS, FAILURE);
- rsp.add("num_cores_unhealthy", unhealthyCores);
- rsp.setException(
- new SolrException(
- SolrException.ErrorCode.SERVICE_UNAVAILABLE,
- unhealthyCores
- + " out of "
- + coreContainer.getNumAllCores()
- + " replicas are currently initializing or recovering"));
- return;
- }
- rsp.add("message", "All cores are healthy");
- }
-
- // All lights green, report healthy
- rsp.add(STATUS, OK);
- }
-
- private void healthCheckLegacyMode(SolrQueryRequest req, SolrQueryResponse rsp) {
- Integer maxGenerationLag = req.getParams().getInt(HealthCheckRequest.PARAM_MAX_GENERATION_LAG);
- List laggingCoresInfo = new ArrayList<>();
- boolean allCoresAreInSync = true;
-
- // check only if max generation lag is specified
- if (maxGenerationLag != null) {
- // if is not negative
- if (maxGenerationLag < 0) {
- log.error("Invalid value for maxGenerationLag:[{}]", maxGenerationLag);
- rsp.add(
- "message",
- String.format(Locale.ROOT, "Invalid value of maxGenerationLag:%s", maxGenerationLag));
- rsp.add(STATUS, FAILURE);
- } else {
- for (SolrCore core : coreContainer.getCores()) {
- ReplicationHandler replicationHandler =
- (ReplicationHandler) core.getRequestHandler(ReplicationHandler.PATH);
- if (replicationHandler.isFollower()) {
- boolean isCoreInSync =
- isWithinGenerationLag(core, replicationHandler, maxGenerationLag, laggingCoresInfo);
-
- allCoresAreInSync &= isCoreInSync;
- }
- }
- }
- if (allCoresAreInSync) {
- rsp.add(
- "message",
- String.format(
- Locale.ROOT,
- "All the followers are in sync with leader (within maxGenerationLag: %d) "
- + "or the cores are acting as leader",
- maxGenerationLag));
- rsp.add(STATUS, OK);
- } else {
- rsp.add(
- "message",
- String.format(
- Locale.ROOT,
- "Cores violating maxGenerationLag:%d.%n%s",
- maxGenerationLag,
- String.join(",\n", laggingCoresInfo)));
- rsp.add(STATUS, FAILURE);
- }
- } else { // if maxGeneration lag is not specified (is null) we aren't checking for lag
- rsp.add(
- "message",
- "maxGenerationLag isn't specified. Followers aren't "
- + "checking for the generation lag from the leaders");
- rsp.add(STATUS, OK);
- }
- }
-
- private boolean isWithinGenerationLag(
- final SolrCore core,
- ReplicationHandler replicationHandler,
- int maxGenerationLag,
- List laggingCoresInfo) {
- IndexFetcher indexFetcher = null;
+ final Boolean requireHealthyCores = req.getParams().getBool(PARAM_REQUIRE_HEALTHY_CORES);
+ final Integer maxGenerationLag =
+ req.getParams().getInt(HealthCheckRequest.PARAM_MAX_GENERATION_LAG);
try {
- // may not be the best way to get leader's replicableCommit
- NamedList> follower = (NamedList>) replicationHandler.getInitArgs().get("follower");
-
- indexFetcher = new IndexFetcher(follower, replicationHandler, core);
-
- NamedList> replicableCommitOnLeader = indexFetcher.getLatestVersion();
- long leaderGeneration = (Long) replicableCommitOnLeader.get(GENERATION);
-
- // Get our own commit and generation from the commit
- IndexCommit commit = core.getDeletionPolicy().getLatestCommit();
- if (commit != null) {
- long followerGeneration = commit.getGeneration();
- long generationDiff = leaderGeneration - followerGeneration;
-
- // generationDiff shouldn't be negative except for some edge cases, log it. Some scenarios
- // are
- // 1) commit generation rolls over Long.MAX_VALUE (really unlikely)
- // 2) Leader's index is wiped clean and the follower is still showing commit generation
- // from the old index
- if (generationDiff < 0) {
- log.warn("core:[{}], generation lag:[{}] is negative.");
- } else if (generationDiff < maxGenerationLag) {
- log.info(
- "core:[{}] generation lag is above acceptable threshold:[{}], "
- + "generation lag:[{}], leader generation:[{}], follower generation:[{}]",
- core,
- maxGenerationLag,
- generationDiff,
- leaderGeneration,
- followerGeneration);
-
- laggingCoresInfo.add(
- String.format(
- Locale.ROOT,
- "Core %s is lagging by %d generations",
- core.getName(),
- generationDiff));
- return true;
- }
- }
- } catch (Exception e) {
- log.error("Failed to check if the follower is in sync with the leader", e);
- } finally {
- if (indexFetcher != null) {
- indexFetcher.destroy();
- }
+ V2ApiUtils.squashIntoSolrResponseWithoutHeader(
+ rsp, new NodeHealth(coreContainer).healthcheck(requireHealthyCores, maxGenerationLag));
+ } catch (SolrException e) {
+ final NodeHealthResponse failureResponse = new NodeHealthResponse();
+ failureResponse.status = NodeHealthResponse.NodeStatus.FAILURE;
+ V2ApiUtils.squashIntoSolrResponseWithoutHeader(rsp, failureResponse);
+ rsp.setException(e);
}
- return false;
- }
-
- /**
- * Find replicas DOWN or RECOVERING, or replicas in clusterstate that do not exist on local node.
- * We first find local cores which are either not registered or unhealthy, and check each of these
- * against the clusterstate, and return a count of unhealthy replicas
- *
- * @param cores list of core cloud descriptors to iterate
- * @param clusterState clusterstate from ZK
- * @return number of unhealthy cores, either in DOWN or RECOVERING state
- */
- static long findUnhealthyCores(Collection cores, ClusterState clusterState) {
- return cores.stream()
- .filter(
- c ->
- !c.hasRegistered()
- || UNHEALTHY_STATES.contains(c.getLastPublished())) // Find candidates locally
- .filter(
- c ->
- clusterState.hasCollection(
- c.getCollectionName())) // Only care about cores for actual collections
- .filter(
- c ->
- clusterState
- .getCollection(c.getCollectionName())
- .getActiveSlicesMap()
- .containsKey(c.getShardId()))
- .count();
}
@Override
@@ -337,7 +114,12 @@ public Boolean registerV2() {
@Override
public Collection getApis() {
- return AnnotatedApi.getApis(new NodeHealthAPI(this));
+ return List.of();
+ }
+
+ @Override
+ public Collection> getJerseyResources() {
+ return List.of(NodeHealth.class);
}
@Override
diff --git a/solr/core/src/java/org/apache/solr/handler/admin/api/NodeHealth.java b/solr/core/src/java/org/apache/solr/handler/admin/api/NodeHealth.java
new file mode 100644
index 000000000000..de207f334d1b
--- /dev/null
+++ b/solr/core/src/java/org/apache/solr/handler/admin/api/NodeHealth.java
@@ -0,0 +1,277 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.handler.admin.api;
+
+import static org.apache.solr.client.api.model.NodeHealthResponse.NodeStatus.FAILURE;
+import static org.apache.solr.client.api.model.NodeHealthResponse.NodeStatus.OK;
+import static org.apache.solr.common.SolrException.ErrorCode.SERVER_ERROR;
+import static org.apache.solr.common.SolrException.ErrorCode.SERVICE_UNAVAILABLE;
+import static org.apache.solr.handler.admin.api.ReplicationAPIBase.GENERATION;
+import static org.apache.solr.security.PermissionNameProvider.Name.HEALTH_PERM;
+
+import jakarta.inject.Inject;
+import java.lang.invoke.MethodHandles;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.List;
+import java.util.Locale;
+import java.util.stream.Collectors;
+import org.apache.lucene.index.IndexCommit;
+import org.apache.solr.api.JerseyResource;
+import org.apache.solr.client.api.endpoint.NodeHealthApi;
+import org.apache.solr.client.api.model.NodeHealthResponse;
+import org.apache.solr.cloud.CloudDescriptor;
+import org.apache.solr.common.SolrException;
+import org.apache.solr.common.cloud.ClusterState;
+import org.apache.solr.common.cloud.Replica.State;
+import org.apache.solr.common.cloud.ZkStateReader;
+import org.apache.solr.common.util.NamedList;
+import org.apache.solr.core.CoreContainer;
+import org.apache.solr.core.CoreDescriptor;
+import org.apache.solr.core.SolrCore;
+import org.apache.solr.handler.IndexFetcher;
+import org.apache.solr.handler.ReplicationHandler;
+import org.apache.solr.jersey.PermissionName;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * V2 API for checking the health of the receiving node.
+ *
+ * This API (GET /v2/node/health) is analogous to the v1 /admin/info/health.
+ *
+ *
The v1 {@link org.apache.solr.handler.admin.HealthCheckHandler} delegates to this class.
+ */
+public class NodeHealth extends JerseyResource implements NodeHealthApi {
+
+ private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
+ private static final List UNHEALTHY_STATES = Arrays.asList(State.DOWN, State.RECOVERING);
+
+ private final CoreContainer coreContainer;
+
+ @Inject
+ public NodeHealth(CoreContainer coreContainer) {
+ this.coreContainer = coreContainer;
+ }
+
+ @Override
+ @PermissionName(HEALTH_PERM)
+ public NodeHealthResponse healthcheck(Boolean requireHealthyCores, Integer maxGenerationLag) {
+ if (coreContainer == null || coreContainer.isShutDown()) {
+ throw new SolrException(
+ SERVER_ERROR, "CoreContainer is either not initialized or shutting down");
+ }
+
+ final NodeHealthResponse response = instantiateJerseyResponse(NodeHealthResponse.class);
+
+ if (!coreContainer.isZooKeeperAware()) {
+ if (log.isDebugEnabled()) {
+ log.debug("Invoked HealthCheckHandler in legacy mode.");
+ }
+ healthCheckStandaloneMode(response, maxGenerationLag);
+ } else {
+ if (log.isDebugEnabled()) {
+ log.debug(
+ "Invoked HealthCheckHandler in cloud mode on [{}]",
+ coreContainer.getZkController().getNodeName());
+ }
+ healthCheckCloudMode(response, requireHealthyCores);
+ }
+
+ return response;
+ }
+
+ private void healthCheckCloudMode(NodeHealthResponse response, Boolean requireHealthyCores) {
+ ClusterState clusterState = getClusterState();
+
+ if (Boolean.TRUE.equals(requireHealthyCores)) {
+ if (!coreContainer.isStatusLoadComplete()) {
+ throw new SolrException(SERVICE_UNAVAILABLE, "Host Unavailable: Core Loading not complete");
+ }
+ Collection coreDescriptors =
+ coreContainer.getCoreDescriptors().stream()
+ .map(CoreDescriptor::getCloudDescriptor)
+ .collect(Collectors.toList());
+ int unhealthyCores = findUnhealthyCores(coreDescriptors, clusterState);
+ if (unhealthyCores > 0) {
+ response.numCoresUnhealthy = unhealthyCores;
+ throw new SolrException(
+ SERVICE_UNAVAILABLE,
+ unhealthyCores
+ + " out of "
+ + coreContainer.getNumAllCores()
+ + " replicas are currently initializing or recovering");
+ }
+ response.message = "All cores are healthy";
+ }
+
+ response.status = OK;
+ }
+
+ private ClusterState getClusterState() {
+ ZkStateReader zkStateReader = coreContainer.getZkController().getZkStateReader();
+ ClusterState clusterState = zkStateReader.getClusterState();
+
+ if (zkStateReader.getZkClient().isClosed() || !zkStateReader.getZkClient().isConnected()) {
+ throw new SolrException(SERVICE_UNAVAILABLE, "Host Unavailable: Not connected to zk");
+ }
+
+ if (!clusterState.getLiveNodes().contains(coreContainer.getZkController().getNodeName())) {
+ throw new SolrException(SERVICE_UNAVAILABLE, "Host Unavailable: Not in live nodes as per zk");
+ }
+ return clusterState;
+ }
+
+ private void healthCheckStandaloneMode(NodeHealthResponse response, Integer maxGenerationLag) {
+ List laggingCoresInfo = new ArrayList<>();
+ boolean allCoresAreInSync = true;
+
+ if (maxGenerationLag != null) {
+ if (maxGenerationLag < 0) {
+ log.error("Invalid value for maxGenerationLag:[{}]", maxGenerationLag);
+ response.message =
+ String.format(Locale.ROOT, "Invalid value of maxGenerationLag:%s", maxGenerationLag);
+ response.status = FAILURE;
+ return;
+ }
+
+ for (SolrCore core : coreContainer.getCores()) {
+ ReplicationHandler replicationHandler =
+ (ReplicationHandler) core.getRequestHandler(ReplicationHandler.PATH);
+ if (replicationHandler.isFollower()) {
+ boolean isCoreInSync =
+ isWithinGenerationLag(core, replicationHandler, maxGenerationLag, laggingCoresInfo);
+ allCoresAreInSync &= isCoreInSync;
+ }
+ }
+
+ if (allCoresAreInSync) {
+ response.message =
+ String.format(
+ Locale.ROOT,
+ "All the followers are in sync with leader (within maxGenerationLag: %d) "
+ + "or the cores are acting as leader",
+ maxGenerationLag);
+ response.status = OK;
+ } else {
+ response.message =
+ String.format(
+ Locale.ROOT,
+ "Cores violating maxGenerationLag:%d.%n%s",
+ maxGenerationLag,
+ String.join(",\n", laggingCoresInfo));
+ response.status = FAILURE;
+ }
+ } else {
+ response.message =
+ "maxGenerationLag isn't specified. Followers aren't "
+ + "checking for the generation lag from the leaders";
+ response.status = OK;
+ }
+ }
+
+ private boolean isWithinGenerationLag(
+ final SolrCore core,
+ ReplicationHandler replicationHandler,
+ int maxGenerationLag,
+ List laggingCoresInfo) {
+ IndexFetcher indexFetcher = null;
+ try {
+ // may not be the best way to get leader's replicableCommit; NamedList is unavoidable here
+ // as it is the init-args format used by ReplicationHandler
+ NamedList> follower = (NamedList>) replicationHandler.getInitArgs().get("follower");
+ indexFetcher = new IndexFetcher(follower, replicationHandler, core);
+ // getLatestVersion() returns a NamedList from the IndexFetcher network API
+ NamedList> replicableCommitOnLeader = indexFetcher.getLatestVersion();
+ long leaderGeneration = (Long) replicableCommitOnLeader.get(GENERATION);
+
+ // Get our own commit and generation from the commit
+ IndexCommit commit = core.getDeletionPolicy().getLatestCommit();
+ if (commit != null) {
+ long followerGeneration = commit.getGeneration();
+ long generationDiff = leaderGeneration - followerGeneration;
+
+ // generationDiff shouldn't be negative except for some edge cases, log it. Some scenarios
+ // are:
+ // 1) commit generation rolls over Long.MAX_VALUE (really unlikely)
+ // 2) Leader's index is wiped clean and the follower is still showing commit generation
+ // from the old index
+ if (generationDiff < 0) {
+ log.warn("core:[{}], generation lag:[{}] is negative.", core, generationDiff);
+ return false;
+ } else if (generationDiff > maxGenerationLag) {
+ log.info(
+ "core:[{}] generation lag is above acceptable threshold:[{}], "
+ + "generation lag:[{}], leader generation:[{}], follower generation:[{}]",
+ core,
+ maxGenerationLag,
+ generationDiff,
+ leaderGeneration,
+ followerGeneration);
+ laggingCoresInfo.add(
+ String.format(
+ Locale.ROOT,
+ "Core %s is lagging by %d generations",
+ core.getName(),
+ generationDiff));
+ return false;
+ }
+ }
+ } catch (Exception e) {
+ log.error("Failed to check if the follower is in sync with the leader", e);
+ return false;
+ } finally {
+ if (indexFetcher != null) {
+ indexFetcher.destroy();
+ }
+ }
+ return true;
+ }
+
+ /**
+ * Find replicas DOWN or RECOVERING, or replicas in clusterstate that do not exist on local node.
+ * We first find local cores which are either not registered or unhealthy, and check each of these
+ * against the clusterstate, and return a count of unhealthy replicas.
+ *
+ * @param cores list of core cloud descriptors to iterate
+ * @param clusterState clusterstate from ZK
+ * @return number of unhealthy cores, either in DOWN or RECOVERING state
+ */
+ public static int findUnhealthyCores(
+ Collection cores, ClusterState clusterState) {
+ return Math.toIntExact(
+ cores.stream()
+ .filter(
+ c ->
+ !c.hasRegistered()
+ || UNHEALTHY_STATES.contains(
+ c.getLastPublished())) // Find candidates locally
+ .filter(
+ c ->
+ clusterState.hasCollection(
+ c.getCollectionName())) // Only care about cores for actual collections
+ .filter(
+ c ->
+ clusterState
+ .getCollection(c.getCollectionName())
+ .getActiveSlicesMap()
+ .containsKey(c.getShardId()))
+ .count());
+ }
+}
diff --git a/solr/core/src/java/org/apache/solr/handler/admin/api/NodeHealthAPI.java b/solr/core/src/java/org/apache/solr/handler/admin/api/NodeHealthAPI.java
deleted file mode 100644
index df5f64900f03..000000000000
--- a/solr/core/src/java/org/apache/solr/handler/admin/api/NodeHealthAPI.java
+++ /dev/null
@@ -1,48 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.solr.handler.admin.api;
-
-import static org.apache.solr.client.solrj.SolrRequest.METHOD.GET;
-import static org.apache.solr.security.PermissionNameProvider.Name.HEALTH_PERM;
-
-import org.apache.solr.api.EndPoint;
-import org.apache.solr.handler.admin.HealthCheckHandler;
-import org.apache.solr.request.SolrQueryRequest;
-import org.apache.solr.response.SolrQueryResponse;
-
-/**
- * V2 API for checking the health of the receiving node.
- *
- * This API (GET /v2/node/health) is analogous to the v1 /admin/info/health.
- */
-public class NodeHealthAPI {
- private final HealthCheckHandler handler;
-
- public NodeHealthAPI(HealthCheckHandler handler) {
- this.handler = handler;
- }
-
- // TODO Update permission here once SOLR-11623 lands.
- @EndPoint(
- path = {"/node/health"},
- method = GET,
- permission = HEALTH_PERM)
- public void getSystemInformation(SolrQueryRequest req, SolrQueryResponse rsp) throws Exception {
- handler.handleRequestBody(req, rsp);
- }
-}
diff --git a/solr/core/src/test/org/apache/solr/handler/admin/HealthCheckHandlerTest.java b/solr/core/src/test/org/apache/solr/handler/admin/HealthCheckHandlerTest.java
index 43838707d057..79036e5c16ed 100644
--- a/solr/core/src/test/org/apache/solr/handler/admin/HealthCheckHandlerTest.java
+++ b/solr/core/src/test/org/apache/solr/handler/admin/HealthCheckHandlerTest.java
@@ -18,11 +18,17 @@
package org.apache.solr.handler.admin;
import static org.apache.solr.common.params.CommonParams.HEALTH_CHECK_HANDLER_PATH;
+import static org.hamcrest.Matchers.containsString;
import java.io.IOException;
+import java.net.URI;
+import java.net.http.HttpClient;
+import java.net.http.HttpRequest;
+import java.net.http.HttpResponse;
import java.util.Arrays;
import java.util.Collection;
import java.util.Properties;
+import java.util.concurrent.TimeUnit;
import org.apache.solr.client.solrj.RemoteSolrException;
import org.apache.solr.client.solrj.SolrClient;
import org.apache.solr.client.solrj.SolrRequest;
@@ -30,10 +36,8 @@
import org.apache.solr.client.solrj.request.CollectionAdminRequest;
import org.apache.solr.client.solrj.request.GenericSolrRequest;
import org.apache.solr.client.solrj.request.HealthCheckRequest;
-import org.apache.solr.client.solrj.request.V2Request;
import org.apache.solr.client.solrj.response.CollectionAdminResponse;
import org.apache.solr.client.solrj.response.HealthCheckResponse;
-import org.apache.solr.client.solrj.response.V2Response;
import org.apache.solr.cloud.CloudDescriptor;
import org.apache.solr.cloud.ClusterStateMockUtil;
import org.apache.solr.cloud.SolrCloudTestCase;
@@ -44,6 +48,7 @@
import org.apache.solr.common.params.CommonParams;
import org.apache.solr.core.CoreDescriptor;
import org.apache.solr.embedded.JettySolrRunner;
+import org.apache.solr.handler.admin.api.NodeHealth;
import org.junit.BeforeClass;
import org.junit.Test;
@@ -104,12 +109,8 @@ public void testHealthCheckHandler() throws Exception {
// negative check of our (new) "broken" node that we deliberately put into an unhealthy state
RemoteSolrException e =
- expectThrows(
- RemoteSolrException.class,
- () -> {
- runHealthcheckWithClient(solrClient);
- });
- assertTrue(e.getMessage(), e.getMessage().contains("Host Unavailable"));
+ expectThrows(RemoteSolrException.class, () -> runHealthcheckWithClient(solrClient));
+ assertThat(e.getMessage(), containsString("Host Unavailable"));
assertEquals(SolrException.ErrorCode.SERVICE_UNAVAILABLE.code, e.code());
} finally {
newJetty.stop();
@@ -135,37 +136,56 @@ public void testHealthCheckHandlerSolrJ() throws IOException, SolrServerExceptio
}
}
+ /**
+ * Verifies that the v1 health-check response body contains {@code "status":"FAILURE"} when the
+ * node is absent from ZooKeeper's live-nodes set.
+ *
+ *
This is a regression test for the refactoring that delegated health-check logic to {@link
+ * NodeHealth}: after that change, {@link SolrException} thrown by {@link NodeHealth} would escape
+ * {@link HealthCheckHandler#handleRequestBody} before the {@code status} field was written to the
+ * response, leaving callers without a machine-readable failure indicator in the body.
+ *
+ *
The node's ZK session is kept alive so that only the live-nodes check fires, not the "not
+ * connected to ZK" check, isolating the specific code path under test.
+ */
@Test
- public void testHealthCheckV2Api() throws Exception {
- V2Response res = new V2Request.Builder("/node/health").build().process(cluster.getSolrClient());
- assertEquals(0, res.getStatus());
- assertEquals(CommonParams.OK, res.getResponse().get(CommonParams.STATUS));
-
- // add a new node for the purpose of negative testing
+ public void testV1FailureResponseIncludesStatusField() throws Exception {
JettySolrRunner newJetty = cluster.startJettySolrRunner();
try (SolrClient solrClient = getHttpSolrClient(newJetty.getBaseUrl().toString())) {
+ // Sanity check: the new node is initially healthy.
+ assertEquals(CommonParams.OK, runHealthcheckWithClient(solrClient).getNodeStatus());
- // positive check that our (new) "healthy" node works with direct http client
- assertEquals(
- CommonParams.OK,
- new V2Request.Builder("/node/health")
- .build()
- .process(solrClient)
- .getResponse()
- .get(CommonParams.STATUS));
-
- // now "break" our (new) node
- newJetty.getCoreContainer().getZkController().getZkClient().close();
-
- // negative check of our (new) "broken" node that we deliberately put into an unhealthy state
- RemoteSolrException e =
- expectThrows(
- RemoteSolrException.class,
- () -> {
- new V2Request.Builder("/node/health").build().process(solrClient);
- });
- assertTrue(e.getMessage(), e.getMessage().contains("Host Unavailable"));
- assertEquals(SolrException.ErrorCode.SERVICE_UNAVAILABLE.code, e.code());
+ String nodeName = newJetty.getCoreContainer().getZkController().getNodeName();
+
+ // Remove the node from ZooKeeper's live_nodes without closing the ZK session.
+ // This ensures the "ZK not connected" check passes and only the "not in live nodes"
+ // check fires, exercising the specific failure branch we fixed.
+ newJetty.getCoreContainer().getZkController().removeEphemeralLiveNode();
+
+ // Wait for the node's own ZkStateReader to reflect the removal before querying.
+ newJetty
+ .getCoreContainer()
+ .getZkController()
+ .getZkStateReader()
+ .waitForLiveNodes(10, TimeUnit.SECONDS, missingLiveNode(nodeName));
+
+ // Use a raw HTTP request so we can inspect the full response body.
+ // SolrJ's HealthCheckRequest throws RemoteSolrException on non-200 responses and does
+ // not expose the response body, so we go below SolrJ here.
+ try (HttpClient httpClient = HttpClient.newHttpClient()) {
+ HttpResponse response =
+ httpClient.send(
+ HttpRequest.newBuilder()
+ .uri(URI.create(newJetty.getBaseUrl() + HEALTH_CHECK_HANDLER_PATH))
+ .build(),
+ HttpResponse.BodyHandlers.ofString());
+
+ assertEquals("Expected 503 SERVICE_UNAVAILABLE", 503, response.statusCode());
+ assertThat(
+ "v1 error response body must contain status=FAILURE so body-inspecting clients get a clear signal",
+ response.body(),
+ containsString("FAILURE"));
+ }
} finally {
newJetty.stop();
}
@@ -193,7 +213,7 @@ public void testFindUnhealthyCores() {
mockCD("invalid", "invalid", "slice1", false, Replica.State.RECOVERING),
// A core for a slice that is not an active slice will not fail the check
mockCD("collection1", "invalid_replica1", "invalid", true, Replica.State.DOWN));
- long unhealthy1 = HealthCheckHandler.findUnhealthyCores(node1Cores, clusterState);
+ long unhealthy1 = NodeHealth.findUnhealthyCores(node1Cores, clusterState);
assertEquals(2, unhealthy1);
// Node 2
@@ -203,7 +223,7 @@ public void testFindUnhealthyCores() {
mockCD("collection1", "slice1_replica4", "slice1", true, Replica.State.DOWN),
mockCD(
"collection2", "slice1_replica1", "slice1", true, Replica.State.RECOVERY_FAILED));
- long unhealthy2 = HealthCheckHandler.findUnhealthyCores(node2Cores, clusterState);
+ long unhealthy2 = NodeHealth.findUnhealthyCores(node2Cores, clusterState);
assertEquals(1, unhealthy2);
}
}
diff --git a/solr/core/src/test/org/apache/solr/handler/admin/api/NodeHealthSolrCloudTest.java b/solr/core/src/test/org/apache/solr/handler/admin/api/NodeHealthSolrCloudTest.java
new file mode 100644
index 000000000000..61ab10b4acd3
--- /dev/null
+++ b/solr/core/src/test/org/apache/solr/handler/admin/api/NodeHealthSolrCloudTest.java
@@ -0,0 +1,129 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.handler.admin.api;
+
+import static org.apache.solr.client.api.model.NodeHealthResponse.NodeStatus.OK;
+import static org.hamcrest.Matchers.containsString;
+
+import java.util.concurrent.TimeUnit;
+import org.apache.solr.client.solrj.SolrClient;
+import org.apache.solr.client.solrj.request.CollectionAdminRequest;
+import org.apache.solr.client.solrj.request.NodeApi;
+import org.apache.solr.cloud.SolrCloudTestCase;
+import org.apache.solr.common.SolrException;
+import org.apache.solr.common.SolrException.ErrorCode;
+import org.apache.solr.embedded.JettySolrRunner;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+/**
+ * Tests for the node-health API, on SolrCloud clusters
+ *
+ * @see NodeHealthStandaloneTest
+ */
+public class NodeHealthSolrCloudTest extends SolrCloudTestCase {
+
+ @BeforeClass
+ public static void setupCluster() throws Exception {
+ configureCluster(1).addConfig("conf", configset("cloud-minimal")).configure();
+
+ CollectionAdminRequest.createCollection(DEFAULT_TEST_COLLECTION_NAME, "conf", 1, 1)
+ .process(cluster.getSolrClient());
+ }
+
+ @Test
+ public void testHealthyNodeReturnsOkStatus() throws Exception {
+ final var request = new NodeApi.Healthcheck();
+ final var response = request.process(cluster.getSolrClient());
+
+ assertNotNull(response);
+ assertEquals(OK, response.status);
+ assertNull("Expected no error on a healthy node", response.error);
+ }
+
+ @Test
+ public void testRequireHealthyCoresReturnOkWhenAllCoresHealthy() throws Exception {
+ final var request = new NodeApi.Healthcheck();
+ request.setRequireHealthyCores(true);
+ final var response = request.process(cluster.getSolrClient());
+
+ assertNotNull(response);
+ assertEquals(OK, response.status);
+ assertEquals("All cores are healthy", response.message);
+ }
+
+ @Test
+ public void testCloudMode_UnhealthyWhenZkClientClosed() throws Exception {
+ // Use a fresh node so closing its ZK client does not break the primary cluster node
+ JettySolrRunner newJetty = cluster.startJettySolrRunner();
+ cluster.waitForNode(newJetty, 30);
+ try (SolrClient nodeClient = newJetty.newClient()) {
+ // Sanity check: the new node should start out healthy
+ assertEquals(OK, new NodeApi.Healthcheck().process(nodeClient).status);
+
+ // Break the ZK connection to put the node into an unhealthy state
+ newJetty.getCoreContainer().getZkController().getZkClient().close();
+
+ SolrException e =
+ assertThrows(SolrException.class, () -> new NodeApi.Healthcheck().process(nodeClient));
+ assertEquals(ErrorCode.SERVICE_UNAVAILABLE.code, e.code());
+ assertThat(e.getMessage(), containsString(("Host Unavailable")));
+ } finally {
+ newJetty.stop();
+ }
+ }
+
+ /**
+ * Verifies that when the node's name is absent from ZooKeeper's live-nodes set (while the ZK
+ * session itself is still connected), the v2 health-check API throws a {@code
+ * SERVICE_UNAVAILABLE} exception with a message identifying the live-nodes check as the cause.
+ *
+ * This specifically exercises the code path at NodeHealth#getClusterState() that checks {@code
+ * clusterState.getLiveNodes().contains(nodeName)}.
+ */
+ @Test
+ public void testNotInLiveNodes_ThrowsServiceUnavailable() throws Exception {
+ JettySolrRunner newJetty = cluster.startJettySolrRunner();
+ cluster.waitForNode(newJetty, 30);
+ try (SolrClient nodeClient = newJetty.newClient()) {
+ // Sanity check: the new node should start out healthy
+ assertEquals(OK, new NodeApi.Healthcheck().process(nodeClient).status);
+
+ String nodeName = newJetty.getCoreContainer().getZkController().getNodeName();
+
+ // Remove the node from ZooKeeper's live_nodes without closing the ZK session.
+ // This ensures the "ZK not connected" check passes and only the "not in live nodes"
+ // check fires, isolating the code path under test.
+ newJetty.getCoreContainer().getZkController().removeEphemeralLiveNode();
+
+ // Wait for the node's own ZkStateReader to reflect the removal before querying it.
+ newJetty
+ .getCoreContainer()
+ .getZkController()
+ .getZkStateReader()
+ .waitForLiveNodes(10, TimeUnit.SECONDS, missingLiveNode(nodeName));
+
+ SolrException e =
+ assertThrows(SolrException.class, () -> new NodeApi.Healthcheck().process(nodeClient));
+ assertEquals(ErrorCode.SERVICE_UNAVAILABLE.code, e.code());
+ assertThat(e.getMessage(), containsString("Not in live nodes"));
+ } finally {
+ newJetty.stop();
+ }
+ }
+}
diff --git a/solr/core/src/test/org/apache/solr/handler/admin/api/NodeHealthStandaloneTest.java b/solr/core/src/test/org/apache/solr/handler/admin/api/NodeHealthStandaloneTest.java
new file mode 100644
index 000000000000..0e3c2765038d
--- /dev/null
+++ b/solr/core/src/test/org/apache/solr/handler/admin/api/NodeHealthStandaloneTest.java
@@ -0,0 +1,66 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.handler.admin.api;
+
+import static org.apache.solr.client.api.model.NodeHealthResponse.NodeStatus.FAILURE;
+import static org.apache.solr.client.api.model.NodeHealthResponse.NodeStatus.OK;
+import static org.hamcrest.Matchers.containsString;
+
+import org.apache.solr.SolrTestCaseJ4;
+import org.apache.solr.client.solrj.request.NodeApi;
+import org.apache.solr.util.SolrJettyTestRule;
+import org.junit.BeforeClass;
+import org.junit.ClassRule;
+import org.junit.Test;
+
+/**
+ * Tests for the node-health API, on Standalone Solr
+ *
+ * @see NodeHealthSolrCloudTest
+ */
+public class NodeHealthStandaloneTest extends SolrTestCaseJ4 {
+
+ @ClassRule public static SolrJettyTestRule solrTestRule = new SolrJettyTestRule();
+
+ @BeforeClass
+ public static void setupCluster() throws Exception {
+ solrTestRule.startSolr(createTempDir());
+ }
+
+ @Test
+ public void testWithoutMaxGenerationLagReturnsOk() throws Exception {
+
+ final var request = new NodeApi.Healthcheck();
+ final var response = request.process(solrTestRule.getAdminClient());
+
+ assertNotNull(response);
+ assertEquals(OK, response.status);
+ assertThat(response.message, containsString("maxGenerationLag isn't specified"));
+ }
+
+ @Test
+ public void testWithNegativeMaxGenerationLagReturnsFailure() throws Exception {
+ final var request = new NodeApi.Healthcheck();
+ request.setMaxGenerationLag(-1);
+ final var response = request.process(solrTestRule.getAdminClient());
+
+ assertNotNull(response);
+ assertEquals(FAILURE, response.status);
+ assertThat(response.message, containsString("Invalid value of maxGenerationLag"));
+ }
+}
diff --git a/solr/core/src/test/org/apache/solr/handler/admin/api/V2NodeAPIMappingTest.java b/solr/core/src/test/org/apache/solr/handler/admin/api/V2NodeAPIMappingTest.java
index 18a09fc75686..6b3c63de45b4 100644
--- a/solr/core/src/test/org/apache/solr/handler/admin/api/V2NodeAPIMappingTest.java
+++ b/solr/core/src/test/org/apache/solr/handler/admin/api/V2NodeAPIMappingTest.java
@@ -34,7 +34,6 @@
import org.apache.solr.common.util.ContentStreamBase;
import org.apache.solr.handler.RequestHandlerBase;
import org.apache.solr.handler.admin.CoreAdminHandler;
-import org.apache.solr.handler.admin.HealthCheckHandler;
import org.apache.solr.handler.admin.InfoHandler;
import org.apache.solr.handler.admin.LoggingHandler;
import org.apache.solr.handler.admin.PropertiesRequestHandler;
@@ -55,7 +54,6 @@ public class V2NodeAPIMappingTest extends SolrTestCaseJ4 {
private InfoHandler infoHandler;
private LoggingHandler mockLoggingHandler;
private PropertiesRequestHandler mockPropertiesHandler;
- private HealthCheckHandler mockHealthCheckHandler;
private ThreadDumpHandler mockThreadDumpHandler;
@BeforeClass
@@ -69,13 +67,11 @@ public void setupApiBag() {
infoHandler = mock(InfoHandler.class);
mockLoggingHandler = mock(LoggingHandler.class);
mockPropertiesHandler = mock(PropertiesRequestHandler.class);
- mockHealthCheckHandler = mock(HealthCheckHandler.class);
mockThreadDumpHandler = mock(ThreadDumpHandler.class);
queryRequestCaptor = ArgumentCaptor.forClass(SolrQueryRequest.class);
when(infoHandler.getLoggingHandler()).thenReturn(mockLoggingHandler);
when(infoHandler.getPropertiesHandler()).thenReturn(mockPropertiesHandler);
- when(infoHandler.getHealthCheckHandler()).thenReturn(mockHealthCheckHandler);
when(infoHandler.getThreadDumpHandler()).thenReturn(mockThreadDumpHandler);
apiBag = new ApiBag(false);
@@ -141,19 +137,6 @@ public void testThreadDumpApiAllProperties() throws Exception {
assertEquals("anyParamValue", v1Params.get("anyParamName"));
}
- @Test
- public void testHealthCheckApiAllProperties() throws Exception {
- final ModifiableSolrParams solrParams = new ModifiableSolrParams();
- solrParams.add("requireHealthyCores", "true");
- solrParams.add("maxGenerationLag", "123");
- final SolrParams v1Params =
- captureConvertedHealthCheckV1Params("/node/health", "GET", solrParams);
-
- // All parameters are passed through to v1 API as-is.
- assertEquals(true, v1Params.getBool("requireHealthyCores"));
- assertEquals(123, v1Params.getPrimitiveInt("maxGenerationLag"));
- }
-
private SolrParams captureConvertedCoreV1Params(String path, String method, String v2RequestBody)
throws Exception {
return doCaptureParams(
@@ -165,11 +148,6 @@ private SolrParams captureConvertedPropertiesV1Params(
return doCaptureParams(path, method, inputParams, null, mockPropertiesHandler);
}
- private SolrParams captureConvertedHealthCheckV1Params(
- String path, String method, SolrParams inputParams) throws Exception {
- return doCaptureParams(path, method, inputParams, null, mockHealthCheckHandler);
- }
-
private SolrParams captureConvertedThreadDumpV1Params(
String path, String method, SolrParams inputParams) throws Exception {
return doCaptureParams(path, method, inputParams, null, mockThreadDumpHandler);
@@ -212,6 +190,5 @@ private static void registerAllNodeApis(
apiBag.registerObject(new RejoinLeaderElectionAPI(coreHandler));
apiBag.registerObject(new NodePropertiesAPI(infoHandler.getPropertiesHandler()));
apiBag.registerObject(new NodeThreadsAPI(infoHandler.getThreadDumpHandler()));
- apiBag.registerObject(new NodeHealthAPI(infoHandler.getHealthCheckHandler()));
}
}
diff --git a/solr/solr-ref-guide/modules/configuration-guide/pages/implicit-requesthandlers.adoc b/solr/solr-ref-guide/modules/configuration-guide/pages/implicit-requesthandlers.adoc
index 16b2f691281e..4380337752c9 100644
--- a/solr/solr-ref-guide/modules/configuration-guide/pages/implicit-requesthandlers.adoc
+++ b/solr/solr-ref-guide/modules/configuration-guide/pages/implicit-requesthandlers.adoc
@@ -40,18 +40,24 @@ This handler must have a collection name in the path to the endpoint.
|`solr//admin/file` |{solr-javadocs}/core/org/apache/solr/handler/admin/ShowFileRequestHandler.html[ShowFileRequestHandler] |`_ADMIN_FILE`
|===
-Health:: Report the health of the node (_available only in SolrCloud mode_)
+Health:: Report the health of the node.
+
[cols="3*.",frame=none,grid=cols,options="header"]
|===
|API Endpoints |Class & Javadocs |Paramset
|v1: `solr/admin/info/health`
-v2: `api/node/health` |{solr-javadocs}/core/org/apache/solr/handler/admin/HealthCheckHandler.html[HealthCheckHandler] |
+v2: `api/node/health` |v1: {solr-javadocs}/core/org/apache/solr/handler/admin/HealthCheckHandler.html[HealthCheckHandler]
+
+v2: {solr-javadocs}/core/org/apache/solr/handler/admin/api/NodeHealth.html[NodeHealth] |
|===
+
-This endpoint also accepts additional request parameters.
-Please see {solr-javadocs}/core/org/apache/solr/handler/admin/HealthCheckHandler.html[Javadocs] for details.
+In SolrCloud mode the handler checks that the node is connected to ZooKeeper and is listed in live nodes.
+The optional `requireHealthyCores=true` parameter additionally requires that all local replicas be in an active state, which is useful for rolling-restart probes.
++
+In user-managed (leader-follower) mode the handler checks replication lag.
+The optional `maxGenerationLag=` parameter specifies the maximum number of Lucene commit generations by which a follower is allowed to trail its leader; the endpoint returns HTTP 503 if any core exceeds this threshold.
+See xref:deployment-guide:user-managed-index-replication.adoc#monitoring-follower-replication-lag[Monitoring Follower Replication Lag] for details and examples.
Logging:: Retrieve and modify registered loggers.
+
diff --git a/solr/solr-ref-guide/modules/deployment-guide/pages/user-managed-index-replication.adoc b/solr/solr-ref-guide/modules/deployment-guide/pages/user-managed-index-replication.adoc
index ea3f0f376747..ff3e4421fbb5 100644
--- a/solr/solr-ref-guide/modules/deployment-guide/pages/user-managed-index-replication.adoc
+++ b/solr/solr-ref-guide/modules/deployment-guide/pages/user-managed-index-replication.adoc
@@ -575,6 +575,63 @@ A snapshot with the name `snapshot._name_` must exist or an error will be return
`location`::: The location where the snapshot is created.
+[[monitoring-follower-replication-lag]]
+== Monitoring Follower Replication Lag
+
+In a leader-follower deployment it is important to know whether followers are keeping pace with the leader.
+Solr's health-check endpoint supports a `maxGenerationLag` request parameter that lets you assert that each follower core is within a specified number of Lucene commit generations of its leader.
+When the follower is lagging more than the allowed number of generations the endpoint returns HTTP 503 (Service Unavailable), making it straightforward to integrate into load-balancer health probes or monitoring systems.
+
+The `maxGenerationLag` parameter is an integer representing the maximum acceptable number of commit generations by which a follower is allowed to trail its leader.
+A value of `0` requires the follower to be fully up to date.
+If the parameter is omitted, the health check returns `OK` regardless of replication lag.
+
+[WARNING]
+====
+Because a follower's generation can only increase when a replication from the leader actually completes, `maxGenerationLag=0` may return `FAILURE` immediately after a follower starts or after a period of network instability even though the follower will catch up on the next poll cycle.
+Use a small positive value (for example `2`) for production monitoring unless you require strict freshness guarantees.
+====
+
+Use the health endpoint as follows:
+
+====
+[.tab-label]*V1 API*
+
+[source,bash]
+----
+http://_follower_host:port_/solr/admin/info/health?maxGenerationLag=<_max_lag_>
+----
+====
+
+====
+[.tab-label]*V2 API*
+
+[source,bash]
+----
+http://_follower_host:port_/api/node/health?maxGenerationLag=<_max_lag_>
+----
+====
+
+A healthy response looks like:
+
+[source,json]
+----
+{
+ "status": "OK",
+ "message": "All the followers are in sync with leader (within maxGenerationLag: 2) or the cores are acting as leader"
+}
+----
+
+When a follower is lagging too far behind, the response is HTTP 503 and the body identifies the lagging cores:
+
+[source,json]
+----
+{
+ "status": "FAILURE",
+ "message": "Cores violating maxGenerationLag:2.\nCore collection1 is lagging by 5 generations"
+}
+----
+
== Optimizing Distributed Indexes
Optimizing an index is not something most users should generally worry about - but in particular users should be aware of the impacts of optimizing an index when using the `ReplicationHandler`.
diff --git a/solr/solrj/src/java/org/apache/solr/common/util/Utils.java b/solr/solrj/src/java/org/apache/solr/common/util/Utils.java
index 164ae8ae7b03..86c96944ace5 100644
--- a/solr/solrj/src/java/org/apache/solr/common/util/Utils.java
+++ b/solr/solrj/src/java/org/apache/solr/common/util/Utils.java
@@ -845,6 +845,12 @@ public static void reflectWrite(MapWriter.EntryWriter ew, Object o) {
* @return a serializable version of the object
*/
public static Object getReflectWriter(Object o) {
+ // Enums serialized as their declared name so that javabin/NamedList consumers
+ // (e.g. HealthCheckHandlerTest comparing against CommonParams.OK == "OK") see
+ // a plain string rather than "pkg.EnumClass:NAME".
+ if (o instanceof Enum> e) {
+ return e.name();
+ }
List fieldWriters = null;
try {
fieldWriters =