Skip to content
Open
Show file tree
Hide file tree
Changes from 15 commits
Commits
Show all changes
17 commits
Select commit Hold shift + click to select a range
7b21fe9
[historyserver] implement grafana health for live session
fscnick Jan 21, 2026
ab79f49
[historyserver] move getGrafanaHealth to router.go
fscnick Jan 23, 2026
31c56c4
Merge remote-tracking branch 'upstream/master' into feat/history-serv…
fscnick Jan 23, 2026
5ab8315
Merge remote-tracking branch 'upstream/master' into feat/history-serv…
fscnick Jan 24, 2026
8f4ea10
Merge remote-tracking branch 'upstream/master' into feat/history-serv…
fscnick Jan 24, 2026
a7a2aba
[Doc][historyserver] add setup for grafana and RayCluster with histor…
fscnick Jan 24, 2026
210a9a5
[Test][historyserver] add e2e test for grafana health
fscnick Jan 24, 2026
123d049
[Test][historyserver] use CombinedOutput to ensure cleanup exits afte…
fscnick Jan 25, 2026
8b44af6
[Test][historyserver] test grafana health response body
fscnick Jan 25, 2026
476f96d
Merge remote-tracking branch 'upstream/master' into feat/history-serv…
fscnick Jan 27, 2026
4231f27
[Test][historyserver] add missing injectCollectorRayClusterID
fscnick Jan 27, 2026
320055e
[historyserver] add prometheus health e2e test for live session
my-vegetable-has-exploded Jan 28, 2026
8f7bbe5
clean code.
my-vegetable-has-exploded Jan 29, 2026
4fc990a
Merge remote-tracking branch 'origin/master' into prometheus-health-e2e
my-vegetable-has-exploded Feb 3, 2026
2664a5e
unify PrepareTestEnv.
my-vegetable-has-exploded Feb 3, 2026
6630e67
Merge remote-tracking branch 'upstream/master' into prometheus-health…
Future-Outlier Feb 5, 2026
b0d3522
nits
Future-Outlier Feb 5, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion historyserver/docs/set_up_historyserver.md
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,7 @@ curl -b ~/cookies.txt "http://localhost:8080/api/jobs/"
curl -b ~/cookies.txt "http://localhost:8080/api/cluster_status"
```

### Live Cluster with grafana
### Live Cluster with prometheus and grafana

```bash
# Install grafana. ref: https://docs.ray.io/en/latest/cluster/kubernetes/k8s-ecosystem/prometheus-grafana.html#step-2-install-kubernetes-prometheus-stack-via-helm-chart
Expand All @@ -183,6 +183,9 @@ kubectl apply -f ray-operator/config/samples/ray-cluster.embed-grafana.yaml
# Get live session cookie. (Port-forward is required)
curl -c ~/cookies.txt "http://localhost:8080/enter_cluster/default/raycluster-embed-grafana/live"

# Request to grafana health endpoint
curl -b ~/cookies.txt http://localhost:8080/api/prometheus_health

# Request to grafana health endpoint
curl -b ~/cookies.txt http://localhost:8080/api/grafana_health
```
46 changes: 45 additions & 1 deletion historyserver/test/e2e/historyserver_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,10 @@ func TestHistoryServer(t *testing.T) {
name: "Live cluster: grafana health only",
testFunc: testLiveGrafanaHealth,
},
{
name: "Live cluster: prometheus health only",
testFunc: testLivePrometheusHealth,
},
{
name: "/v0/logs/file endpoint (live cluster)",
testFunc: testLogFileEndpointLiveCluster,
Expand Down Expand Up @@ -78,7 +82,7 @@ func testLiveClusters(test Test, g *WithT, namespace *corev1.Namespace, s3Client
}

func testLiveGrafanaHealth(test Test, g *WithT, namespace *corev1.Namespace, s3Client *s3.S3) {
rayCluster := PrepareTestEnvWithGrafana(test, g, namespace, s3Client)
rayCluster := PrepareTestEnvWithPrometheusAndGrafana(test, g, namespace, s3Client)
ApplyRayJobAndWaitForCompletion(test, g, namespace, rayCluster)
ApplyHistoryServer(test, g, namespace)
historyServerURL := GetHistoryServerURL(test, g, namespace)
Expand All @@ -95,6 +99,22 @@ func testLiveGrafanaHealth(test Test, g *WithT, namespace *corev1.Namespace, s3C
LogWithTimestamp(test.T(), "Live clusters grafana health E2E test completed successfully")
}

func testLivePrometheusHealth(test Test, g *WithT, namespace *corev1.Namespace, s3Client *s3.S3) {
rayCluster := PrepareTestEnvWithPrometheusAndGrafana(test, g, namespace, s3Client)
ApplyRayJobAndWaitForCompletion(test, g, namespace, rayCluster)
ApplyHistoryServer(test, g, namespace)
historyServerURL := GetHistoryServerURL(test, g, namespace)

clusterInfo := getClusterFromList(test, g, historyServerURL, rayCluster.Name, namespace.Name)
g.Expect(clusterInfo.SessionName).To(Equal(LiveSessionName), "Live cluster should have sessionName='live'")

client := CreateHTTPClientWithCookieJar(g)
setClusterContext(test, g, client, historyServerURL, namespace.Name, rayCluster.Name, clusterInfo.SessionName)
verifyHistoryServerPrometheusHealthEndpoint(test, g, client, historyServerURL)
DeleteS3Bucket(test, g, s3Client)
LogWithTimestamp(test.T(), "Live clusters prometheus health E2E test completed successfully")
}

// setClusterContext sets the cluster context via /enter_cluster/ endpoint and verifies the response.
func setClusterContext(test Test, g *WithT, client *http.Client, historyServerURL, namespace, clusterName, session string) {
enterURL := fmt.Sprintf("%s/enter_cluster/%s/%s/%s", historyServerURL, namespace, clusterName, session)
Expand Down Expand Up @@ -159,6 +179,30 @@ func verifyHistoryServerGrafanaHealthEndpoint(test Test, g *WithT, client *http.

}

// verifyHistoryServerPrometheusHealthEndpoint tests the /api/prometheus_health endpoint
func verifyHistoryServerPrometheusHealthEndpoint(test Test, g *WithT, client *http.Client, historyServerURL string) {
endpoint := HistoryServerEndpointPrometheusHealth
LogWithTimestamp(test.T(), "Testing history server endpoint: %s", endpoint)

g.Eventually(func(gg Gomega) {
resp, err := client.Get(historyServerURL + endpoint)
gg.Expect(err).NotTo(HaveOccurred())
defer resp.Body.Close()

body, err := io.ReadAll(resp.Body)
gg.Expect(err).NotTo(HaveOccurred())
gg.Expect(resp.StatusCode).To(Equal(200),
"Endpoint %s should return 200, got %d: %s", endpoint, resp.StatusCode, string(body))

var result map[string]any
err = json.Unmarshal(body, &result)
gg.Expect(err).NotTo(HaveOccurred())
gg.Expect(result["result"]).To(Equal(true), "Response should have result=true")
gg.Expect(result["msg"]).To(ContainSubstring("prometheus running"), "Response message should contain 'prometheus running'")
LogWithTimestamp(test.T(), "Endpoint %s returned status %d with valid response", endpoint, resp.StatusCode)
}, TestTimeoutShort).Should(Succeed())
}

// getClusterFromList retrieves a cluster from the /clusters/ endpoint by name and namespace.
func getClusterFromList(test Test, g *WithT, historyServerURL, clusterName, namespace string) *utils.ClusterInfo {
LogWithTimestamp(test.T(), "Getting cluster %s/%s from /clusters/ endpoint", namespace, clusterName)
Expand Down
9 changes: 5 additions & 4 deletions historyserver/test/support/historyserver.go
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,6 @@ const (
// Excluded endpoints that are not yet implemented:
// - /events
// - /api/cluster_status
// - /api/prometheus_health
// - /api/data/datasets/{job_id}
// - /api/jobs
// - /api/serve/applications
Expand All @@ -68,8 +67,9 @@ var HistoryServerEndpoints = []string{
"/logical/actors",
}

// HistoryServerEndpointGrafanaHealth is a standalone constant
// HistoryServerEndpointPrometheusHealth and HistoryServerEndpointGrafanaHealth are standalone constants
// because it requires some additional dependencies.
const HistoryServerEndpointPrometheusHealth = "/api/prometheus_health"
const HistoryServerEndpointGrafanaHealth = "/api/grafana_health"

// ApplyHistoryServer deploys the HistoryServer and RBAC resources.
Expand Down Expand Up @@ -161,15 +161,16 @@ func PrepareTestEnv(test Test, g *WithT, namespace *corev1.Namespace, s3Client *
return rayCluster
}

// PrepareTestEnvWithGrafana prepares test environment with Grafana for each test case, including applying a Ray cluster,
// PrepareTestEnvWithPrometheusAndGrafana prepares test environment with Prometheus and Grafana for each test case, including applying a Ray cluster,
// checking the collector sidecar container exists in the head pod and an empty S3 bucket exists.
func PrepareTestEnvWithGrafana(test Test, g *WithT, namespace *corev1.Namespace, s3Client *s3.S3) *rayv1.RayCluster {
func PrepareTestEnvWithPrometheusAndGrafana(test Test, g *WithT, namespace *corev1.Namespace, s3Client *s3.S3) *rayv1.RayCluster {

InstallGrafanaAndPrometheus(test, g)

additionalEnvs := map[string]string{
"RAY_GRAFANA_IFRAME_HOST": RayGrafanaIframeHost,
"RAY_GRAFANA_HOST": "http://prometheus-grafana.prometheus-system.svc:80",
"RAY_PROMETHEUS_HOST": "http://prometheus-kube-prometheus-prometheus.prometheus-system.svc:9090",
}

// Deploy a Ray cluster with the collector.
Expand Down
Loading