[history server][e2e] Add dead cluster tests for Azure Blob Storage

ikchifo · ikchifo · commit d336ac1c2d5e · 2026-02-05T12:31:14.000-05:00
Add e2e tests for Azure Blob Storage that verify history server
functionality after cluster deletion:
- testAzureDeadClusters: verifies all endpoints work for dead clusters
- testAzureLogFileEndpointLiveCluster: verifies log file access for live
- testAzureLogFileEndpointDeadCluster: verifies log file access for dead

Extract shared test helpers to avoid duplication between S3 and Azure:
- VerifyLogFileEndpointReturnsContent
- VerifyLogFileEndpointRejectsPathTraversal
- DeleteRayClusterAndWait
diff --git a/historyserver/test/e2e/historyserver_azureblob_test.go b/historyserver/test/e2e/historyserver_azureblob_test.go
@@ -22,6 +22,18 @@ func TestAzureHistoryServer(t *testing.T) {
 			name:     "Live cluster: historyserver endpoints should be accessible",
 			testFunc: testAzureLiveClusters,
 		},
+		{
+			name:     "Dead cluster: historyserver endpoints should be accessible",
+			testFunc: testAzureDeadClusters,
+		},
+		{
+			name:     "/v0/logs/file endpoint (live cluster)",
+			testFunc: testAzureLogFileEndpointLiveCluster,
+		},
+		{
+			name:     "/v0/logs/file endpoint (dead cluster)",
+			testFunc: testAzureLogFileEndpointDeadCluster,
+		},
 	}
 
 	for _, tt := range tests {
@@ -51,3 +63,76 @@ func testAzureLiveClusters(test Test, g *WithT, namespace *corev1.Namespace, azu
 	DeleteAzureBlobContainer(test, g, azureClient)
 	LogWithTimestamp(test.T(), "Azure live clusters E2E test completed successfully")
 }
+
+func testAzureDeadClusters(test Test, g *WithT, namespace *corev1.Namespace, azureClient *azblob.Client) {
+	rayCluster := PrepareAzureBlobTestEnv(test, g, namespace, azureClient)
+	ApplyRayJobAndWaitForCompletion(test, g, namespace, rayCluster)
+
+	DeleteRayClusterAndWait(test, g, namespace.Name, rayCluster.Name)
+
+	ApplyHistoryServer(test, g, namespace, AzureHistoryServerManifestPath)
+	historyServerURL := GetHistoryServerURL(test, g, namespace)
+
+	clusterInfo := getClusterFromList(test, g, historyServerURL, rayCluster.Name, namespace.Name)
+	g.Expect(clusterInfo.SessionName).NotTo(Equal(LiveSessionName), "Dead cluster should not have sessionName='live'")
+
+	client := CreateHTTPClientWithCookieJar(g)
+	setClusterContext(test, g, client, historyServerURL, namespace.Name, rayCluster.Name, clusterInfo.SessionName)
+	verifyHistoryServerEndpoints(test, g, client, historyServerURL)
+
+	DeleteAzureBlobContainer(test, g, azureClient)
+	LogWithTimestamp(test.T(), "Azure dead clusters E2E test completed successfully")
+}
+
+func testAzureLogFileEndpointLiveCluster(test Test, g *WithT, namespace *corev1.Namespace, azureClient *azblob.Client) {
+	rayCluster := PrepareAzureBlobTestEnv(test, g, namespace, azureClient)
+	ApplyRayJobAndWaitForCompletion(test, g, namespace, rayCluster)
+	ApplyHistoryServer(test, g, namespace, AzureHistoryServerManifestPath)
+	historyServerURL := GetHistoryServerURL(test, g, namespace)
+
+	clusterInfo := getClusterFromList(test, g, historyServerURL, rayCluster.Name, namespace.Name)
+	client := CreateHTTPClientWithCookieJar(g)
+	setClusterContext(test, g, client, historyServerURL, namespace.Name, rayCluster.Name, clusterInfo.SessionName)
+
+	nodeID := GetOneOfNodeID(g, client, historyServerURL)
+
+	test.T().Run("should return log content", func(t *testing.T) {
+		VerifyLogFileEndpointReturnsContent(test, NewWithT(t), client, historyServerURL, nodeID)
+	})
+
+	test.T().Run("should reject path traversal", func(t *testing.T) {
+		VerifyLogFileEndpointRejectsPathTraversal(test, NewWithT(t), client, historyServerURL, nodeID)
+	})
+
+	DeleteAzureBlobContainer(test, g, azureClient)
+	LogWithTimestamp(test.T(), "Azure log file endpoint tests completed")
+}
+
+func testAzureLogFileEndpointDeadCluster(test Test, g *WithT, namespace *corev1.Namespace, azureClient *azblob.Client) {
+	rayCluster := PrepareAzureBlobTestEnv(test, g, namespace, azureClient)
+	ApplyRayJobAndWaitForCompletion(test, g, namespace, rayCluster)
+
+	DeleteRayClusterAndWait(test, g, namespace.Name, rayCluster.Name)
+
+	ApplyHistoryServer(test, g, namespace, AzureHistoryServerManifestPath)
+	historyServerURL := GetHistoryServerURL(test, g, namespace)
+
+	clusterInfo := getClusterFromList(test, g, historyServerURL, rayCluster.Name, namespace.Name)
+	g.Expect(clusterInfo.SessionName).NotTo(Equal(LiveSessionName))
+
+	client := CreateHTTPClientWithCookieJar(g)
+	setClusterContext(test, g, client, historyServerURL, namespace.Name, rayCluster.Name, clusterInfo.SessionName)
+
+	nodeID := GetOneOfNodeID(g, client, historyServerURL)
+
+	test.T().Run("should return log content from Azure Blob", func(t *testing.T) {
+		VerifyLogFileEndpointReturnsContent(test, NewWithT(t), client, historyServerURL, nodeID)
+	})
+
+	test.T().Run("should reject path traversal from Azure Blob", func(t *testing.T) {
+		VerifyLogFileEndpointRejectsPathTraversal(test, NewWithT(t), client, historyServerURL, nodeID)
+	})
+
+	DeleteAzureBlobContainer(test, g, azureClient)
+	LogWithTimestamp(test.T(), "Azure dead cluster log file endpoint tests completed")
+}
diff --git a/historyserver/test/e2e/historyserver_test.go b/historyserver/test/e2e/historyserver_test.go
@@ -10,19 +10,14 @@ import (
 	"github.com/aws/aws-sdk-go/service/s3"
 	. "github.com/onsi/gomega"
 	corev1 "k8s.io/api/core/v1"
-	k8serrors "k8s.io/apimachinery/pkg/api/errors"
-	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 
 	. "github.com/ray-project/kuberay/ray-operator/test/support"
 
 	"github.com/ray-project/kuberay/historyserver/pkg/utils"
 	. "github.com/ray-project/kuberay/historyserver/test/support"
 )
 
-const (
-	LiveSessionName = "live"
-	EndpointLogFile = "/api/v0/logs/file"
-)
+const LiveSessionName = "live"
 
 func TestHistoryServer(t *testing.T) {
 	// Share a single S3 client among subtests.
@@ -192,15 +187,6 @@ func getClusterFromList(test Test, g *WithT, historyServerURL, clusterName, name
 }
 
 // testLogFileEndpointLiveCluster verifies that the history server can fetch log files from a live cluster.
-//
-// The test case follows these steps:
-// 1. Prepare test environment by applying a Ray cluster
-// 2. Submit a Ray job to the existing cluster
-// 3. Apply History Server and get its URL
-// 4. Get the cluster info from the list
-// 5. Verify that the history server can fetch log content (raylet.out)
-// 6. Verify that the history server rejects path traversal attempts
-// 7. Delete S3 bucket to ensure test isolation
 func testLogFileEndpointLiveCluster(test Test, g *WithT, namespace *corev1.Namespace, s3Client *s3.S3) {
 	rayCluster := PrepareTestEnv(test, g, namespace, s3Client)
 	ApplyRayJobAndWaitForCompletion(test, g, namespace, rayCluster)
@@ -212,70 +198,25 @@ func testLogFileEndpointLiveCluster(test Test, g *WithT, namespace *corev1.Names
 	setClusterContext(test, g, client, historyServerURL, namespace.Name, rayCluster.Name, clusterInfo.SessionName)
 
 	nodeID := GetOneOfNodeID(g, client, historyServerURL)
-	// Hardcode "raylet.out" for deterministic testing.
-	filename := "raylet.out"
 
 	test.T().Run("should return log content", func(t *testing.T) {
-		g := NewWithT(t)
-		g.Eventually(func(gg Gomega) {
-			logFileURL := fmt.Sprintf("%s%s?node_id=%s&filename=%s&lines=100", historyServerURL, EndpointLogFile, nodeID, filename)
-			resp, err := client.Get(logFileURL)
-			gg.Expect(err).NotTo(HaveOccurred())
-			defer resp.Body.Close()
-			gg.Expect(resp.StatusCode).To(Equal(http.StatusOK))
-
-			body, err := io.ReadAll(resp.Body)
-			gg.Expect(err).NotTo(HaveOccurred())
-			gg.Expect(len(body)).To(BeNumerically(">", 0))
-		}, TestTimeoutShort).Should(Succeed())
+		VerifyLogFileEndpointReturnsContent(test, NewWithT(t), client, historyServerURL, nodeID)
 	})
 
 	test.T().Run("should reject path traversal", func(t *testing.T) {
-		g := NewWithT(t)
-		maliciousPaths := []string{"../etc/passwd", "..", "/etc/passwd", "../../secret"}
-
-		for _, malicious := range maliciousPaths {
-			g.Eventually(func(gg Gomega) {
-				url := fmt.Sprintf("%s%s?node_id=%s&filename=%s", historyServerURL, EndpointLogFile, nodeID, malicious)
-				resp, err := client.Get(url)
-				gg.Expect(err).NotTo(HaveOccurred())
-				defer func() {
-					io.Copy(io.Discard, resp.Body)
-					resp.Body.Close()
-				}()
-				gg.Expect(resp.StatusCode).To(Equal(http.StatusBadRequest))
-			}, TestTimeoutShort).Should(Succeed())
-		}
+		VerifyLogFileEndpointRejectsPathTraversal(test, NewWithT(t), client, historyServerURL, nodeID)
 	})
 
 	DeleteS3Bucket(test, g, s3Client)
 	LogWithTimestamp(test.T(), "Log file endpoint tests completed")
 }
 
 // testLogFileEndpointDeadCluster verifies that the history server can fetch log files from S3 after a cluster is deleted.
-//
-// The test case follows these steps:
-// 1. Prepare test environment by applying a Ray cluster
-// 2. Submit a Ray job to the existing cluster
-// 3. Delete RayCluster to trigger log upload to S3
-// 4. Apply History Server and get its URL
-// 5. Verify that the history server can fetch log content from S3 (raylet.out)
-// 6. Verify that the history server rejects path traversal attempts from S3
-// 7. Delete S3 bucket to ensure test isolation
 func testLogFileEndpointDeadCluster(test Test, g *WithT, namespace *corev1.Namespace, s3Client *s3.S3) {
 	rayCluster := PrepareTestEnv(test, g, namespace, s3Client)
 	ApplyRayJobAndWaitForCompletion(test, g, namespace, rayCluster)
 
-	// Delete RayCluster to trigger log upload
-	err := test.Client().Ray().RayV1().RayClusters(namespace.Name).Delete(test.Ctx(), rayCluster.Name, metav1.DeleteOptions{})
-	g.Expect(err).NotTo(HaveOccurred())
-	LogWithTimestamp(test.T(), "Deleted RayCluster %s/%s", namespace.Name, rayCluster.Name)
-
-	// Wait for cluster to be fully deleted (ensures logs are uploaded to S3)
-	g.Eventually(func() error {
-		_, err := GetRayCluster(test, namespace.Name, rayCluster.Name)
-		return err
-	}, TestTimeoutMedium).Should(WithTransform(k8serrors.IsNotFound, BeTrue()))
+	DeleteRayClusterAndWait(test, g, namespace.Name, rayCluster.Name)
 
 	ApplyHistoryServer(test, g, namespace, "")
 	historyServerURL := GetHistoryServerURL(test, g, namespace)
@@ -287,40 +228,13 @@ func testLogFileEndpointDeadCluster(test Test, g *WithT, namespace *corev1.Names
 	setClusterContext(test, g, client, historyServerURL, namespace.Name, rayCluster.Name, clusterInfo.SessionName)
 
 	nodeID := GetOneOfNodeID(g, client, historyServerURL)
-	// Hardcode "raylet.out" for deterministic testing.
-	filename := "raylet.out"
 
 	test.T().Run("should return log content from S3", func(t *testing.T) {
-		g := NewWithT(t)
-		g.Eventually(func(gg Gomega) {
-			logFileURL := fmt.Sprintf("%s%s?node_id=%s&filename=%s&lines=100", historyServerURL, EndpointLogFile, nodeID, filename)
-			resp, err := client.Get(logFileURL)
-			gg.Expect(err).NotTo(HaveOccurred())
-			defer resp.Body.Close()
-			gg.Expect(resp.StatusCode).To(Equal(http.StatusOK))
-
-			body, err := io.ReadAll(resp.Body)
-			gg.Expect(err).NotTo(HaveOccurred())
-			gg.Expect(len(body)).To(BeNumerically(">", 0))
-		}, TestTimeoutShort).Should(Succeed())
+		VerifyLogFileEndpointReturnsContent(test, NewWithT(t), client, historyServerURL, nodeID)
 	})
 
 	test.T().Run("should reject path traversal from S3", func(t *testing.T) {
-		g := NewWithT(t)
-		maliciousPaths := []string{"../etc/passwd", "..", "/etc/passwd", "../../secret"}
-
-		for _, malicious := range maliciousPaths {
-			g.Eventually(func(gg Gomega) {
-				url := fmt.Sprintf("%s%s?node_id=%s&filename=%s", historyServerURL, EndpointLogFile, nodeID, malicious)
-				resp, err := client.Get(url)
-				gg.Expect(err).NotTo(HaveOccurred())
-				defer func() {
-					io.Copy(io.Discard, resp.Body)
-					resp.Body.Close()
-				}()
-				gg.Expect(resp.StatusCode).To(Equal(http.StatusBadRequest))
-			}, TestTimeoutShort).Should(Succeed())
-		}
+		VerifyLogFileEndpointRejectsPathTraversal(test, NewWithT(t), client, historyServerURL, nodeID)
 	})
 
 	DeleteS3Bucket(test, g, s3Client)
diff --git a/historyserver/test/support/historyserver.go b/historyserver/test/support/historyserver.go
@@ -217,3 +217,58 @@ func GetOneOfNodeID(g *WithT, client *http.Client, historyServerURL string) stri
 	nodeInfo := summary[0].(map[string]any)
 	return nodeInfo["raylet"].(map[string]any)["nodeId"].(string)
 }
+
+// VerifyLogFileEndpointReturnsContent verifies that the log file endpoint returns content.
+func VerifyLogFileEndpointReturnsContent(test Test, g *WithT, client *http.Client, historyServerURL, nodeID string) {
+	filename := "raylet.out"
+	endpointLogFile := "/api/v0/logs/file"
+
+	g.Eventually(func(gg Gomega) {
+		logFileURL := fmt.Sprintf("%s%s?node_id=%s&filename=%s&lines=100", historyServerURL, endpointLogFile, nodeID, filename)
+		resp, err := client.Get(logFileURL)
+		gg.Expect(err).NotTo(HaveOccurred())
+		defer resp.Body.Close()
+		gg.Expect(resp.StatusCode).To(Equal(http.StatusOK))
+
+		body, err := io.ReadAll(resp.Body)
+		gg.Expect(err).NotTo(HaveOccurred())
+		gg.Expect(len(body)).To(BeNumerically(">", 0))
+	}, TestTimeoutShort).Should(Succeed())
+
+	LogWithTimestamp(test.T(), "Log file endpoint returned content successfully")
+}
+
+// VerifyLogFileEndpointRejectsPathTraversal verifies that the log file endpoint rejects path traversal attempts.
+func VerifyLogFileEndpointRejectsPathTraversal(test Test, g *WithT, client *http.Client, historyServerURL, nodeID string) {
+	endpointLogFile := "/api/v0/logs/file"
+	maliciousPaths := []string{"../etc/passwd", "..", "/etc/passwd", "../../secret"}
+
+	for _, malicious := range maliciousPaths {
+		g.Eventually(func(gg Gomega) {
+			url := fmt.Sprintf("%s%s?node_id=%s&filename=%s", historyServerURL, endpointLogFile, nodeID, malicious)
+			resp, err := client.Get(url)
+			gg.Expect(err).NotTo(HaveOccurred())
+			defer func() {
+				io.Copy(io.Discard, resp.Body)
+				resp.Body.Close()
+			}()
+			gg.Expect(resp.StatusCode).To(Equal(http.StatusBadRequest))
+		}, TestTimeoutShort).Should(Succeed())
+	}
+
+	LogWithTimestamp(test.T(), "Log file endpoint correctly rejected path traversal attempts")
+}
+
+// DeleteRayClusterAndWait deletes a RayCluster and waits for it to be fully deleted.
+func DeleteRayClusterAndWait(test Test, g *WithT, namespace string, clusterName string) {
+	err := test.Client().Ray().RayV1().RayClusters(namespace).Delete(test.Ctx(), clusterName, metav1.DeleteOptions{})
+	g.Expect(err).NotTo(HaveOccurred())
+	LogWithTimestamp(test.T(), "Deleted RayCluster %s/%s", namespace, clusterName)
+
+	g.Eventually(func() error {
+		_, err := GetRayCluster(test, namespace, clusterName)
+		return err
+	}, TestTimeoutMedium).Should(WithTransform(k8serrors.IsNotFound, BeTrue()))
+
+	LogWithTimestamp(test.T(), "RayCluster %s/%s fully deleted", namespace, clusterName)
+}