From 1eb0afb49288fc4a5f878a4215607026101773c1 Mon Sep 17 00:00:00 2001
From: machichima <nary12321@gmail.com>
Date: Thu, 29 Jan 2026 19:14:39 +0800
Subject: [PATCH 01/37] test: add more test case for live cluster e2e test

Signed-off-by: machichima <nary12321@gmail.com>
---
 historyserver/test/e2e/historyserver_test.go | 88 +++++++++++++++-----
 1 file changed, 65 insertions(+), 23 deletions(-)

diff --git a/historyserver/test/e2e/historyserver_test.go b/historyserver/test/e2e/historyserver_test.go
index e2cb61afd3c..00ffc59816b 100644
--- a/historyserver/test/e2e/historyserver_test.go
+++ b/historyserver/test/e2e/historyserver_test.go
@@ -171,38 +171,80 @@ func testLogFileEndpointLiveCluster(test Test, g *WithT, namespace *corev1.Names
 	// Hardcode "raylet.out" for deterministic testing.
 	filename := "raylet.out"
 
-	test.T().Run("should return log content", func(t *testing.T) {
-		g := NewWithT(t)
-		g.Eventually(func(gg Gomega) {
-			logFileURL := fmt.Sprintf("%s%s?node_id=%s&filename=%s&lines=100", historyServerURL, EndpointLogFile, nodeID, filename)
-			resp, err := client.Get(logFileURL)
-			gg.Expect(err).NotTo(HaveOccurred())
-			defer resp.Body.Close()
-			gg.Expect(resp.StatusCode).To(Equal(http.StatusOK))
-
-			body, err := io.ReadAll(resp.Body)
-			gg.Expect(err).NotTo(HaveOccurred())
-			gg.Expect(len(body)).To(BeNumerically(">", 0))
-		}, TestTimeoutShort).Should(Succeed())
-	})
-
-	test.T().Run("should reject path traversal", func(t *testing.T) {
-		g := NewWithT(t)
-		maliciousPaths := []string{"../etc/passwd", "..", "/etc/passwd", "../../secret"}
+	// Define test cases covering different parameters
+	logFileTestCases := []struct {
+		name           string
+		buildURL       func(baseURL, nodeID string) string
+		expectedStatus int
+	}{
+		// lines parameter
+		{"lines=100", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=%s&lines=100", u, EndpointLogFile, n, filename) }, http.StatusOK},
+		{"lines=0 (default 1000)", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=%s", u, EndpointLogFile, n, filename) }, http.StatusOK},
+		{"lines=-1 (all)", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=%s&lines=-1", u, EndpointLogFile, n, filename) }, http.StatusOK},
+
+		// timeout parameter
+		{"timeout=5", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=%s&timeout=5", u, EndpointLogFile, n, filename) }, http.StatusOK},
+		{"timeout=30", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=%s&timeout=30", u, EndpointLogFile, n, filename) }, http.StatusOK},
+
+		// attempt_number parameter
+		{"attempt_number=0", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=%s&attempt_number=0", u, EndpointLogFile, n, filename) }, http.StatusOK},
+		{"attempt_number=1", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=%s&attempt_number=1", u, EndpointLogFile, n, filename) }, http.StatusOK},
+
+		// download_filename parameter
+		{"download_filename=custom.log", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=%s&download_filename=custom.log", u, EndpointLogFile, n, filename) }, http.StatusOK},
+
+		// filter_ansi_code parameter
+		{"filter_ansi_code=true", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=%s&filter_ansi_code=true", u, EndpointLogFile, n, filename) }, http.StatusOK},
+		{"filter_ansi_code=false", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=%s&filter_ansi_code=false", u, EndpointLogFile, n, filename) }, http.StatusOK},
+
+		// Combined parameters
+		{"lines+timeout+filter", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=%s&lines=50&timeout=10&filter_ansi_code=true", u, EndpointLogFile, n, filename) }, http.StatusOK},
+
+		// Missing mandatory parameters
+		{"missing node_id", func(u, n string) string { return fmt.Sprintf("%s%s?filename=%s", u, EndpointLogFile, filename) }, http.StatusBadRequest},
+		{"missing filename", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s", u, EndpointLogFile, n) }, http.StatusBadRequest},
+		{"missing both", func(u, n string) string { return fmt.Sprintf("%s%s", u, EndpointLogFile) }, http.StatusBadRequest},
+
+		// Invalid parameters
+		// NOTE: Ray Dashboard will only return 500 (Internal Server Error) for the exceptions (including file not found and invalid parameter)
+		// ref: https://github.com/ray-project/ray/blob/68d01c4c48a59c7768ec9c2359a1859966c446b6/python/ray/dashboard/modules/state/state_head.py#L282-L284
+		{"invalid lines (string)", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=%s&lines=abc", u, EndpointLogFile, n, filename) }, http.StatusInternalServerError},
+		{"invalid timeout (string)", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=%s&timeout=invalid", u, EndpointLogFile, n, filename) }, http.StatusInternalServerError},
+		{"invalid attempt_number (string)", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=%s&attempt_number=xyz", u, EndpointLogFile, n, filename) }, http.StatusInternalServerError},
+		{"file not found", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=nonexistent.log", u, EndpointLogFile, n) }, http.StatusInternalServerError},
+
+		// Path traversal attacks
+		{"traversal ../etc/passwd", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=../etc/passwd", u, EndpointLogFile, n) }, http.StatusBadRequest},
+		{"traversal ..", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=..", u, EndpointLogFile, n) }, http.StatusBadRequest},
+		{"traversal /etc/passwd", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=/etc/passwd", u, EndpointLogFile, n) }, http.StatusBadRequest},
+		{"traversal ../../secret", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=../../secret", u, EndpointLogFile, n) }, http.StatusBadRequest},
+		{"traversal in node_id", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=../evil&filename=%s", u, EndpointLogFile, filename) }, http.StatusBadRequest},
+	}
 
-		for _, malicious := range maliciousPaths {
+	for _, tc := range logFileTestCases {
+		test.T().Run(tc.name, func(t *testing.T) {
+			g := NewWithT(t)
 			g.Eventually(func(gg Gomega) {
-				url := fmt.Sprintf("%s%s?node_id=%s&filename=%s", historyServerURL, EndpointLogFile, nodeID, malicious)
+				url := tc.buildURL(historyServerURL, nodeID)
 				resp, err := client.Get(url)
 				gg.Expect(err).NotTo(HaveOccurred())
 				defer func() {
 					io.Copy(io.Discard, resp.Body)
 					resp.Body.Close()
 				}()
-				gg.Expect(resp.StatusCode).To(Equal(http.StatusBadRequest))
+
+				gg.Expect(resp.StatusCode).To(Equal(tc.expectedStatus),
+					"Test case '%s' failed: expected status %d, got %d", tc.name, tc.expectedStatus, resp.StatusCode)
+
+				if tc.expectedStatus == http.StatusOK {
+					body, err := io.ReadAll(resp.Body)
+					gg.Expect(err).NotTo(HaveOccurred())
+					gg.Expect(len(body)).To(BeNumerically(">", 0),
+						"Test case '%s' should return non-empty body", tc.name)
+				}
 			}, TestTimeoutShort).Should(Succeed())
-		}
-	})
+		})
+	}
 
 	DeleteS3Bucket(test, g, s3Client)
 	LogWithTimestamp(test.T(), "Log file endpoint tests completed")

From 64df48dc6fc30c105189a9ff7b7a5ec50b6349e9 Mon Sep 17 00:00:00 2001
From: machichima <nary12321@gmail.com>
Date: Thu, 29 Jan 2026 19:21:01 +0800
Subject: [PATCH 02/37] test: more test case for dead cluster e2e test

Signed-off-by: machichima <nary12321@gmail.com>
---
 historyserver/test/e2e/historyserver_test.go | 72 +++++++++++++-------
 1 file changed, 48 insertions(+), 24 deletions(-)

diff --git a/historyserver/test/e2e/historyserver_test.go b/historyserver/test/e2e/historyserver_test.go
index 00ffc59816b..4f8a811ce67 100644
--- a/historyserver/test/e2e/historyserver_test.go
+++ b/historyserver/test/e2e/historyserver_test.go
@@ -257,9 +257,10 @@ func testLogFileEndpointLiveCluster(test Test, g *WithT, namespace *corev1.Names
 // 2. Submit a Ray job to the existing cluster
 // 3. Delete RayCluster to trigger log upload to S3
 // 4. Apply History Server and get its URL
-// 5. Verify that the history server can fetch log content from S3 (raylet.out)
-// 6. Verify that the history server rejects path traversal attempts from S3
-// 7. Delete S3 bucket to ensure test isolation
+// 5. Verify that the history server can fetch log content from S3
+// 6. Verify parameter validation for dead cluster
+// 7. Verify security (path traversal) protection
+// 8. Delete S3 bucket to ensure test isolation
 func testLogFileEndpointDeadCluster(test Test, g *WithT, namespace *corev1.Namespace, s3Client *s3.S3) {
 	rayCluster := PrepareTestEnv(test, g, namespace, s3Client)
 	ApplyRayJobAndWaitForCompletion(test, g, namespace, rayCluster)
@@ -288,38 +289,61 @@ func testLogFileEndpointDeadCluster(test Test, g *WithT, namespace *corev1.Names
 	// Hardcode "raylet.out" for deterministic testing.
 	filename := "raylet.out"
 
-	test.T().Run("should return log content from S3", func(t *testing.T) {
-		g := NewWithT(t)
-		g.Eventually(func(gg Gomega) {
-			logFileURL := fmt.Sprintf("%s%s?node_id=%s&filename=%s&lines=100", historyServerURL, EndpointLogFile, nodeID, filename)
-			resp, err := client.Get(logFileURL)
-			gg.Expect(err).NotTo(HaveOccurred())
-			defer resp.Body.Close()
-			gg.Expect(resp.StatusCode).To(Equal(http.StatusOK))
+	// Define test cases for dead cluster (S3 storage)
+	// Note: Dead cluster only supports basic parameters (node_id, filename, lines)
+	// Advanced features like timeout, attempt_number, download_filename, filter_ansi_code
+	// are only available for live clusters
+	logFileTestCases := []struct {
+		name           string
+		buildURL       func(baseURL, nodeID string) string
+		expectedStatus int
+	}{
+		// Supported parameters
+		{"lines=100", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=%s&lines=100", u, EndpointLogFile, n, filename) }, http.StatusOK},
+		{"lines=0 (default 1000)", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=%s", u, EndpointLogFile, n, filename) }, http.StatusOK},
+		{"lines=-1 (all)", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=%s&lines=-1", u, EndpointLogFile, n, filename) }, http.StatusOK},
 
-			body, err := io.ReadAll(resp.Body)
-			gg.Expect(err).NotTo(HaveOccurred())
-			gg.Expect(len(body)).To(BeNumerically(">", 0))
-		}, TestTimeoutShort).Should(Succeed())
-	})
+		// Missing mandatory parameters
+		{"missing node_id", func(u, n string) string { return fmt.Sprintf("%s%s?filename=%s", u, EndpointLogFile, filename) }, http.StatusBadRequest},
+		{"missing filename", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s", u, EndpointLogFile, n) }, http.StatusBadRequest},
+		{"missing both", func(u, n string) string { return fmt.Sprintf("%s%s", u, EndpointLogFile) }, http.StatusBadRequest},
 
-	test.T().Run("should reject path traversal from S3", func(t *testing.T) {
-		g := NewWithT(t)
-		maliciousPaths := []string{"../etc/passwd", "..", "/etc/passwd", "../../secret"}
+		// Invalid parameters
+		{"invalid lines (string)", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=%s&lines=abc", u, EndpointLogFile, n, filename) }, http.StatusBadRequest},
+		{"file not found", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=nonexistent.log", u, EndpointLogFile, n) }, http.StatusNotFound},
 
-		for _, malicious := range maliciousPaths {
+		// Path traversal attacks
+		{"traversal ../etc/passwd", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=../etc/passwd", u, EndpointLogFile, n) }, http.StatusBadRequest},
+		{"traversal ..", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=..", u, EndpointLogFile, n) }, http.StatusBadRequest},
+		{"traversal /etc/passwd", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=/etc/passwd", u, EndpointLogFile, n) }, http.StatusBadRequest},
+		{"traversal ../../secret", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=../../secret", u, EndpointLogFile, n) }, http.StatusBadRequest},
+		{"traversal in node_id", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=../evil&filename=%s", u, EndpointLogFile, filename) }, http.StatusBadRequest},
+	}
+
+	for _, tc := range logFileTestCases {
+		test.T().Run(tc.name, func(t *testing.T) {
+			g := NewWithT(t)
 			g.Eventually(func(gg Gomega) {
-				url := fmt.Sprintf("%s%s?node_id=%s&filename=%s", historyServerURL, EndpointLogFile, nodeID, malicious)
+				url := tc.buildURL(historyServerURL, nodeID)
 				resp, err := client.Get(url)
 				gg.Expect(err).NotTo(HaveOccurred())
 				defer func() {
 					io.Copy(io.Discard, resp.Body)
 					resp.Body.Close()
 				}()
-				gg.Expect(resp.StatusCode).To(Equal(http.StatusBadRequest))
+
+				gg.Expect(resp.StatusCode).To(Equal(tc.expectedStatus),
+					"Test case '%s' failed: expected status %d, got %d", tc.name, tc.expectedStatus, resp.StatusCode)
+
+				if tc.expectedStatus == http.StatusOK {
+					body, err := io.ReadAll(resp.Body)
+					gg.Expect(err).NotTo(HaveOccurred())
+					gg.Expect(len(body)).To(BeNumerically(">", 0),
+						"Test case '%s' should return non-empty body", tc.name)
+				}
 			}, TestTimeoutShort).Should(Succeed())
-		}
-	})
+		})
+	}
 
 	DeleteS3Bucket(test, g, s3Client)
 	LogWithTimestamp(test.T(), "Dead cluster log file endpoint tests completed")

From 4a4f72924422af8b63d3e5e884f0e2396c00a298 Mon Sep 17 00:00:00 2001
From: machichima <nary12321@gmail.com>
Date: Thu, 29 Jan 2026 19:30:07 +0800
Subject: [PATCH 03/37] refactor: clean up comments

Signed-off-by: machichima <nary12321@gmail.com>
---
 historyserver/test/e2e/historyserver_test.go | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/historyserver/test/e2e/historyserver_test.go b/historyserver/test/e2e/historyserver_test.go
index 4f8a811ce67..c9738b264b0 100644
--- a/historyserver/test/e2e/historyserver_test.go
+++ b/historyserver/test/e2e/historyserver_test.go
@@ -171,7 +171,6 @@ func testLogFileEndpointLiveCluster(test Test, g *WithT, namespace *corev1.Names
 	// Hardcode "raylet.out" for deterministic testing.
 	filename := "raylet.out"
 
-	// Define test cases covering different parameters
 	logFileTestCases := []struct {
 		name           string
 		buildURL       func(baseURL, nodeID string) string
@@ -289,10 +288,9 @@ func testLogFileEndpointDeadCluster(test Test, g *WithT, namespace *corev1.Names
 	// Hardcode "raylet.out" for deterministic testing.
 	filename := "raylet.out"
 
-	// Define test cases for dead cluster (S3 storage)
-	// Note: Dead cluster only supports basic parameters (node_id, filename, lines)
-	// Advanced features like timeout, attempt_number, download_filename, filter_ansi_code
-	// are only available for live clusters
+	// NOTE: Dead clusters only support basic parameters (node_id, filename, lines).
+	// Advanced features like timeout, attempt_number, download_filename, and filter_ansi_code
+	// are currently available only for live clusters; they will be supported for dead clusters in the future.
 	logFileTestCases := []struct {
 		name           string
 		buildURL       func(baseURL, nodeID string) string

From 0d893385116bc66a0e311859a2912fcd061c8ffd Mon Sep 17 00:00:00 2001
From: machichima <nary12321@gmail.com>
Date: Fri, 30 Jan 2026 18:25:23 +0800
Subject: [PATCH 04/37] feat: move query options to struct&add more options

Signed-off-by: machichima <nary12321@gmail.com>
---
 historyserver/pkg/historyserver/reader.go |  8 ++-
 historyserver/pkg/historyserver/router.go | 82 +++++++++++++++++------
 historyserver/pkg/historyserver/types.go  | 20 ++++++
 3 files changed, 87 insertions(+), 23 deletions(-)

diff --git a/historyserver/pkg/historyserver/reader.go b/historyserver/pkg/historyserver/reader.go
index 2914eca18af..a167f023210 100644
--- a/historyserver/pkg/historyserver/reader.go
+++ b/historyserver/pkg/historyserver/reader.go
@@ -73,8 +73,9 @@ func (s *ServerHandler) _getNodeLogs(rayClusterNameID, sessionId, nodeId, dir st
 	return json.Marshal(ret)
 }
 
-func (s *ServerHandler) _getNodeLogFile(rayClusterNameID, sessionID, nodeID, filename string, maxLines int) ([]byte, error) {
-	logPath := path.Join(sessionID, "logs", nodeID, filename)
+func (s *ServerHandler) _getNodeLogFile(rayClusterNameID, sessionID string, options GetLogFileOptions) ([]byte, error) {
+	// Build log path with attempt_number if specified
+	logPath := path.Join(sessionID, "logs", options.NodeID, options.Filename)
 
 	reader := s.reader.GetContent(rayClusterNameID, logPath)
 
@@ -82,8 +83,9 @@ func (s *ServerHandler) _getNodeLogFile(rayClusterNameID, sessionID, nodeID, fil
 		return nil, utils.NewHTTPError(fmt.Errorf("log file not found: %s", logPath), http.StatusNotFound)
 	}
 
+	maxLines := options.Lines
 	if maxLines < 0 {
-		// -1 means read all lines, match Ray Dashboard API behavior
+		// -1 means read all lines
 		return io.ReadAll(reader)
 	}
 
diff --git a/historyserver/pkg/historyserver/router.go b/historyserver/pkg/historyserver/router.go
index daeb6954e87..cf369ef6680 100644
--- a/historyserver/pkg/historyserver/router.go
+++ b/historyserver/pkg/historyserver/router.go
@@ -109,6 +109,10 @@ func routerAPI(s *ServerHandler) {
 		Doc("get logfile").Param(ws.QueryParameter("node_id", "node_id")).
 		Param(ws.QueryParameter("filename", "filename")).
 		Param(ws.QueryParameter("lines", "lines")).
+		Param(ws.QueryParameter("timeout", "timeout")).
+		Param(ws.QueryParameter("attempt_number", "attempt_number")).
+		Param(ws.QueryParameter("download_file", "download_file")).
+		Param(ws.QueryParameter("filter_ansi_code", "filter_ansi_code")).
 		Produces("text/plain").
 		Writes("")) // Placeholder for specific return type
 
@@ -573,24 +577,26 @@ func (s *ServerHandler) getNodeLogFile(req *restful.Request, resp *restful.Respo
 	clusterNamespace := req.Attribute(COOKIE_CLUSTER_NAMESPACE_KEY).(string)
 	sessionName := req.Attribute(COOKIE_SESSION_NAME_KEY).(string)
 
-	// Parse query parameters
-	nodeID := req.QueryParameter("node_id")
-	filename := req.QueryParameter("filename")
-	lines := req.QueryParameter("lines")
+	// Parse query parameters into GetLogFileOptions struct
+	options, err := parseGetLogFileOptions(req)
+	if err != nil {
+		resp.WriteErrorString(http.StatusBadRequest, err.Error())
+		return
+	}
 
 	// Validate required parameters
-	if nodeID == "" {
+	if options.NodeID == "" {
 		resp.WriteErrorString(http.StatusBadRequest, "Missing required parameter: node_id")
 		return
 	}
-	if filename == "" {
+	if options.Filename == "" {
 		resp.WriteErrorString(http.StatusBadRequest, "Missing required parameter: filename")
 		return
 	}
 
 	// Prevent path traversal attacks (e.g., ../../etc/passwd)
-	if !fs.ValidPath(nodeID) || !fs.ValidPath(filename) {
-		resp.WriteErrorString(http.StatusBadRequest, fmt.Sprintf("invalid path: path traversal not allowed (node_id=%s, filename=%s)", nodeID, filename))
+	if !fs.ValidPath(options.NodeID) || !fs.ValidPath(options.Filename) {
+		resp.WriteErrorString(http.StatusBadRequest, fmt.Sprintf("invalid path: path traversal not allowed (node_id=%s, filename=%s)", options.NodeID, options.Filename))
 		return
 	}
 
@@ -599,18 +605,7 @@ func (s *ServerHandler) getNodeLogFile(req *restful.Request, resp *restful.Respo
 		return
 	}
 
-	// Convert lines parameter to int
-	maxLines := 0
-	if lines != "" {
-		parsedLines, err := strconv.Atoi(lines)
-		if err != nil {
-			resp.WriteErrorString(http.StatusBadRequest, fmt.Sprintf("invalid lines parameter: %s", lines))
-			return
-		}
-		maxLines = parsedLines
-	}
-
-	content, err := s._getNodeLogFile(clusterNameID+"_"+clusterNamespace, sessionName, nodeID, filename, maxLines)
+	content, err := s._getNodeLogFile(clusterNameID+"_"+clusterNamespace, sessionName, options)
 	if err != nil {
 		var httpErr *utils.HTTPError
 		if errors.As(err, &httpErr) {
@@ -625,6 +620,53 @@ func (s *ServerHandler) getNodeLogFile(req *restful.Request, resp *restful.Respo
 	resp.Write(content)
 }
 
+// parseGetLogFileOptions parses query parameters into GetLogFileOptions struct
+func parseGetLogFileOptions(req *restful.Request) (GetLogFileOptions, error) {
+	options := GetLogFileOptions{
+		NodeID:   req.QueryParameter("node_id"),
+		Filename: req.QueryParameter("filename"),
+	}
+
+	// Parse lines parameter
+	if linesStr := req.QueryParameter("lines"); linesStr != "" {
+		lines, err := strconv.Atoi(linesStr)
+		if err != nil {
+			return options, fmt.Errorf("invalid lines parameter: %s", linesStr)
+		}
+		options.Lines = lines
+	}
+
+	// Parse timeout parameter
+	if timeoutStr := req.QueryParameter("timeout"); timeoutStr != "" {
+		timeout, err := strconv.Atoi(timeoutStr)
+		if err != nil {
+			return options, fmt.Errorf("invalid timeout parameter: %s", timeoutStr)
+		}
+		options.Timeout = timeout
+	}
+
+	// Parse attempt_number parameter
+	if attemptStr := req.QueryParameter("attempt_number"); attemptStr != "" {
+		attemptNumber, err := strconv.Atoi(attemptStr)
+		if err != nil {
+			return options, fmt.Errorf("invalid attempt_number parameter: %s", attemptStr)
+		}
+		options.AttemptNumber = attemptNumber
+	}
+
+	// Parse filter_ansi_code parameter (boolean)
+	if filterStr := req.QueryParameter("filter_ansi_code"); filterStr != "" {
+		options.FilterAnsiCode = strings.ToLower(filterStr) == "true"
+	}
+
+	// Parse download_file parameter (boolean)
+	if downloadStr := req.QueryParameter("download_file"); downloadStr != "" {
+		options.DownloadFile = strings.ToLower(downloadStr) == "true"
+	}
+
+	return options, nil
+}
+
 func (s *ServerHandler) getTaskSummarize(req *restful.Request, resp *restful.Response) {
 	clusterName := req.Attribute(COOKIE_CLUSTER_NAME_KEY).(string)
 	clusterNamespace := req.Attribute(COOKIE_CLUSTER_NAMESPACE_KEY).(string)
diff --git a/historyserver/pkg/historyserver/types.go b/historyserver/pkg/historyserver/types.go
index 0c9831adfbb..d87998bf793 100644
--- a/historyserver/pkg/historyserver/types.go
+++ b/historyserver/pkg/historyserver/types.go
@@ -1,5 +1,25 @@
 package historyserver
 
+// GetLogFileOptions contains all options for fetching log files
+type GetLogFileOptions struct {
+	// Required parameters
+	NodeID   string // The node id where the log file is located
+	Filename string // The log file name
+
+	// Optional parameters with defaults
+	// Number of lines to return, default to DEFAULT_LOG_LIMIT (1000)
+	// -1 = all lines
+	Lines int
+	// Timeout in seconds for the request, default to 0 (no timeout)
+	Timeout int
+	// Attempt number for task retries, default to 0 (first attempt)
+	AttemptNumber int
+	// Whether to filter ANSI escape codes from logs, default to False
+	FilterAnsiCode bool
+	// Whether to set Content-Disposition header for file download, default to false
+	DownloadFile bool
+}
+
 type ReplyTaskInfo struct {
 	Data   TaskInfoData `json:"data"`
 	Msg    string       `json:"msg"`

From 5dc5f12fb9cda1b31d32840560420481fd444804 Mon Sep 17 00:00:00 2001
From: machichima <nary12321@gmail.com>
Date: Fri, 30 Jan 2026 18:33:46 +0800
Subject: [PATCH 05/37] feat: implement attempt_numbe, download_file,
 filter_ansi_code function

Signed-off-by: machichima <nary12321@gmail.com>
---
 historyserver/pkg/historyserver/reader.go | 30 +++++++++++++++++++++--
 historyserver/pkg/historyserver/router.go |  6 +++++
 2 files changed, 34 insertions(+), 2 deletions(-)

diff --git a/historyserver/pkg/historyserver/reader.go b/historyserver/pkg/historyserver/reader.go
index a167f023210..2d039e3db9e 100644
--- a/historyserver/pkg/historyserver/reader.go
+++ b/historyserver/pkg/historyserver/reader.go
@@ -8,6 +8,7 @@ import (
 	"io"
 	"net/http"
 	"path"
+	"regexp"
 	"sort"
 	"strings"
 
@@ -26,6 +27,15 @@ const (
 	MAX_LOG_LIMIT = 10000
 )
 
+// ANSI escape code pattern for filtering colored output from logs
+// Matches patterns like: \x1b[31m (red color), \x1b[0m (reset), etc.
+var ansiEscapePattern = regexp.MustCompile(`\x1b\[[0-9;]+m`)
+
+// filterAnsiEscapeCodes removes ANSI escape sequences from log content
+func filterAnsiEscapeCodes(content []byte) []byte {
+	return ansiEscapePattern.ReplaceAll(content, []byte(""))
+}
+
 func (s *ServerHandler) listClusters(limit int) []utils.ClusterInfo {
 	// Initial continuation marker
 	logrus.Debugf("Prepare to get list clusters info ...")
@@ -76,6 +86,10 @@ func (s *ServerHandler) _getNodeLogs(rayClusterNameID, sessionId, nodeId, dir st
 func (s *ServerHandler) _getNodeLogFile(rayClusterNameID, sessionID string, options GetLogFileOptions) ([]byte, error) {
 	// Build log path with attempt_number if specified
 	logPath := path.Join(sessionID, "logs", options.NodeID, options.Filename)
+	if options.AttemptNumber > 0 {
+		// Append attempt number to filename (e.g., worker.log.1)
+		logPath = fmt.Sprintf("%s.%d", logPath, options.AttemptNumber)
+	}
 
 	reader := s.reader.GetContent(rayClusterNameID, logPath)
 
@@ -86,7 +100,14 @@ func (s *ServerHandler) _getNodeLogFile(rayClusterNameID, sessionID string, opti
 	maxLines := options.Lines
 	if maxLines < 0 {
 		// -1 means read all lines
-		return io.ReadAll(reader)
+		content, err := io.ReadAll(reader)
+		if err != nil {
+			return nil, err
+		}
+		if options.FilterAnsiCode {
+			content = filterAnsiEscapeCodes(content)
+		}
+		return content, nil
 	}
 
 	if maxLines == 0 {
@@ -134,7 +155,12 @@ func (s *ServerHandler) _getNodeLogFile(rayClusterNameID, sessionID string, opti
 		lines = append(buffer[start:], buffer[:start]...)
 	}
 
-	return []byte(strings.Join(lines, "\n")), nil
+	result := []byte(strings.Join(lines, "\n"))
+	if options.FilterAnsiCode {
+		result = filterAnsiEscapeCodes(result)
+	}
+
+	return result, nil
 }
 
 func (s *ServerHandler) GetNodes(rayClusterNameID, sessionId string) ([]byte, error) {
diff --git a/historyserver/pkg/historyserver/router.go b/historyserver/pkg/historyserver/router.go
index cf369ef6680..a17c1e74672 100644
--- a/historyserver/pkg/historyserver/router.go
+++ b/historyserver/pkg/historyserver/router.go
@@ -617,6 +617,12 @@ func (s *ServerHandler) getNodeLogFile(req *restful.Request, resp *restful.Respo
 		}
 		return
 	}
+
+	// Set Content-Disposition header if download_file is requested
+	if options.DownloadFile {
+		resp.AddHeader("Content-Disposition", fmt.Sprintf("attachment; filename=\"%s\"", options.Filename))
+	}
+
 	resp.Write(content)
 }
 

From 6de7601461c52acd2aa940184c85dd6422bf5079 Mon Sep 17 00:00:00 2001
From: machichima <nary12321@gmail.com>
Date: Fri, 30 Jan 2026 19:41:13 +0800
Subject: [PATCH 06/37] feat: e2e test for attempt_numbe, download_file,
 filter_ansi_code

timeout only test validation, not testing the behavior

Signed-off-by: machichima <nary12321@gmail.com>
---
 historyserver/test/e2e/historyserver_test.go | 125 ++++++++++++++++++-
 1 file changed, 121 insertions(+), 4 deletions(-)

diff --git a/historyserver/test/e2e/historyserver_test.go b/historyserver/test/e2e/historyserver_test.go
index c9738b264b0..9900c64ddf7 100644
--- a/historyserver/test/e2e/historyserver_test.go
+++ b/historyserver/test/e2e/historyserver_test.go
@@ -5,6 +5,7 @@ import (
 	"fmt"
 	"io"
 	"net/http"
+	"regexp"
 	"testing"
 
 	"github.com/aws/aws-sdk-go/service/s3"
@@ -23,6 +24,10 @@ const (
 	EndpointLogFile = "/api/v0/logs/file"
 )
 
+// ansiEscapePattern matches ANSI escape sequences (same pattern as in reader.go)
+// Pattern: \x1b\[[0-9;]+m
+var ansiEscapePattern = regexp.MustCompile(`\x1b\[[0-9;]+m`)
+
 func TestHistoryServer(t *testing.T) {
 	// Share a single S3 client among subtests.
 	s3Client := EnsureS3Client(t)
@@ -288,19 +293,37 @@ func testLogFileEndpointDeadCluster(test Test, g *WithT, namespace *corev1.Names
 	// Hardcode "raylet.out" for deterministic testing.
 	filename := "raylet.out"
 
-	// NOTE: Dead clusters only support basic parameters (node_id, filename, lines).
-	// Advanced features like timeout, attempt_number, download_filename, and filter_ansi_code
-	// are currently available only for live clusters; they will be supported for dead clusters in the future.
 	logFileTestCases := []struct {
 		name           string
 		buildURL       func(baseURL, nodeID string) string
 		expectedStatus int
 	}{
-		// Supported parameters
+		// Basic parameters
 		{"lines=100", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=%s&lines=100", u, EndpointLogFile, n, filename) }, http.StatusOK},
 		{"lines=0 (default 1000)", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=%s", u, EndpointLogFile, n, filename) }, http.StatusOK},
 		{"lines=-1 (all)", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=%s&lines=-1", u, EndpointLogFile, n, filename) }, http.StatusOK},
 
+		// timeout parameter
+		// NOTE: timeout feature is not yet implemented, we just accept and validate the timeout parameter
+		{"timeout=5", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=%s&timeout=5", u, EndpointLogFile, n, filename) }, http.StatusOK},
+		{"timeout=30", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=%s&timeout=30", u, EndpointLogFile, n, filename) }, http.StatusOK},
+
+		// attempt_number parameter
+		{"attempt_number=0", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=%s&attempt_number=0", u, EndpointLogFile, n, filename) }, http.StatusOK},
+		{"attempt_number=1 (not found)", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=%s&attempt_number=1", u, EndpointLogFile, n, filename) }, http.StatusNotFound},
+
+		// download_file parameter
+		{"download_file=true", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=%s&download_file=true", u, EndpointLogFile, n, filename) }, http.StatusOK},
+		{"download_file=false", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=%s&download_file=false", u, EndpointLogFile, n, filename) }, http.StatusOK},
+
+		// filter_ansi_code parameter
+		{"filter_ansi_code=true", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=%s&filter_ansi_code=true", u, EndpointLogFile, n, filename) }, http.StatusOK},
+		{"filter_ansi_code=false", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=%s&filter_ansi_code=false", u, EndpointLogFile, n, filename) }, http.StatusOK},
+
+		// Combined parameters
+		{"lines+timeout+filter", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=%s&lines=50&timeout=10&filter_ansi_code=true", u, EndpointLogFile, n, filename) }, http.StatusOK},
+		{"all parameters", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=%s&lines=100&timeout=15&attempt_number=0&download_file=true&filter_ansi_code=true", u, EndpointLogFile, n, filename) }, http.StatusOK},
+
 		// Missing mandatory parameters
 		{"missing node_id", func(u, n string) string { return fmt.Sprintf("%s%s?filename=%s", u, EndpointLogFile, filename) }, http.StatusBadRequest},
 		{"missing filename", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s", u, EndpointLogFile, n) }, http.StatusBadRequest},
@@ -308,6 +331,8 @@ func testLogFileEndpointDeadCluster(test Test, g *WithT, namespace *corev1.Names
 
 		// Invalid parameters
 		{"invalid lines (string)", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=%s&lines=abc", u, EndpointLogFile, n, filename) }, http.StatusBadRequest},
+		{"invalid timeout (string)", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=%s&timeout=invalid", u, EndpointLogFile, n, filename) }, http.StatusBadRequest},
+		{"invalid attempt_number (string)", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=%s&attempt_number=xyz", u, EndpointLogFile, n, filename) }, http.StatusBadRequest},
 		{"file not found", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=nonexistent.log", u, EndpointLogFile, n) }, http.StatusNotFound},
 
 		// Path traversal attacks
@@ -343,6 +368,98 @@ func testLogFileEndpointDeadCluster(test Test, g *WithT, namespace *corev1.Names
 		})
 	}
 
+	// Sub-tests for specific parameter behaviors
+	test.T().Run("download_file header validation", func(t *testing.T) {
+		g := NewWithT(t)
+		g.Eventually(func(gg Gomega) {
+			// Test with download_file=true
+			urlWithDownload := fmt.Sprintf("%s%s?node_id=%s&filename=%s&download_file=true", historyServerURL, EndpointLogFile, nodeID, filename)
+			resp, err := client.Get(urlWithDownload)
+			gg.Expect(err).NotTo(HaveOccurred())
+			defer resp.Body.Close()
+
+			gg.Expect(resp.StatusCode).To(Equal(http.StatusOK))
+			contentDisposition := resp.Header.Get("Content-Disposition")
+			gg.Expect(contentDisposition).To(ContainSubstring("attachment"))
+			gg.Expect(contentDisposition).To(ContainSubstring(fmt.Sprintf("filename=\"%s\"", filename)))
+			LogWithTimestamp(test.T(), "download_file=true sets Content-Disposition header: %s", contentDisposition)
+
+			// Test with download_file=false, should not have Content-Disposition header
+			urlWithoutDownload := fmt.Sprintf("%s%s?node_id=%s&filename=%s&download_file=false", historyServerURL, EndpointLogFile, nodeID, filename)
+			resp2, err := client.Get(urlWithoutDownload)
+			gg.Expect(err).NotTo(HaveOccurred())
+			defer resp2.Body.Close()
+
+			gg.Expect(resp2.StatusCode).To(Equal(http.StatusOK))
+			contentDisposition2 := resp2.Header.Get("Content-Disposition")
+			gg.Expect(contentDisposition2).To(BeEmpty(), "Content-Disposition should not be set when download_file=false")
+		}, TestTimeoutShort).Should(Succeed())
+	})
+
+	test.T().Run("filter_ansi_code behavior", func(t *testing.T) {
+		g := NewWithT(t)
+		g.Eventually(func(gg Gomega) {
+			// Fetch with filter_ansi_code=false (original content with ANSI codes)
+			urlWithoutFilter := fmt.Sprintf("%s%s?node_id=%s&filename=%s&filter_ansi_code=false&lines=100", historyServerURL, EndpointLogFile, nodeID, filename)
+			resp, err := client.Get(urlWithoutFilter)
+			gg.Expect(err).NotTo(HaveOccurred())
+			defer resp.Body.Close()
+
+			gg.Expect(resp.StatusCode).To(Equal(http.StatusOK))
+			bodyWithoutFilter, err := io.ReadAll(resp.Body)
+			gg.Expect(err).NotTo(HaveOccurred())
+
+			// Fetch with filter_ansi_code=true (ANSI codes should be removed)
+			urlWithFilter := fmt.Sprintf("%s%s?node_id=%s&filename=%s&filter_ansi_code=true&lines=100", historyServerURL, EndpointLogFile, nodeID, filename)
+			resp2, err := client.Get(urlWithFilter)
+			gg.Expect(err).NotTo(HaveOccurred())
+			defer resp2.Body.Close()
+
+			gg.Expect(resp2.StatusCode).To(Equal(http.StatusOK))
+			bodyWithFilter, err := io.ReadAll(resp2.Body)
+			gg.Expect(err).NotTo(HaveOccurred())
+
+			// Check if original content contains ANSI codes using the same pattern as reader.go
+			hasAnsiInOriginal := ansiEscapePattern.Match(bodyWithoutFilter)
+
+			if hasAnsiInOriginal {
+				LogWithTimestamp(test.T(), "Original log contains ANSI codes, verifying they are filtered")
+				// Filtered content should NOT contain ANSI escape sequences
+				hasAnsiInFiltered := ansiEscapePattern.Match(bodyWithFilter)
+				gg.Expect(hasAnsiInFiltered).To(BeFalse(), "Filtered content should not contain ANSI escape sequences")
+			} else {
+				LogWithTimestamp(test.T(), "Log doesn't contain ANSI codes, check is skipped...")
+			}
+		}, TestTimeoutShort).Should(Succeed())
+	})
+
+	test.T().Run("attempt_number behavior", func(t *testing.T) {
+		g := NewWithT(t)
+		g.Eventually(func(gg Gomega) {
+			// Test with attempt_number=0
+			urlAttempt0 := fmt.Sprintf("%s%s?node_id=%s&filename=%s&attempt_number=0", historyServerURL, EndpointLogFile, nodeID, filename)
+			resp, err := client.Get(urlAttempt0)
+			gg.Expect(err).NotTo(HaveOccurred())
+			defer resp.Body.Close()
+
+			gg.Expect(resp.StatusCode).To(Equal(http.StatusOK))
+			body, err := io.ReadAll(resp.Body)
+			gg.Expect(err).NotTo(HaveOccurred())
+			gg.Expect(len(body)).To(BeNumerically(">", 0))
+			LogWithTimestamp(test.T(), "attempt_number=0 returned %d bytes", len(body))
+
+			// attempt_number=1 should fail as retry log doesn't exist for normal execution
+			urlAttempt1 := fmt.Sprintf("%s%s?node_id=%s&filename=%s&attempt_number=1", historyServerURL, EndpointLogFile, nodeID, filename)
+			resp2, err := client.Get(urlAttempt1)
+			gg.Expect(err).NotTo(HaveOccurred())
+			defer resp2.Body.Close()
+
+			gg.Expect(resp2.StatusCode).To(Equal(http.StatusNotFound),
+				"attempt_number=1 should return 404 when retry log doesn't exist")
+			LogWithTimestamp(test.T(), "attempt_number=1 correctly returns 404 (file not found)")
+		}, TestTimeoutShort).Should(Succeed())
+	})
+
 	DeleteS3Bucket(test, g, s3Client)
 	LogWithTimestamp(test.T(), "Dead cluster log file endpoint tests completed")
 }

From 8d11aff732a3f27ed6a329e0b1fe1809b84692a8 Mon Sep 17 00:00:00 2001
From: machichima <nary12321@gmail.com>
Date: Fri, 30 Jan 2026 20:04:06 +0800
Subject: [PATCH 07/37] fix: update live cluster invalide param status code

Signed-off-by: machichima <nary12321@gmail.com>
---
 historyserver/test/e2e/historyserver_test.go | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/historyserver/test/e2e/historyserver_test.go b/historyserver/test/e2e/historyserver_test.go
index 9900c64ddf7..e311304a244 100644
--- a/historyserver/test/e2e/historyserver_test.go
+++ b/historyserver/test/e2e/historyserver_test.go
@@ -210,11 +210,11 @@ func testLogFileEndpointLiveCluster(test Test, g *WithT, namespace *corev1.Names
 		{"missing both", func(u, n string) string { return fmt.Sprintf("%s%s", u, EndpointLogFile) }, http.StatusBadRequest},
 
 		// Invalid parameters
-		// NOTE: Ray Dashboard will only return 500 (Internal Server Error) for the exceptions (including file not found and invalid parameter)
+		{"invalid lines (string)", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=%s&lines=abc", u, EndpointLogFile, n, filename) }, http.StatusBadRequest},
+		{"invalid timeout (string)", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=%s&timeout=invalid", u, EndpointLogFile, n, filename) }, http.StatusBadRequest},
+		{"invalid attempt_number (string)", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=%s&attempt_number=xyz", u, EndpointLogFile, n, filename) }, http.StatusBadRequest},
+		// NOTE: Ray Dashboard will return 500 (Internal Server Error) for the file not found error
 		// ref: https://github.com/ray-project/ray/blob/68d01c4c48a59c7768ec9c2359a1859966c446b6/python/ray/dashboard/modules/state/state_head.py#L282-L284
-		{"invalid lines (string)", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=%s&lines=abc", u, EndpointLogFile, n, filename) }, http.StatusInternalServerError},
-		{"invalid timeout (string)", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=%s&timeout=invalid", u, EndpointLogFile, n, filename) }, http.StatusInternalServerError},
-		{"invalid attempt_number (string)", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=%s&attempt_number=xyz", u, EndpointLogFile, n, filename) }, http.StatusInternalServerError},
 		{"file not found", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=nonexistent.log", u, EndpointLogFile, n) }, http.StatusInternalServerError},
 
 		// Path traversal attacks

From ff75e12cf483b81224bf0598ba6ba50e9a816f9c Mon Sep 17 00:00:00 2001
From: machichima <nary12321@gmail.com>
Date: Sat, 31 Jan 2026 15:16:19 +0800
Subject: [PATCH 08/37] feat: add id related param&implement task_id+suffix

Signed-off-by: machichima <nary12321@gmail.com>
---
 historyserver/pkg/historyserver/reader.go | 112 +++++++++++++++++++++-
 historyserver/pkg/historyserver/router.go |  60 +++++++++---
 historyserver/pkg/historyserver/types.go  |  12 ++-
 3 files changed, 165 insertions(+), 19 deletions(-)

diff --git a/historyserver/pkg/historyserver/reader.go b/historyserver/pkg/historyserver/reader.go
index 2d039e3db9e..44f0331d4b5 100644
--- a/historyserver/pkg/historyserver/reader.go
+++ b/historyserver/pkg/historyserver/reader.go
@@ -13,6 +13,7 @@ import (
 	"strings"
 
 	"github.com/emicklei/go-restful/v3"
+	eventtypes "github.com/ray-project/kuberay/historyserver/pkg/eventserver/types"
 	"github.com/ray-project/kuberay/historyserver/pkg/utils"
 	"github.com/sirupsen/logrus"
 )
@@ -84,10 +85,18 @@ func (s *ServerHandler) _getNodeLogs(rayClusterNameID, sessionId, nodeId, dir st
 }
 
 func (s *ServerHandler) _getNodeLogFile(rayClusterNameID, sessionID string, options GetLogFileOptions) ([]byte, error) {
-	// Build log path with attempt_number if specified
-	logPath := path.Join(sessionID, "logs", options.NodeID, options.Filename)
-	if options.AttemptNumber > 0 {
-		// Append attempt number to filename (e.g., worker.log.1)
+	// Resolve node_id and filename based on options
+	nodeID, filename, err := s.resolveLogFilename(rayClusterNameID, options)
+	if err != nil {
+		return nil, utils.NewHTTPError(err, http.StatusBadRequest)
+	}
+
+	// Build log path
+	logPath := path.Join(sessionID, "logs", nodeID, filename)
+
+	// Append attempt_number if specified and not using task_id
+	// (task_id already includes attempt_number in resolution)
+	if options.AttemptNumber > 0 && options.TaskID == "" {
 		logPath = fmt.Sprintf("%s.%d", logPath, options.AttemptNumber)
 	}
 
@@ -163,6 +172,101 @@ func (s *ServerHandler) _getNodeLogFile(rayClusterNameID, sessionID string, opti
 	return result, nil
 }
 
+// resolveLogFilename resolves the log file node_id and filename based on the provided options.
+// This mirrors Ray Dashboard's resolve_filename logic.
+func (s *ServerHandler) resolveLogFilename(clusterNameID string, options GetLogFileOptions) (nodeID, filename string, err error) {
+	// Validate suffix
+	if options.Suffix != "out" && options.Suffix != "err" {
+		return "", "", fmt.Errorf("invalid suffix: %s (must be 'out' or 'err')", options.Suffix)
+	}
+
+	// If filename is explicitly provided, use it and ignore suffix
+	if options.Filename != "" {
+		if options.NodeID == "" {
+			return "", "", fmt.Errorf("node_id is required when filename is provided")
+		}
+		return options.NodeID, options.Filename, nil
+	}
+
+	// If task_id is provided, resolve from task events
+	if options.TaskID != "" {
+		return s.resolveTaskLogFilename(clusterNameID, options.TaskID, options.AttemptNumber, options.Suffix)
+	}
+
+	// If actor_id is provided, resolve from actor events
+	// TODO: not implemented
+	if options.ActorID != "" {
+		return "", "", fmt.Errorf("actor_id resolution not yet implemented")
+	}
+
+	// If pid is provided, resolve worker log file
+	// TODO: not implemented
+	if options.PID > 0 {
+		return "", "", fmt.Errorf("pid resolution not yet implemented")
+	}
+
+	return "", "", fmt.Errorf("must provide one of: filename, task_id, actor_id, or pid")
+}
+
+// resolveTaskLogFilename resolves log file for a task by querying task events.
+// This mirrors Ray Dashboard's _resolve_task_filename logic.
+func (s *ServerHandler) resolveTaskLogFilename(clusterNameID, taskID string, attemptNumber int, suffix string) (nodeID, filename string, err error) {
+	// Get task attempts by task ID
+	taskAttempts, found := s.eventHandler.GetTaskByID(clusterNameID, taskID)
+	if !found {
+		return "", "", fmt.Errorf("task not found: task_id=%s", taskID)
+	}
+
+	// Find the specific attempt
+	var foundTask *eventtypes.Task
+	for i, task := range taskAttempts {
+		if task.AttemptNumber == attemptNumber {
+			foundTask = &taskAttempts[i]
+			break
+		}
+	}
+
+	if foundTask == nil {
+		return "", "", fmt.Errorf("task attempt not found: task_id=%s, attempt_number=%d", taskID, attemptNumber)
+	}
+
+	// Check if task has node_id
+	if foundTask.NodeID == "" {
+		return "", "", fmt.Errorf("task %s (attempt %d) has no node_id (task not scheduled yet)", taskID, attemptNumber)
+	}
+
+	// Check if task has TaskLogInfo
+	if foundTask.TaskLogInfo == nil {
+		// Check if this is an actor task
+		if foundTask.ActorID != "" {
+			return "", "", fmt.Errorf(
+				"for actor task, please query actor log for actor(%s) by providing actor_id query parameter",
+				foundTask.ActorID,
+			)
+		}
+		return "", "", fmt.Errorf(
+			"task %s (attempt %d) has no task_log_info (worker_id=%s, node_id=%s)",
+			taskID, attemptNumber, foundTask.WorkerID, foundTask.NodeID,
+		)
+	}
+
+	// Get the log filename from task_log_info based on suffix
+	filenameKey := "stdout_file"
+	if suffix == "err" {
+		filenameKey = "stderr_file"
+	}
+
+	logFilename, ok := foundTask.TaskLogInfo[filenameKey]
+	if !ok || logFilename == "" {
+		return "", "", fmt.Errorf(
+			"missing log filename info (%s) in task_log_info for task %s (attempt %d)",
+			filenameKey, taskID, attemptNumber,
+		)
+	}
+
+	return foundTask.NodeID, logFilename, nil
+}
+
 func (s *ServerHandler) GetNodes(rayClusterNameID, sessionId string) ([]byte, error) {
 	logPath := path.Join(sessionId, "logs")
 	nodes := s.reader.ListFiles(rayClusterNameID, logPath)
diff --git a/historyserver/pkg/historyserver/router.go b/historyserver/pkg/historyserver/router.go
index a17c1e74672..44f756ab0fb 100644
--- a/historyserver/pkg/historyserver/router.go
+++ b/historyserver/pkg/historyserver/router.go
@@ -106,13 +106,18 @@ func routerAPI(s *ServerHandler) {
 		Doc("get appliations").Param(ws.QueryParameter("node_id", "node_id")).
 		Writes("")) // Placeholder for specific return type
 	ws.Route(ws.GET("/v0/logs/file").To(s.getNodeLogFile).Filter(s.CookieHandle).
-		Doc("get logfile").Param(ws.QueryParameter("node_id", "node_id")).
-		Param(ws.QueryParameter("filename", "filename")).
-		Param(ws.QueryParameter("lines", "lines")).
+		Doc("get logfile").
+		Param(ws.QueryParameter("node_id", "node_id (optional if task_id is provided)")).
+		Param(ws.QueryParameter("filename", "filename (explicit log file path)")).
+		Param(ws.QueryParameter("task_id", "task_id (resolve log file from task)")).
+		Param(ws.QueryParameter("actor_id", "actor_id (resolve log file from actor, not yet implemented)")).
+		Param(ws.QueryParameter("pid", "pid (resolve log file from process id, not yet implemented)")).
+		Param(ws.QueryParameter("suffix", "suffix (out or err, default: out, used with task_id/actor_id/pid)")).
+		Param(ws.QueryParameter("lines", "lines (number of lines to return, default: 1000)")).
 		Param(ws.QueryParameter("timeout", "timeout")).
-		Param(ws.QueryParameter("attempt_number", "attempt_number")).
-		Param(ws.QueryParameter("download_file", "download_file")).
-		Param(ws.QueryParameter("filter_ansi_code", "filter_ansi_code")).
+		Param(ws.QueryParameter("attempt_number", "attempt_number (task retry attempt number, default: 0)")).
+		Param(ws.QueryParameter("download_file", "download_file (true/false)")).
+		Param(ws.QueryParameter("filter_ansi_code", "filter_ansi_code (true/false)")).
 		Produces("text/plain").
 		Writes("")) // Placeholder for specific return type
 
@@ -584,19 +589,26 @@ func (s *ServerHandler) getNodeLogFile(req *restful.Request, resp *restful.Respo
 		return
 	}
 
-	// Validate required parameters
-	if options.NodeID == "" {
-		resp.WriteErrorString(http.StatusBadRequest, "Missing required parameter: node_id")
+	// Validate required parameters following Ray Dashboard logic
+	// At least one of: actor_id, task_id, pid, filename, submission_id must be provided
+	if options.ActorID == "" && options.TaskID == "" && options.PID == 0 && options.Filename == "" {
+		resp.WriteErrorString(http.StatusBadRequest, "At least one of actor_id, task_id, pid, or filename is required")
 		return
 	}
-	if options.Filename == "" {
-		resp.WriteErrorString(http.StatusBadRequest, "Missing required parameter: filename")
+
+	// node_id is required when not using actor_id or task_id (they can auto-resolve node_id)
+	if options.NodeID == "" && options.ActorID == "" && options.TaskID == "" {
+		resp.WriteErrorString(http.StatusBadRequest, "node_id is required when actor_id or task_id is not provided")
 		return
 	}
 
 	// Prevent path traversal attacks (e.g., ../../etc/passwd)
-	if !fs.ValidPath(options.NodeID) || !fs.ValidPath(options.Filename) {
-		resp.WriteErrorString(http.StatusBadRequest, fmt.Sprintf("invalid path: path traversal not allowed (node_id=%s, filename=%s)", options.NodeID, options.Filename))
+	if options.NodeID != "" && !fs.ValidPath(options.NodeID) {
+		resp.WriteErrorString(http.StatusBadRequest, fmt.Sprintf("invalid path: path traversal not allowed (node_id=%s)", options.NodeID))
+		return
+	}
+	if options.Filename != "" && !fs.ValidPath(options.Filename) {
+		resp.WriteErrorString(http.StatusBadRequest, fmt.Sprintf("invalid path: path traversal not allowed (filename=%s)", options.Filename))
 		return
 	}
 
@@ -631,6 +643,28 @@ func parseGetLogFileOptions(req *restful.Request) (GetLogFileOptions, error) {
 	options := GetLogFileOptions{
 		NodeID:   req.QueryParameter("node_id"),
 		Filename: req.QueryParameter("filename"),
+		TaskID:   req.QueryParameter("task_id"),
+		ActorID:  req.QueryParameter("actor_id"),
+		Suffix:   req.QueryParameter("suffix"),
+	}
+
+	// Parse PID parameter
+	if pidStr := req.QueryParameter("pid"); pidStr != "" {
+		pid, err := strconv.Atoi(pidStr)
+		if err != nil {
+			return options, fmt.Errorf("invalid pid parameter: %s", pidStr)
+		}
+		options.PID = pid
+	}
+
+	// Default suffix to "out" if not specified
+	if options.Suffix == "" {
+		options.Suffix = "out"
+	}
+
+	// Validate suffix parameter
+	if options.Suffix != "out" && options.Suffix != "err" {
+		return options, fmt.Errorf("invalid suffix parameter: %s (must be 'out' or 'err')", options.Suffix)
 	}
 
 	// Parse lines parameter
diff --git a/historyserver/pkg/historyserver/types.go b/historyserver/pkg/historyserver/types.go
index d87998bf793..24c3bc057a0 100644
--- a/historyserver/pkg/historyserver/types.go
+++ b/historyserver/pkg/historyserver/types.go
@@ -2,9 +2,14 @@ package historyserver
 
 // GetLogFileOptions contains all options for fetching log files
 type GetLogFileOptions struct {
-	// Required parameters
+	// Node identification (one of these is required if not using task_id/actor_id)
 	NodeID   string // The node id where the log file is located
-	Filename string // The log file name
+
+	// Log file identification (provide one of: Filename, TaskID, ActorID, PID)
+	Filename string // The log file name (explicit path)
+	TaskID   string // Task ID to resolve log file
+	ActorID  string // Actor ID to resolve log file (not yet implemented)
+	PID      int    // Process ID to resolve log file (not yet implemented)
 
 	// Optional parameters with defaults
 	// Number of lines to return, default to DEFAULT_LOG_LIMIT (1000)
@@ -18,6 +23,9 @@ type GetLogFileOptions struct {
 	FilterAnsiCode bool
 	// Whether to set Content-Disposition header for file download, default to false
 	DownloadFile bool
+	// The suffix of the log file ("out" or "err"), default to "out"
+	// Used when resolving by TaskID, ActorID, or PID
+	Suffix string
 }
 
 type ReplyTaskInfo struct {

From 4f562802cc6f95c332ec2e6a956673778a79c10f Mon Sep 17 00:00:00 2001
From: machichima <nary12321@gmail.com>
Date: Sat, 31 Jan 2026 15:25:51 +0800
Subject: [PATCH 09/37] test: remove eventual & print body when status code
 mismatch

Signed-off-by: machichima <nary12321@gmail.com>
---
 historyserver/test/e2e/historyserver_test.go | 84 +++++++++++---------
 1 file changed, 46 insertions(+), 38 deletions(-)

diff --git a/historyserver/test/e2e/historyserver_test.go b/historyserver/test/e2e/historyserver_test.go
index e311304a244..2774e19f7e8 100644
--- a/historyserver/test/e2e/historyserver_test.go
+++ b/historyserver/test/e2e/historyserver_test.go
@@ -228,25 +228,29 @@ func testLogFileEndpointLiveCluster(test Test, g *WithT, namespace *corev1.Names
 	for _, tc := range logFileTestCases {
 		test.T().Run(tc.name, func(t *testing.T) {
 			g := NewWithT(t)
-			g.Eventually(func(gg Gomega) {
-				url := tc.buildURL(historyServerURL, nodeID)
-				resp, err := client.Get(url)
-				gg.Expect(err).NotTo(HaveOccurred())
-				defer func() {
-					io.Copy(io.Discard, resp.Body)
-					resp.Body.Close()
-				}()
-
-				gg.Expect(resp.StatusCode).To(Equal(tc.expectedStatus),
-					"Test case '%s' failed: expected status %d, got %d", tc.name, tc.expectedStatus, resp.StatusCode)
-
-				if tc.expectedStatus == http.StatusOK {
-					body, err := io.ReadAll(resp.Body)
-					gg.Expect(err).NotTo(HaveOccurred())
-					gg.Expect(len(body)).To(BeNumerically(">", 0),
-						"Test case '%s' should return non-empty body", tc.name)
-				}
-			}, TestTimeoutShort).Should(Succeed())
+
+			url := tc.buildURL(historyServerURL, nodeID)
+			resp, err := client.Get(url)
+			g.Expect(err).NotTo(HaveOccurred())
+			defer func() {
+				io.Copy(io.Discard, resp.Body)
+				resp.Body.Close()
+			}()
+
+			body, err := io.ReadAll(resp.Body)
+			g.Expect(err).NotTo(HaveOccurred())
+
+			if resp.StatusCode != tc.expectedStatus {
+				LogWithTimestamp(t, "Test case '%s' failed: expected %d, got %d, body: %s",
+					tc.name, tc.expectedStatus, resp.StatusCode, string(body))
+			}
+
+			g.Expect(resp.StatusCode).To(Equal(tc.expectedStatus),
+				"Test case '%s' failed: expected %d, got %d", tc.name, tc.expectedStatus, resp.StatusCode)
+
+			if tc.expectedStatus == http.StatusOK {
+				g.Expect(len(body)).To(BeNumerically(">", 0))
+			}
 		})
 	}
 
@@ -346,25 +350,29 @@ func testLogFileEndpointDeadCluster(test Test, g *WithT, namespace *corev1.Names
 	for _, tc := range logFileTestCases {
 		test.T().Run(tc.name, func(t *testing.T) {
 			g := NewWithT(t)
-			g.Eventually(func(gg Gomega) {
-				url := tc.buildURL(historyServerURL, nodeID)
-				resp, err := client.Get(url)
-				gg.Expect(err).NotTo(HaveOccurred())
-				defer func() {
-					io.Copy(io.Discard, resp.Body)
-					resp.Body.Close()
-				}()
-
-				gg.Expect(resp.StatusCode).To(Equal(tc.expectedStatus),
-					"Test case '%s' failed: expected status %d, got %d", tc.name, tc.expectedStatus, resp.StatusCode)
-
-				if tc.expectedStatus == http.StatusOK {
-					body, err := io.ReadAll(resp.Body)
-					gg.Expect(err).NotTo(HaveOccurred())
-					gg.Expect(len(body)).To(BeNumerically(">", 0),
-						"Test case '%s' should return non-empty body", tc.name)
-				}
-			}, TestTimeoutShort).Should(Succeed())
+
+			url := tc.buildURL(historyServerURL, nodeID)
+			resp, err := client.Get(url)
+			g.Expect(err).NotTo(HaveOccurred())
+			defer func() {
+				io.Copy(io.Discard, resp.Body)
+				resp.Body.Close()
+			}()
+
+			body, err := io.ReadAll(resp.Body)
+			g.Expect(err).NotTo(HaveOccurred())
+
+			if resp.StatusCode != tc.expectedStatus {
+				LogWithTimestamp(t, "Test case '%s' failed: expected %d, got %d, body: %s",
+					tc.name, tc.expectedStatus, resp.StatusCode, string(body))
+			}
+
+			g.Expect(resp.StatusCode).To(Equal(tc.expectedStatus),
+				"Test case '%s' failed: expected %d, got %d", tc.name, tc.expectedStatus, resp.StatusCode)
+
+			if tc.expectedStatus == http.StatusOK {
+				g.Expect(len(body)).To(BeNumerically(">", 0))
+			}
 		})
 	}
 

From e01c1132b94debfcb2af281ec29206033636a55c Mon Sep 17 00:00:00 2001
From: machichima <nary12321@gmail.com>
Date: Sat, 31 Jan 2026 18:11:50 +0800
Subject: [PATCH 10/37] feat: logic to find logs based on worker ID

Signed-off-by: machichima <nary12321@gmail.com>
---
 historyserver/pkg/historyserver/reader.go | 114 +++++++++++++++++-----
 1 file changed, 90 insertions(+), 24 deletions(-)

diff --git a/historyserver/pkg/historyserver/reader.go b/historyserver/pkg/historyserver/reader.go
index 44f0331d4b5..8194233e759 100644
--- a/historyserver/pkg/historyserver/reader.go
+++ b/historyserver/pkg/historyserver/reader.go
@@ -3,6 +3,7 @@ package historyserver
 import (
 	"bufio"
 	"context"
+	"encoding/base64"
 	"encoding/json"
 	"fmt"
 	"io"
@@ -86,7 +87,7 @@ func (s *ServerHandler) _getNodeLogs(rayClusterNameID, sessionId, nodeId, dir st
 
 func (s *ServerHandler) _getNodeLogFile(rayClusterNameID, sessionID string, options GetLogFileOptions) ([]byte, error) {
 	// Resolve node_id and filename based on options
-	nodeID, filename, err := s.resolveLogFilename(rayClusterNameID, options)
+	nodeID, filename, err := s.resolveLogFilename(rayClusterNameID, sessionID, options)
 	if err != nil {
 		return nil, utils.NewHTTPError(err, http.StatusBadRequest)
 	}
@@ -174,7 +175,8 @@ func (s *ServerHandler) _getNodeLogFile(rayClusterNameID, sessionID string, opti
 
 // resolveLogFilename resolves the log file node_id and filename based on the provided options.
 // This mirrors Ray Dashboard's resolve_filename logic.
-func (s *ServerHandler) resolveLogFilename(clusterNameID string, options GetLogFileOptions) (nodeID, filename string, err error) {
+// The sessionID parameter is required for task_id resolution to search worker log files.
+func (s *ServerHandler) resolveLogFilename(clusterNameID, sessionID string, options GetLogFileOptions) (nodeID, filename string, err error) {
 	// Validate suffix
 	if options.Suffix != "out" && options.Suffix != "err" {
 		return "", "", fmt.Errorf("invalid suffix: %s (must be 'out' or 'err')", options.Suffix)
@@ -190,7 +192,7 @@ func (s *ServerHandler) resolveLogFilename(clusterNameID string, options GetLogF
 
 	// If task_id is provided, resolve from task events
 	if options.TaskID != "" {
-		return s.resolveTaskLogFilename(clusterNameID, options.TaskID, options.AttemptNumber, options.Suffix)
+		return s.resolveTaskLogFilename(clusterNameID, sessionID, options.TaskID, options.AttemptNumber, options.Suffix)
 	}
 
 	// If actor_id is provided, resolve from actor events
@@ -210,7 +212,8 @@ func (s *ServerHandler) resolveLogFilename(clusterNameID string, options GetLogF
 
 // resolveTaskLogFilename resolves log file for a task by querying task events.
 // This mirrors Ray Dashboard's _resolve_task_filename logic.
-func (s *ServerHandler) resolveTaskLogFilename(clusterNameID, taskID string, attemptNumber int, suffix string) (nodeID, filename string, err error) {
+// The sessionID parameter is required for searching worker log files when task_log_info is not available.
+func (s *ServerHandler) resolveTaskLogFilename(clusterNameID, sessionID, taskID string, attemptNumber int, suffix string) (nodeID, filename string, err error) {
 	// Get task attempts by task ID
 	taskAttempts, found := s.eventHandler.GetTaskByID(clusterNameID, taskID)
 	if !found {
@@ -235,36 +238,99 @@ func (s *ServerHandler) resolveTaskLogFilename(clusterNameID, taskID string, att
 		return "", "", fmt.Errorf("task %s (attempt %d) has no node_id (task not scheduled yet)", taskID, attemptNumber)
 	}
 
-	// Check if task has TaskLogInfo
-	if foundTask.TaskLogInfo == nil {
-		// Check if this is an actor task
-		if foundTask.ActorID != "" {
-			return "", "", fmt.Errorf(
-				"for actor task, please query actor log for actor(%s) by providing actor_id query parameter",
-				foundTask.ActorID,
-			)
-		}
+	// Check if this is an actor task
+	if foundTask.ActorID != "" {
+		return "", "", fmt.Errorf(
+			"for actor task, please query actor log for actor(%s) by providing actor_id query parameter",
+			foundTask.ActorID,
+		)
+	}
+
+	// Check if task has worker_id
+	if foundTask.WorkerID == "" {
 		return "", "", fmt.Errorf(
-			"task %s (attempt %d) has no task_log_info (worker_id=%s, node_id=%s)",
-			taskID, attemptNumber, foundTask.WorkerID, foundTask.NodeID,
+			"task %s (attempt %d) has no worker_id",
+			taskID, attemptNumber,
 		)
 	}
 
-	// Get the log filename from task_log_info based on suffix
-	filenameKey := "stdout_file"
-	if suffix == "err" {
-		filenameKey = "stderr_file"
+	// Try to use task_log_info if available
+	// NOTE: task_log_info is currently not supported in ray export event, so we will always
+	// fallback to following logic.
+	if foundTask.TaskLogInfo != nil && len(foundTask.TaskLogInfo) > 0 {
+		filenameKey := "stdout_file"
+		if suffix == "err" {
+			filenameKey = "stderr_file"
+		}
+
+		if logFilename, ok := foundTask.TaskLogInfo[filenameKey]; ok && logFilename != "" {
+			return foundTask.NodeID, logFilename, nil
+		}
 	}
 
-	logFilename, ok := foundTask.TaskLogInfo[filenameKey]
-	if !ok || logFilename == "" {
+	// Fallback: Find worker log file by worker_id
+	// Worker log files follow the pattern: worker-{worker_id_hex}-{pid}-{worker_startup_token}.{suffix}
+	// We need to search for files matching this pattern
+	if sessionID == "" {
 		return "", "", fmt.Errorf(
-			"missing log filename info (%s) in task_log_info for task %s (attempt %d)",
-			filenameKey, taskID, attemptNumber,
+			"task %s (attempt %d) has no task_log_info and sessionID is required to search for worker log files",
+			taskID, attemptNumber,
 		)
 	}
 
-	return foundTask.NodeID, logFilename, nil
+	nodeIDHex, logFilename, err := s.findWorkerLogFile(clusterNameID, sessionID, foundTask.NodeID, foundTask.WorkerID, suffix)
+	if err != nil {
+		return "", "", fmt.Errorf(
+			"failed to find worker log file for task %s (attempt %d, worker_id=%s, node_id=%s): %w",
+			taskID, attemptNumber, foundTask.WorkerID, foundTask.NodeID, err,
+		)
+	}
+
+	return nodeIDHex, logFilename, nil
+}
+
+// findWorkerLogFile searches for a worker log file by worker_id.
+// Worker log files follow the pattern: worker-{worker_id_hex}-{pid}-{worker_startup_token}.{suffix}
+// Returns (nodeIDHex, filename, error).
+func (s *ServerHandler) findWorkerLogFile(clusterNameID, sessionID, nodeID, workerID, suffix string) (string, string, error) {
+	// Convert Base64 node_id to hex for the file path
+	// Ray stores IDs in Base64 (URL-safe) in events, but uses hex in log directory structure
+	nodeIDBytes, err := base64.RawURLEncoding.DecodeString(nodeID)
+	if err != nil {
+		// Try standard Base64 if URL-safe fails
+		nodeIDBytes, err = base64.StdEncoding.DecodeString(nodeID)
+		if err != nil {
+			return "", "", fmt.Errorf("failed to decode node_id: %w", err)
+		}
+	}
+	nodeIDHex := fmt.Sprintf("%x", nodeIDBytes)
+
+	// Convert Base64 worker_id to hex
+	workerIDBytes, err := base64.RawURLEncoding.DecodeString(workerID)
+	if err != nil {
+		// Try standard Base64 if URL-safe fails
+		workerIDBytes, err = base64.StdEncoding.DecodeString(workerID)
+		if err != nil {
+			return "", "", fmt.Errorf("failed to decode worker_id: %w", err)
+		}
+	}
+	workerIDHex := fmt.Sprintf("%x", workerIDBytes)
+
+	// List all files in the node's log directory
+	logPath := path.Join(sessionID, "logs", nodeIDHex)
+	files := s.reader.ListFiles(clusterNameID, logPath)
+
+	// Search for files matching pattern: worker-{worker_id_hex}-*.{suffix}
+	workerPrefix := fmt.Sprintf("worker-%s-", workerIDHex)
+	workerSuffix := fmt.Sprintf(".%s", suffix)
+
+	for _, file := range files {
+		if strings.HasPrefix(file, workerPrefix) && strings.HasSuffix(file, workerSuffix) {
+			return nodeIDHex, file, nil
+		}
+	}
+
+	return "", "", fmt.Errorf("worker log file not found: worker_id=%s (hex=%s), suffix=%s, searched in %s", workerID, workerIDHex, suffix, logPath)
 }
 
 func (s *ServerHandler) GetNodes(rayClusterNameID, sessionId string) ([]byte, error) {

From 41e5970fd89318f1d55d9715953da849d628a7bb Mon Sep 17 00:00:00 2001
From: machichima <nary12321@gmail.com>
Date: Sat, 31 Jan 2026 18:35:30 +0800
Subject: [PATCH 11/37] test: for suffix and task_id

Signed-off-by: machichima <nary12321@gmail.com>
---
 historyserver/test/e2e/historyserver_test.go | 158 +++++++++++++++++++
 1 file changed, 158 insertions(+)

diff --git a/historyserver/test/e2e/historyserver_test.go b/historyserver/test/e2e/historyserver_test.go
index 2774e19f7e8..dc5f7682210 100644
--- a/historyserver/test/e2e/historyserver_test.go
+++ b/historyserver/test/e2e/historyserver_test.go
@@ -201,6 +201,10 @@ func testLogFileEndpointLiveCluster(test Test, g *WithT, namespace *corev1.Names
 		{"filter_ansi_code=true", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=%s&filter_ansi_code=true", u, EndpointLogFile, n, filename) }, http.StatusOK},
 		{"filter_ansi_code=false", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=%s&filter_ansi_code=false", u, EndpointLogFile, n, filename) }, http.StatusOK},
 
+		// suffix parameter
+		{"suffix=out (default)", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=%s&suffix=out", u, EndpointLogFile, n, filename) }, http.StatusOK},
+		{"suffix=err", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=%s&suffix=err", u, EndpointLogFile, n, filename) }, http.StatusOK},
+
 		// Combined parameters
 		{"lines+timeout+filter", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=%s&lines=50&timeout=10&filter_ansi_code=true", u, EndpointLogFile, n, filename) }, http.StatusOK},
 
@@ -213,9 +217,11 @@ func testLogFileEndpointLiveCluster(test Test, g *WithT, namespace *corev1.Names
 		{"invalid lines (string)", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=%s&lines=abc", u, EndpointLogFile, n, filename) }, http.StatusBadRequest},
 		{"invalid timeout (string)", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=%s&timeout=invalid", u, EndpointLogFile, n, filename) }, http.StatusBadRequest},
 		{"invalid attempt_number (string)", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=%s&attempt_number=xyz", u, EndpointLogFile, n, filename) }, http.StatusBadRequest},
+		{"invalid suffix", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=%s&suffix=invalid", u, EndpointLogFile, n, filename) }, http.StatusBadRequest},
 		// NOTE: Ray Dashboard will return 500 (Internal Server Error) for the file not found error
 		// ref: https://github.com/ray-project/ray/blob/68d01c4c48a59c7768ec9c2359a1859966c446b6/python/ray/dashboard/modules/state/state_head.py#L282-L284
 		{"file not found", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=nonexistent.log", u, EndpointLogFile, n) }, http.StatusInternalServerError},
+		{"task_id invalid (not found)", func(u, n string) string { return fmt.Sprintf("%s%s?task_id=nonexistent-task-id", u, EndpointLogFile) }, http.StatusInternalServerError},
 
 		// Path traversal attacks
 		{"traversal ../etc/passwd", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=../etc/passwd", u, EndpointLogFile, n) }, http.StatusBadRequest},
@@ -254,6 +260,45 @@ func testLogFileEndpointLiveCluster(test Test, g *WithT, namespace *corev1.Names
 		})
 	}
 
+	// Sub-test for task_id parameter (live cluster)
+	test.T().Run("task_id parameter", func(t *testing.T) {
+		g := NewWithT(t)
+
+		// Get all eligible task IDs
+		taskIDs := getAllEligibleTaskIDs(g, client, historyServerURL)
+		LogWithTimestamp(t, "Found %d eligible task IDs for testing", len(taskIDs))
+
+		var successCount int
+		var lastError string
+
+		// Try each task ID until one succeeds
+		for _, taskID := range taskIDs {
+			LogWithTimestamp(t, "Testing task_id: %s", taskID)
+
+			url := fmt.Sprintf("%s%s?task_id=%s", historyServerURL, EndpointLogFile, taskID)
+			resp, err := client.Get(url)
+			if err != nil {
+				lastError = fmt.Sprintf("HTTP error for task %s: %v", taskID, err)
+				continue
+			}
+
+			body, _ := io.ReadAll(resp.Body)
+			resp.Body.Close()
+
+			if resp.StatusCode == http.StatusOK {
+				successCount++
+				LogWithTimestamp(t, "✓ Task %s succeeded, returned %d bytes", taskID, len(body))
+				break
+			} else {
+				lastError = fmt.Sprintf("task %s returned %d: %s", taskID, resp.StatusCode, string(body))
+				LogWithTimestamp(t, "✗ Task %s failed: %s", taskID, lastError)
+			}
+		}
+
+		g.Expect(successCount).To(BeNumerically(">", 0),
+			"At least one task_id should succeed. Last error: %s", lastError)
+	})
+
 	DeleteS3Bucket(test, g, s3Client)
 	LogWithTimestamp(test.T(), "Log file endpoint tests completed")
 }
@@ -324,6 +369,10 @@ func testLogFileEndpointDeadCluster(test Test, g *WithT, namespace *corev1.Names
 		{"filter_ansi_code=true", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=%s&filter_ansi_code=true", u, EndpointLogFile, n, filename) }, http.StatusOK},
 		{"filter_ansi_code=false", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=%s&filter_ansi_code=false", u, EndpointLogFile, n, filename) }, http.StatusOK},
 
+		// suffix parameter
+		{"suffix=out (default)", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=%s&suffix=out", u, EndpointLogFile, n, filename) }, http.StatusOK},
+		{"suffix=err", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=%s&suffix=err", u, EndpointLogFile, n, filename) }, http.StatusOK},
+
 		// Combined parameters
 		{"lines+timeout+filter", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=%s&lines=50&timeout=10&filter_ansi_code=true", u, EndpointLogFile, n, filename) }, http.StatusOK},
 		{"all parameters", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=%s&lines=100&timeout=15&attempt_number=0&download_file=true&filter_ansi_code=true", u, EndpointLogFile, n, filename) }, http.StatusOK},
@@ -338,6 +387,8 @@ func testLogFileEndpointDeadCluster(test Test, g *WithT, namespace *corev1.Names
 		{"invalid timeout (string)", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=%s&timeout=invalid", u, EndpointLogFile, n, filename) }, http.StatusBadRequest},
 		{"invalid attempt_number (string)", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=%s&attempt_number=xyz", u, EndpointLogFile, n, filename) }, http.StatusBadRequest},
 		{"file not found", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=nonexistent.log", u, EndpointLogFile, n) }, http.StatusNotFound},
+		{"invalid suffix", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=%s&suffix=invalid", u, EndpointLogFile, n, filename) }, http.StatusBadRequest},
+		{"task_id invalid (not found)", func(u, n string) string { return fmt.Sprintf("%s%s?task_id=nonexistent-task-id", u, EndpointLogFile) }, http.StatusBadRequest},
 
 		// Path traversal attacks
 		{"traversal ../etc/passwd", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=../etc/passwd", u, EndpointLogFile, n) }, http.StatusBadRequest},
@@ -468,6 +519,113 @@ func testLogFileEndpointDeadCluster(test Test, g *WithT, namespace *corev1.Names
 		}, TestTimeoutShort).Should(Succeed())
 	})
 
+	// Sub-test for task_id parameter
+	test.T().Run("task_id parameter", func(t *testing.T) {
+		g := NewWithT(t)
+
+		// Get all eligible task IDs
+		taskIDs := getAllEligibleTaskIDs(g, client, historyServerURL)
+		LogWithTimestamp(t, "Found %d eligible task IDs for testing", len(taskIDs))
+
+		var successCount int
+		var lastError string
+
+		// Try each task ID until one succeeds
+		for _, taskID := range taskIDs {
+			LogWithTimestamp(t, "Testing task_id: %s", taskID)
+
+			url := fmt.Sprintf("%s%s?task_id=%s", historyServerURL, EndpointLogFile, taskID)
+			resp, err := client.Get(url)
+			if err != nil {
+				lastError = fmt.Sprintf("HTTP error for task %s: %v", taskID, err)
+				continue
+			}
+
+			body, _ := io.ReadAll(resp.Body)
+			resp.Body.Close()
+
+			if resp.StatusCode == http.StatusOK {
+				successCount++
+				LogWithTimestamp(t, "✓ Task %s succeeded, returned %d bytes", taskID, len(body))
+				break
+			} else {
+				lastError = fmt.Sprintf("task %s returned %d: %s", taskID, resp.StatusCode, string(body))
+				LogWithTimestamp(t, "✗ Task %s failed: %s", taskID, lastError)
+			}
+		}
+
+		g.Expect(successCount).To(BeNumerically(">", 0),
+			"At least one task_id should succeed. Last error: %s", lastError)
+	})
+
 	DeleteS3Bucket(test, g, s3Client)
 	LogWithTimestamp(test.T(), "Dead cluster log file endpoint tests completed")
 }
+
+// getAllEligibleTaskIDs retrieves all non-actor task IDs with node_id from the /api/v0/tasks endpoint.
+// Returns a list of task IDs that are eligible for log file testing.
+// Note: We filter out actor tasks because they don't have task_log_info by default
+// (unless RAY_ENABLE_RECORD_ACTOR_TASK_LOGGING=1 is set).
+func getAllEligibleTaskIDs(g *WithT, client *http.Client, historyServerURL string) []string {
+	var taskIDs []string
+	g.Eventually(func(gg Gomega) {
+		resp, err := client.Get(historyServerURL + "/api/v0/tasks")
+		gg.Expect(err).NotTo(HaveOccurred())
+		defer resp.Body.Close()
+
+		gg.Expect(resp.StatusCode).To(Equal(http.StatusOK))
+
+		body, err := io.ReadAll(resp.Body)
+		gg.Expect(err).NotTo(HaveOccurred())
+
+		var result map[string]interface{}
+		err = json.Unmarshal(body, &result)
+		gg.Expect(err).NotTo(HaveOccurred())
+
+		// Extract task_id from response
+		// Response format: {"result": true, "msg": "...", "data": {"result": {"result": [tasks...], ...}}}
+		data, ok := result["data"].(map[string]interface{})
+		gg.Expect(ok).To(BeTrue(), "response should have 'data' field")
+
+		dataResult, ok := data["result"].(map[string]interface{})
+		gg.Expect(ok).To(BeTrue(), "data should have 'result' field")
+
+		tasks, ok := dataResult["result"].([]interface{})
+		gg.Expect(ok).To(BeTrue(), "result should have 'result' array")
+		gg.Expect(len(tasks)).To(BeNumerically(">", 0), "should have at least one task")
+
+		// Find all non-actor tasks with node_id
+		for _, t := range tasks {
+			task, ok := t.(map[string]interface{})
+			if !ok {
+				continue
+			}
+
+			// Check if this is an actor task
+			actorID, _ := task["actor_id"].(string)
+			if actorID != "" {
+				// Skip actor tasks - they don't have task_log_info unless
+				// RAY_ENABLE_RECORD_ACTOR_TASK_LOGGING=1 is set
+				continue
+			}
+
+			// Check if it has node_id
+			nodeID, _ := task["node_id"].(string)
+			if nodeID == "" {
+				// If nodeID is empty, it means the task is not scheduled yet. Skip it
+				// as it will not have logs
+				continue
+			}
+
+			// Found a non-actor task with logs
+			taskID, ok := task["task_id"].(string)
+			if ok && taskID != "" {
+				taskIDs = append(taskIDs, taskID)
+			}
+		}
+
+		gg.Expect(len(taskIDs)).To(BeNumerically(">", 0), "should have at least one eligible task")
+	}, TestTimeoutShort).Should(Succeed())
+
+	return taskIDs
+}

From 6497225dd7ea8801ed48d1daa36d91705eaf3b12 Mon Sep 17 00:00:00 2001
From: machichima <nary12321@gmail.com>
Date: Sat, 31 Jan 2026 18:51:08 +0800
Subject: [PATCH 12/37] fix: update rayjob.yaml to ensure produce log.out file

Signed-off-by: machichima <nary12321@gmail.com>
---
 historyserver/config/rayjob.yaml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/historyserver/config/rayjob.yaml b/historyserver/config/rayjob.yaml
index 3ce3619364a..61357a3ca0a 100644
--- a/historyserver/config/rayjob.yaml
+++ b/historyserver/config/rayjob.yaml
@@ -11,6 +11,8 @@ spec:
 
     @ray.remote(num_cpus=0.5)
     def my_task(x):
+        # NOTE: Add this to ensure will produce log.out
+        print(f'Processing {x}', flush=True)
         return x * 2
 
     @ray.remote(num_cpus=0.5)

From 6b66a305fb179ea27744d13de4c7e806b3b11c28 Mon Sep 17 00:00:00 2001
From: machichima <nary12321@gmail.com>
Date: Sat, 31 Jan 2026 20:16:01 +0800
Subject: [PATCH 13/37] feat+test: support actor_id query

Signed-off-by: machichima <nary12321@gmail.com>
---
 historyserver/pkg/historyserver/reader.go    |  47 +++-
 historyserver/pkg/historyserver/router.go    |   4 +-
 historyserver/test/e2e/historyserver_test.go | 214 ++++++++++++++-----
 3 files changed, 211 insertions(+), 54 deletions(-)

diff --git a/historyserver/pkg/historyserver/reader.go b/historyserver/pkg/historyserver/reader.go
index 8194233e759..a0d92003d90 100644
--- a/historyserver/pkg/historyserver/reader.go
+++ b/historyserver/pkg/historyserver/reader.go
@@ -196,9 +196,8 @@ func (s *ServerHandler) resolveLogFilename(clusterNameID, sessionID string, opti
 	}
 
 	// If actor_id is provided, resolve from actor events
-	// TODO: not implemented
 	if options.ActorID != "" {
-		return "", "", fmt.Errorf("actor_id resolution not yet implemented")
+		return s.resolveActorLogFilename(clusterNameID, sessionID, options.ActorID, options.Suffix)
 	}
 
 	// If pid is provided, resolve worker log file
@@ -289,6 +288,50 @@ func (s *ServerHandler) resolveTaskLogFilename(clusterNameID, sessionID, taskID
 	return nodeIDHex, logFilename, nil
 }
 
+// resolveActorLogFilename resolves log file for an actor by querying actor events.
+// This mirrors Ray Dashboard's _resolve_actor_filename logic.
+func (s *ServerHandler) resolveActorLogFilename(clusterNameID, sessionID, actorID, suffix string) (nodeID, filename string, err error) {
+	// Get actor by actor ID
+	actor, found := s.eventHandler.GetActorByID(clusterNameID, actorID)
+	if !found {
+		return "", "", fmt.Errorf("actor not found: actor_id=%s", actorID)
+	}
+
+	// Check if actor has node_id (means it was scheduled)
+	if actor.Address.NodeID == "" {
+		return "", "", fmt.Errorf(
+			"actor %s has no node_id (actor not scheduled yet)",
+			actorID,
+		)
+	}
+
+	// Check if actor has worker_id
+	if actor.Address.WorkerID == "" {
+		return "", "", fmt.Errorf(
+			"actor %s has no worker_id (actor not scheduled yet)",
+			actorID,
+		)
+	}
+
+	// Find worker log file by worker_id
+	// Worker log files follow the pattern: worker-{worker_id_hex}-{pid}-{worker_startup_token}.{suffix}
+	nodeIDHex, logFilename, err := s.findWorkerLogFile(
+		clusterNameID,
+		sessionID,
+		actor.Address.NodeID,
+		actor.Address.WorkerID,
+		suffix,
+	)
+	if err != nil {
+		return "", "", fmt.Errorf(
+			"failed to find worker log file for actor %s (worker_id=%s, node_id=%s): %w",
+			actorID, actor.Address.WorkerID, actor.Address.NodeID, err,
+		)
+	}
+
+	return nodeIDHex, logFilename, nil
+}
+
 // findWorkerLogFile searches for a worker log file by worker_id.
 // Worker log files follow the pattern: worker-{worker_id_hex}-{pid}-{worker_startup_token}.{suffix}
 // Returns (nodeIDHex, filename, error).
diff --git a/historyserver/pkg/historyserver/router.go b/historyserver/pkg/historyserver/router.go
index 44f756ab0fb..6da191b20c1 100644
--- a/historyserver/pkg/historyserver/router.go
+++ b/historyserver/pkg/historyserver/router.go
@@ -107,10 +107,10 @@ func routerAPI(s *ServerHandler) {
 		Writes("")) // Placeholder for specific return type
 	ws.Route(ws.GET("/v0/logs/file").To(s.getNodeLogFile).Filter(s.CookieHandle).
 		Doc("get logfile").
-		Param(ws.QueryParameter("node_id", "node_id (optional if task_id is provided)")).
+		Param(ws.QueryParameter("node_id", "node_id (optional if task_id or actor_id is provided)")).
 		Param(ws.QueryParameter("filename", "filename (explicit log file path)")).
 		Param(ws.QueryParameter("task_id", "task_id (resolve log file from task)")).
-		Param(ws.QueryParameter("actor_id", "actor_id (resolve log file from actor, not yet implemented)")).
+		Param(ws.QueryParameter("actor_id", "actor_id (resolve log file from actor)")).
 		Param(ws.QueryParameter("pid", "pid (resolve log file from process id, not yet implemented)")).
 		Param(ws.QueryParameter("suffix", "suffix (out or err, default: out, used with task_id/actor_id/pid)")).
 		Param(ws.QueryParameter("lines", "lines (number of lines to return, default: 1000)")).
diff --git a/historyserver/test/e2e/historyserver_test.go b/historyserver/test/e2e/historyserver_test.go
index dc5f7682210..e889a244d43 100644
--- a/historyserver/test/e2e/historyserver_test.go
+++ b/historyserver/test/e2e/historyserver_test.go
@@ -287,11 +287,11 @@ func testLogFileEndpointLiveCluster(test Test, g *WithT, namespace *corev1.Names
 
 			if resp.StatusCode == http.StatusOK {
 				successCount++
-				LogWithTimestamp(t, "✓ Task %s succeeded, returned %d bytes", taskID, len(body))
+				LogWithTimestamp(t, "Task %s succeeded, returned %d bytes", taskID, len(body))
 				break
 			} else {
 				lastError = fmt.Sprintf("task %s returned %d: %s", taskID, resp.StatusCode, string(body))
-				LogWithTimestamp(t, "✗ Task %s failed: %s", taskID, lastError)
+				LogWithTimestamp(t, "Task %s failed: %s", taskID, lastError)
 			}
 		}
 
@@ -299,6 +299,45 @@ func testLogFileEndpointLiveCluster(test Test, g *WithT, namespace *corev1.Names
 			"At least one task_id should succeed. Last error: %s", lastError)
 	})
 
+	// Sub-test for actor_id parameter (live cluster)
+	test.T().Run("actor_id parameter", func(t *testing.T) {
+		g := NewWithT(t)
+
+		// Get all eligible actor IDs
+		actorIDs := getAllEligibleActorIDs(g, client, historyServerURL)
+		LogWithTimestamp(t, "Found %d eligible actor IDs for testing", len(actorIDs))
+
+		var successCount int
+		var lastError string
+
+		// Try each actor ID until one succeeds
+		for _, actorID := range actorIDs {
+			LogWithTimestamp(t, "Testing actor_id: %s", actorID)
+
+			url := fmt.Sprintf("%s%s?actor_id=%s", historyServerURL, EndpointLogFile, actorID)
+			resp, err := client.Get(url)
+			if err != nil {
+				lastError = fmt.Sprintf("HTTP error for actor %s: %v", actorID, err)
+				continue
+			}
+
+			body, _ := io.ReadAll(resp.Body)
+			resp.Body.Close()
+
+			if resp.StatusCode == http.StatusOK {
+				successCount++
+				LogWithTimestamp(t, "Actor %s succeeded, returned %d bytes", actorID, len(body))
+				break
+			} else {
+				lastError = fmt.Sprintf("actor %s returned %d: %s", actorID, resp.StatusCode, string(body))
+				LogWithTimestamp(t, "Actor %s failed: %s", actorID, lastError)
+			}
+		}
+
+		g.Expect(successCount).To(BeNumerically(">", 0),
+			"At least one actor_id should succeed. Last error: %s", lastError)
+	})
+
 	DeleteS3Bucket(test, g, s3Client)
 	LogWithTimestamp(test.T(), "Log file endpoint tests completed")
 }
@@ -546,11 +585,11 @@ func testLogFileEndpointDeadCluster(test Test, g *WithT, namespace *corev1.Names
 
 			if resp.StatusCode == http.StatusOK {
 				successCount++
-				LogWithTimestamp(t, "✓ Task %s succeeded, returned %d bytes", taskID, len(body))
+				LogWithTimestamp(t, "Task %s succeeded, returned %d bytes", taskID, len(body))
 				break
 			} else {
 				lastError = fmt.Sprintf("task %s returned %d: %s", taskID, resp.StatusCode, string(body))
-				LogWithTimestamp(t, "✗ Task %s failed: %s", taskID, lastError)
+				LogWithTimestamp(t, "Task %s failed: %s", taskID, lastError)
 			}
 		}
 
@@ -558,6 +597,45 @@ func testLogFileEndpointDeadCluster(test Test, g *WithT, namespace *corev1.Names
 			"At least one task_id should succeed. Last error: %s", lastError)
 	})
 
+	// Sub-test for actor_id parameter (dead cluster)
+	test.T().Run("actor_id parameter", func(t *testing.T) {
+		g := NewWithT(t)
+
+		// Get all eligible actor IDs
+		actorIDs := getAllEligibleActorIDs(g, client, historyServerURL)
+		LogWithTimestamp(t, "Found %d eligible actor IDs for testing", len(actorIDs))
+
+		var successCount int
+		var lastError string
+
+		// Try each actor ID until one succeeds
+		for _, actorID := range actorIDs {
+			LogWithTimestamp(t, "Testing actor_id: %s", actorID)
+
+			url := fmt.Sprintf("%s%s?actor_id=%s", historyServerURL, EndpointLogFile, actorID)
+			resp, err := client.Get(url)
+			if err != nil {
+				lastError = fmt.Sprintf("HTTP error for actor %s: %v", actorID, err)
+				continue
+			}
+
+			body, _ := io.ReadAll(resp.Body)
+			resp.Body.Close()
+
+			if resp.StatusCode == http.StatusOK {
+				successCount++
+				LogWithTimestamp(t, "Actor %s succeeded, returned %d bytes", actorID, len(body))
+				break
+			} else {
+				lastError = fmt.Sprintf("actor %s returned %d: %s", actorID, resp.StatusCode, string(body))
+				LogWithTimestamp(t, "Actor %s failed: %s", actorID, lastError)
+			}
+		}
+
+		g.Expect(successCount).To(BeNumerically(">", 0),
+			"At least one actor_id should succeed. Last error: %s", lastError)
+	})
+
 	DeleteS3Bucket(test, g, s3Client)
 	LogWithTimestamp(test.T(), "Dead cluster log file endpoint tests completed")
 }
@@ -568,64 +646,100 @@ func testLogFileEndpointDeadCluster(test Test, g *WithT, namespace *corev1.Names
 // (unless RAY_ENABLE_RECORD_ACTOR_TASK_LOGGING=1 is set).
 func getAllEligibleTaskIDs(g *WithT, client *http.Client, historyServerURL string) []string {
 	var taskIDs []string
-	g.Eventually(func(gg Gomega) {
-		resp, err := client.Get(historyServerURL + "/api/v0/tasks")
-		gg.Expect(err).NotTo(HaveOccurred())
-		defer resp.Body.Close()
+	resp, err := client.Get(historyServerURL + "/api/v0/tasks")
+	g.Expect(err).NotTo(HaveOccurred())
+	defer resp.Body.Close()
 
-		gg.Expect(resp.StatusCode).To(Equal(http.StatusOK))
+	g.Expect(resp.StatusCode).To(Equal(http.StatusOK))
 
-		body, err := io.ReadAll(resp.Body)
-		gg.Expect(err).NotTo(HaveOccurred())
+	body, err := io.ReadAll(resp.Body)
+	g.Expect(err).NotTo(HaveOccurred())
 
-		var result map[string]interface{}
-		err = json.Unmarshal(body, &result)
-		gg.Expect(err).NotTo(HaveOccurred())
+	var result map[string]interface{}
+	err = json.Unmarshal(body, &result)
+	g.Expect(err).NotTo(HaveOccurred())
 
-		// Extract task_id from response
-		// Response format: {"result": true, "msg": "...", "data": {"result": {"result": [tasks...], ...}}}
-		data, ok := result["data"].(map[string]interface{})
-		gg.Expect(ok).To(BeTrue(), "response should have 'data' field")
+	// Extract task_id from response
+	// Response format: {"result": true, "msg": "...", "data": {"result": {"result": [tasks...], ...}}}
+	data, ok := result["data"].(map[string]interface{})
+	g.Expect(ok).To(BeTrue(), "response should have 'data' field")
 
-		dataResult, ok := data["result"].(map[string]interface{})
-		gg.Expect(ok).To(BeTrue(), "data should have 'result' field")
+	dataResult, ok := data["result"].(map[string]interface{})
+	g.Expect(ok).To(BeTrue(), "data should have 'result' field")
 
-		tasks, ok := dataResult["result"].([]interface{})
-		gg.Expect(ok).To(BeTrue(), "result should have 'result' array")
-		gg.Expect(len(tasks)).To(BeNumerically(">", 0), "should have at least one task")
+	tasks, ok := dataResult["result"].([]interface{})
+	g.Expect(ok).To(BeTrue(), "result should have 'result' array")
+	g.Expect(len(tasks)).To(BeNumerically(">", 0), "should have at least one task")
 
-		// Find all non-actor tasks with node_id
-		for _, t := range tasks {
-			task, ok := t.(map[string]interface{})
-			if !ok {
-				continue
-			}
+	// Find all non-actor tasks with node_id
+	for _, t := range tasks {
+		task, ok := t.(map[string]interface{})
+		if !ok {
+			continue
+		}
 
-			// Check if this is an actor task
-			actorID, _ := task["actor_id"].(string)
-			if actorID != "" {
-				// Skip actor tasks - they don't have task_log_info unless
-				// RAY_ENABLE_RECORD_ACTOR_TASK_LOGGING=1 is set
-				continue
-			}
+		// Check if this is an actor task
+		actorID, _ := task["actor_id"].(string)
+		if actorID != "" {
+			// Skip actor tasks - they don't have task_log_info unless
+			// RAY_ENABLE_RECORD_ACTOR_TASK_LOGGING=1 is set
+			continue
+		}
 
-			// Check if it has node_id
-			nodeID, _ := task["node_id"].(string)
-			if nodeID == "" {
-				// If nodeID is empty, it means the task is not scheduled yet. Skip it
-				// as it will not have logs
-				continue
-			}
+		// Check if it has node_id
+		nodeID, _ := task["node_id"].(string)
+		if nodeID == "" {
+			// If nodeID is empty, it means the task is not scheduled yet. Skip it
+			// as it will not have logs
+			continue
+		}
 
-			// Found a non-actor task with logs
-			taskID, ok := task["task_id"].(string)
-			if ok && taskID != "" {
-				taskIDs = append(taskIDs, taskID)
-			}
+		// Found a non-actor task with logs
+		taskID, ok := task["task_id"].(string)
+		if ok && taskID != "" {
+			taskIDs = append(taskIDs, taskID)
 		}
+	}
 
-		gg.Expect(len(taskIDs)).To(BeNumerically(">", 0), "should have at least one eligible task")
-	}, TestTimeoutShort).Should(Succeed())
+	g.Expect(len(taskIDs)).To(BeNumerically(">", 0), "should have at least one eligible task")
 
 	return taskIDs
 }
+
+// getAllEligibleActorIDs retrieves all actor IDs with node_id and worker_id from the /logical/actors endpoint.
+// Returns a list of actor IDs that are eligible for log file testing (actors that have been scheduled and are running).
+func getAllEligibleActorIDs(g *WithT, client *http.Client, historyServerURL string) []string {
+	var actorIDs []string
+	resp, err := client.Get(historyServerURL + "/logical/actors")
+	g.Expect(err).NotTo(HaveOccurred())
+	defer resp.Body.Close()
+
+	g.Expect(resp.StatusCode).To(Equal(http.StatusOK))
+
+	body, err := io.ReadAll(resp.Body)
+	g.Expect(err).NotTo(HaveOccurred())
+
+	var result map[string]interface{}
+	err = json.Unmarshal(body, &result)
+	g.Expect(err).NotTo(HaveOccurred())
+
+	// Extract actor_id from response
+	// Response format: {"result": true, "msg": "...", "data": {"actors": {actor_id: {...}, ...}}}
+	data, ok := result["data"].(map[string]interface{})
+	g.Expect(ok).To(BeTrue(), "response should have 'data' field")
+
+	actors, ok := data["actors"].(map[string]interface{})
+	g.Expect(ok).To(BeTrue(), "data should have 'actors' field")
+	g.Expect(len(actors)).To(BeNumerically(">", 0), "should have at least one actor")
+
+	// Find all actors with node_id and worker_id (scheduled actors)
+	for actorID := range actors {
+		actorIDs = append(actorIDs, actorID)
+	}
+
+	fmt.Printf("Actor IDs: %+v", actorIDs)
+
+	g.Expect(len(actorIDs)).To(BeNumerically(">", 0), "should have at least one eligible actor")
+
+	return actorIDs
+}

From d204ef118100c44e3915c80ac115af15797e412e Mon Sep 17 00:00:00 2001
From: machichima <nary12321@gmail.com>
Date: Sat, 31 Jan 2026 22:16:25 +0800
Subject: [PATCH 14/37] feat+test: support pid query

Signed-off-by: machichima <nary12321@gmail.com>
---
 historyserver/pkg/historyserver/reader.go    |  41 ++++++-
 historyserver/pkg/historyserver/router.go    |   2 +-
 historyserver/test/e2e/historyserver_test.go | 108 +++++++++++++++++++
 3 files changed, 148 insertions(+), 3 deletions(-)

diff --git a/historyserver/pkg/historyserver/reader.go b/historyserver/pkg/historyserver/reader.go
index a0d92003d90..0235ff26d26 100644
--- a/historyserver/pkg/historyserver/reader.go
+++ b/historyserver/pkg/historyserver/reader.go
@@ -5,6 +5,7 @@ import (
 	"context"
 	"encoding/base64"
 	"encoding/json"
+	"errors"
 	"fmt"
 	"io"
 	"net/http"
@@ -89,6 +90,11 @@ func (s *ServerHandler) _getNodeLogFile(rayClusterNameID, sessionID string, opti
 	// Resolve node_id and filename based on options
 	nodeID, filename, err := s.resolveLogFilename(rayClusterNameID, sessionID, options)
 	if err != nil {
+		// Preserve HTTPError status code if already set, otherwise use BadRequest
+		var httpErr *utils.HTTPError
+		if errors.As(err, &httpErr) {
+			return nil, err
+		}
 		return nil, utils.NewHTTPError(err, http.StatusBadRequest)
 	}
 
@@ -201,14 +207,45 @@ func (s *ServerHandler) resolveLogFilename(clusterNameID, sessionID string, opti
 	}
 
 	// If pid is provided, resolve worker log file
-	// TODO: not implemented
 	if options.PID > 0 {
-		return "", "", fmt.Errorf("pid resolution not yet implemented")
+		return s.resolvePidLogFilename(clusterNameID, sessionID, options.NodeID, options.PID, options.Suffix)
 	}
 
 	return "", "", fmt.Errorf("must provide one of: filename, task_id, actor_id, or pid")
 }
 
+// resolvePidLogFilename resolves a log file by PID.
+// It requires a nodeID and searches for a log file with a name ending in "-{pid}.{suffix}".
+func (s *ServerHandler) resolvePidLogFilename(clusterNameID, sessionID, nodeID string, pid int, suffix string) (string, string, error) {
+	if nodeID == "" {
+		return "", "", fmt.Errorf("node_id is required for pid resolution")
+	}
+
+	// The nodeID from actors/tasks is base64, but the path on storage uses hex.
+	nodeIDBytes, err := base64.RawURLEncoding.DecodeString(nodeID)
+	if err != nil {
+		// Try standard Base64 if URL-safe fails
+		nodeIDBytes, err = base64.StdEncoding.DecodeString(nodeID)
+		if err != nil {
+			return "", "", fmt.Errorf("failed to decode node_id: %w", err)
+		}
+	}
+	nodeIDHex := fmt.Sprintf("%x", nodeIDBytes)
+
+	logPath := path.Join(sessionID, "logs", nodeIDHex)
+	files := s.reader.ListFiles(clusterNameID, logPath)
+
+	pidSuffix := fmt.Sprintf("-%d.%s", pid, suffix)
+
+	for _, file := range files {
+		if strings.HasSuffix(file, pidSuffix) {
+			return nodeIDHex, file, nil
+		}
+	}
+
+	return "", "", utils.NewHTTPError(fmt.Errorf("log file not found for pid %d in path %s", pid, logPath), http.StatusNotFound)
+}
+
 // resolveTaskLogFilename resolves log file for a task by querying task events.
 // This mirrors Ray Dashboard's _resolve_task_filename logic.
 // The sessionID parameter is required for searching worker log files when task_log_info is not available.
diff --git a/historyserver/pkg/historyserver/router.go b/historyserver/pkg/historyserver/router.go
index 6da191b20c1..b57a85ecb1b 100644
--- a/historyserver/pkg/historyserver/router.go
+++ b/historyserver/pkg/historyserver/router.go
@@ -111,7 +111,7 @@ func routerAPI(s *ServerHandler) {
 		Param(ws.QueryParameter("filename", "filename (explicit log file path)")).
 		Param(ws.QueryParameter("task_id", "task_id (resolve log file from task)")).
 		Param(ws.QueryParameter("actor_id", "actor_id (resolve log file from actor)")).
-		Param(ws.QueryParameter("pid", "pid (resolve log file from process id, not yet implemented)")).
+		Param(ws.QueryParameter("pid", "pid (resolve log file from process id)")).
 		Param(ws.QueryParameter("suffix", "suffix (out or err, default: out, used with task_id/actor_id/pid)")).
 		Param(ws.QueryParameter("lines", "lines (number of lines to return, default: 1000)")).
 		Param(ws.QueryParameter("timeout", "timeout")).
diff --git a/historyserver/test/e2e/historyserver_test.go b/historyserver/test/e2e/historyserver_test.go
index e889a244d43..0db598bed10 100644
--- a/historyserver/test/e2e/historyserver_test.go
+++ b/historyserver/test/e2e/historyserver_test.go
@@ -338,6 +338,47 @@ func testLogFileEndpointLiveCluster(test Test, g *WithT, namespace *corev1.Names
 			"At least one actor_id should succeed. Last error: %s", lastError)
 	})
 
+	// Sub-test for pid parameter (live cluster)
+	test.T().Run("pid parameter", func(t *testing.T) {
+		g := NewWithT(t)
+
+		// Get an eligible worker PID and its node ID
+		pid, nodeID := getEligibleWorkerPID(g, client, historyServerURL)
+		LogWithTimestamp(t, "Found eligible worker PID %d on node %s for testing", pid, nodeID)
+
+		// Test successful case
+		url := fmt.Sprintf("%s%s?pid=%d&node_id=%s", historyServerURL, EndpointLogFile, pid, nodeID)
+		resp, err := client.Get(url)
+		g.Expect(err).NotTo(HaveOccurred())
+		body, _ := io.ReadAll(resp.Body)
+		resp.Body.Close()
+		g.Expect(resp.StatusCode).To(Equal(http.StatusOK), "Expected OK for valid pid and node_id, got %d: %s", resp.StatusCode, string(body))
+		g.Expect(len(body)).To(BeNumerically(">", 0))
+
+		// Test missing node_id
+		url = fmt.Sprintf("%s%s?pid=%d", historyServerURL, EndpointLogFile, pid)
+		resp, err = client.Get(url)
+		g.Expect(err).NotTo(HaveOccurred())
+		resp.Body.Close()
+		g.Expect(resp.StatusCode).To(Equal(http.StatusBadRequest))
+
+		// Test invalid pid
+		url = fmt.Sprintf("%s%s?pid=abc&node_id=%s", historyServerURL, EndpointLogFile, nodeID)
+		resp, err = client.Get(url)
+		g.Expect(err).NotTo(HaveOccurred())
+		resp.Body.Close()
+		g.Expect(resp.StatusCode).To(Equal(http.StatusBadRequest))
+
+		// Test non-existent pid
+		url = fmt.Sprintf("%s%s?pid=999999&node_id=%s", historyServerURL, EndpointLogFile, nodeID)
+		resp, err = client.Get(url)
+		g.Expect(err).NotTo(HaveOccurred())
+		body, _ = io.ReadAll(resp.Body)
+		resp.Body.Close()
+		// For a live cluster, this is proxied to the dashboard. The dashboard returns 500 for file not found.
+		g.Expect(resp.StatusCode).To(Equal(http.StatusInternalServerError), "Expected 500 for non-existent pid, got %d: %s", resp.StatusCode, string(body))
+	})
+
 	DeleteS3Bucket(test, g, s3Client)
 	LogWithTimestamp(test.T(), "Log file endpoint tests completed")
 }
@@ -428,6 +469,7 @@ func testLogFileEndpointDeadCluster(test Test, g *WithT, namespace *corev1.Names
 		{"file not found", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=nonexistent.log", u, EndpointLogFile, n) }, http.StatusNotFound},
 		{"invalid suffix", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=%s&suffix=invalid", u, EndpointLogFile, n, filename) }, http.StatusBadRequest},
 		{"task_id invalid (not found)", func(u, n string) string { return fmt.Sprintf("%s%s?task_id=nonexistent-task-id", u, EndpointLogFile) }, http.StatusBadRequest},
+		{"non-existent pid", func(u, n string) string { return fmt.Sprintf("%s%s?pid=999999&node_id=%s", u, EndpointLogFile, n) }, http.StatusNotFound},
 
 		// Path traversal attacks
 		{"traversal ../etc/passwd", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=../etc/passwd", u, EndpointLogFile, n) }, http.StatusBadRequest},
@@ -636,6 +678,15 @@ func testLogFileEndpointDeadCluster(test Test, g *WithT, namespace *corev1.Names
 			"At least one actor_id should succeed. Last error: %s", lastError)
 	})
 
+	// Sub-test for pid parameter (dead cluster)
+	// NOTE: This test is skipped because Ray export events don't include worker_pid.
+	// See: https://github.com/ray-project/ray/issues/60129
+	// Worker lifecycle events are not yet exported, so we cannot obtain worker PIDs
+	// from historical data for dead clusters.
+	test.T().Run("pid parameter", func(t *testing.T) {
+		t.Skip("Skipping pid parameter test for dead cluster: worker_pid not available in Ray export events (see https://github.com/ray-project/ray/issues/60129)")
+	})
+
 	DeleteS3Bucket(test, g, s3Client)
 	LogWithTimestamp(test.T(), "Dead cluster log file endpoint tests completed")
 }
@@ -743,3 +794,60 @@ func getAllEligibleActorIDs(g *WithT, client *http.Client, historyServerURL stri
 
 	return actorIDs
 }
+
+// getEligibleWorkerPID retrieves an eligible worker PID and its node ID for log testing.
+// It queries the /api/v0/tasks endpoint to find any task with a valid worker_pid and node_id.
+func getEligibleWorkerPID(g *WithT, client *http.Client, historyServerURL string) (pid int, nodeID string) {
+	resp, err := client.Get(historyServerURL + "/api/v0/tasks")
+	g.Expect(err).NotTo(HaveOccurred())
+	defer resp.Body.Close()
+
+	g.Expect(resp.StatusCode).To(Equal(http.StatusOK))
+
+	body, err := io.ReadAll(resp.Body)
+	g.Expect(err).NotTo(HaveOccurred())
+
+	var result map[string]interface{}
+	err = json.Unmarshal(body, &result)
+	g.Expect(err).NotTo(HaveOccurred())
+
+	// Response format: {"result": true, "msg": "...", "data": {"result": {"result": [tasks...], ...}}}
+	data, ok := result["data"].(map[string]interface{})
+	g.Expect(ok).To(BeTrue(), "response should have 'data' field")
+
+	dataResult, ok := data["result"].(map[string]interface{})
+	g.Expect(ok).To(BeTrue(), "data should have 'result' field")
+
+	tasks, ok := dataResult["result"].([]interface{})
+	g.Expect(ok).To(BeTrue(), "result should have 'result' array")
+	g.Expect(len(tasks)).To(BeNumerically(">", 0), "should have at least one task")
+
+	fmt.Printf("Tasks: %+v", tasks)
+
+	// Find a task with valid worker_pid and node_id
+	for _, t := range tasks {
+		task, ok := t.(map[string]interface{})
+		if !ok {
+			continue
+		}
+
+		// Check for worker_pid (must be non-zero)
+		workerPidFloat, pidOk := task["worker_pid"].(float64)
+		if !pidOk || workerPidFloat == 0 {
+			continue
+		}
+
+		// Check for node_id (must be non-empty)
+		taskNodeID, nodeOk := task["node_id"].(string)
+		if !nodeOk || taskNodeID == "" {
+			continue
+		}
+
+		// Found an eligible task with worker PID
+		return int(workerPidFloat), taskNodeID
+	}
+
+	// If we get here, no eligible task was found
+	g.Fail("should find at least one task with valid worker_pid and node_id")
+	return 0, ""
+}

From f739e5ab9038eaa25fff6fa0d7e18c8b3aef19bc Mon Sep 17 00:00:00 2001
From: machichima <nary12321@gmail.com>
Date: Sun, 1 Feb 2026 09:42:22 +0800
Subject: [PATCH 15/37] docs: todo comment for submission_id

Signed-off-by: machichima <nary12321@gmail.com>
---
 historyserver/pkg/historyserver/router.go | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/historyserver/pkg/historyserver/router.go b/historyserver/pkg/historyserver/router.go
index b57a85ecb1b..fb3f703c4dc 100644
--- a/historyserver/pkg/historyserver/router.go
+++ b/historyserver/pkg/historyserver/router.go
@@ -118,6 +118,16 @@ func routerAPI(s *ServerHandler) {
 		Param(ws.QueryParameter("attempt_number", "attempt_number (task retry attempt number, default: 0)")).
 		Param(ws.QueryParameter("download_file", "download_file (true/false)")).
 		Param(ws.QueryParameter("filter_ansi_code", "filter_ansi_code (true/false)")).
+		// TODO: submission_id parameter is not currently supported.
+		// To support it, we need to:
+		// 1. Implement DRIVER_JOB_DEFINITION_EVENT processing in eventserver to store driver job info
+		//    (including driver_node_id from the export event)
+		// 2. Add submission_id field to DriverJobDefinitionEvent in Ray (currently missing, tracked in
+		//    https://github.com/ray-project/ray/issues/60129)
+		// 3. Create resolveSubmissionLogFilename() method to:
+		//    - Look up driver job by submission_id
+		//    - Get driver_node_id from stored event
+		//    - Return filename as "job-driver-{submission_id}.log"
 		Produces("text/plain").
 		Writes("")) // Placeholder for specific return type
 

From e562958539113f622802f61419d5aea053cc9957 Mon Sep 17 00:00:00 2001
From: machichima <nary12321@gmail.com>
Date: Sun, 1 Feb 2026 10:48:11 +0800
Subject: [PATCH 16/37] feat+test: add node_ip support

Signed-off-by: machichima <nary12321@gmail.com>
---
 historyserver/pkg/historyserver/reader.go    | 76 ++++++++++++++++++++
 historyserver/pkg/historyserver/router.go    | 28 ++++++--
 historyserver/pkg/historyserver/types.go     |  1 +
 historyserver/test/e2e/historyserver_test.go | 69 +++++++++++++++++-
 4 files changed, 164 insertions(+), 10 deletions(-)

diff --git a/historyserver/pkg/historyserver/reader.go b/historyserver/pkg/historyserver/reader.go
index 0235ff26d26..5d3ab50590b 100644
--- a/historyserver/pkg/historyserver/reader.go
+++ b/historyserver/pkg/historyserver/reader.go
@@ -15,6 +15,7 @@ import (
 	"strings"
 
 	"github.com/emicklei/go-restful/v3"
+	"github.com/ray-project/kuberay/historyserver/pkg/eventserver/types"
 	eventtypes "github.com/ray-project/kuberay/historyserver/pkg/eventserver/types"
 	"github.com/ray-project/kuberay/historyserver/pkg/utils"
 	"github.com/sirupsen/logrus"
@@ -437,6 +438,81 @@ func (s *ServerHandler) GetNodes(rayClusterNameID, sessionId string) ([]byte, er
 	return json.Marshal(templ)
 }
 
+// ipToNodeId resolves node_id from node_ip by querying node_events from storage.
+// This mirrors Ray Dashboard's ip_to_node_id logic.
+// Returns node_id in hex format if found, error otherwise.
+func (s *ServerHandler) ipToNodeId(rayClusterNameID, sessionID, nodeIP string) (string, error) {
+	if nodeIP == "" {
+		return "", fmt.Errorf("node_ip is empty")
+	}
+
+	// List all node_events files
+	nodeEventsPath := path.Join(sessionID, "node_events")
+	files := s.reader.ListFiles(rayClusterNameID, nodeEventsPath)
+
+	// Parse each node event file to find matching node_ip
+	for _, file := range files {
+		filePath := path.Join(nodeEventsPath, file)
+		reader := s.reader.GetContent(rayClusterNameID, filePath)
+		if reader == nil {
+			continue
+		}
+
+		data, err := io.ReadAll(reader)
+		if err != nil {
+			logrus.Warnf("Failed to read node event file %s: %v", filePath, err)
+			continue
+		}
+
+		var events []map[string]interface{}
+		if err := json.Unmarshal(data, &events); err != nil {
+			logrus.Warnf("Failed to unmarshal node events from %s: %v", filePath, err)
+			continue
+		}
+
+		// Search for NODE_DEFINITION_EVENT with matching node_ip
+		for _, event := range events {
+			eventType, ok := event["eventType"].(string)
+			if !ok || eventType != string(types.NODE_DEFINITION_EVENT) {
+				continue
+			}
+
+			nodeDefEvent, ok := event["nodeDefinitionEvent"].(map[string]interface{})
+			if !ok {
+				continue
+			}
+
+			ipAddr, ok := nodeDefEvent["nodeIpAddress"].(string)
+			if !ok || ipAddr != nodeIP {
+				continue
+			}
+
+			// Found matching node, extract node_id
+			nodeIDBytes, ok := nodeDefEvent["nodeId"].(string)
+			if !ok {
+				continue
+			}
+
+			// Convert base64 node_id to hex (Ray stores node_id as base64 in events)
+			decoded, err := base64.StdEncoding.DecodeString(nodeIDBytes)
+			if err != nil {
+				// Try URL-safe base64 encoding
+				decoded, err = base64.RawURLEncoding.DecodeString(nodeIDBytes)
+				if err != nil {
+					logrus.Warnf("Failed to decode node_id %s: %v", nodeIDBytes, err)
+					continue
+				}
+			}
+
+			nodeIDHex := fmt.Sprintf("%x", decoded)
+			logrus.Infof("Resolved node_ip %s to node_id %s", nodeIP, nodeIDHex)
+			return nodeIDHex, nil
+		}
+	}
+
+	return "", fmt.Errorf("node_id not found for node_ip=%s", nodeIP)
+}
+
 // TODO: implement this
 func (h *ServerHandler) getGrafanaHealth(req *restful.Request, resp *restful.Response) {
 	resp.WriteErrorString(http.StatusNotImplemented, "Grafana health not yet supported")
diff --git a/historyserver/pkg/historyserver/router.go b/historyserver/pkg/historyserver/router.go
index fb3f703c4dc..ac9cbdd765a 100644
--- a/historyserver/pkg/historyserver/router.go
+++ b/historyserver/pkg/historyserver/router.go
@@ -107,7 +107,8 @@ func routerAPI(s *ServerHandler) {
 		Writes("")) // Placeholder for specific return type
 	ws.Route(ws.GET("/v0/logs/file").To(s.getNodeLogFile).Filter(s.CookieHandle).
 		Doc("get logfile").
-		Param(ws.QueryParameter("node_id", "node_id (optional if task_id or actor_id is provided)")).
+		Param(ws.QueryParameter("node_id", "node_id (optional if node_ip/task_id/actor_id is provided)")).
+		Param(ws.QueryParameter("node_ip", "node_ip (optional, resolve to node_id)")).
 		Param(ws.QueryParameter("filename", "filename (explicit log file path)")).
 		Param(ws.QueryParameter("task_id", "task_id (resolve log file from task)")).
 		Param(ws.QueryParameter("actor_id", "actor_id (resolve log file from actor)")).
@@ -599,6 +600,12 @@ func (s *ServerHandler) getNodeLogFile(req *restful.Request, resp *restful.Respo
 		return
 	}
 
+	// node_id or node_ip is required when not using actor_id or task_id (they can auto-resolve node_id)
+	if options.NodeID == "" && options.NodeIP == "" && options.ActorID == "" && options.TaskID == "" {
+		resp.WriteErrorString(http.StatusBadRequest, "node_id or node_ip is required when actor_id or task_id is not provided")
+		return
+	}
+
 	// Validate required parameters following Ray Dashboard logic
 	// At least one of: actor_id, task_id, pid, filename, submission_id must be provided
 	if options.ActorID == "" && options.TaskID == "" && options.PID == 0 && options.Filename == "" {
@@ -606,12 +613,6 @@ func (s *ServerHandler) getNodeLogFile(req *restful.Request, resp *restful.Respo
 		return
 	}
 
-	// node_id is required when not using actor_id or task_id (they can auto-resolve node_id)
-	if options.NodeID == "" && options.ActorID == "" && options.TaskID == "" {
-		resp.WriteErrorString(http.StatusBadRequest, "node_id is required when actor_id or task_id is not provided")
-		return
-	}
-
 	// Prevent path traversal attacks (e.g., ../../etc/passwd)
 	if options.NodeID != "" && !fs.ValidPath(options.NodeID) {
 		resp.WriteErrorString(http.StatusBadRequest, fmt.Sprintf("invalid path: path traversal not allowed (node_id=%s)", options.NodeID))
@@ -622,11 +623,23 @@ func (s *ServerHandler) getNodeLogFile(req *restful.Request, resp *restful.Respo
 		return
 	}
 
+	// For live cluster, proxy the request directly to Ray Dashboard without any processing
 	if sessionName == "live" {
 		s.redirectRequest(req, resp)
 		return
 	}
 
+	// Only resolve node_ip to node_id from stored events for dead cluster
+	if options.NodeID == "" && options.NodeIP != "" {
+		nodeID, err := s.ipToNodeId(clusterNameID+"_"+clusterNamespace, sessionName, options.NodeIP)
+		if err != nil {
+			resp.WriteErrorString(http.StatusNotFound,
+				fmt.Sprintf("Cannot find matching node_id for a given node ip %s", options.NodeIP))
+			return
+		}
+		options.NodeID = nodeID
+	}
+
 	content, err := s._getNodeLogFile(clusterNameID+"_"+clusterNamespace, sessionName, options)
 	if err != nil {
 		var httpErr *utils.HTTPError
@@ -652,6 +665,7 @@ func (s *ServerHandler) getNodeLogFile(req *restful.Request, resp *restful.Respo
 func parseGetLogFileOptions(req *restful.Request) (GetLogFileOptions, error) {
 	options := GetLogFileOptions{
 		NodeID:   req.QueryParameter("node_id"),
+		NodeIP:   req.QueryParameter("node_ip"),
 		Filename: req.QueryParameter("filename"),
 		TaskID:   req.QueryParameter("task_id"),
 		ActorID:  req.QueryParameter("actor_id"),
diff --git a/historyserver/pkg/historyserver/types.go b/historyserver/pkg/historyserver/types.go
index 24c3bc057a0..0def7f3dc8d 100644
--- a/historyserver/pkg/historyserver/types.go
+++ b/historyserver/pkg/historyserver/types.go
@@ -4,6 +4,7 @@ package historyserver
 type GetLogFileOptions struct {
 	// Node identification (one of these is required if not using task_id/actor_id)
 	NodeID   string // The node id where the log file is located
+	NodeIP   string // The node ip address (will be resolved to node_id)
 
 	// Log file identification (provide one of: Filename, TaskID, ActorID, PID)
 	Filename string // The log file name (explicit path)
diff --git a/historyserver/test/e2e/historyserver_test.go b/historyserver/test/e2e/historyserver_test.go
index 0db598bed10..0e4f9378d6f 100644
--- a/historyserver/test/e2e/historyserver_test.go
+++ b/historyserver/test/e2e/historyserver_test.go
@@ -209,7 +209,7 @@ func testLogFileEndpointLiveCluster(test Test, g *WithT, namespace *corev1.Names
 		{"lines+timeout+filter", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=%s&lines=50&timeout=10&filter_ansi_code=true", u, EndpointLogFile, n, filename) }, http.StatusOK},
 
 		// Missing mandatory parameters
-		{"missing node_id", func(u, n string) string { return fmt.Sprintf("%s%s?filename=%s", u, EndpointLogFile, filename) }, http.StatusBadRequest},
+		{"missing node_id and node_ip", func(u, n string) string { return fmt.Sprintf("%s%s?filename=%s", u, EndpointLogFile, filename) }, http.StatusBadRequest},
 		{"missing filename", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s", u, EndpointLogFile, n) }, http.StatusBadRequest},
 		{"missing both", func(u, n string) string { return fmt.Sprintf("%s%s", u, EndpointLogFile) }, http.StatusBadRequest},
 
@@ -223,6 +223,9 @@ func testLogFileEndpointLiveCluster(test Test, g *WithT, namespace *corev1.Names
 		{"file not found", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=nonexistent.log", u, EndpointLogFile, n) }, http.StatusInternalServerError},
 		{"task_id invalid (not found)", func(u, n string) string { return fmt.Sprintf("%s%s?task_id=nonexistent-task-id", u, EndpointLogFile) }, http.StatusInternalServerError},
 
+		// node_ip parameter tests
+		{"node_ip invalid (non-existent)", func(u, n string) string { return fmt.Sprintf("%s%s?node_ip=192.168.255.255&filename=%s", u, EndpointLogFile, filename) }, http.StatusInternalServerError},
+
 		// Path traversal attacks
 		{"traversal ../etc/passwd", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=../etc/passwd", u, EndpointLogFile, n) }, http.StatusBadRequest},
 		{"traversal ..", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=..", u, EndpointLogFile, n) }, http.StatusBadRequest},
@@ -379,6 +382,30 @@ func testLogFileEndpointLiveCluster(test Test, g *WithT, namespace *corev1.Names
 		g.Expect(resp.StatusCode).To(Equal(http.StatusInternalServerError), "Expected 500 for non-existent pid, got %d: %s", resp.StatusCode, string(body))
 	})
 
+	// Sub-test for node_ip parameter (live cluster)
+	test.T().Run("node_ip parameter", func(t *testing.T) {
+		g := NewWithT(t)
+
+		// Get node IP from head pod (use Pod IP, not Host IP)
+		// Ray registers nodes with Pod IP (--node-ip-address flag)
+		headPod, err := GetHeadPod(test, rayCluster)
+		g.Expect(err).NotTo(HaveOccurred())
+		nodeIP := headPod.Status.PodIP
+		g.Expect(nodeIP).NotTo(BeEmpty(), "Head pod should have a pod IP")
+		LogWithTimestamp(t, "Found head pod with IP: %s", nodeIP)
+
+		// Test successful case: node_ip + filename
+		url := fmt.Sprintf("%s%s?node_ip=%s&filename=%s", historyServerURL, EndpointLogFile, nodeIP, filename)
+		resp, err := client.Get(url)
+		g.Expect(err).NotTo(HaveOccurred())
+		body, _ := io.ReadAll(resp.Body)
+		resp.Body.Close()
+		// For live cluster, the request is proxied to Ray Dashboard
+		// The dashboard should be able to resolve node_ip to node_id
+		g.Expect(resp.StatusCode).To(Equal(http.StatusOK), "Expected OK for valid node_ip, got %d: %s", resp.StatusCode, string(body))
+		g.Expect(len(body)).To(BeNumerically(">", 0))
+	})
+
 	DeleteS3Bucket(test, g, s3Client)
 	LogWithTimestamp(test.T(), "Log file endpoint tests completed")
 }
@@ -398,8 +425,15 @@ func testLogFileEndpointDeadCluster(test Test, g *WithT, namespace *corev1.Names
 	rayCluster := PrepareTestEnv(test, g, namespace, s3Client)
 	ApplyRayJobAndWaitForCompletion(test, g, namespace, rayCluster)
 
+	// Capture node IP and ID before deleting cluster (for node_ip tests later)
+	headPod, err := GetHeadPod(test, rayCluster)
+	g.Expect(err).NotTo(HaveOccurred())
+	savedNodeIP := headPod.Status.PodIP
+	savedNodeID := GetNodeIDFromHeadPod(test, g, rayCluster)
+	LogWithTimestamp(test.T(), "Captured node IP %s and node ID %s before cluster deletion", savedNodeIP, savedNodeID)
+
 	// Delete RayCluster to trigger log upload
-	err := test.Client().Ray().RayV1().RayClusters(namespace.Name).Delete(test.Ctx(), rayCluster.Name, metav1.DeleteOptions{})
+	err = test.Client().Ray().RayV1().RayClusters(namespace.Name).Delete(test.Ctx(), rayCluster.Name, metav1.DeleteOptions{})
 	g.Expect(err).NotTo(HaveOccurred())
 	LogWithTimestamp(test.T(), "Deleted RayCluster %s/%s", namespace.Name, rayCluster.Name)
 
@@ -458,7 +492,7 @@ func testLogFileEndpointDeadCluster(test Test, g *WithT, namespace *corev1.Names
 		{"all parameters", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=%s&lines=100&timeout=15&attempt_number=0&download_file=true&filter_ansi_code=true", u, EndpointLogFile, n, filename) }, http.StatusOK},
 
 		// Missing mandatory parameters
-		{"missing node_id", func(u, n string) string { return fmt.Sprintf("%s%s?filename=%s", u, EndpointLogFile, filename) }, http.StatusBadRequest},
+		{"missing node_id and node_ip", func(u, n string) string { return fmt.Sprintf("%s%s?filename=%s", u, EndpointLogFile, filename) }, http.StatusBadRequest},
 		{"missing filename", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s", u, EndpointLogFile, n) }, http.StatusBadRequest},
 		{"missing both", func(u, n string) string { return fmt.Sprintf("%s%s", u, EndpointLogFile) }, http.StatusBadRequest},
 
@@ -471,6 +505,9 @@ func testLogFileEndpointDeadCluster(test Test, g *WithT, namespace *corev1.Names
 		{"task_id invalid (not found)", func(u, n string) string { return fmt.Sprintf("%s%s?task_id=nonexistent-task-id", u, EndpointLogFile) }, http.StatusBadRequest},
 		{"non-existent pid", func(u, n string) string { return fmt.Sprintf("%s%s?pid=999999&node_id=%s", u, EndpointLogFile, n) }, http.StatusNotFound},
 
+		// node_ip parameter tests
+		{"node_ip invalid (non-existent)", func(u, n string) string { return fmt.Sprintf("%s%s?node_ip=192.168.255.255&filename=%s", u, EndpointLogFile, filename) }, http.StatusNotFound},
+
 		// Path traversal attacks
 		{"traversal ../etc/passwd", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=../etc/passwd", u, EndpointLogFile, n) }, http.StatusBadRequest},
 		{"traversal ..", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=..", u, EndpointLogFile, n) }, http.StatusBadRequest},
@@ -687,6 +724,32 @@ func testLogFileEndpointDeadCluster(test Test, g *WithT, namespace *corev1.Names
 		t.Skip("Skipping pid parameter test for dead cluster: worker_pid not available in Ray export events (see https://github.com/ray-project/ray/issues/60129)")
 	})
 
+	// Sub-test for node_ip parameter (dead cluster)
+	test.T().Run("node_ip parameter", func(t *testing.T) {
+		g := NewWithT(t)
+
+		// Use the captured node IP and ID from before cluster deletion
+		LogWithTimestamp(t, "Testing node_ip parameter with IP: %s, ID: %s", savedNodeIP, savedNodeID)
+
+		// Test successful case: node_ip + filename
+		url := fmt.Sprintf("%s%s?node_ip=%s&filename=%s", historyServerURL, EndpointLogFile, savedNodeIP, filename)
+		resp, err := client.Get(url)
+		g.Expect(err).NotTo(HaveOccurred())
+		body, _ := io.ReadAll(resp.Body)
+		resp.Body.Close()
+		g.Expect(resp.StatusCode).To(Equal(http.StatusOK), "Expected OK for valid node_ip, got %d: %s", resp.StatusCode, string(body))
+		g.Expect(len(body)).To(BeNumerically(">", 0))
+
+		// Test that node_ip and node_id point to the same node (should return same content)
+		urlWithNodeID := fmt.Sprintf("%s%s?node_id=%s&filename=%s", historyServerURL, EndpointLogFile, savedNodeID, filename)
+		resp2, err := client.Get(urlWithNodeID)
+		g.Expect(err).NotTo(HaveOccurred())
+		bodyWithNodeID, _ := io.ReadAll(resp2.Body)
+		resp2.Body.Close()
+		g.Expect(resp2.StatusCode).To(Equal(http.StatusOK))
+		g.Expect(len(body)).To(Equal(len(bodyWithNodeID)), "node_ip and node_id should return same content")
+	})
+
 	DeleteS3Bucket(test, g, s3Client)
 	LogWithTimestamp(test.T(), "Dead cluster log file endpoint tests completed")
 }

From f2f016176fd52fdab5749e8a1ddc948bb080d2d8 Mon Sep 17 00:00:00 2001
From: machichima <nary12321@gmail.com>
Date: Sun, 1 Feb 2026 16:13:32 +0800
Subject: [PATCH 17/37] test: move pid invalid test to logFileTestCases

Signed-off-by: machichima <nary12321@gmail.com>
---
 historyserver/test/e2e/historyserver_test.go | 20 ++------------------
 1 file changed, 2 insertions(+), 18 deletions(-)

diff --git a/historyserver/test/e2e/historyserver_test.go b/historyserver/test/e2e/historyserver_test.go
index 0e4f9378d6f..3226a43762c 100644
--- a/historyserver/test/e2e/historyserver_test.go
+++ b/historyserver/test/e2e/historyserver_test.go
@@ -222,9 +222,9 @@ func testLogFileEndpointLiveCluster(test Test, g *WithT, namespace *corev1.Names
 		// ref: https://github.com/ray-project/ray/blob/68d01c4c48a59c7768ec9c2359a1859966c446b6/python/ray/dashboard/modules/state/state_head.py#L282-L284
 		{"file not found", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=nonexistent.log", u, EndpointLogFile, n) }, http.StatusInternalServerError},
 		{"task_id invalid (not found)", func(u, n string) string { return fmt.Sprintf("%s%s?task_id=nonexistent-task-id", u, EndpointLogFile) }, http.StatusInternalServerError},
-
-		// node_ip parameter tests
 		{"node_ip invalid (non-existent)", func(u, n string) string { return fmt.Sprintf("%s%s?node_ip=192.168.255.255&filename=%s", u, EndpointLogFile, filename) }, http.StatusInternalServerError},
+		{"pid invalid (string)", func(u, n string) string { return fmt.Sprintf("%s%s?pid=abc&node_id=%s", u, EndpointLogFile, n) }, http.StatusBadRequest},
+		{"pid non-existent", func(u, n string) string { return fmt.Sprintf("%s%s?pid=999999&node_id=%s", u, EndpointLogFile, n) }, http.StatusInternalServerError},
 
 		// Path traversal attacks
 		{"traversal ../etc/passwd", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=../etc/passwd", u, EndpointLogFile, n) }, http.StatusBadRequest},
@@ -364,22 +364,6 @@ func testLogFileEndpointLiveCluster(test Test, g *WithT, namespace *corev1.Names
 		g.Expect(err).NotTo(HaveOccurred())
 		resp.Body.Close()
 		g.Expect(resp.StatusCode).To(Equal(http.StatusBadRequest))
-
-		// Test invalid pid
-		url = fmt.Sprintf("%s%s?pid=abc&node_id=%s", historyServerURL, EndpointLogFile, nodeID)
-		resp, err = client.Get(url)
-		g.Expect(err).NotTo(HaveOccurred())
-		resp.Body.Close()
-		g.Expect(resp.StatusCode).To(Equal(http.StatusBadRequest))
-
-		// Test non-existent pid
-		url = fmt.Sprintf("%s%s?pid=999999&node_id=%s", historyServerURL, EndpointLogFile, nodeID)
-		resp, err = client.Get(url)
-		g.Expect(err).NotTo(HaveOccurred())
-		body, _ = io.ReadAll(resp.Body)
-		resp.Body.Close()
-		// For a live cluster, this is proxied to the dashboard. The dashboard returns 500 for file not found.
-		g.Expect(resp.StatusCode).To(Equal(http.StatusInternalServerError), "Expected 500 for non-existent pid, got %d: %s", resp.StatusCode, string(body))
 	})
 
 	// Sub-test for node_ip parameter (live cluster)

From e0151db22977fc5a90ee102b39168f416e7b5e94 Mon Sep 17 00:00:00 2001
From: machichima <nary12321@gmail.com>
Date: Sun, 1 Feb 2026 16:14:51 +0800
Subject: [PATCH 18/37] fix: add download_filename rather than download_file
 flag

Signed-off-by: machichima <nary12321@gmail.com>
---
 historyserver/pkg/historyserver/reader.go |  3 +++
 historyserver/pkg/historyserver/router.go | 17 ++++++++++-------
 historyserver/pkg/historyserver/types.go  |  5 +++--
 3 files changed, 16 insertions(+), 9 deletions(-)

diff --git a/historyserver/pkg/historyserver/reader.go b/historyserver/pkg/historyserver/reader.go
index 5d3ab50590b..8df3fcda48b 100644
--- a/historyserver/pkg/historyserver/reader.go
+++ b/historyserver/pkg/historyserver/reader.go
@@ -26,6 +26,9 @@ const (
 	// This matches Ray Dashboard API default behavior.
 	DEFAULT_LOG_LIMIT = 1000
 
+	// DEFAULT_DOWNLOAD_FILENAME is the default filename when download_filename is not specified
+	DEFAULT_DOWNLOAD_FILENAME = "file.txt"
+
 	// MAX_LOG_LIMIT is the maximum number of lines that can be requested.
 	// Requests exceeding this limit will be capped to this value.
 	MAX_LOG_LIMIT = 10000
diff --git a/historyserver/pkg/historyserver/router.go b/historyserver/pkg/historyserver/router.go
index ac9cbdd765a..511010c8950 100644
--- a/historyserver/pkg/historyserver/router.go
+++ b/historyserver/pkg/historyserver/router.go
@@ -117,7 +117,7 @@ func routerAPI(s *ServerHandler) {
 		Param(ws.QueryParameter("lines", "lines (number of lines to return, default: 1000)")).
 		Param(ws.QueryParameter("timeout", "timeout")).
 		Param(ws.QueryParameter("attempt_number", "attempt_number (task retry attempt number, default: 0)")).
-		Param(ws.QueryParameter("download_file", "download_file (true/false)")).
+		Param(ws.QueryParameter("download_filename", "download_filename (if set, triggers download with this filename)")).
 		Param(ws.QueryParameter("filter_ansi_code", "filter_ansi_code (true/false)")).
 		// TODO: submission_id parameter is not currently supported.
 		// To support it, we need to:
@@ -653,10 +653,12 @@ func (s *ServerHandler) getNodeLogFile(req *restful.Request, resp *restful.Respo
 		return
 	}
 
-	// Set Content-Disposition header if download_file is requested
-	if options.DownloadFile {
-		resp.AddHeader("Content-Disposition", fmt.Sprintf("attachment; filename=\"%s\"", options.Filename))
+	// Use provided download_filename or default to DEFAULT_DOWNLOAD_FILENAME
+	downloadFilename := options.DownloadFilename
+	if downloadFilename == "" {
+		downloadFilename = DEFAULT_DOWNLOAD_FILENAME
 	}
+	resp.AddHeader("Content-Disposition", fmt.Sprintf("attachment; filename=\"%s\"", downloadFilename))
 
 	resp.Write(content)
 }
@@ -723,9 +725,10 @@ func parseGetLogFileOptions(req *restful.Request) (GetLogFileOptions, error) {
 		options.FilterAnsiCode = strings.ToLower(filterStr) == "true"
 	}
 
-	// Parse download_file parameter (boolean)
-	if downloadStr := req.QueryParameter("download_file"); downloadStr != "" {
-		options.DownloadFile = strings.ToLower(downloadStr) == "true"
+	// Parse download_filename parameter
+	// If provided, download with the specified filename
+	if downloadFilename := req.QueryParameter("download_filename"); downloadFilename != "" {
+		options.DownloadFilename = downloadFilename
 	}
 
 	return options, nil
diff --git a/historyserver/pkg/historyserver/types.go b/historyserver/pkg/historyserver/types.go
index 0def7f3dc8d..c0751765165 100644
--- a/historyserver/pkg/historyserver/types.go
+++ b/historyserver/pkg/historyserver/types.go
@@ -22,8 +22,9 @@ type GetLogFileOptions struct {
 	AttemptNumber int
 	// Whether to filter ANSI escape codes from logs, default to False
 	FilterAnsiCode bool
-	// Whether to set Content-Disposition header for file download, default to false
-	DownloadFile bool
+	// If set, triggers download with Content-Disposition header using this filename.
+	// Otherwise, using the default filename DEFAULT_DOWNLOAD_FILENAME
+	DownloadFilename string
 	// The suffix of the log file ("out" or "err"), default to "out"
 	// Used when resolving by TaskID, ActorID, or PID
 	Suffix string

From 892d41a2aedcaa5bb1735ef5ca2d405942bea4a1 Mon Sep 17 00:00:00 2001
From: machichima <nary12321@gmail.com>
Date: Sun, 1 Feb 2026 16:20:08 +0800
Subject: [PATCH 19/37] refactor: Base64 to hex conversion logic to util
 function

Signed-off-by: machichima <nary12321@gmail.com>
---
 historyserver/pkg/historyserver/reader.go | 53 ++++++++++-------------
 1 file changed, 23 insertions(+), 30 deletions(-)

diff --git a/historyserver/pkg/historyserver/reader.go b/historyserver/pkg/historyserver/reader.go
index 8df3fcda48b..453d4781bcf 100644
--- a/historyserver/pkg/historyserver/reader.go
+++ b/historyserver/pkg/historyserver/reader.go
@@ -43,6 +43,20 @@ func filterAnsiEscapeCodes(content []byte) []byte {
 	return ansiEscapePattern.ReplaceAll(content, []byte(""))
 }
 
+// decodeBase64ToHex decodes a Base64-encoded ID and returns its hex representation.
+// It tries RawURLEncoding first (Ray's default), falling back to StdEncoding if that fails.
+func decodeBase64ToHex(id string) (string, error) {
+	idBytes, err := base64.RawURLEncoding.DecodeString(id)
+	if err != nil {
+		// Try standard Base64 if URL-safe fails
+		idBytes, err = base64.StdEncoding.DecodeString(id)
+		if err != nil {
+			return "", fmt.Errorf("failed to decode Base64 ID: %w", err)
+		}
+	}
+	return fmt.Sprintf("%x", idBytes), nil
+}
+
 func (s *ServerHandler) listClusters(limit int) []utils.ClusterInfo {
 	// Initial continuation marker
 	logrus.Debugf("Prepare to get list clusters info ...")
@@ -226,15 +240,10 @@ func (s *ServerHandler) resolvePidLogFilename(clusterNameID, sessionID, nodeID s
 	}
 
 	// The nodeID from actors/tasks is base64, but the path on storage uses hex.
-	nodeIDBytes, err := base64.RawURLEncoding.DecodeString(nodeID)
+	nodeIDHex, err := decodeBase64ToHex(nodeID)
 	if err != nil {
-		// Try standard Base64 if URL-safe fails
-		nodeIDBytes, err = base64.StdEncoding.DecodeString(nodeID)
-		if err != nil {
-			return "", "", fmt.Errorf("failed to decode node_id: %w", err)
-		}
+		return "", "", fmt.Errorf("failed to decode node_id: %w", err)
 	}
-	nodeIDHex := fmt.Sprintf("%x", nodeIDBytes)
 
 	logPath := path.Join(sessionID, "logs", nodeIDHex)
 	files := s.reader.ListFiles(clusterNameID, logPath)
@@ -379,26 +388,16 @@ func (s *ServerHandler) resolveActorLogFilename(clusterNameID, sessionID, actorI
 func (s *ServerHandler) findWorkerLogFile(clusterNameID, sessionID, nodeID, workerID, suffix string) (string, string, error) {
 	// Convert Base64 node_id to hex for the file path
 	// Ray stores IDs in Base64 (URL-safe) in events, but uses hex in log directory structure
-	nodeIDBytes, err := base64.RawURLEncoding.DecodeString(nodeID)
+	nodeIDHex, err := decodeBase64ToHex(nodeID)
 	if err != nil {
-		// Try standard Base64 if URL-safe fails
-		nodeIDBytes, err = base64.StdEncoding.DecodeString(nodeID)
-		if err != nil {
-			return "", "", fmt.Errorf("failed to decode node_id: %w", err)
-		}
+		return "", "", fmt.Errorf("failed to decode node_id: %w", err)
 	}
-	nodeIDHex := fmt.Sprintf("%x", nodeIDBytes)
 
 	// Convert Base64 worker_id to hex
-	workerIDBytes, err := base64.RawURLEncoding.DecodeString(workerID)
+	workerIDHex, err := decodeBase64ToHex(workerID)
 	if err != nil {
-		// Try standard Base64 if URL-safe fails
-		workerIDBytes, err = base64.StdEncoding.DecodeString(workerID)
-		if err != nil {
-			return "", "", fmt.Errorf("failed to decode worker_id: %w", err)
-		}
+		return "", "", fmt.Errorf("failed to decode worker_id: %w", err)
 	}
-	workerIDHex := fmt.Sprintf("%x", workerIDBytes)
 
 	// List all files in the node's log directory
 	logPath := path.Join(sessionID, "logs", nodeIDHex)
@@ -497,17 +496,11 @@ func (s *ServerHandler) ipToNodeId(rayClusterNameID, sessionID, nodeIP string) (
 			}
 
 			// Convert base64 node_id to hex (Ray stores node_id as base64 in events)
-			decoded, err := base64.StdEncoding.DecodeString(nodeIDBytes)
+			nodeIDHex, err := decodeBase64ToHex(nodeIDBytes)
 			if err != nil {
-				// Try URL-safe base64 encoding
-				decoded, err = base64.RawURLEncoding.DecodeString(nodeIDBytes)
-				if err != nil {
-					logrus.Warnf("Failed to decode node_id %s: %v", nodeIDBytes, err)
-					continue
-				}
+				logrus.Warnf("Failed to decode node_id %s: %v", nodeIDBytes, err)
+				continue
 			}
-
-			nodeIDHex := fmt.Sprintf("%x", decoded)
 			logrus.Infof("Resolved node_ip %s to node_id %s", nodeIP, nodeIDHex)
 			return nodeIDHex, nil
 		}

From d25446322a8162d43afed30761dd6ef9a25f50c4 Mon Sep 17 00:00:00 2001
From: machichima <nary12321@gmail.com>
Date: Sun, 1 Feb 2026 16:27:10 +0800
Subject: [PATCH 20/37] fix: skip convert to hex if already is

Signed-off-by: machichima <nary12321@gmail.com>
---
 historyserver/pkg/historyserver/reader.go | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/historyserver/pkg/historyserver/reader.go b/historyserver/pkg/historyserver/reader.go
index 453d4781bcf..8a8cc238f7b 100644
--- a/historyserver/pkg/historyserver/reader.go
+++ b/historyserver/pkg/historyserver/reader.go
@@ -43,9 +43,18 @@ func filterAnsiEscapeCodes(content []byte) []byte {
 	return ansiEscapePattern.ReplaceAll(content, []byte(""))
 }
 
-// decodeBase64ToHex decodes a Base64-encoded ID and returns its hex representation.
+// decodeBase64ToHex converts an ID to hex format.
+// Handles both cases:
+// 1. Already hex format - returns as-is
+// 2. Base64-encoded - decodes to hex
 // It tries RawURLEncoding first (Ray's default), falling back to StdEncoding if that fails.
 func decodeBase64ToHex(id string) (string, error) {
+	// Check if already hex (only [0-9a-f])
+	if matched, _ := regexp.MatchString("^[0-9a-f]+$", id); matched {
+		return id, nil
+	}
+
+	// Try base64 decode
 	idBytes, err := base64.RawURLEncoding.DecodeString(id)
 	if err != nil {
 		// Try standard Base64 if URL-safe fails
@@ -239,7 +248,7 @@ func (s *ServerHandler) resolvePidLogFilename(clusterNameID, sessionID, nodeID s
 		return "", "", fmt.Errorf("node_id is required for pid resolution")
 	}
 
-	// The nodeID from actors/tasks is base64, but the path on storage uses hex.
+	// Convert to hex if not already is
 	nodeIDHex, err := decodeBase64ToHex(nodeID)
 	if err != nil {
 		return "", "", fmt.Errorf("failed to decode node_id: %w", err)
@@ -386,8 +395,7 @@ func (s *ServerHandler) resolveActorLogFilename(clusterNameID, sessionID, actorI
 // Worker log files follow the pattern: worker-{worker_id_hex}-{pid}-{worker_startup_token}.{suffix}
 // Returns (nodeIDHex, filename, error).
 func (s *ServerHandler) findWorkerLogFile(clusterNameID, sessionID, nodeID, workerID, suffix string) (string, string, error) {
-	// Convert Base64 node_id to hex for the file path
-	// Ray stores IDs in Base64 (URL-safe) in events, but uses hex in log directory structure
+	// Convert to hex if not already is
 	nodeIDHex, err := decodeBase64ToHex(nodeID)
 	if err != nil {
 		return "", "", fmt.Errorf("failed to decode node_id: %w", err)
@@ -495,7 +503,7 @@ func (s *ServerHandler) ipToNodeId(rayClusterNameID, sessionID, nodeIP string) (
 				continue
 			}
 
-			// Convert base64 node_id to hex (Ray stores node_id as base64 in events)
+			// Convert to hex if not already is
 			nodeIDHex, err := decodeBase64ToHex(nodeIDBytes)
 			if err != nil {
 				logrus.Warnf("Failed to decode node_id %s: %v", nodeIDBytes, err)

From ccd8243af37886bb07566024ea2a6990ed104100 Mon Sep 17 00:00:00 2001
From: machichima <nary12321@gmail.com>
Date: Sun, 1 Feb 2026 16:29:53 +0800
Subject: [PATCH 21/37] fix: close reader to prevent connection leak

Signed-off-by: machichima <nary12321@gmail.com>
---
 historyserver/pkg/historyserver/reader.go | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/historyserver/pkg/historyserver/reader.go b/historyserver/pkg/historyserver/reader.go
index 8a8cc238f7b..73f027414ed 100644
--- a/historyserver/pkg/historyserver/reader.go
+++ b/historyserver/pkg/historyserver/reader.go
@@ -468,6 +468,10 @@ func (s *ServerHandler) ipToNodeId(rayClusterNameID, sessionID, nodeIP string) (
 			continue
 		}
 
+		if closer, ok := reader.(io.Closer); ok {
+			defer closer.Close()
+		}
+
 		data, err := io.ReadAll(reader)
 		if err != nil {
 			logrus.Warnf("Failed to read node event file %s: %v", filePath, err)

From f4dfefb13dfa7c026fa15a32a22c8ff275791467 Mon Sep 17 00:00:00 2001
From: machichima <nary12321@gmail.com>
Date: Sun, 1 Feb 2026 16:31:39 +0800
Subject: [PATCH 22/37] refactor: remove duplicate import

Signed-off-by: machichima <nary12321@gmail.com>
---
 historyserver/pkg/historyserver/reader.go | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/historyserver/pkg/historyserver/reader.go b/historyserver/pkg/historyserver/reader.go
index 73f027414ed..81b816f90d8 100644
--- a/historyserver/pkg/historyserver/reader.go
+++ b/historyserver/pkg/historyserver/reader.go
@@ -15,7 +15,6 @@ import (
 	"strings"
 
 	"github.com/emicklei/go-restful/v3"
-	"github.com/ray-project/kuberay/historyserver/pkg/eventserver/types"
 	eventtypes "github.com/ray-project/kuberay/historyserver/pkg/eventserver/types"
 	"github.com/ray-project/kuberay/historyserver/pkg/utils"
 	"github.com/sirupsen/logrus"
@@ -487,7 +486,7 @@ func (s *ServerHandler) ipToNodeId(rayClusterNameID, sessionID, nodeIP string) (
 		// Search for NODE_DEFINITION_EVENT with matching node_ip
 		for _, event := range events {
 			eventType, ok := event["eventType"].(string)
-			if !ok || eventType != string(types.NODE_DEFINITION_EVENT) {
+			if !ok || eventType != string(eventtypes.NODE_DEFINITION_EVENT) {
 				continue
 			}
 

From a6b76ad63bfdbeb5491bcd32d3851b7a07607889 Mon Sep 17 00:00:00 2001
From: machichima <nary12321@gmail.com>
Date: Sun, 1 Feb 2026 16:49:43 +0800
Subject: [PATCH 23/37] test: fix and refactor test

Signed-off-by: machichima <nary12321@gmail.com>
---
 historyserver/test/e2e/historyserver_test.go | 137 ++++++++-----------
 1 file changed, 58 insertions(+), 79 deletions(-)

diff --git a/historyserver/test/e2e/historyserver_test.go b/historyserver/test/e2e/historyserver_test.go
index 3226a43762c..698ca40696c 100644
--- a/historyserver/test/e2e/historyserver_test.go
+++ b/historyserver/test/e2e/historyserver_test.go
@@ -459,9 +459,8 @@ func testLogFileEndpointDeadCluster(test Test, g *WithT, namespace *corev1.Names
 		{"attempt_number=0", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=%s&attempt_number=0", u, EndpointLogFile, n, filename) }, http.StatusOK},
 		{"attempt_number=1 (not found)", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=%s&attempt_number=1", u, EndpointLogFile, n, filename) }, http.StatusNotFound},
 
-		// download_file parameter
-		{"download_file=true", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=%s&download_file=true", u, EndpointLogFile, n, filename) }, http.StatusOK},
-		{"download_file=false", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=%s&download_file=false", u, EndpointLogFile, n, filename) }, http.StatusOK},
+		// download_filename parameter
+		{"download_filename=custom.log", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=%s&download_filename=custom.log", u, EndpointLogFile, n, filename) }, http.StatusOK},
 
 		// filter_ansi_code parameter
 		{"filter_ansi_code=true", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=%s&filter_ansi_code=true", u, EndpointLogFile, n, filename) }, http.StatusOK},
@@ -473,7 +472,7 @@ func testLogFileEndpointDeadCluster(test Test, g *WithT, namespace *corev1.Names
 
 		// Combined parameters
 		{"lines+timeout+filter", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=%s&lines=50&timeout=10&filter_ansi_code=true", u, EndpointLogFile, n, filename) }, http.StatusOK},
-		{"all parameters", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=%s&lines=100&timeout=15&attempt_number=0&download_file=true&filter_ansi_code=true", u, EndpointLogFile, n, filename) }, http.StatusOK},
+		{"all parameters", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=%s&lines=100&timeout=15&attempt_number=0&download_filename=custom.log&filter_ansi_code=true", u, EndpointLogFile, n, filename) }, http.StatusOK},
 
 		// Missing mandatory parameters
 		{"missing node_id and node_ip", func(u, n string) string { return fmt.Sprintf("%s%s?filename=%s", u, EndpointLogFile, filename) }, http.StatusBadRequest},
@@ -530,95 +529,79 @@ func testLogFileEndpointDeadCluster(test Test, g *WithT, namespace *corev1.Names
 	}
 
 	// Sub-tests for specific parameter behaviors
-	test.T().Run("download_file header validation", func(t *testing.T) {
+	test.T().Run("download_filename header validation", func(t *testing.T) {
 		g := NewWithT(t)
-		g.Eventually(func(gg Gomega) {
-			// Test with download_file=true
-			urlWithDownload := fmt.Sprintf("%s%s?node_id=%s&filename=%s&download_file=true", historyServerURL, EndpointLogFile, nodeID, filename)
-			resp, err := client.Get(urlWithDownload)
-			gg.Expect(err).NotTo(HaveOccurred())
-			defer resp.Body.Close()
-
-			gg.Expect(resp.StatusCode).To(Equal(http.StatusOK))
-			contentDisposition := resp.Header.Get("Content-Disposition")
-			gg.Expect(contentDisposition).To(ContainSubstring("attachment"))
-			gg.Expect(contentDisposition).To(ContainSubstring(fmt.Sprintf("filename=\"%s\"", filename)))
-			LogWithTimestamp(test.T(), "download_file=true sets Content-Disposition header: %s", contentDisposition)
-
-			// Test with download_file=false, should not have Content-Disposition header
-			urlWithoutDownload := fmt.Sprintf("%s%s?node_id=%s&filename=%s&download_file=false", historyServerURL, EndpointLogFile, nodeID, filename)
-			resp2, err := client.Get(urlWithoutDownload)
-			gg.Expect(err).NotTo(HaveOccurred())
-			defer resp2.Body.Close()
+		// Test with download_filename parameter set
+		customFilename := "custom_download.log"
+		urlWithDownload := fmt.Sprintf("%s%s?node_id=%s&filename=%s&download_filename=%s", historyServerURL, EndpointLogFile, nodeID, filename, customFilename)
+		resp, err := client.Get(urlWithDownload)
+		g.Expect(err).NotTo(HaveOccurred())
+		defer resp.Body.Close()
 
-			gg.Expect(resp2.StatusCode).To(Equal(http.StatusOK))
-			contentDisposition2 := resp2.Header.Get("Content-Disposition")
-			gg.Expect(contentDisposition2).To(BeEmpty(), "Content-Disposition should not be set when download_file=false")
-		}, TestTimeoutShort).Should(Succeed())
+		g.Expect(resp.StatusCode).To(Equal(http.StatusOK))
+		contentDisposition := resp.Header.Get("Content-Disposition")
+		g.Expect(contentDisposition).To(ContainSubstring("attachment"))
+		g.Expect(contentDisposition).To(ContainSubstring(fmt.Sprintf("filename=\"%s\"", customFilename)))
 	})
 
 	test.T().Run("filter_ansi_code behavior", func(t *testing.T) {
 		g := NewWithT(t)
-		g.Eventually(func(gg Gomega) {
-			// Fetch with filter_ansi_code=false (original content with ANSI codes)
-			urlWithoutFilter := fmt.Sprintf("%s%s?node_id=%s&filename=%s&filter_ansi_code=false&lines=100", historyServerURL, EndpointLogFile, nodeID, filename)
-			resp, err := client.Get(urlWithoutFilter)
-			gg.Expect(err).NotTo(HaveOccurred())
-			defer resp.Body.Close()
+		// Fetch with filter_ansi_code=false (original content with ANSI codes)
+		urlWithoutFilter := fmt.Sprintf("%s%s?node_id=%s&filename=%s&filter_ansi_code=false&lines=100", historyServerURL, EndpointLogFile, nodeID, filename)
+		resp, err := client.Get(urlWithoutFilter)
+		g.Expect(err).NotTo(HaveOccurred())
+		defer resp.Body.Close()
 
-			gg.Expect(resp.StatusCode).To(Equal(http.StatusOK))
-			bodyWithoutFilter, err := io.ReadAll(resp.Body)
-			gg.Expect(err).NotTo(HaveOccurred())
+		g.Expect(resp.StatusCode).To(Equal(http.StatusOK))
+		bodyWithoutFilter, err := io.ReadAll(resp.Body)
+		g.Expect(err).NotTo(HaveOccurred())
 
-			// Fetch with filter_ansi_code=true (ANSI codes should be removed)
-			urlWithFilter := fmt.Sprintf("%s%s?node_id=%s&filename=%s&filter_ansi_code=true&lines=100", historyServerURL, EndpointLogFile, nodeID, filename)
-			resp2, err := client.Get(urlWithFilter)
-			gg.Expect(err).NotTo(HaveOccurred())
-			defer resp2.Body.Close()
+		// Fetch with filter_ansi_code=true (ANSI codes should be removed)
+		urlWithFilter := fmt.Sprintf("%s%s?node_id=%s&filename=%s&filter_ansi_code=true&lines=100", historyServerURL, EndpointLogFile, nodeID, filename)
+		resp2, err := client.Get(urlWithFilter)
+		g.Expect(err).NotTo(HaveOccurred())
+		defer resp2.Body.Close()
 
-			gg.Expect(resp2.StatusCode).To(Equal(http.StatusOK))
-			bodyWithFilter, err := io.ReadAll(resp2.Body)
-			gg.Expect(err).NotTo(HaveOccurred())
+		g.Expect(resp2.StatusCode).To(Equal(http.StatusOK))
+		bodyWithFilter, err := io.ReadAll(resp2.Body)
+		g.Expect(err).NotTo(HaveOccurred())
 
-			// Check if original content contains ANSI codes using the same pattern as reader.go
-			hasAnsiInOriginal := ansiEscapePattern.Match(bodyWithoutFilter)
+		// Check if original content contains ANSI codes using the same pattern as reader.go
+		hasAnsiInOriginal := ansiEscapePattern.Match(bodyWithoutFilter)
 
-			if hasAnsiInOriginal {
-				LogWithTimestamp(test.T(), "Original log contains ANSI codes, verifying they are filtered")
-				// Filtered content should NOT contain ANSI escape sequences
-				hasAnsiInFiltered := ansiEscapePattern.Match(bodyWithFilter)
-				gg.Expect(hasAnsiInFiltered).To(BeFalse(), "Filtered content should not contain ANSI escape sequences")
-			} else {
-				LogWithTimestamp(test.T(), "Log doesn't contain ANSI codes, check is skipped...")
-			}
-		}, TestTimeoutShort).Should(Succeed())
+		if hasAnsiInOriginal {
+			LogWithTimestamp(test.T(), "Original log contains ANSI codes, verifying they are filtered")
+			// Filtered content should NOT contain ANSI escape sequences
+			hasAnsiInFiltered := ansiEscapePattern.Match(bodyWithFilter)
+			g.Expect(hasAnsiInFiltered).To(BeFalse(), "Filtered content should not contain ANSI escape sequences")
+		} else {
+			LogWithTimestamp(test.T(), "Log doesn't contain ANSI codes, check is skipped...")
+		}
 	})
 
 	test.T().Run("attempt_number behavior", func(t *testing.T) {
 		g := NewWithT(t)
-		g.Eventually(func(gg Gomega) {
-			// Test with attempt_number=0
-			urlAttempt0 := fmt.Sprintf("%s%s?node_id=%s&filename=%s&attempt_number=0", historyServerURL, EndpointLogFile, nodeID, filename)
-			resp, err := client.Get(urlAttempt0)
-			gg.Expect(err).NotTo(HaveOccurred())
-			defer resp.Body.Close()
+		// Test with attempt_number=0
+		urlAttempt0 := fmt.Sprintf("%s%s?node_id=%s&filename=%s&attempt_number=0", historyServerURL, EndpointLogFile, nodeID, filename)
+		resp, err := client.Get(urlAttempt0)
+		g.Expect(err).NotTo(HaveOccurred())
+		defer resp.Body.Close()
 
-			gg.Expect(resp.StatusCode).To(Equal(http.StatusOK))
-			body, err := io.ReadAll(resp.Body)
-			gg.Expect(err).NotTo(HaveOccurred())
-			gg.Expect(len(body)).To(BeNumerically(">", 0))
-			LogWithTimestamp(test.T(), "attempt_number=0 returned %d bytes", len(body))
+		g.Expect(resp.StatusCode).To(Equal(http.StatusOK))
+		body, err := io.ReadAll(resp.Body)
+		g.Expect(err).NotTo(HaveOccurred())
+		g.Expect(len(body)).To(BeNumerically(">", 0))
+		LogWithTimestamp(test.T(), "attempt_number=0 returned %d bytes", len(body))
 
-			// attempt_number=1 should fail as retry log doesn't exist for normal execution
-			urlAttempt1 := fmt.Sprintf("%s%s?node_id=%s&filename=%s&attempt_number=1", historyServerURL, EndpointLogFile, nodeID, filename)
-			resp2, err := client.Get(urlAttempt1)
-			gg.Expect(err).NotTo(HaveOccurred())
-			defer resp2.Body.Close()
+		// attempt_number=1 should fail as retry log doesn't exist for normal execution
+		urlAttempt1 := fmt.Sprintf("%s%s?node_id=%s&filename=%s&attempt_number=1", historyServerURL, EndpointLogFile, nodeID, filename)
+		resp2, err := client.Get(urlAttempt1)
+		g.Expect(err).NotTo(HaveOccurred())
+		defer resp2.Body.Close()
 
-			gg.Expect(resp2.StatusCode).To(Equal(http.StatusNotFound),
-				"attempt_number=1 should return 404 when retry log doesn't exist")
-			LogWithTimestamp(test.T(), "attempt_number=1 correctly returns 404 (file not found)")
-		}, TestTimeoutShort).Should(Succeed())
+		g.Expect(resp2.StatusCode).To(Equal(http.StatusNotFound),
+			"attempt_number=1 should return 404 when retry log doesn't exist")
+		LogWithTimestamp(test.T(), "attempt_number=1 correctly returns 404 (file not found)")
 	})
 
 	// Sub-test for task_id parameter
@@ -835,8 +818,6 @@ func getAllEligibleActorIDs(g *WithT, client *http.Client, historyServerURL stri
 		actorIDs = append(actorIDs, actorID)
 	}
 
-	fmt.Printf("Actor IDs: %+v", actorIDs)
-
 	g.Expect(len(actorIDs)).To(BeNumerically(">", 0), "should have at least one eligible actor")
 
 	return actorIDs
@@ -869,8 +850,6 @@ func getEligibleWorkerPID(g *WithT, client *http.Client, historyServerURL string
 	g.Expect(ok).To(BeTrue(), "result should have 'result' array")
 	g.Expect(len(tasks)).To(BeNumerically(">", 0), "should have at least one task")
 
-	fmt.Printf("Tasks: %+v", tasks)
-
 	// Find a task with valid worker_pid and node_id
 	for _, t := range tasks {
 		task, ok := t.(map[string]interface{})

From 4b7a56b185ea25faae86de52aee9a36b12a66312 Mon Sep 17 00:00:00 2001
From: machichima <nary12321@gmail.com>
Date: Sun, 1 Feb 2026 16:58:24 +0800
Subject: [PATCH 24/37] fix: append suffix when deal with filename

Signed-off-by: machichima <nary12321@gmail.com>
---
 historyserver/pkg/historyserver/reader.go | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/historyserver/pkg/historyserver/reader.go b/historyserver/pkg/historyserver/reader.go
index 81b816f90d8..2ec395ed185 100644
--- a/historyserver/pkg/historyserver/reader.go
+++ b/historyserver/pkg/historyserver/reader.go
@@ -127,12 +127,6 @@ func (s *ServerHandler) _getNodeLogFile(rayClusterNameID, sessionID string, opti
 	// Build log path
 	logPath := path.Join(sessionID, "logs", nodeID, filename)
 
-	// Append attempt_number if specified and not using task_id
-	// (task_id already includes attempt_number in resolution)
-	if options.AttemptNumber > 0 && options.TaskID == "" {
-		logPath = fmt.Sprintf("%s.%d", logPath, options.AttemptNumber)
-	}
-
 	reader := s.reader.GetContent(rayClusterNameID, logPath)
 
 	if reader == nil {
@@ -219,7 +213,12 @@ func (s *ServerHandler) resolveLogFilename(clusterNameID, sessionID string, opti
 		if options.NodeID == "" {
 			return "", "", fmt.Errorf("node_id is required when filename is provided")
 		}
-		return options.NodeID, options.Filename, nil
+		filename := options.Filename
+		// Append attempt_number if specified for explicit filenames
+		if options.AttemptNumber > 0 {
+			filename = fmt.Sprintf("%s.%d", filename, options.AttemptNumber)
+		}
+		return options.NodeID, filename, nil
 	}
 
 	// If task_id is provided, resolve from task events

From 4bd16dadace58f0929d3716fd091ee792754856d Mon Sep 17 00:00:00 2001
From: machichima <nary12321@gmail.com>
Date: Sun, 1 Feb 2026 17:01:13 +0800
Subject: [PATCH 25/37] fix: remove duplicate suffix validation

Signed-off-by: machichima <nary12321@gmail.com>
---
 historyserver/pkg/historyserver/reader.go | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/historyserver/pkg/historyserver/reader.go b/historyserver/pkg/historyserver/reader.go
index 2ec395ed185..64ff7bb1b1c 100644
--- a/historyserver/pkg/historyserver/reader.go
+++ b/historyserver/pkg/historyserver/reader.go
@@ -203,11 +203,6 @@ func (s *ServerHandler) _getNodeLogFile(rayClusterNameID, sessionID string, opti
 // This mirrors Ray Dashboard's resolve_filename logic.
 // The sessionID parameter is required for task_id resolution to search worker log files.
 func (s *ServerHandler) resolveLogFilename(clusterNameID, sessionID string, options GetLogFileOptions) (nodeID, filename string, err error) {
-	// Validate suffix
-	if options.Suffix != "out" && options.Suffix != "err" {
-		return "", "", fmt.Errorf("invalid suffix: %s (must be 'out' or 'err')", options.Suffix)
-	}
-
 	// If filename is explicitly provided, use it and ignore suffix
 	if options.Filename != "" {
 		if options.NodeID == "" {

From f8dd9eeee4747b958c3436fb3ccb4dfaa9857fcb Mon Sep 17 00:00:00 2001
From: machichima <nary12321@gmail.com>
Date: Sun, 1 Feb 2026 17:17:10 +0800
Subject: [PATCH 26/37] refactor: remove not yet implemented comment

Signed-off-by: machichima <nary12321@gmail.com>
---
 historyserver/pkg/historyserver/types.go | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/historyserver/pkg/historyserver/types.go b/historyserver/pkg/historyserver/types.go
index c0751765165..fcb65009599 100644
--- a/historyserver/pkg/historyserver/types.go
+++ b/historyserver/pkg/historyserver/types.go
@@ -9,8 +9,8 @@ type GetLogFileOptions struct {
 	// Log file identification (provide one of: Filename, TaskID, ActorID, PID)
 	Filename string // The log file name (explicit path)
 	TaskID   string // Task ID to resolve log file
-	ActorID  string // Actor ID to resolve log file (not yet implemented)
-	PID      int    // Process ID to resolve log file (not yet implemented)
+	ActorID  string // Actor ID to resolve log file
+	PID      int    // Process ID to resolve log file
 
 	// Optional parameters with defaults
 	// Number of lines to return, default to DEFAULT_LOG_LIMIT (1000)

From fe49c92fe3b1783178da1e005caa600e3554ce17 Mon Sep 17 00:00:00 2001
From: machichima <nary12321@gmail.com>
Date: Sun, 1 Feb 2026 20:19:30 +0800
Subject: [PATCH 27/37] feat+test: add logs/stream endpoint

Signed-off-by: machichima <nary12321@gmail.com>
---
 historyserver/pkg/historyserver/router.go    | 25 ++++++
 historyserver/test/e2e/historyserver_test.go | 83 +++++++++++++++++++-
 2 files changed, 106 insertions(+), 2 deletions(-)

diff --git a/historyserver/pkg/historyserver/router.go b/historyserver/pkg/historyserver/router.go
index 511010c8950..8fea834faf6 100644
--- a/historyserver/pkg/historyserver/router.go
+++ b/historyserver/pkg/historyserver/router.go
@@ -131,6 +131,18 @@ func routerAPI(s *ServerHandler) {
 		//    - Return filename as "job-driver-{submission_id}.log"
 		Produces("text/plain").
 		Writes("")) // Placeholder for specific return type
+	ws.Route(ws.GET("/v0/logs/stream").To(s.getNodeLogStream).Filter(s.CookieHandle).
+		Doc("stream logs in real-time").
+		Param(ws.QueryParameter("node_id", "node_id (optional if node_ip/task_id/actor_id is provided)")).
+		Param(ws.QueryParameter("node_ip", "node_ip (optional, resolve to node_id)")).
+		Param(ws.QueryParameter("filename", "filename (explicit log file path)")).
+		Param(ws.QueryParameter("task_id", "task_id (resolve log file from task)")).
+		Param(ws.QueryParameter("actor_id", "actor_id (resolve log file from actor)")).
+		Param(ws.QueryParameter("pid", "pid (resolve log file from process id)")).
+		Param(ws.QueryParameter("suffix", "suffix (out or err, default: out, used with task_id/actor_id/pid)")).
+		Param(ws.QueryParameter("interval", "interval (polling interval in seconds)")).
+		Produces("text/event-stream").
+		Writes("")) // Placeholder for specific return type
 
 	ws.Route(ws.GET("/v0/tasks").To(s.getTaskDetail).Filter(s.CookieHandle).
 		Doc("get task detail ").
@@ -734,6 +746,19 @@ func parseGetLogFileOptions(req *restful.Request) (GetLogFileOptions, error) {
 	return options, nil
 }
 
+func (s *ServerHandler) getNodeLogStream(req *restful.Request, resp *restful.Response) {
+	sessionName := req.Attribute(COOKIE_SESSION_NAME_KEY).(string)
+
+	// Streaming only available for live clusters
+	if sessionName != "live" {
+		resp.WriteErrorString(http.StatusNotImplemented, "Log streaming only available for live clusters")
+		return
+	}
+
+	// Forward to Ray Dashboard
+	s.redirectRequest(req, resp)
+}
+
 func (s *ServerHandler) getTaskSummarize(req *restful.Request, resp *restful.Response) {
 	clusterName := req.Attribute(COOKIE_CLUSTER_NAME_KEY).(string)
 	clusterNamespace := req.Attribute(COOKIE_CLUSTER_NAMESPACE_KEY).(string)
diff --git a/historyserver/test/e2e/historyserver_test.go b/historyserver/test/e2e/historyserver_test.go
index 698ca40696c..2cc70dbda54 100644
--- a/historyserver/test/e2e/historyserver_test.go
+++ b/historyserver/test/e2e/historyserver_test.go
@@ -20,8 +20,9 @@ import (
 )
 
 const (
-	LiveSessionName = "live"
-	EndpointLogFile = "/api/v0/logs/file"
+	LiveSessionName   = "live"
+	EndpointLogFile   = "/api/v0/logs/file"
+	EndpointLogStream = "/api/v0/logs/stream"
 )
 
 // ansiEscapePattern matches ANSI escape sequences (same pattern as in reader.go)
@@ -48,6 +49,10 @@ func TestHistoryServer(t *testing.T) {
 			name:     "/v0/logs/file endpoint (dead cluster)",
 			testFunc: testLogFileEndpointDeadCluster,
 		},
+		{
+			name:     "/v0/logs/stream endpoint",
+			testFunc: testLogStreamEndpoint,
+		},
 	}
 
 	for _, tt := range tests {
@@ -877,3 +882,77 @@ func getEligibleWorkerPID(g *WithT, client *http.Client, historyServerURL string
 	g.Fail("should find at least one task with valid worker_pid and node_id")
 	return 0, ""
 }
+
+// testLogStreamEndpoint verifies the /v0/logs/stream endpoint behavior for both live and dead clusters.
+//
+// The test case follows these steps:
+// 1. Prepare test environment by applying a Ray cluster
+// 2. Submit a Ray job to the existing cluster
+// 3. Apply History Server and get its URL
+// 4. Test live cluster: streaming should work (return 200)
+// 5. Delete cluster to test dead cluster behavior
+// 6. Test dead cluster: streaming should return 501 Not Implemented
+// 7. Delete S3 bucket to ensure test isolation
+func testLogStreamEndpoint(test Test, g *WithT, namespace *corev1.Namespace, s3Client *s3.S3) {
+	rayCluster := PrepareTestEnv(test, g, namespace, s3Client)
+	ApplyRayJobAndWaitForCompletion(test, g, namespace, rayCluster)
+	ApplyHistoryServer(test, g, namespace)
+	historyServerURL := GetHistoryServerURL(test, g, namespace)
+
+	// Test 1: Live cluster - streaming should work
+	LogWithTimestamp(test.T(), "Testing /v0/logs/stream for live cluster")
+	clusterInfo := getClusterFromList(test, g, historyServerURL, rayCluster.Name, namespace.Name)
+	g.Expect(clusterInfo.SessionName).To(Equal(LiveSessionName))
+
+	client := CreateHTTPClientWithCookieJar(g)
+	setClusterContext(test, g, client, historyServerURL, namespace.Name, rayCluster.Name, clusterInfo.SessionName)
+
+	nodeID := GetOneOfNodeID(g, client, historyServerURL)
+	filename := "raylet.out"
+	streamURL := fmt.Sprintf("%s%s?node_id=%s&filename=%s", historyServerURL, EndpointLogStream, nodeID, filename)
+
+	// Test live cluster streaming endpoint
+	test.T().Run("live cluster", func(t *testing.T) {
+		g := NewWithT(t)
+
+		resp, err := client.Get(streamURL)
+		g.Expect(err).NotTo(HaveOccurred())
+		g.Expect(resp.StatusCode).To(Equal(http.StatusOK), "Live cluster should support log streaming")
+
+		resp.Body.Close()
+	})
+
+	// Delete RayCluster to test dead cluster behavior
+	err := test.Client().Ray().RayV1().RayClusters(namespace.Name).Delete(test.Ctx(), rayCluster.Name, metav1.DeleteOptions{})
+	g.Expect(err).NotTo(HaveOccurred())
+	LogWithTimestamp(test.T(), "Deleted RayCluster %s/%s", namespace.Name, rayCluster.Name)
+
+	// Wait for cluster to be fully deleted
+	g.Eventually(func() error {
+		_, err := GetRayCluster(test, namespace.Name, rayCluster.Name)
+		return err
+	}, TestTimeoutMedium).Should(WithTransform(k8serrors.IsNotFound, BeTrue()))
+
+	// Get the new cluster info
+	deadClusterInfo := getClusterFromList(test, g, historyServerURL, rayCluster.Name, namespace.Name)
+	g.Expect(deadClusterInfo.SessionName).NotTo(Equal(LiveSessionName))
+
+	// Set context for dead cluster
+	setClusterContext(test, g, client, historyServerURL, namespace.Name, rayCluster.Name, deadClusterInfo.SessionName)
+
+	// Test dead cluster streaming endpoint - should return 501
+	test.T().Run("dead cluster", func(t *testing.T) {
+		g := NewWithT(t)
+		resp2, err := client.Get(streamURL)
+		g.Expect(err).NotTo(HaveOccurred())
+		defer resp2.Body.Close()
+
+		g.Expect(resp2.StatusCode).To(Equal(http.StatusNotImplemented), "Dead cluster should return 501 for log streaming")
+		body, _ := io.ReadAll(resp2.Body)
+		g.Expect(string(body)).To(ContainSubstring("Log streaming only available for live clusters"))
+		LogWithTimestamp(test.T(), "Dead cluster correctly returns 501 Not Implemented")
+	})
+
+	DeleteS3Bucket(test, g, s3Client)
+	LogWithTimestamp(test.T(), "Log stream endpoint tests completed")
+}

From 200d56c3b317ee7025cdb1c50f97613251e56fd7 Mon Sep 17 00:00:00 2001
From: machichima <nary12321@gmail.com>
Date: Mon, 2 Feb 2026 19:59:13 +0800
Subject: [PATCH 28/37] fix: remove redundant status code check

Signed-off-by: machichima <nary12321@gmail.com>
---
 historyserver/test/e2e/historyserver_test.go | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/historyserver/test/e2e/historyserver_test.go b/historyserver/test/e2e/historyserver_test.go
index 2cc70dbda54..8e9b3679f54 100644
--- a/historyserver/test/e2e/historyserver_test.go
+++ b/historyserver/test/e2e/historyserver_test.go
@@ -259,9 +259,6 @@ func testLogFileEndpointLiveCluster(test Test, g *WithT, namespace *corev1.Names
 					tc.name, tc.expectedStatus, resp.StatusCode, string(body))
 			}
 
-			g.Expect(resp.StatusCode).To(Equal(tc.expectedStatus),
-				"Test case '%s' failed: expected %d, got %d", tc.name, tc.expectedStatus, resp.StatusCode)
-
 			if tc.expectedStatus == http.StatusOK {
 				g.Expect(len(body)).To(BeNumerically(">", 0))
 			}

From c41d87436688f2020081f9c1f9a047f93810ec4d Mon Sep 17 00:00:00 2001
From: machichima <nary12321@gmail.com>
Date: Tue, 3 Feb 2026 18:51:08 +0800
Subject: [PATCH 29/37] fix: update format of worker log in comment

Signed-off-by: machichima <nary12321@gmail.com>
---
 historyserver/pkg/historyserver/reader.go | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/historyserver/pkg/historyserver/reader.go b/historyserver/pkg/historyserver/reader.go
index 64ff7bb1b1c..e7fe9ccf32c 100644
--- a/historyserver/pkg/historyserver/reader.go
+++ b/historyserver/pkg/historyserver/reader.go
@@ -320,15 +320,12 @@ func (s *ServerHandler) resolveTaskLogFilename(clusterNameID, sessionID, taskID
 	}
 
 	// Fallback: Find worker log file by worker_id
-	// Worker log files follow the pattern: worker-{worker_id_hex}-{pid}-{worker_startup_token}.{suffix}
-	// We need to search for files matching this pattern
 	if sessionID == "" {
 		return "", "", fmt.Errorf(
 			"task %s (attempt %d) has no task_log_info and sessionID is required to search for worker log files",
 			taskID, attemptNumber,
 		)
 	}
-
 	nodeIDHex, logFilename, err := s.findWorkerLogFile(clusterNameID, sessionID, foundTask.NodeID, foundTask.WorkerID, suffix)
 	if err != nil {
 		return "", "", fmt.Errorf(
@@ -366,7 +363,6 @@ func (s *ServerHandler) resolveActorLogFilename(clusterNameID, sessionID, actorI
 	}
 
 	// Find worker log file by worker_id
-	// Worker log files follow the pattern: worker-{worker_id_hex}-{pid}-{worker_startup_token}.{suffix}
 	nodeIDHex, logFilename, err := s.findWorkerLogFile(
 		clusterNameID,
 		sessionID,
@@ -385,7 +381,8 @@ func (s *ServerHandler) resolveActorLogFilename(clusterNameID, sessionID, actorI
 }
 
 // findWorkerLogFile searches for a worker log file by worker_id.
-// Worker log files follow the pattern: worker-{worker_id_hex}-{pid}-{worker_startup_token}.{suffix}
+// Worker log files follow the pattern: worker-{worker_id_hex}-{job_id_hex}-{pid}.{suffix}
+// Ref: https://github.com/ray-project/ray/blob/219ee7037bbdc02f66b58a814c9ad2618309c19e/src/ray/core_worker/core_worker_process.cc#L80-L80
 // Returns (nodeIDHex, filename, error).
 func (s *ServerHandler) findWorkerLogFile(clusterNameID, sessionID, nodeID, workerID, suffix string) (string, string, error) {
 	// Convert to hex if not already is

From d9b7dbaee56a8139aa378cc1b4a92e8c2c8b7b5b Mon Sep 17 00:00:00 2001
From: machichima <nary12321@gmail.com>
Date: Tue, 3 Feb 2026 19:04:49 +0800
Subject: [PATCH 30/37] feat: more robust ConvertBase64ToHex and centralize the
 logic

put in utils and use in reader

Signed-off-by: machichima <nary12321@gmail.com>
---
 historyserver/pkg/historyserver/reader.go | 34 +++--------------------
 historyserver/pkg/utils/utils.go          | 28 +++++++++++++------
 2 files changed, 24 insertions(+), 38 deletions(-)

diff --git a/historyserver/pkg/historyserver/reader.go b/historyserver/pkg/historyserver/reader.go
index d9c45f39603..eb12c17d739 100644
--- a/historyserver/pkg/historyserver/reader.go
+++ b/historyserver/pkg/historyserver/reader.go
@@ -3,7 +3,6 @@ package historyserver
 import (
 	"bufio"
 	"context"
-	"encoding/base64"
 	"encoding/json"
 	"errors"
 	"fmt"
@@ -14,9 +13,7 @@ import (
 	"sort"
 	"strings"
 
-	"github.com/emicklei/go-restful/v3"
 	eventtypes "github.com/ray-project/kuberay/historyserver/pkg/eventserver/types"
-	"github.com/ray-project/kuberay/historyserver/pkg/utils"
 	"github.com/sirupsen/logrus"
 
 	"github.com/ray-project/kuberay/historyserver/pkg/utils"
@@ -44,29 +41,6 @@ func filterAnsiEscapeCodes(content []byte) []byte {
 	return ansiEscapePattern.ReplaceAll(content, []byte(""))
 }
 
-// decodeBase64ToHex converts an ID to hex format.
-// Handles both cases:
-// 1. Already hex format - returns as-is
-// 2. Base64-encoded - decodes to hex
-// It tries RawURLEncoding first (Ray's default), falling back to StdEncoding if that fails.
-func decodeBase64ToHex(id string) (string, error) {
-	// Check if already hex (only [0-9a-f])
-	if matched, _ := regexp.MatchString("^[0-9a-f]+$", id); matched {
-		return id, nil
-	}
-
-	// Try base64 decode
-	idBytes, err := base64.RawURLEncoding.DecodeString(id)
-	if err != nil {
-		// Try standard Base64 if URL-safe fails
-		idBytes, err = base64.StdEncoding.DecodeString(id)
-		if err != nil {
-			return "", fmt.Errorf("failed to decode Base64 ID: %w", err)
-		}
-	}
-	return fmt.Sprintf("%x", idBytes), nil
-}
-
 func (s *ServerHandler) listClusters(limit int) []utils.ClusterInfo {
 	// Initial continuation marker
 	logrus.Debugf("Prepare to get list clusters info ...")
@@ -244,7 +218,7 @@ func (s *ServerHandler) resolvePidLogFilename(clusterNameID, sessionID, nodeID s
 	}
 
 	// Convert to hex if not already is
-	nodeIDHex, err := decodeBase64ToHex(nodeID)
+	nodeIDHex, err := utils.ConvertBase64ToHex(nodeID)
 	if err != nil {
 		return "", "", fmt.Errorf("failed to decode node_id: %w", err)
 	}
@@ -388,13 +362,13 @@ func (s *ServerHandler) resolveActorLogFilename(clusterNameID, sessionID, actorI
 // Returns (nodeIDHex, filename, error).
 func (s *ServerHandler) findWorkerLogFile(clusterNameID, sessionID, nodeID, workerID, suffix string) (string, string, error) {
 	// Convert to hex if not already is
-	nodeIDHex, err := decodeBase64ToHex(nodeID)
+	nodeIDHex, err := utils.ConvertBase64ToHex(nodeID)
 	if err != nil {
 		return "", "", fmt.Errorf("failed to decode node_id: %w", err)
 	}
 
 	// Convert Base64 worker_id to hex
-	workerIDHex, err := decodeBase64ToHex(workerID)
+	workerIDHex, err := utils.ConvertBase64ToHex(workerID)
 	if err != nil {
 		return "", "", fmt.Errorf("failed to decode worker_id: %w", err)
 	}
@@ -500,7 +474,7 @@ func (s *ServerHandler) ipToNodeId(rayClusterNameID, sessionID, nodeIP string) (
 			}
 
 			// Convert to hex if not already is
-			nodeIDHex, err := decodeBase64ToHex(nodeIDBytes)
+			nodeIDHex, err := utils.ConvertBase64ToHex(nodeIDBytes)
 			if err != nil {
 				logrus.Warnf("Failed to decode node_id %s: %v", nodeIDBytes, err)
 				continue
diff --git a/historyserver/pkg/utils/utils.go b/historyserver/pkg/utils/utils.go
index e769c88141c..8f0c59a828b 100644
--- a/historyserver/pkg/utils/utils.go
+++ b/historyserver/pkg/utils/utils.go
@@ -3,10 +3,10 @@ package utils
 import (
 	"bytes"
 	"encoding/base64"
-	"encoding/hex"
 	"fmt"
 	"os"
 	"path"
+	"regexp"
 	"strings"
 	"time"
 
@@ -243,15 +243,27 @@ func GetRayNodeID() (string, error) {
 	return "", fmt.Errorf("timeout --node_id= not found")
 }
 
-func ConvertBase64ToHex(input string) (string, error) {
-	bytes, err := base64.StdEncoding.DecodeString(input)
-	if err != nil {
-		return input, err
+// ConvertBase64ToHex converts an ID to hex format.
+// Handles both cases:
+// 1. Already hex format - returns as-is
+// 2. Base64-encoded - decodes to hex
+// It tries RawURLEncoding first (Ray's default), falling back to StdEncoding if that fails.
+func ConvertBase64ToHex(id string) (string, error) {
+	// Check if already hex (only [0-9a-f])
+	if matched, _ := regexp.MatchString("^[0-9a-f]+$", id); matched {
+		return id, nil
 	}
 
-	hexStr := hex.EncodeToString(bytes)
-
-	return hexStr, nil
+	// Try base64 decode
+	idBytes, err := base64.RawURLEncoding.DecodeString(id)
+	if err != nil {
+		// Try standard Base64 if URL-safe fails
+		idBytes, err = base64.StdEncoding.DecodeString(id)
+		if err != nil {
+			return "", fmt.Errorf("failed to decode Base64 ID: %w", err)
+		}
+	}
+	return fmt.Sprintf("%x", idBytes), nil
 }
 
 // BuildClusterSessionKey constructs the key used to identify a specific cluster session.

From f1799d19ce360f1f925d6c696f82be837fc9b4e9 Mon Sep 17 00:00:00 2001
From: machichima <nary12321@gmail.com>
Date: Tue, 3 Feb 2026 19:19:38 +0800
Subject: [PATCH 31/37] fix: return original id if cannot decode

Signed-off-by: machichima <nary12321@gmail.com>
---
 historyserver/pkg/utils/utils.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/historyserver/pkg/utils/utils.go b/historyserver/pkg/utils/utils.go
index 8f0c59a828b..9cab5f36c90 100644
--- a/historyserver/pkg/utils/utils.go
+++ b/historyserver/pkg/utils/utils.go
@@ -260,7 +260,7 @@ func ConvertBase64ToHex(id string) (string, error) {
 		// Try standard Base64 if URL-safe fails
 		idBytes, err = base64.StdEncoding.DecodeString(id)
 		if err != nil {
-			return "", fmt.Errorf("failed to decode Base64 ID: %w", err)
+			return id, fmt.Errorf("failed to decode Base64 ID: %w", err)
 		}
 	}
 	return fmt.Sprintf("%x", idBytes), nil

From e848198f574cc415cbb38fd967b95ca82f639451 Mon Sep 17 00:00:00 2001
From: machichima <nary12321@gmail.com>
Date: Tue, 3 Feb 2026 19:38:44 +0800
Subject: [PATCH 32/37] fix: escape filename correctly

Signed-off-by: machichima <nary12321@gmail.com>
---
 historyserver/pkg/historyserver/router.go | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/historyserver/pkg/historyserver/router.go b/historyserver/pkg/historyserver/router.go
index 5526e088f64..3f9cc013ca4 100644
--- a/historyserver/pkg/historyserver/router.go
+++ b/historyserver/pkg/historyserver/router.go
@@ -7,6 +7,7 @@ import (
 	"fmt"
 	"io"
 	"io/fs"
+	"mime"
 	"net/http"
 	"strconv"
 	"strings"
@@ -765,7 +766,19 @@ func (s *ServerHandler) getNodeLogFile(req *restful.Request, resp *restful.Respo
 	if downloadFilename == "" {
 		downloadFilename = DEFAULT_DOWNLOAD_FILENAME
 	}
-	resp.AddHeader("Content-Disposition", fmt.Sprintf("attachment; filename=\"%s\"", downloadFilename))
+	// Format filename correct to escape " or \
+	disposition := mime.FormatMediaType("attachment", map[string]string{
+		"filename": downloadFilename,
+	})
+
+	if disposition == "" {
+		logrus.Errorf("Failed to format Content-Disposition header for filename %q: %v", downloadFilename, err)
+
+		// Fallback to the default filename
+		disposition = fmt.Sprintf("attachment; filename=\"%s\"", DEFAULT_DOWNLOAD_FILENAME)
+	}
+
+	resp.AddHeader("Content-Disposition", disposition)
 
 	resp.Write(content)
 }

From 4e30f185e3194a56a36672f427325eb10ad3f3a0 Mon Sep 17 00:00:00 2001
From: machichima <nary12321@gmail.com>
Date: Tue, 3 Feb 2026 19:39:19 +0800
Subject: [PATCH 33/37] fix: add sessionID == "" check in
 resolveActorLogFilename

Signed-off-by: machichima <nary12321@gmail.com>
---
 historyserver/pkg/historyserver/reader.go | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/historyserver/pkg/historyserver/reader.go b/historyserver/pkg/historyserver/reader.go
index eb12c17d739..b0d955c4ea3 100644
--- a/historyserver/pkg/historyserver/reader.go
+++ b/historyserver/pkg/historyserver/reader.go
@@ -338,6 +338,13 @@ func (s *ServerHandler) resolveActorLogFilename(clusterNameID, sessionID, actorI
 		)
 	}
 
+	if sessionID == "" {
+		return "", "", fmt.Errorf(
+			"sessionID is required to search for worker log files for actor %s",
+			actorID,
+		)
+	}
+
 	// Find worker log file by worker_id
 	nodeIDHex, logFilename, err := s.findWorkerLogFile(
 		clusterNameID,

From 1432037b67e911fa2d5a03b47423ffc864ea819e Mon Sep 17 00:00:00 2001
From: machichima <nary12321@gmail.com>
Date: Tue, 3 Feb 2026 19:53:44 +0800
Subject: [PATCH 34/37] fix: use correct cluster name to query task and actor
 id

Signed-off-by: machichima <nary12321@gmail.com>
---
 historyserver/pkg/historyserver/reader.go | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/historyserver/pkg/historyserver/reader.go b/historyserver/pkg/historyserver/reader.go
index b0d955c4ea3..25ecc437240 100644
--- a/historyserver/pkg/historyserver/reader.go
+++ b/historyserver/pkg/historyserver/reader.go
@@ -241,8 +241,13 @@ func (s *ServerHandler) resolvePidLogFilename(clusterNameID, sessionID, nodeID s
 // This mirrors Ray Dashboard's _resolve_task_filename logic.
 // The sessionID parameter is required for searching worker log files when task_log_info is not available.
 func (s *ServerHandler) resolveTaskLogFilename(clusterNameID, sessionID, taskID string, attemptNumber int, suffix string) (nodeID, filename string, err error) {
+	// Construct full cluster session key for event lookup
+	// We append the sessionID to the clusterNameID (which is "name_namespace")
+	// to match the key format used by utils.BuildClusterSessionKey.
+	fullKey := fmt.Sprintf("%s_%s", clusterNameID, sessionID)
+
 	// Get task attempts by task ID
-	taskAttempts, found := s.eventHandler.GetTaskByID(clusterNameID, taskID)
+	taskAttempts, found := s.eventHandler.GetTaskByID(fullKey, taskID)
 	if !found {
 		return "", "", fmt.Errorf("task not found: task_id=%s", taskID)
 	}
@@ -316,8 +321,13 @@ func (s *ServerHandler) resolveTaskLogFilename(clusterNameID, sessionID, taskID
 // resolveActorLogFilename resolves log file for an actor by querying actor events.
 // This mirrors Ray Dashboard's _resolve_actor_filename logic.
 func (s *ServerHandler) resolveActorLogFilename(clusterNameID, sessionID, actorID, suffix string) (nodeID, filename string, err error) {
+	// Construct full cluster session key for event lookup
+	// We append the sessionID to the clusterNameID (which is "name_namespace")
+	// to match the key format used by utils.BuildClusterSessionKey.
+	fullKey := fmt.Sprintf("%s_%s", clusterNameID, sessionID)
+
 	// Get actor by actor ID
-	actor, found := s.eventHandler.GetActorByID(clusterNameID, actorID)
+	actor, found := s.eventHandler.GetActorByID(fullKey, actorID)
 	if !found {
 		return "", "", fmt.Errorf("actor not found: actor_id=%s", actorID)
 	}

From aba1b9ed180b84ad8c02b5b02b82e45e4f649806 Mon Sep 17 00:00:00 2001
From: machichima <nary12321@gmail.com>
Date: Tue, 3 Feb 2026 19:54:28 +0800
Subject: [PATCH 35/37] test: filename header use no ""

Signed-off-by: machichima <nary12321@gmail.com>
---
 historyserver/test/e2e/historyserver_test.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/historyserver/test/e2e/historyserver_test.go b/historyserver/test/e2e/historyserver_test.go
index 6a627bf1284..2c637d726bc 100644
--- a/historyserver/test/e2e/historyserver_test.go
+++ b/historyserver/test/e2e/historyserver_test.go
@@ -587,7 +587,7 @@ func testLogFileEndpointDeadCluster(test Test, g *WithT, namespace *corev1.Names
 		g.Expect(resp.StatusCode).To(Equal(http.StatusOK))
 		contentDisposition := resp.Header.Get("Content-Disposition")
 		g.Expect(contentDisposition).To(ContainSubstring("attachment"))
-		g.Expect(contentDisposition).To(ContainSubstring(fmt.Sprintf("filename=\"%s\"", customFilename)))
+		g.Expect(contentDisposition).To(ContainSubstring(fmt.Sprintf("filename=%s", customFilename)))
 	})
 
 	test.T().Run("filter_ansi_code behavior", func(t *testing.T) {

From 145ab04fe16aeb7708cde989626e6ca634fc6b4c Mon Sep 17 00:00:00 2001
From: machichima <nary12321@gmail.com>
Date: Tue, 3 Feb 2026 20:22:31 +0800
Subject: [PATCH 36/37] fix: redundant err and correct regex for base64

Signed-off-by: machichima <nary12321@gmail.com>
---
 historyserver/pkg/historyserver/router.go | 2 +-
 historyserver/pkg/utils/utils.go          | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/historyserver/pkg/historyserver/router.go b/historyserver/pkg/historyserver/router.go
index 3f9cc013ca4..15d1277df73 100644
--- a/historyserver/pkg/historyserver/router.go
+++ b/historyserver/pkg/historyserver/router.go
@@ -772,7 +772,7 @@ func (s *ServerHandler) getNodeLogFile(req *restful.Request, resp *restful.Respo
 	})
 
 	if disposition == "" {
-		logrus.Errorf("Failed to format Content-Disposition header for filename %q: %v", downloadFilename, err)
+		logrus.Errorf("Failed to format Content-Disposition header for filename %q", downloadFilename)
 
 		// Fallback to the default filename
 		disposition = fmt.Sprintf("attachment; filename=\"%s\"", DEFAULT_DOWNLOAD_FILENAME)
diff --git a/historyserver/pkg/utils/utils.go b/historyserver/pkg/utils/utils.go
index 9cab5f36c90..ba5c8820f26 100644
--- a/historyserver/pkg/utils/utils.go
+++ b/historyserver/pkg/utils/utils.go
@@ -250,7 +250,7 @@ func GetRayNodeID() (string, error) {
 // It tries RawURLEncoding first (Ray's default), falling back to StdEncoding if that fails.
 func ConvertBase64ToHex(id string) (string, error) {
 	// Check if already hex (only [0-9a-f])
-	if matched, _ := regexp.MatchString("^[0-9a-f]+$", id); matched {
+	if matched, _ := regexp.MatchString("^[0-9a-fA-F]+$", id); matched {
 		return id, nil
 	}
 

From 9980d22f8b848f312cf9bb67cafb5204e59af419 Mon Sep 17 00:00:00 2001
From: machichima <nary12321@gmail.com>
Date: Wed, 4 Feb 2026 20:23:12 +0800
Subject: [PATCH 37/37] fix: properly encode url parameter for task and actor
 id

Signed-off-by: machichima <nary12321@gmail.com>
---
 historyserver/test/e2e/historyserver_test.go | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/historyserver/test/e2e/historyserver_test.go b/historyserver/test/e2e/historyserver_test.go
index 2c637d726bc..c97e70cb06f 100644
--- a/historyserver/test/e2e/historyserver_test.go
+++ b/historyserver/test/e2e/historyserver_test.go
@@ -5,6 +5,7 @@ import (
 	"fmt"
 	"io"
 	"net/http"
+	"net/url"
 	"regexp"
 	"testing"
 
@@ -324,7 +325,7 @@ func testLogFileEndpointLiveCluster(test Test, g *WithT, namespace *corev1.Names
 		for _, taskID := range taskIDs {
 			LogWithTimestamp(t, "Testing task_id: %s", taskID)
 
-			url := fmt.Sprintf("%s%s?task_id=%s", historyServerURL, EndpointLogFile, taskID)
+			url := fmt.Sprintf("%s%s?task_id=%s", historyServerURL, EndpointLogFile, url.QueryEscape(taskID))
 			resp, err := client.Get(url)
 			if err != nil {
 				lastError = fmt.Sprintf("HTTP error for task %s: %v", taskID, err)
@@ -363,7 +364,7 @@ func testLogFileEndpointLiveCluster(test Test, g *WithT, namespace *corev1.Names
 		for _, actorID := range actorIDs {
 			LogWithTimestamp(t, "Testing actor_id: %s", actorID)
 
-			url := fmt.Sprintf("%s%s?actor_id=%s", historyServerURL, EndpointLogFile, actorID)
+			url := fmt.Sprintf("%s%s?actor_id=%s", historyServerURL, EndpointLogFile, url.QueryEscape(actorID))
 			resp, err := client.Get(url)
 			if err != nil {
 				lastError = fmt.Sprintf("HTTP error for actor %s: %v", actorID, err)
@@ -665,7 +666,7 @@ func testLogFileEndpointDeadCluster(test Test, g *WithT, namespace *corev1.Names
 		for _, taskID := range taskIDs {
 			LogWithTimestamp(t, "Testing task_id: %s", taskID)
 
-			url := fmt.Sprintf("%s%s?task_id=%s", historyServerURL, EndpointLogFile, taskID)
+			url := fmt.Sprintf("%s%s?task_id=%s", historyServerURL, EndpointLogFile, url.QueryEscape(taskID))
 			resp, err := client.Get(url)
 			if err != nil {
 				lastError = fmt.Sprintf("HTTP error for task %s: %v", taskID, err)
@@ -704,7 +705,7 @@ func testLogFileEndpointDeadCluster(test Test, g *WithT, namespace *corev1.Names
 		for _, actorID := range actorIDs {
 			LogWithTimestamp(t, "Testing actor_id: %s", actorID)
 
-			url := fmt.Sprintf("%s%s?actor_id=%s", historyServerURL, EndpointLogFile, actorID)
+			url := fmt.Sprintf("%s%s?actor_id=%s", historyServerURL, EndpointLogFile, url.QueryEscape(actorID))
 			resp, err := client.Get(url)
 			if err != nil {
 				lastError = fmt.Sprintf("HTTP error for actor %s: %v", actorID, err)