From 1eb0afb49288fc4a5f878a4215607026101773c1 Mon Sep 17 00:00:00 2001 From: machichima Date: Thu, 29 Jan 2026 19:14:39 +0800 Subject: [PATCH 01/37] test: add more test case for live cluster e2e test Signed-off-by: machichima --- historyserver/test/e2e/historyserver_test.go | 88 +++++++++++++++----- 1 file changed, 65 insertions(+), 23 deletions(-) diff --git a/historyserver/test/e2e/historyserver_test.go b/historyserver/test/e2e/historyserver_test.go index e2cb61afd3c..00ffc59816b 100644 --- a/historyserver/test/e2e/historyserver_test.go +++ b/historyserver/test/e2e/historyserver_test.go @@ -171,38 +171,80 @@ func testLogFileEndpointLiveCluster(test Test, g *WithT, namespace *corev1.Names // Hardcode "raylet.out" for deterministic testing. filename := "raylet.out" - test.T().Run("should return log content", func(t *testing.T) { - g := NewWithT(t) - g.Eventually(func(gg Gomega) { - logFileURL := fmt.Sprintf("%s%s?node_id=%s&filename=%s&lines=100", historyServerURL, EndpointLogFile, nodeID, filename) - resp, err := client.Get(logFileURL) - gg.Expect(err).NotTo(HaveOccurred()) - defer resp.Body.Close() - gg.Expect(resp.StatusCode).To(Equal(http.StatusOK)) - - body, err := io.ReadAll(resp.Body) - gg.Expect(err).NotTo(HaveOccurred()) - gg.Expect(len(body)).To(BeNumerically(">", 0)) - }, TestTimeoutShort).Should(Succeed()) - }) - - test.T().Run("should reject path traversal", func(t *testing.T) { - g := NewWithT(t) - maliciousPaths := []string{"../etc/passwd", "..", "/etc/passwd", "../../secret"} + // Define test cases covering different parameters + logFileTestCases := []struct { + name string + buildURL func(baseURL, nodeID string) string + expectedStatus int + }{ + // lines parameter + {"lines=100", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=%s&lines=100", u, EndpointLogFile, n, filename) }, http.StatusOK}, + {"lines=0 (default 1000)", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=%s", u, EndpointLogFile, n, filename) }, http.StatusOK}, + {"lines=-1 (all)", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=%s&lines=-1", u, EndpointLogFile, n, filename) }, http.StatusOK}, + + // timeout parameter + {"timeout=5", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=%s&timeout=5", u, EndpointLogFile, n, filename) }, http.StatusOK}, + {"timeout=30", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=%s&timeout=30", u, EndpointLogFile, n, filename) }, http.StatusOK}, + + // attempt_number parameter + {"attempt_number=0", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=%s&attempt_number=0", u, EndpointLogFile, n, filename) }, http.StatusOK}, + {"attempt_number=1", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=%s&attempt_number=1", u, EndpointLogFile, n, filename) }, http.StatusOK}, + + // download_filename parameter + {"download_filename=custom.log", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=%s&download_filename=custom.log", u, EndpointLogFile, n, filename) }, http.StatusOK}, + + // filter_ansi_code parameter + {"filter_ansi_code=true", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=%s&filter_ansi_code=true", u, EndpointLogFile, n, filename) }, http.StatusOK}, + {"filter_ansi_code=false", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=%s&filter_ansi_code=false", u, EndpointLogFile, n, filename) }, http.StatusOK}, + + // Combined parameters + {"lines+timeout+filter", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=%s&lines=50&timeout=10&filter_ansi_code=true", u, EndpointLogFile, n, filename) }, http.StatusOK}, + + // Missing mandatory parameters + {"missing node_id", func(u, n string) string { return fmt.Sprintf("%s%s?filename=%s", u, EndpointLogFile, filename) }, http.StatusBadRequest}, + {"missing filename", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s", u, EndpointLogFile, n) }, http.StatusBadRequest}, + {"missing both", func(u, n string) string { return fmt.Sprintf("%s%s", u, EndpointLogFile) }, http.StatusBadRequest}, + + // Invalid parameters + // NOTE: Ray Dashboard will only return 500 (Internal Server Error) for the exceptions (including file not found and invalid parameter) + // ref: https://github.com/ray-project/ray/blob/68d01c4c48a59c7768ec9c2359a1859966c446b6/python/ray/dashboard/modules/state/state_head.py#L282-L284 + {"invalid lines (string)", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=%s&lines=abc", u, EndpointLogFile, n, filename) }, http.StatusInternalServerError}, + {"invalid timeout (string)", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=%s&timeout=invalid", u, EndpointLogFile, n, filename) }, http.StatusInternalServerError}, + {"invalid attempt_number (string)", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=%s&attempt_number=xyz", u, EndpointLogFile, n, filename) }, http.StatusInternalServerError}, + {"file not found", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=nonexistent.log", u, EndpointLogFile, n) }, http.StatusInternalServerError}, + + // Path traversal attacks + {"traversal ../etc/passwd", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=../etc/passwd", u, EndpointLogFile, n) }, http.StatusBadRequest}, + {"traversal ..", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=..", u, EndpointLogFile, n) }, http.StatusBadRequest}, + {"traversal /etc/passwd", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=/etc/passwd", u, EndpointLogFile, n) }, http.StatusBadRequest}, + {"traversal ../../secret", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=../../secret", u, EndpointLogFile, n) }, http.StatusBadRequest}, + {"traversal in node_id", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=../evil&filename=%s", u, EndpointLogFile, filename) }, http.StatusBadRequest}, + } - for _, malicious := range maliciousPaths { + for _, tc := range logFileTestCases { + test.T().Run(tc.name, func(t *testing.T) { + g := NewWithT(t) g.Eventually(func(gg Gomega) { - url := fmt.Sprintf("%s%s?node_id=%s&filename=%s", historyServerURL, EndpointLogFile, nodeID, malicious) + url := tc.buildURL(historyServerURL, nodeID) resp, err := client.Get(url) gg.Expect(err).NotTo(HaveOccurred()) defer func() { io.Copy(io.Discard, resp.Body) resp.Body.Close() }() - gg.Expect(resp.StatusCode).To(Equal(http.StatusBadRequest)) + + gg.Expect(resp.StatusCode).To(Equal(tc.expectedStatus), + "Test case '%s' failed: expected status %d, got %d", tc.name, tc.expectedStatus, resp.StatusCode) + + if tc.expectedStatus == http.StatusOK { + body, err := io.ReadAll(resp.Body) + gg.Expect(err).NotTo(HaveOccurred()) + gg.Expect(len(body)).To(BeNumerically(">", 0), + "Test case '%s' should return non-empty body", tc.name) + } }, TestTimeoutShort).Should(Succeed()) - } - }) + }) + } DeleteS3Bucket(test, g, s3Client) LogWithTimestamp(test.T(), "Log file endpoint tests completed") From 64df48dc6fc30c105189a9ff7b7a5ec50b6349e9 Mon Sep 17 00:00:00 2001 From: machichima Date: Thu, 29 Jan 2026 19:21:01 +0800 Subject: [PATCH 02/37] test: more test case for dead cluster e2e test Signed-off-by: machichima --- historyserver/test/e2e/historyserver_test.go | 72 +++++++++++++------- 1 file changed, 48 insertions(+), 24 deletions(-) diff --git a/historyserver/test/e2e/historyserver_test.go b/historyserver/test/e2e/historyserver_test.go index 00ffc59816b..4f8a811ce67 100644 --- a/historyserver/test/e2e/historyserver_test.go +++ b/historyserver/test/e2e/historyserver_test.go @@ -257,9 +257,10 @@ func testLogFileEndpointLiveCluster(test Test, g *WithT, namespace *corev1.Names // 2. Submit a Ray job to the existing cluster // 3. Delete RayCluster to trigger log upload to S3 // 4. Apply History Server and get its URL -// 5. Verify that the history server can fetch log content from S3 (raylet.out) -// 6. Verify that the history server rejects path traversal attempts from S3 -// 7. Delete S3 bucket to ensure test isolation +// 5. Verify that the history server can fetch log content from S3 +// 6. Verify parameter validation for dead cluster +// 7. Verify security (path traversal) protection +// 8. Delete S3 bucket to ensure test isolation func testLogFileEndpointDeadCluster(test Test, g *WithT, namespace *corev1.Namespace, s3Client *s3.S3) { rayCluster := PrepareTestEnv(test, g, namespace, s3Client) ApplyRayJobAndWaitForCompletion(test, g, namespace, rayCluster) @@ -288,38 +289,61 @@ func testLogFileEndpointDeadCluster(test Test, g *WithT, namespace *corev1.Names // Hardcode "raylet.out" for deterministic testing. filename := "raylet.out" - test.T().Run("should return log content from S3", func(t *testing.T) { - g := NewWithT(t) - g.Eventually(func(gg Gomega) { - logFileURL := fmt.Sprintf("%s%s?node_id=%s&filename=%s&lines=100", historyServerURL, EndpointLogFile, nodeID, filename) - resp, err := client.Get(logFileURL) - gg.Expect(err).NotTo(HaveOccurred()) - defer resp.Body.Close() - gg.Expect(resp.StatusCode).To(Equal(http.StatusOK)) + // Define test cases for dead cluster (S3 storage) + // Note: Dead cluster only supports basic parameters (node_id, filename, lines) + // Advanced features like timeout, attempt_number, download_filename, filter_ansi_code + // are only available for live clusters + logFileTestCases := []struct { + name string + buildURL func(baseURL, nodeID string) string + expectedStatus int + }{ + // Supported parameters + {"lines=100", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=%s&lines=100", u, EndpointLogFile, n, filename) }, http.StatusOK}, + {"lines=0 (default 1000)", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=%s", u, EndpointLogFile, n, filename) }, http.StatusOK}, + {"lines=-1 (all)", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=%s&lines=-1", u, EndpointLogFile, n, filename) }, http.StatusOK}, - body, err := io.ReadAll(resp.Body) - gg.Expect(err).NotTo(HaveOccurred()) - gg.Expect(len(body)).To(BeNumerically(">", 0)) - }, TestTimeoutShort).Should(Succeed()) - }) + // Missing mandatory parameters + {"missing node_id", func(u, n string) string { return fmt.Sprintf("%s%s?filename=%s", u, EndpointLogFile, filename) }, http.StatusBadRequest}, + {"missing filename", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s", u, EndpointLogFile, n) }, http.StatusBadRequest}, + {"missing both", func(u, n string) string { return fmt.Sprintf("%s%s", u, EndpointLogFile) }, http.StatusBadRequest}, - test.T().Run("should reject path traversal from S3", func(t *testing.T) { - g := NewWithT(t) - maliciousPaths := []string{"../etc/passwd", "..", "/etc/passwd", "../../secret"} + // Invalid parameters + {"invalid lines (string)", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=%s&lines=abc", u, EndpointLogFile, n, filename) }, http.StatusBadRequest}, + {"file not found", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=nonexistent.log", u, EndpointLogFile, n) }, http.StatusNotFound}, - for _, malicious := range maliciousPaths { + // Path traversal attacks + {"traversal ../etc/passwd", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=../etc/passwd", u, EndpointLogFile, n) }, http.StatusBadRequest}, + {"traversal ..", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=..", u, EndpointLogFile, n) }, http.StatusBadRequest}, + {"traversal /etc/passwd", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=/etc/passwd", u, EndpointLogFile, n) }, http.StatusBadRequest}, + {"traversal ../../secret", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=../../secret", u, EndpointLogFile, n) }, http.StatusBadRequest}, + {"traversal in node_id", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=../evil&filename=%s", u, EndpointLogFile, filename) }, http.StatusBadRequest}, + } + + for _, tc := range logFileTestCases { + test.T().Run(tc.name, func(t *testing.T) { + g := NewWithT(t) g.Eventually(func(gg Gomega) { - url := fmt.Sprintf("%s%s?node_id=%s&filename=%s", historyServerURL, EndpointLogFile, nodeID, malicious) + url := tc.buildURL(historyServerURL, nodeID) resp, err := client.Get(url) gg.Expect(err).NotTo(HaveOccurred()) defer func() { io.Copy(io.Discard, resp.Body) resp.Body.Close() }() - gg.Expect(resp.StatusCode).To(Equal(http.StatusBadRequest)) + + gg.Expect(resp.StatusCode).To(Equal(tc.expectedStatus), + "Test case '%s' failed: expected status %d, got %d", tc.name, tc.expectedStatus, resp.StatusCode) + + if tc.expectedStatus == http.StatusOK { + body, err := io.ReadAll(resp.Body) + gg.Expect(err).NotTo(HaveOccurred()) + gg.Expect(len(body)).To(BeNumerically(">", 0), + "Test case '%s' should return non-empty body", tc.name) + } }, TestTimeoutShort).Should(Succeed()) - } - }) + }) + } DeleteS3Bucket(test, g, s3Client) LogWithTimestamp(test.T(), "Dead cluster log file endpoint tests completed") From 4a4f72924422af8b63d3e5e884f0e2396c00a298 Mon Sep 17 00:00:00 2001 From: machichima Date: Thu, 29 Jan 2026 19:30:07 +0800 Subject: [PATCH 03/37] refactor: clean up comments Signed-off-by: machichima --- historyserver/test/e2e/historyserver_test.go | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/historyserver/test/e2e/historyserver_test.go b/historyserver/test/e2e/historyserver_test.go index 4f8a811ce67..c9738b264b0 100644 --- a/historyserver/test/e2e/historyserver_test.go +++ b/historyserver/test/e2e/historyserver_test.go @@ -171,7 +171,6 @@ func testLogFileEndpointLiveCluster(test Test, g *WithT, namespace *corev1.Names // Hardcode "raylet.out" for deterministic testing. filename := "raylet.out" - // Define test cases covering different parameters logFileTestCases := []struct { name string buildURL func(baseURL, nodeID string) string @@ -289,10 +288,9 @@ func testLogFileEndpointDeadCluster(test Test, g *WithT, namespace *corev1.Names // Hardcode "raylet.out" for deterministic testing. filename := "raylet.out" - // Define test cases for dead cluster (S3 storage) - // Note: Dead cluster only supports basic parameters (node_id, filename, lines) - // Advanced features like timeout, attempt_number, download_filename, filter_ansi_code - // are only available for live clusters + // NOTE: Dead clusters only support basic parameters (node_id, filename, lines). + // Advanced features like timeout, attempt_number, download_filename, and filter_ansi_code + // are currently available only for live clusters; they will be supported for dead clusters in the future. logFileTestCases := []struct { name string buildURL func(baseURL, nodeID string) string From 0d893385116bc66a0e311859a2912fcd061c8ffd Mon Sep 17 00:00:00 2001 From: machichima Date: Fri, 30 Jan 2026 18:25:23 +0800 Subject: [PATCH 04/37] feat: move query options to struct&add more options Signed-off-by: machichima --- historyserver/pkg/historyserver/reader.go | 8 ++- historyserver/pkg/historyserver/router.go | 82 +++++++++++++++++------ historyserver/pkg/historyserver/types.go | 20 ++++++ 3 files changed, 87 insertions(+), 23 deletions(-) diff --git a/historyserver/pkg/historyserver/reader.go b/historyserver/pkg/historyserver/reader.go index 2914eca18af..a167f023210 100644 --- a/historyserver/pkg/historyserver/reader.go +++ b/historyserver/pkg/historyserver/reader.go @@ -73,8 +73,9 @@ func (s *ServerHandler) _getNodeLogs(rayClusterNameID, sessionId, nodeId, dir st return json.Marshal(ret) } -func (s *ServerHandler) _getNodeLogFile(rayClusterNameID, sessionID, nodeID, filename string, maxLines int) ([]byte, error) { - logPath := path.Join(sessionID, "logs", nodeID, filename) +func (s *ServerHandler) _getNodeLogFile(rayClusterNameID, sessionID string, options GetLogFileOptions) ([]byte, error) { + // Build log path with attempt_number if specified + logPath := path.Join(sessionID, "logs", options.NodeID, options.Filename) reader := s.reader.GetContent(rayClusterNameID, logPath) @@ -82,8 +83,9 @@ func (s *ServerHandler) _getNodeLogFile(rayClusterNameID, sessionID, nodeID, fil return nil, utils.NewHTTPError(fmt.Errorf("log file not found: %s", logPath), http.StatusNotFound) } + maxLines := options.Lines if maxLines < 0 { - // -1 means read all lines, match Ray Dashboard API behavior + // -1 means read all lines return io.ReadAll(reader) } diff --git a/historyserver/pkg/historyserver/router.go b/historyserver/pkg/historyserver/router.go index daeb6954e87..cf369ef6680 100644 --- a/historyserver/pkg/historyserver/router.go +++ b/historyserver/pkg/historyserver/router.go @@ -109,6 +109,10 @@ func routerAPI(s *ServerHandler) { Doc("get logfile").Param(ws.QueryParameter("node_id", "node_id")). Param(ws.QueryParameter("filename", "filename")). Param(ws.QueryParameter("lines", "lines")). + Param(ws.QueryParameter("timeout", "timeout")). + Param(ws.QueryParameter("attempt_number", "attempt_number")). + Param(ws.QueryParameter("download_file", "download_file")). + Param(ws.QueryParameter("filter_ansi_code", "filter_ansi_code")). Produces("text/plain"). Writes("")) // Placeholder for specific return type @@ -573,24 +577,26 @@ func (s *ServerHandler) getNodeLogFile(req *restful.Request, resp *restful.Respo clusterNamespace := req.Attribute(COOKIE_CLUSTER_NAMESPACE_KEY).(string) sessionName := req.Attribute(COOKIE_SESSION_NAME_KEY).(string) - // Parse query parameters - nodeID := req.QueryParameter("node_id") - filename := req.QueryParameter("filename") - lines := req.QueryParameter("lines") + // Parse query parameters into GetLogFileOptions struct + options, err := parseGetLogFileOptions(req) + if err != nil { + resp.WriteErrorString(http.StatusBadRequest, err.Error()) + return + } // Validate required parameters - if nodeID == "" { + if options.NodeID == "" { resp.WriteErrorString(http.StatusBadRequest, "Missing required parameter: node_id") return } - if filename == "" { + if options.Filename == "" { resp.WriteErrorString(http.StatusBadRequest, "Missing required parameter: filename") return } // Prevent path traversal attacks (e.g., ../../etc/passwd) - if !fs.ValidPath(nodeID) || !fs.ValidPath(filename) { - resp.WriteErrorString(http.StatusBadRequest, fmt.Sprintf("invalid path: path traversal not allowed (node_id=%s, filename=%s)", nodeID, filename)) + if !fs.ValidPath(options.NodeID) || !fs.ValidPath(options.Filename) { + resp.WriteErrorString(http.StatusBadRequest, fmt.Sprintf("invalid path: path traversal not allowed (node_id=%s, filename=%s)", options.NodeID, options.Filename)) return } @@ -599,18 +605,7 @@ func (s *ServerHandler) getNodeLogFile(req *restful.Request, resp *restful.Respo return } - // Convert lines parameter to int - maxLines := 0 - if lines != "" { - parsedLines, err := strconv.Atoi(lines) - if err != nil { - resp.WriteErrorString(http.StatusBadRequest, fmt.Sprintf("invalid lines parameter: %s", lines)) - return - } - maxLines = parsedLines - } - - content, err := s._getNodeLogFile(clusterNameID+"_"+clusterNamespace, sessionName, nodeID, filename, maxLines) + content, err := s._getNodeLogFile(clusterNameID+"_"+clusterNamespace, sessionName, options) if err != nil { var httpErr *utils.HTTPError if errors.As(err, &httpErr) { @@ -625,6 +620,53 @@ func (s *ServerHandler) getNodeLogFile(req *restful.Request, resp *restful.Respo resp.Write(content) } +// parseGetLogFileOptions parses query parameters into GetLogFileOptions struct +func parseGetLogFileOptions(req *restful.Request) (GetLogFileOptions, error) { + options := GetLogFileOptions{ + NodeID: req.QueryParameter("node_id"), + Filename: req.QueryParameter("filename"), + } + + // Parse lines parameter + if linesStr := req.QueryParameter("lines"); linesStr != "" { + lines, err := strconv.Atoi(linesStr) + if err != nil { + return options, fmt.Errorf("invalid lines parameter: %s", linesStr) + } + options.Lines = lines + } + + // Parse timeout parameter + if timeoutStr := req.QueryParameter("timeout"); timeoutStr != "" { + timeout, err := strconv.Atoi(timeoutStr) + if err != nil { + return options, fmt.Errorf("invalid timeout parameter: %s", timeoutStr) + } + options.Timeout = timeout + } + + // Parse attempt_number parameter + if attemptStr := req.QueryParameter("attempt_number"); attemptStr != "" { + attemptNumber, err := strconv.Atoi(attemptStr) + if err != nil { + return options, fmt.Errorf("invalid attempt_number parameter: %s", attemptStr) + } + options.AttemptNumber = attemptNumber + } + + // Parse filter_ansi_code parameter (boolean) + if filterStr := req.QueryParameter("filter_ansi_code"); filterStr != "" { + options.FilterAnsiCode = strings.ToLower(filterStr) == "true" + } + + // Parse download_file parameter (boolean) + if downloadStr := req.QueryParameter("download_file"); downloadStr != "" { + options.DownloadFile = strings.ToLower(downloadStr) == "true" + } + + return options, nil +} + func (s *ServerHandler) getTaskSummarize(req *restful.Request, resp *restful.Response) { clusterName := req.Attribute(COOKIE_CLUSTER_NAME_KEY).(string) clusterNamespace := req.Attribute(COOKIE_CLUSTER_NAMESPACE_KEY).(string) diff --git a/historyserver/pkg/historyserver/types.go b/historyserver/pkg/historyserver/types.go index 0c9831adfbb..d87998bf793 100644 --- a/historyserver/pkg/historyserver/types.go +++ b/historyserver/pkg/historyserver/types.go @@ -1,5 +1,25 @@ package historyserver +// GetLogFileOptions contains all options for fetching log files +type GetLogFileOptions struct { + // Required parameters + NodeID string // The node id where the log file is located + Filename string // The log file name + + // Optional parameters with defaults + // Number of lines to return, default to DEFAULT_LOG_LIMIT (1000) + // -1 = all lines + Lines int + // Timeout in seconds for the request, default to 0 (no timeout) + Timeout int + // Attempt number for task retries, default to 0 (first attempt) + AttemptNumber int + // Whether to filter ANSI escape codes from logs, default to False + FilterAnsiCode bool + // Whether to set Content-Disposition header for file download, default to false + DownloadFile bool +} + type ReplyTaskInfo struct { Data TaskInfoData `json:"data"` Msg string `json:"msg"` From 5dc5f12fb9cda1b31d32840560420481fd444804 Mon Sep 17 00:00:00 2001 From: machichima Date: Fri, 30 Jan 2026 18:33:46 +0800 Subject: [PATCH 05/37] feat: implement attempt_numbe, download_file, filter_ansi_code function Signed-off-by: machichima --- historyserver/pkg/historyserver/reader.go | 30 +++++++++++++++++++++-- historyserver/pkg/historyserver/router.go | 6 +++++ 2 files changed, 34 insertions(+), 2 deletions(-) diff --git a/historyserver/pkg/historyserver/reader.go b/historyserver/pkg/historyserver/reader.go index a167f023210..2d039e3db9e 100644 --- a/historyserver/pkg/historyserver/reader.go +++ b/historyserver/pkg/historyserver/reader.go @@ -8,6 +8,7 @@ import ( "io" "net/http" "path" + "regexp" "sort" "strings" @@ -26,6 +27,15 @@ const ( MAX_LOG_LIMIT = 10000 ) +// ANSI escape code pattern for filtering colored output from logs +// Matches patterns like: \x1b[31m (red color), \x1b[0m (reset), etc. +var ansiEscapePattern = regexp.MustCompile(`\x1b\[[0-9;]+m`) + +// filterAnsiEscapeCodes removes ANSI escape sequences from log content +func filterAnsiEscapeCodes(content []byte) []byte { + return ansiEscapePattern.ReplaceAll(content, []byte("")) +} + func (s *ServerHandler) listClusters(limit int) []utils.ClusterInfo { // Initial continuation marker logrus.Debugf("Prepare to get list clusters info ...") @@ -76,6 +86,10 @@ func (s *ServerHandler) _getNodeLogs(rayClusterNameID, sessionId, nodeId, dir st func (s *ServerHandler) _getNodeLogFile(rayClusterNameID, sessionID string, options GetLogFileOptions) ([]byte, error) { // Build log path with attempt_number if specified logPath := path.Join(sessionID, "logs", options.NodeID, options.Filename) + if options.AttemptNumber > 0 { + // Append attempt number to filename (e.g., worker.log.1) + logPath = fmt.Sprintf("%s.%d", logPath, options.AttemptNumber) + } reader := s.reader.GetContent(rayClusterNameID, logPath) @@ -86,7 +100,14 @@ func (s *ServerHandler) _getNodeLogFile(rayClusterNameID, sessionID string, opti maxLines := options.Lines if maxLines < 0 { // -1 means read all lines - return io.ReadAll(reader) + content, err := io.ReadAll(reader) + if err != nil { + return nil, err + } + if options.FilterAnsiCode { + content = filterAnsiEscapeCodes(content) + } + return content, nil } if maxLines == 0 { @@ -134,7 +155,12 @@ func (s *ServerHandler) _getNodeLogFile(rayClusterNameID, sessionID string, opti lines = append(buffer[start:], buffer[:start]...) } - return []byte(strings.Join(lines, "\n")), nil + result := []byte(strings.Join(lines, "\n")) + if options.FilterAnsiCode { + result = filterAnsiEscapeCodes(result) + } + + return result, nil } func (s *ServerHandler) GetNodes(rayClusterNameID, sessionId string) ([]byte, error) { diff --git a/historyserver/pkg/historyserver/router.go b/historyserver/pkg/historyserver/router.go index cf369ef6680..a17c1e74672 100644 --- a/historyserver/pkg/historyserver/router.go +++ b/historyserver/pkg/historyserver/router.go @@ -617,6 +617,12 @@ func (s *ServerHandler) getNodeLogFile(req *restful.Request, resp *restful.Respo } return } + + // Set Content-Disposition header if download_file is requested + if options.DownloadFile { + resp.AddHeader("Content-Disposition", fmt.Sprintf("attachment; filename=\"%s\"", options.Filename)) + } + resp.Write(content) } From 6de7601461c52acd2aa940184c85dd6422bf5079 Mon Sep 17 00:00:00 2001 From: machichima Date: Fri, 30 Jan 2026 19:41:13 +0800 Subject: [PATCH 06/37] feat: e2e test for attempt_numbe, download_file, filter_ansi_code timeout only test validation, not testing the behavior Signed-off-by: machichima --- historyserver/test/e2e/historyserver_test.go | 125 ++++++++++++++++++- 1 file changed, 121 insertions(+), 4 deletions(-) diff --git a/historyserver/test/e2e/historyserver_test.go b/historyserver/test/e2e/historyserver_test.go index c9738b264b0..9900c64ddf7 100644 --- a/historyserver/test/e2e/historyserver_test.go +++ b/historyserver/test/e2e/historyserver_test.go @@ -5,6 +5,7 @@ import ( "fmt" "io" "net/http" + "regexp" "testing" "github.com/aws/aws-sdk-go/service/s3" @@ -23,6 +24,10 @@ const ( EndpointLogFile = "/api/v0/logs/file" ) +// ansiEscapePattern matches ANSI escape sequences (same pattern as in reader.go) +// Pattern: \x1b\[[0-9;]+m +var ansiEscapePattern = regexp.MustCompile(`\x1b\[[0-9;]+m`) + func TestHistoryServer(t *testing.T) { // Share a single S3 client among subtests. s3Client := EnsureS3Client(t) @@ -288,19 +293,37 @@ func testLogFileEndpointDeadCluster(test Test, g *WithT, namespace *corev1.Names // Hardcode "raylet.out" for deterministic testing. filename := "raylet.out" - // NOTE: Dead clusters only support basic parameters (node_id, filename, lines). - // Advanced features like timeout, attempt_number, download_filename, and filter_ansi_code - // are currently available only for live clusters; they will be supported for dead clusters in the future. logFileTestCases := []struct { name string buildURL func(baseURL, nodeID string) string expectedStatus int }{ - // Supported parameters + // Basic parameters {"lines=100", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=%s&lines=100", u, EndpointLogFile, n, filename) }, http.StatusOK}, {"lines=0 (default 1000)", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=%s", u, EndpointLogFile, n, filename) }, http.StatusOK}, {"lines=-1 (all)", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=%s&lines=-1", u, EndpointLogFile, n, filename) }, http.StatusOK}, + // timeout parameter + // NOTE: timeout feature is not yet implemented, we just accept and validate the timeout parameter + {"timeout=5", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=%s&timeout=5", u, EndpointLogFile, n, filename) }, http.StatusOK}, + {"timeout=30", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=%s&timeout=30", u, EndpointLogFile, n, filename) }, http.StatusOK}, + + // attempt_number parameter + {"attempt_number=0", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=%s&attempt_number=0", u, EndpointLogFile, n, filename) }, http.StatusOK}, + {"attempt_number=1 (not found)", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=%s&attempt_number=1", u, EndpointLogFile, n, filename) }, http.StatusNotFound}, + + // download_file parameter + {"download_file=true", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=%s&download_file=true", u, EndpointLogFile, n, filename) }, http.StatusOK}, + {"download_file=false", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=%s&download_file=false", u, EndpointLogFile, n, filename) }, http.StatusOK}, + + // filter_ansi_code parameter + {"filter_ansi_code=true", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=%s&filter_ansi_code=true", u, EndpointLogFile, n, filename) }, http.StatusOK}, + {"filter_ansi_code=false", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=%s&filter_ansi_code=false", u, EndpointLogFile, n, filename) }, http.StatusOK}, + + // Combined parameters + {"lines+timeout+filter", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=%s&lines=50&timeout=10&filter_ansi_code=true", u, EndpointLogFile, n, filename) }, http.StatusOK}, + {"all parameters", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=%s&lines=100&timeout=15&attempt_number=0&download_file=true&filter_ansi_code=true", u, EndpointLogFile, n, filename) }, http.StatusOK}, + // Missing mandatory parameters {"missing node_id", func(u, n string) string { return fmt.Sprintf("%s%s?filename=%s", u, EndpointLogFile, filename) }, http.StatusBadRequest}, {"missing filename", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s", u, EndpointLogFile, n) }, http.StatusBadRequest}, @@ -308,6 +331,8 @@ func testLogFileEndpointDeadCluster(test Test, g *WithT, namespace *corev1.Names // Invalid parameters {"invalid lines (string)", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=%s&lines=abc", u, EndpointLogFile, n, filename) }, http.StatusBadRequest}, + {"invalid timeout (string)", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=%s&timeout=invalid", u, EndpointLogFile, n, filename) }, http.StatusBadRequest}, + {"invalid attempt_number (string)", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=%s&attempt_number=xyz", u, EndpointLogFile, n, filename) }, http.StatusBadRequest}, {"file not found", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=nonexistent.log", u, EndpointLogFile, n) }, http.StatusNotFound}, // Path traversal attacks @@ -343,6 +368,98 @@ func testLogFileEndpointDeadCluster(test Test, g *WithT, namespace *corev1.Names }) } + // Sub-tests for specific parameter behaviors + test.T().Run("download_file header validation", func(t *testing.T) { + g := NewWithT(t) + g.Eventually(func(gg Gomega) { + // Test with download_file=true + urlWithDownload := fmt.Sprintf("%s%s?node_id=%s&filename=%s&download_file=true", historyServerURL, EndpointLogFile, nodeID, filename) + resp, err := client.Get(urlWithDownload) + gg.Expect(err).NotTo(HaveOccurred()) + defer resp.Body.Close() + + gg.Expect(resp.StatusCode).To(Equal(http.StatusOK)) + contentDisposition := resp.Header.Get("Content-Disposition") + gg.Expect(contentDisposition).To(ContainSubstring("attachment")) + gg.Expect(contentDisposition).To(ContainSubstring(fmt.Sprintf("filename=\"%s\"", filename))) + LogWithTimestamp(test.T(), "download_file=true sets Content-Disposition header: %s", contentDisposition) + + // Test with download_file=false, should not have Content-Disposition header + urlWithoutDownload := fmt.Sprintf("%s%s?node_id=%s&filename=%s&download_file=false", historyServerURL, EndpointLogFile, nodeID, filename) + resp2, err := client.Get(urlWithoutDownload) + gg.Expect(err).NotTo(HaveOccurred()) + defer resp2.Body.Close() + + gg.Expect(resp2.StatusCode).To(Equal(http.StatusOK)) + contentDisposition2 := resp2.Header.Get("Content-Disposition") + gg.Expect(contentDisposition2).To(BeEmpty(), "Content-Disposition should not be set when download_file=false") + }, TestTimeoutShort).Should(Succeed()) + }) + + test.T().Run("filter_ansi_code behavior", func(t *testing.T) { + g := NewWithT(t) + g.Eventually(func(gg Gomega) { + // Fetch with filter_ansi_code=false (original content with ANSI codes) + urlWithoutFilter := fmt.Sprintf("%s%s?node_id=%s&filename=%s&filter_ansi_code=false&lines=100", historyServerURL, EndpointLogFile, nodeID, filename) + resp, err := client.Get(urlWithoutFilter) + gg.Expect(err).NotTo(HaveOccurred()) + defer resp.Body.Close() + + gg.Expect(resp.StatusCode).To(Equal(http.StatusOK)) + bodyWithoutFilter, err := io.ReadAll(resp.Body) + gg.Expect(err).NotTo(HaveOccurred()) + + // Fetch with filter_ansi_code=true (ANSI codes should be removed) + urlWithFilter := fmt.Sprintf("%s%s?node_id=%s&filename=%s&filter_ansi_code=true&lines=100", historyServerURL, EndpointLogFile, nodeID, filename) + resp2, err := client.Get(urlWithFilter) + gg.Expect(err).NotTo(HaveOccurred()) + defer resp2.Body.Close() + + gg.Expect(resp2.StatusCode).To(Equal(http.StatusOK)) + bodyWithFilter, err := io.ReadAll(resp2.Body) + gg.Expect(err).NotTo(HaveOccurred()) + + // Check if original content contains ANSI codes using the same pattern as reader.go + hasAnsiInOriginal := ansiEscapePattern.Match(bodyWithoutFilter) + + if hasAnsiInOriginal { + LogWithTimestamp(test.T(), "Original log contains ANSI codes, verifying they are filtered") + // Filtered content should NOT contain ANSI escape sequences + hasAnsiInFiltered := ansiEscapePattern.Match(bodyWithFilter) + gg.Expect(hasAnsiInFiltered).To(BeFalse(), "Filtered content should not contain ANSI escape sequences") + } else { + LogWithTimestamp(test.T(), "Log doesn't contain ANSI codes, check is skipped...") + } + }, TestTimeoutShort).Should(Succeed()) + }) + + test.T().Run("attempt_number behavior", func(t *testing.T) { + g := NewWithT(t) + g.Eventually(func(gg Gomega) { + // Test with attempt_number=0 + urlAttempt0 := fmt.Sprintf("%s%s?node_id=%s&filename=%s&attempt_number=0", historyServerURL, EndpointLogFile, nodeID, filename) + resp, err := client.Get(urlAttempt0) + gg.Expect(err).NotTo(HaveOccurred()) + defer resp.Body.Close() + + gg.Expect(resp.StatusCode).To(Equal(http.StatusOK)) + body, err := io.ReadAll(resp.Body) + gg.Expect(err).NotTo(HaveOccurred()) + gg.Expect(len(body)).To(BeNumerically(">", 0)) + LogWithTimestamp(test.T(), "attempt_number=0 returned %d bytes", len(body)) + + // attempt_number=1 should fail as retry log doesn't exist for normal execution + urlAttempt1 := fmt.Sprintf("%s%s?node_id=%s&filename=%s&attempt_number=1", historyServerURL, EndpointLogFile, nodeID, filename) + resp2, err := client.Get(urlAttempt1) + gg.Expect(err).NotTo(HaveOccurred()) + defer resp2.Body.Close() + + gg.Expect(resp2.StatusCode).To(Equal(http.StatusNotFound), + "attempt_number=1 should return 404 when retry log doesn't exist") + LogWithTimestamp(test.T(), "attempt_number=1 correctly returns 404 (file not found)") + }, TestTimeoutShort).Should(Succeed()) + }) + DeleteS3Bucket(test, g, s3Client) LogWithTimestamp(test.T(), "Dead cluster log file endpoint tests completed") } From 8d11aff732a3f27ed6a329e0b1fe1809b84692a8 Mon Sep 17 00:00:00 2001 From: machichima Date: Fri, 30 Jan 2026 20:04:06 +0800 Subject: [PATCH 07/37] fix: update live cluster invalide param status code Signed-off-by: machichima --- historyserver/test/e2e/historyserver_test.go | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/historyserver/test/e2e/historyserver_test.go b/historyserver/test/e2e/historyserver_test.go index 9900c64ddf7..e311304a244 100644 --- a/historyserver/test/e2e/historyserver_test.go +++ b/historyserver/test/e2e/historyserver_test.go @@ -210,11 +210,11 @@ func testLogFileEndpointLiveCluster(test Test, g *WithT, namespace *corev1.Names {"missing both", func(u, n string) string { return fmt.Sprintf("%s%s", u, EndpointLogFile) }, http.StatusBadRequest}, // Invalid parameters - // NOTE: Ray Dashboard will only return 500 (Internal Server Error) for the exceptions (including file not found and invalid parameter) + {"invalid lines (string)", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=%s&lines=abc", u, EndpointLogFile, n, filename) }, http.StatusBadRequest}, + {"invalid timeout (string)", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=%s&timeout=invalid", u, EndpointLogFile, n, filename) }, http.StatusBadRequest}, + {"invalid attempt_number (string)", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=%s&attempt_number=xyz", u, EndpointLogFile, n, filename) }, http.StatusBadRequest}, + // NOTE: Ray Dashboard will return 500 (Internal Server Error) for the file not found error // ref: https://github.com/ray-project/ray/blob/68d01c4c48a59c7768ec9c2359a1859966c446b6/python/ray/dashboard/modules/state/state_head.py#L282-L284 - {"invalid lines (string)", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=%s&lines=abc", u, EndpointLogFile, n, filename) }, http.StatusInternalServerError}, - {"invalid timeout (string)", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=%s&timeout=invalid", u, EndpointLogFile, n, filename) }, http.StatusInternalServerError}, - {"invalid attempt_number (string)", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=%s&attempt_number=xyz", u, EndpointLogFile, n, filename) }, http.StatusInternalServerError}, {"file not found", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=nonexistent.log", u, EndpointLogFile, n) }, http.StatusInternalServerError}, // Path traversal attacks From ff75e12cf483b81224bf0598ba6ba50e9a816f9c Mon Sep 17 00:00:00 2001 From: machichima Date: Sat, 31 Jan 2026 15:16:19 +0800 Subject: [PATCH 08/37] feat: add id related param&implement task_id+suffix Signed-off-by: machichima --- historyserver/pkg/historyserver/reader.go | 112 +++++++++++++++++++++- historyserver/pkg/historyserver/router.go | 60 +++++++++--- historyserver/pkg/historyserver/types.go | 12 ++- 3 files changed, 165 insertions(+), 19 deletions(-) diff --git a/historyserver/pkg/historyserver/reader.go b/historyserver/pkg/historyserver/reader.go index 2d039e3db9e..44f0331d4b5 100644 --- a/historyserver/pkg/historyserver/reader.go +++ b/historyserver/pkg/historyserver/reader.go @@ -13,6 +13,7 @@ import ( "strings" "github.com/emicklei/go-restful/v3" + eventtypes "github.com/ray-project/kuberay/historyserver/pkg/eventserver/types" "github.com/ray-project/kuberay/historyserver/pkg/utils" "github.com/sirupsen/logrus" ) @@ -84,10 +85,18 @@ func (s *ServerHandler) _getNodeLogs(rayClusterNameID, sessionId, nodeId, dir st } func (s *ServerHandler) _getNodeLogFile(rayClusterNameID, sessionID string, options GetLogFileOptions) ([]byte, error) { - // Build log path with attempt_number if specified - logPath := path.Join(sessionID, "logs", options.NodeID, options.Filename) - if options.AttemptNumber > 0 { - // Append attempt number to filename (e.g., worker.log.1) + // Resolve node_id and filename based on options + nodeID, filename, err := s.resolveLogFilename(rayClusterNameID, options) + if err != nil { + return nil, utils.NewHTTPError(err, http.StatusBadRequest) + } + + // Build log path + logPath := path.Join(sessionID, "logs", nodeID, filename) + + // Append attempt_number if specified and not using task_id + // (task_id already includes attempt_number in resolution) + if options.AttemptNumber > 0 && options.TaskID == "" { logPath = fmt.Sprintf("%s.%d", logPath, options.AttemptNumber) } @@ -163,6 +172,101 @@ func (s *ServerHandler) _getNodeLogFile(rayClusterNameID, sessionID string, opti return result, nil } +// resolveLogFilename resolves the log file node_id and filename based on the provided options. +// This mirrors Ray Dashboard's resolve_filename logic. +func (s *ServerHandler) resolveLogFilename(clusterNameID string, options GetLogFileOptions) (nodeID, filename string, err error) { + // Validate suffix + if options.Suffix != "out" && options.Suffix != "err" { + return "", "", fmt.Errorf("invalid suffix: %s (must be 'out' or 'err')", options.Suffix) + } + + // If filename is explicitly provided, use it and ignore suffix + if options.Filename != "" { + if options.NodeID == "" { + return "", "", fmt.Errorf("node_id is required when filename is provided") + } + return options.NodeID, options.Filename, nil + } + + // If task_id is provided, resolve from task events + if options.TaskID != "" { + return s.resolveTaskLogFilename(clusterNameID, options.TaskID, options.AttemptNumber, options.Suffix) + } + + // If actor_id is provided, resolve from actor events + // TODO: not implemented + if options.ActorID != "" { + return "", "", fmt.Errorf("actor_id resolution not yet implemented") + } + + // If pid is provided, resolve worker log file + // TODO: not implemented + if options.PID > 0 { + return "", "", fmt.Errorf("pid resolution not yet implemented") + } + + return "", "", fmt.Errorf("must provide one of: filename, task_id, actor_id, or pid") +} + +// resolveTaskLogFilename resolves log file for a task by querying task events. +// This mirrors Ray Dashboard's _resolve_task_filename logic. +func (s *ServerHandler) resolveTaskLogFilename(clusterNameID, taskID string, attemptNumber int, suffix string) (nodeID, filename string, err error) { + // Get task attempts by task ID + taskAttempts, found := s.eventHandler.GetTaskByID(clusterNameID, taskID) + if !found { + return "", "", fmt.Errorf("task not found: task_id=%s", taskID) + } + + // Find the specific attempt + var foundTask *eventtypes.Task + for i, task := range taskAttempts { + if task.AttemptNumber == attemptNumber { + foundTask = &taskAttempts[i] + break + } + } + + if foundTask == nil { + return "", "", fmt.Errorf("task attempt not found: task_id=%s, attempt_number=%d", taskID, attemptNumber) + } + + // Check if task has node_id + if foundTask.NodeID == "" { + return "", "", fmt.Errorf("task %s (attempt %d) has no node_id (task not scheduled yet)", taskID, attemptNumber) + } + + // Check if task has TaskLogInfo + if foundTask.TaskLogInfo == nil { + // Check if this is an actor task + if foundTask.ActorID != "" { + return "", "", fmt.Errorf( + "for actor task, please query actor log for actor(%s) by providing actor_id query parameter", + foundTask.ActorID, + ) + } + return "", "", fmt.Errorf( + "task %s (attempt %d) has no task_log_info (worker_id=%s, node_id=%s)", + taskID, attemptNumber, foundTask.WorkerID, foundTask.NodeID, + ) + } + + // Get the log filename from task_log_info based on suffix + filenameKey := "stdout_file" + if suffix == "err" { + filenameKey = "stderr_file" + } + + logFilename, ok := foundTask.TaskLogInfo[filenameKey] + if !ok || logFilename == "" { + return "", "", fmt.Errorf( + "missing log filename info (%s) in task_log_info for task %s (attempt %d)", + filenameKey, taskID, attemptNumber, + ) + } + + return foundTask.NodeID, logFilename, nil +} + func (s *ServerHandler) GetNodes(rayClusterNameID, sessionId string) ([]byte, error) { logPath := path.Join(sessionId, "logs") nodes := s.reader.ListFiles(rayClusterNameID, logPath) diff --git a/historyserver/pkg/historyserver/router.go b/historyserver/pkg/historyserver/router.go index a17c1e74672..44f756ab0fb 100644 --- a/historyserver/pkg/historyserver/router.go +++ b/historyserver/pkg/historyserver/router.go @@ -106,13 +106,18 @@ func routerAPI(s *ServerHandler) { Doc("get appliations").Param(ws.QueryParameter("node_id", "node_id")). Writes("")) // Placeholder for specific return type ws.Route(ws.GET("/v0/logs/file").To(s.getNodeLogFile).Filter(s.CookieHandle). - Doc("get logfile").Param(ws.QueryParameter("node_id", "node_id")). - Param(ws.QueryParameter("filename", "filename")). - Param(ws.QueryParameter("lines", "lines")). + Doc("get logfile"). + Param(ws.QueryParameter("node_id", "node_id (optional if task_id is provided)")). + Param(ws.QueryParameter("filename", "filename (explicit log file path)")). + Param(ws.QueryParameter("task_id", "task_id (resolve log file from task)")). + Param(ws.QueryParameter("actor_id", "actor_id (resolve log file from actor, not yet implemented)")). + Param(ws.QueryParameter("pid", "pid (resolve log file from process id, not yet implemented)")). + Param(ws.QueryParameter("suffix", "suffix (out or err, default: out, used with task_id/actor_id/pid)")). + Param(ws.QueryParameter("lines", "lines (number of lines to return, default: 1000)")). Param(ws.QueryParameter("timeout", "timeout")). - Param(ws.QueryParameter("attempt_number", "attempt_number")). - Param(ws.QueryParameter("download_file", "download_file")). - Param(ws.QueryParameter("filter_ansi_code", "filter_ansi_code")). + Param(ws.QueryParameter("attempt_number", "attempt_number (task retry attempt number, default: 0)")). + Param(ws.QueryParameter("download_file", "download_file (true/false)")). + Param(ws.QueryParameter("filter_ansi_code", "filter_ansi_code (true/false)")). Produces("text/plain"). Writes("")) // Placeholder for specific return type @@ -584,19 +589,26 @@ func (s *ServerHandler) getNodeLogFile(req *restful.Request, resp *restful.Respo return } - // Validate required parameters - if options.NodeID == "" { - resp.WriteErrorString(http.StatusBadRequest, "Missing required parameter: node_id") + // Validate required parameters following Ray Dashboard logic + // At least one of: actor_id, task_id, pid, filename, submission_id must be provided + if options.ActorID == "" && options.TaskID == "" && options.PID == 0 && options.Filename == "" { + resp.WriteErrorString(http.StatusBadRequest, "At least one of actor_id, task_id, pid, or filename is required") return } - if options.Filename == "" { - resp.WriteErrorString(http.StatusBadRequest, "Missing required parameter: filename") + + // node_id is required when not using actor_id or task_id (they can auto-resolve node_id) + if options.NodeID == "" && options.ActorID == "" && options.TaskID == "" { + resp.WriteErrorString(http.StatusBadRequest, "node_id is required when actor_id or task_id is not provided") return } // Prevent path traversal attacks (e.g., ../../etc/passwd) - if !fs.ValidPath(options.NodeID) || !fs.ValidPath(options.Filename) { - resp.WriteErrorString(http.StatusBadRequest, fmt.Sprintf("invalid path: path traversal not allowed (node_id=%s, filename=%s)", options.NodeID, options.Filename)) + if options.NodeID != "" && !fs.ValidPath(options.NodeID) { + resp.WriteErrorString(http.StatusBadRequest, fmt.Sprintf("invalid path: path traversal not allowed (node_id=%s)", options.NodeID)) + return + } + if options.Filename != "" && !fs.ValidPath(options.Filename) { + resp.WriteErrorString(http.StatusBadRequest, fmt.Sprintf("invalid path: path traversal not allowed (filename=%s)", options.Filename)) return } @@ -631,6 +643,28 @@ func parseGetLogFileOptions(req *restful.Request) (GetLogFileOptions, error) { options := GetLogFileOptions{ NodeID: req.QueryParameter("node_id"), Filename: req.QueryParameter("filename"), + TaskID: req.QueryParameter("task_id"), + ActorID: req.QueryParameter("actor_id"), + Suffix: req.QueryParameter("suffix"), + } + + // Parse PID parameter + if pidStr := req.QueryParameter("pid"); pidStr != "" { + pid, err := strconv.Atoi(pidStr) + if err != nil { + return options, fmt.Errorf("invalid pid parameter: %s", pidStr) + } + options.PID = pid + } + + // Default suffix to "out" if not specified + if options.Suffix == "" { + options.Suffix = "out" + } + + // Validate suffix parameter + if options.Suffix != "out" && options.Suffix != "err" { + return options, fmt.Errorf("invalid suffix parameter: %s (must be 'out' or 'err')", options.Suffix) } // Parse lines parameter diff --git a/historyserver/pkg/historyserver/types.go b/historyserver/pkg/historyserver/types.go index d87998bf793..24c3bc057a0 100644 --- a/historyserver/pkg/historyserver/types.go +++ b/historyserver/pkg/historyserver/types.go @@ -2,9 +2,14 @@ package historyserver // GetLogFileOptions contains all options for fetching log files type GetLogFileOptions struct { - // Required parameters + // Node identification (one of these is required if not using task_id/actor_id) NodeID string // The node id where the log file is located - Filename string // The log file name + + // Log file identification (provide one of: Filename, TaskID, ActorID, PID) + Filename string // The log file name (explicit path) + TaskID string // Task ID to resolve log file + ActorID string // Actor ID to resolve log file (not yet implemented) + PID int // Process ID to resolve log file (not yet implemented) // Optional parameters with defaults // Number of lines to return, default to DEFAULT_LOG_LIMIT (1000) @@ -18,6 +23,9 @@ type GetLogFileOptions struct { FilterAnsiCode bool // Whether to set Content-Disposition header for file download, default to false DownloadFile bool + // The suffix of the log file ("out" or "err"), default to "out" + // Used when resolving by TaskID, ActorID, or PID + Suffix string } type ReplyTaskInfo struct { From 4f562802cc6f95c332ec2e6a956673778a79c10f Mon Sep 17 00:00:00 2001 From: machichima Date: Sat, 31 Jan 2026 15:25:51 +0800 Subject: [PATCH 09/37] test: remove eventual & print body when status code mismatch Signed-off-by: machichima --- historyserver/test/e2e/historyserver_test.go | 84 +++++++++++--------- 1 file changed, 46 insertions(+), 38 deletions(-) diff --git a/historyserver/test/e2e/historyserver_test.go b/historyserver/test/e2e/historyserver_test.go index e311304a244..2774e19f7e8 100644 --- a/historyserver/test/e2e/historyserver_test.go +++ b/historyserver/test/e2e/historyserver_test.go @@ -228,25 +228,29 @@ func testLogFileEndpointLiveCluster(test Test, g *WithT, namespace *corev1.Names for _, tc := range logFileTestCases { test.T().Run(tc.name, func(t *testing.T) { g := NewWithT(t) - g.Eventually(func(gg Gomega) { - url := tc.buildURL(historyServerURL, nodeID) - resp, err := client.Get(url) - gg.Expect(err).NotTo(HaveOccurred()) - defer func() { - io.Copy(io.Discard, resp.Body) - resp.Body.Close() - }() - - gg.Expect(resp.StatusCode).To(Equal(tc.expectedStatus), - "Test case '%s' failed: expected status %d, got %d", tc.name, tc.expectedStatus, resp.StatusCode) - - if tc.expectedStatus == http.StatusOK { - body, err := io.ReadAll(resp.Body) - gg.Expect(err).NotTo(HaveOccurred()) - gg.Expect(len(body)).To(BeNumerically(">", 0), - "Test case '%s' should return non-empty body", tc.name) - } - }, TestTimeoutShort).Should(Succeed()) + + url := tc.buildURL(historyServerURL, nodeID) + resp, err := client.Get(url) + g.Expect(err).NotTo(HaveOccurred()) + defer func() { + io.Copy(io.Discard, resp.Body) + resp.Body.Close() + }() + + body, err := io.ReadAll(resp.Body) + g.Expect(err).NotTo(HaveOccurred()) + + if resp.StatusCode != tc.expectedStatus { + LogWithTimestamp(t, "Test case '%s' failed: expected %d, got %d, body: %s", + tc.name, tc.expectedStatus, resp.StatusCode, string(body)) + } + + g.Expect(resp.StatusCode).To(Equal(tc.expectedStatus), + "Test case '%s' failed: expected %d, got %d", tc.name, tc.expectedStatus, resp.StatusCode) + + if tc.expectedStatus == http.StatusOK { + g.Expect(len(body)).To(BeNumerically(">", 0)) + } }) } @@ -346,25 +350,29 @@ func testLogFileEndpointDeadCluster(test Test, g *WithT, namespace *corev1.Names for _, tc := range logFileTestCases { test.T().Run(tc.name, func(t *testing.T) { g := NewWithT(t) - g.Eventually(func(gg Gomega) { - url := tc.buildURL(historyServerURL, nodeID) - resp, err := client.Get(url) - gg.Expect(err).NotTo(HaveOccurred()) - defer func() { - io.Copy(io.Discard, resp.Body) - resp.Body.Close() - }() - - gg.Expect(resp.StatusCode).To(Equal(tc.expectedStatus), - "Test case '%s' failed: expected status %d, got %d", tc.name, tc.expectedStatus, resp.StatusCode) - - if tc.expectedStatus == http.StatusOK { - body, err := io.ReadAll(resp.Body) - gg.Expect(err).NotTo(HaveOccurred()) - gg.Expect(len(body)).To(BeNumerically(">", 0), - "Test case '%s' should return non-empty body", tc.name) - } - }, TestTimeoutShort).Should(Succeed()) + + url := tc.buildURL(historyServerURL, nodeID) + resp, err := client.Get(url) + g.Expect(err).NotTo(HaveOccurred()) + defer func() { + io.Copy(io.Discard, resp.Body) + resp.Body.Close() + }() + + body, err := io.ReadAll(resp.Body) + g.Expect(err).NotTo(HaveOccurred()) + + if resp.StatusCode != tc.expectedStatus { + LogWithTimestamp(t, "Test case '%s' failed: expected %d, got %d, body: %s", + tc.name, tc.expectedStatus, resp.StatusCode, string(body)) + } + + g.Expect(resp.StatusCode).To(Equal(tc.expectedStatus), + "Test case '%s' failed: expected %d, got %d", tc.name, tc.expectedStatus, resp.StatusCode) + + if tc.expectedStatus == http.StatusOK { + g.Expect(len(body)).To(BeNumerically(">", 0)) + } }) } From e01c1132b94debfcb2af281ec29206033636a55c Mon Sep 17 00:00:00 2001 From: machichima Date: Sat, 31 Jan 2026 18:11:50 +0800 Subject: [PATCH 10/37] feat: logic to find logs based on worker ID Signed-off-by: machichima --- historyserver/pkg/historyserver/reader.go | 114 +++++++++++++++++----- 1 file changed, 90 insertions(+), 24 deletions(-) diff --git a/historyserver/pkg/historyserver/reader.go b/historyserver/pkg/historyserver/reader.go index 44f0331d4b5..8194233e759 100644 --- a/historyserver/pkg/historyserver/reader.go +++ b/historyserver/pkg/historyserver/reader.go @@ -3,6 +3,7 @@ package historyserver import ( "bufio" "context" + "encoding/base64" "encoding/json" "fmt" "io" @@ -86,7 +87,7 @@ func (s *ServerHandler) _getNodeLogs(rayClusterNameID, sessionId, nodeId, dir st func (s *ServerHandler) _getNodeLogFile(rayClusterNameID, sessionID string, options GetLogFileOptions) ([]byte, error) { // Resolve node_id and filename based on options - nodeID, filename, err := s.resolveLogFilename(rayClusterNameID, options) + nodeID, filename, err := s.resolveLogFilename(rayClusterNameID, sessionID, options) if err != nil { return nil, utils.NewHTTPError(err, http.StatusBadRequest) } @@ -174,7 +175,8 @@ func (s *ServerHandler) _getNodeLogFile(rayClusterNameID, sessionID string, opti // resolveLogFilename resolves the log file node_id and filename based on the provided options. // This mirrors Ray Dashboard's resolve_filename logic. -func (s *ServerHandler) resolveLogFilename(clusterNameID string, options GetLogFileOptions) (nodeID, filename string, err error) { +// The sessionID parameter is required for task_id resolution to search worker log files. +func (s *ServerHandler) resolveLogFilename(clusterNameID, sessionID string, options GetLogFileOptions) (nodeID, filename string, err error) { // Validate suffix if options.Suffix != "out" && options.Suffix != "err" { return "", "", fmt.Errorf("invalid suffix: %s (must be 'out' or 'err')", options.Suffix) @@ -190,7 +192,7 @@ func (s *ServerHandler) resolveLogFilename(clusterNameID string, options GetLogF // If task_id is provided, resolve from task events if options.TaskID != "" { - return s.resolveTaskLogFilename(clusterNameID, options.TaskID, options.AttemptNumber, options.Suffix) + return s.resolveTaskLogFilename(clusterNameID, sessionID, options.TaskID, options.AttemptNumber, options.Suffix) } // If actor_id is provided, resolve from actor events @@ -210,7 +212,8 @@ func (s *ServerHandler) resolveLogFilename(clusterNameID string, options GetLogF // resolveTaskLogFilename resolves log file for a task by querying task events. // This mirrors Ray Dashboard's _resolve_task_filename logic. -func (s *ServerHandler) resolveTaskLogFilename(clusterNameID, taskID string, attemptNumber int, suffix string) (nodeID, filename string, err error) { +// The sessionID parameter is required for searching worker log files when task_log_info is not available. +func (s *ServerHandler) resolveTaskLogFilename(clusterNameID, sessionID, taskID string, attemptNumber int, suffix string) (nodeID, filename string, err error) { // Get task attempts by task ID taskAttempts, found := s.eventHandler.GetTaskByID(clusterNameID, taskID) if !found { @@ -235,36 +238,99 @@ func (s *ServerHandler) resolveTaskLogFilename(clusterNameID, taskID string, att return "", "", fmt.Errorf("task %s (attempt %d) has no node_id (task not scheduled yet)", taskID, attemptNumber) } - // Check if task has TaskLogInfo - if foundTask.TaskLogInfo == nil { - // Check if this is an actor task - if foundTask.ActorID != "" { - return "", "", fmt.Errorf( - "for actor task, please query actor log for actor(%s) by providing actor_id query parameter", - foundTask.ActorID, - ) - } + // Check if this is an actor task + if foundTask.ActorID != "" { + return "", "", fmt.Errorf( + "for actor task, please query actor log for actor(%s) by providing actor_id query parameter", + foundTask.ActorID, + ) + } + + // Check if task has worker_id + if foundTask.WorkerID == "" { return "", "", fmt.Errorf( - "task %s (attempt %d) has no task_log_info (worker_id=%s, node_id=%s)", - taskID, attemptNumber, foundTask.WorkerID, foundTask.NodeID, + "task %s (attempt %d) has no worker_id", + taskID, attemptNumber, ) } - // Get the log filename from task_log_info based on suffix - filenameKey := "stdout_file" - if suffix == "err" { - filenameKey = "stderr_file" + // Try to use task_log_info if available + // NOTE: task_log_info is currently not supported in ray export event, so we will always + // fallback to following logic. + if foundTask.TaskLogInfo != nil && len(foundTask.TaskLogInfo) > 0 { + filenameKey := "stdout_file" + if suffix == "err" { + filenameKey = "stderr_file" + } + + if logFilename, ok := foundTask.TaskLogInfo[filenameKey]; ok && logFilename != "" { + return foundTask.NodeID, logFilename, nil + } } - logFilename, ok := foundTask.TaskLogInfo[filenameKey] - if !ok || logFilename == "" { + // Fallback: Find worker log file by worker_id + // Worker log files follow the pattern: worker-{worker_id_hex}-{pid}-{worker_startup_token}.{suffix} + // We need to search for files matching this pattern + if sessionID == "" { return "", "", fmt.Errorf( - "missing log filename info (%s) in task_log_info for task %s (attempt %d)", - filenameKey, taskID, attemptNumber, + "task %s (attempt %d) has no task_log_info and sessionID is required to search for worker log files", + taskID, attemptNumber, ) } - return foundTask.NodeID, logFilename, nil + nodeIDHex, logFilename, err := s.findWorkerLogFile(clusterNameID, sessionID, foundTask.NodeID, foundTask.WorkerID, suffix) + if err != nil { + return "", "", fmt.Errorf( + "failed to find worker log file for task %s (attempt %d, worker_id=%s, node_id=%s): %w", + taskID, attemptNumber, foundTask.WorkerID, foundTask.NodeID, err, + ) + } + + return nodeIDHex, logFilename, nil +} + +// findWorkerLogFile searches for a worker log file by worker_id. +// Worker log files follow the pattern: worker-{worker_id_hex}-{pid}-{worker_startup_token}.{suffix} +// Returns (nodeIDHex, filename, error). +func (s *ServerHandler) findWorkerLogFile(clusterNameID, sessionID, nodeID, workerID, suffix string) (string, string, error) { + // Convert Base64 node_id to hex for the file path + // Ray stores IDs in Base64 (URL-safe) in events, but uses hex in log directory structure + nodeIDBytes, err := base64.RawURLEncoding.DecodeString(nodeID) + if err != nil { + // Try standard Base64 if URL-safe fails + nodeIDBytes, err = base64.StdEncoding.DecodeString(nodeID) + if err != nil { + return "", "", fmt.Errorf("failed to decode node_id: %w", err) + } + } + nodeIDHex := fmt.Sprintf("%x", nodeIDBytes) + + // Convert Base64 worker_id to hex + workerIDBytes, err := base64.RawURLEncoding.DecodeString(workerID) + if err != nil { + // Try standard Base64 if URL-safe fails + workerIDBytes, err = base64.StdEncoding.DecodeString(workerID) + if err != nil { + return "", "", fmt.Errorf("failed to decode worker_id: %w", err) + } + } + workerIDHex := fmt.Sprintf("%x", workerIDBytes) + + // List all files in the node's log directory + logPath := path.Join(sessionID, "logs", nodeIDHex) + files := s.reader.ListFiles(clusterNameID, logPath) + + // Search for files matching pattern: worker-{worker_id_hex}-*.{suffix} + workerPrefix := fmt.Sprintf("worker-%s-", workerIDHex) + workerSuffix := fmt.Sprintf(".%s", suffix) + + for _, file := range files { + if strings.HasPrefix(file, workerPrefix) && strings.HasSuffix(file, workerSuffix) { + return nodeIDHex, file, nil + } + } + + return "", "", fmt.Errorf("worker log file not found: worker_id=%s (hex=%s), suffix=%s, searched in %s", workerID, workerIDHex, suffix, logPath) } func (s *ServerHandler) GetNodes(rayClusterNameID, sessionId string) ([]byte, error) { From 41e5970fd89318f1d55d9715953da849d628a7bb Mon Sep 17 00:00:00 2001 From: machichima Date: Sat, 31 Jan 2026 18:35:30 +0800 Subject: [PATCH 11/37] test: for suffix and task_id Signed-off-by: machichima --- historyserver/test/e2e/historyserver_test.go | 158 +++++++++++++++++++ 1 file changed, 158 insertions(+) diff --git a/historyserver/test/e2e/historyserver_test.go b/historyserver/test/e2e/historyserver_test.go index 2774e19f7e8..dc5f7682210 100644 --- a/historyserver/test/e2e/historyserver_test.go +++ b/historyserver/test/e2e/historyserver_test.go @@ -201,6 +201,10 @@ func testLogFileEndpointLiveCluster(test Test, g *WithT, namespace *corev1.Names {"filter_ansi_code=true", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=%s&filter_ansi_code=true", u, EndpointLogFile, n, filename) }, http.StatusOK}, {"filter_ansi_code=false", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=%s&filter_ansi_code=false", u, EndpointLogFile, n, filename) }, http.StatusOK}, + // suffix parameter + {"suffix=out (default)", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=%s&suffix=out", u, EndpointLogFile, n, filename) }, http.StatusOK}, + {"suffix=err", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=%s&suffix=err", u, EndpointLogFile, n, filename) }, http.StatusOK}, + // Combined parameters {"lines+timeout+filter", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=%s&lines=50&timeout=10&filter_ansi_code=true", u, EndpointLogFile, n, filename) }, http.StatusOK}, @@ -213,9 +217,11 @@ func testLogFileEndpointLiveCluster(test Test, g *WithT, namespace *corev1.Names {"invalid lines (string)", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=%s&lines=abc", u, EndpointLogFile, n, filename) }, http.StatusBadRequest}, {"invalid timeout (string)", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=%s&timeout=invalid", u, EndpointLogFile, n, filename) }, http.StatusBadRequest}, {"invalid attempt_number (string)", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=%s&attempt_number=xyz", u, EndpointLogFile, n, filename) }, http.StatusBadRequest}, + {"invalid suffix", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=%s&suffix=invalid", u, EndpointLogFile, n, filename) }, http.StatusBadRequest}, // NOTE: Ray Dashboard will return 500 (Internal Server Error) for the file not found error // ref: https://github.com/ray-project/ray/blob/68d01c4c48a59c7768ec9c2359a1859966c446b6/python/ray/dashboard/modules/state/state_head.py#L282-L284 {"file not found", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=nonexistent.log", u, EndpointLogFile, n) }, http.StatusInternalServerError}, + {"task_id invalid (not found)", func(u, n string) string { return fmt.Sprintf("%s%s?task_id=nonexistent-task-id", u, EndpointLogFile) }, http.StatusInternalServerError}, // Path traversal attacks {"traversal ../etc/passwd", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=../etc/passwd", u, EndpointLogFile, n) }, http.StatusBadRequest}, @@ -254,6 +260,45 @@ func testLogFileEndpointLiveCluster(test Test, g *WithT, namespace *corev1.Names }) } + // Sub-test for task_id parameter (live cluster) + test.T().Run("task_id parameter", func(t *testing.T) { + g := NewWithT(t) + + // Get all eligible task IDs + taskIDs := getAllEligibleTaskIDs(g, client, historyServerURL) + LogWithTimestamp(t, "Found %d eligible task IDs for testing", len(taskIDs)) + + var successCount int + var lastError string + + // Try each task ID until one succeeds + for _, taskID := range taskIDs { + LogWithTimestamp(t, "Testing task_id: %s", taskID) + + url := fmt.Sprintf("%s%s?task_id=%s", historyServerURL, EndpointLogFile, taskID) + resp, err := client.Get(url) + if err != nil { + lastError = fmt.Sprintf("HTTP error for task %s: %v", taskID, err) + continue + } + + body, _ := io.ReadAll(resp.Body) + resp.Body.Close() + + if resp.StatusCode == http.StatusOK { + successCount++ + LogWithTimestamp(t, "✓ Task %s succeeded, returned %d bytes", taskID, len(body)) + break + } else { + lastError = fmt.Sprintf("task %s returned %d: %s", taskID, resp.StatusCode, string(body)) + LogWithTimestamp(t, "✗ Task %s failed: %s", taskID, lastError) + } + } + + g.Expect(successCount).To(BeNumerically(">", 0), + "At least one task_id should succeed. Last error: %s", lastError) + }) + DeleteS3Bucket(test, g, s3Client) LogWithTimestamp(test.T(), "Log file endpoint tests completed") } @@ -324,6 +369,10 @@ func testLogFileEndpointDeadCluster(test Test, g *WithT, namespace *corev1.Names {"filter_ansi_code=true", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=%s&filter_ansi_code=true", u, EndpointLogFile, n, filename) }, http.StatusOK}, {"filter_ansi_code=false", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=%s&filter_ansi_code=false", u, EndpointLogFile, n, filename) }, http.StatusOK}, + // suffix parameter + {"suffix=out (default)", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=%s&suffix=out", u, EndpointLogFile, n, filename) }, http.StatusOK}, + {"suffix=err", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=%s&suffix=err", u, EndpointLogFile, n, filename) }, http.StatusOK}, + // Combined parameters {"lines+timeout+filter", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=%s&lines=50&timeout=10&filter_ansi_code=true", u, EndpointLogFile, n, filename) }, http.StatusOK}, {"all parameters", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=%s&lines=100&timeout=15&attempt_number=0&download_file=true&filter_ansi_code=true", u, EndpointLogFile, n, filename) }, http.StatusOK}, @@ -338,6 +387,8 @@ func testLogFileEndpointDeadCluster(test Test, g *WithT, namespace *corev1.Names {"invalid timeout (string)", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=%s&timeout=invalid", u, EndpointLogFile, n, filename) }, http.StatusBadRequest}, {"invalid attempt_number (string)", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=%s&attempt_number=xyz", u, EndpointLogFile, n, filename) }, http.StatusBadRequest}, {"file not found", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=nonexistent.log", u, EndpointLogFile, n) }, http.StatusNotFound}, + {"invalid suffix", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=%s&suffix=invalid", u, EndpointLogFile, n, filename) }, http.StatusBadRequest}, + {"task_id invalid (not found)", func(u, n string) string { return fmt.Sprintf("%s%s?task_id=nonexistent-task-id", u, EndpointLogFile) }, http.StatusBadRequest}, // Path traversal attacks {"traversal ../etc/passwd", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=../etc/passwd", u, EndpointLogFile, n) }, http.StatusBadRequest}, @@ -468,6 +519,113 @@ func testLogFileEndpointDeadCluster(test Test, g *WithT, namespace *corev1.Names }, TestTimeoutShort).Should(Succeed()) }) + // Sub-test for task_id parameter + test.T().Run("task_id parameter", func(t *testing.T) { + g := NewWithT(t) + + // Get all eligible task IDs + taskIDs := getAllEligibleTaskIDs(g, client, historyServerURL) + LogWithTimestamp(t, "Found %d eligible task IDs for testing", len(taskIDs)) + + var successCount int + var lastError string + + // Try each task ID until one succeeds + for _, taskID := range taskIDs { + LogWithTimestamp(t, "Testing task_id: %s", taskID) + + url := fmt.Sprintf("%s%s?task_id=%s", historyServerURL, EndpointLogFile, taskID) + resp, err := client.Get(url) + if err != nil { + lastError = fmt.Sprintf("HTTP error for task %s: %v", taskID, err) + continue + } + + body, _ := io.ReadAll(resp.Body) + resp.Body.Close() + + if resp.StatusCode == http.StatusOK { + successCount++ + LogWithTimestamp(t, "✓ Task %s succeeded, returned %d bytes", taskID, len(body)) + break + } else { + lastError = fmt.Sprintf("task %s returned %d: %s", taskID, resp.StatusCode, string(body)) + LogWithTimestamp(t, "✗ Task %s failed: %s", taskID, lastError) + } + } + + g.Expect(successCount).To(BeNumerically(">", 0), + "At least one task_id should succeed. Last error: %s", lastError) + }) + DeleteS3Bucket(test, g, s3Client) LogWithTimestamp(test.T(), "Dead cluster log file endpoint tests completed") } + +// getAllEligibleTaskIDs retrieves all non-actor task IDs with node_id from the /api/v0/tasks endpoint. +// Returns a list of task IDs that are eligible for log file testing. +// Note: We filter out actor tasks because they don't have task_log_info by default +// (unless RAY_ENABLE_RECORD_ACTOR_TASK_LOGGING=1 is set). +func getAllEligibleTaskIDs(g *WithT, client *http.Client, historyServerURL string) []string { + var taskIDs []string + g.Eventually(func(gg Gomega) { + resp, err := client.Get(historyServerURL + "/api/v0/tasks") + gg.Expect(err).NotTo(HaveOccurred()) + defer resp.Body.Close() + + gg.Expect(resp.StatusCode).To(Equal(http.StatusOK)) + + body, err := io.ReadAll(resp.Body) + gg.Expect(err).NotTo(HaveOccurred()) + + var result map[string]interface{} + err = json.Unmarshal(body, &result) + gg.Expect(err).NotTo(HaveOccurred()) + + // Extract task_id from response + // Response format: {"result": true, "msg": "...", "data": {"result": {"result": [tasks...], ...}}} + data, ok := result["data"].(map[string]interface{}) + gg.Expect(ok).To(BeTrue(), "response should have 'data' field") + + dataResult, ok := data["result"].(map[string]interface{}) + gg.Expect(ok).To(BeTrue(), "data should have 'result' field") + + tasks, ok := dataResult["result"].([]interface{}) + gg.Expect(ok).To(BeTrue(), "result should have 'result' array") + gg.Expect(len(tasks)).To(BeNumerically(">", 0), "should have at least one task") + + // Find all non-actor tasks with node_id + for _, t := range tasks { + task, ok := t.(map[string]interface{}) + if !ok { + continue + } + + // Check if this is an actor task + actorID, _ := task["actor_id"].(string) + if actorID != "" { + // Skip actor tasks - they don't have task_log_info unless + // RAY_ENABLE_RECORD_ACTOR_TASK_LOGGING=1 is set + continue + } + + // Check if it has node_id + nodeID, _ := task["node_id"].(string) + if nodeID == "" { + // If nodeID is empty, it means the task is not scheduled yet. Skip it + // as it will not have logs + continue + } + + // Found a non-actor task with logs + taskID, ok := task["task_id"].(string) + if ok && taskID != "" { + taskIDs = append(taskIDs, taskID) + } + } + + gg.Expect(len(taskIDs)).To(BeNumerically(">", 0), "should have at least one eligible task") + }, TestTimeoutShort).Should(Succeed()) + + return taskIDs +} From 6497225dd7ea8801ed48d1daa36d91705eaf3b12 Mon Sep 17 00:00:00 2001 From: machichima Date: Sat, 31 Jan 2026 18:51:08 +0800 Subject: [PATCH 12/37] fix: update rayjob.yaml to ensure produce log.out file Signed-off-by: machichima --- historyserver/config/rayjob.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/historyserver/config/rayjob.yaml b/historyserver/config/rayjob.yaml index 3ce3619364a..61357a3ca0a 100644 --- a/historyserver/config/rayjob.yaml +++ b/historyserver/config/rayjob.yaml @@ -11,6 +11,8 @@ spec: @ray.remote(num_cpus=0.5) def my_task(x): + # NOTE: Add this to ensure will produce log.out + print(f'Processing {x}', flush=True) return x * 2 @ray.remote(num_cpus=0.5) From 6b66a305fb179ea27744d13de4c7e806b3b11c28 Mon Sep 17 00:00:00 2001 From: machichima Date: Sat, 31 Jan 2026 20:16:01 +0800 Subject: [PATCH 13/37] feat+test: support actor_id query Signed-off-by: machichima --- historyserver/pkg/historyserver/reader.go | 47 +++- historyserver/pkg/historyserver/router.go | 4 +- historyserver/test/e2e/historyserver_test.go | 214 ++++++++++++++----- 3 files changed, 211 insertions(+), 54 deletions(-) diff --git a/historyserver/pkg/historyserver/reader.go b/historyserver/pkg/historyserver/reader.go index 8194233e759..a0d92003d90 100644 --- a/historyserver/pkg/historyserver/reader.go +++ b/historyserver/pkg/historyserver/reader.go @@ -196,9 +196,8 @@ func (s *ServerHandler) resolveLogFilename(clusterNameID, sessionID string, opti } // If actor_id is provided, resolve from actor events - // TODO: not implemented if options.ActorID != "" { - return "", "", fmt.Errorf("actor_id resolution not yet implemented") + return s.resolveActorLogFilename(clusterNameID, sessionID, options.ActorID, options.Suffix) } // If pid is provided, resolve worker log file @@ -289,6 +288,50 @@ func (s *ServerHandler) resolveTaskLogFilename(clusterNameID, sessionID, taskID return nodeIDHex, logFilename, nil } +// resolveActorLogFilename resolves log file for an actor by querying actor events. +// This mirrors Ray Dashboard's _resolve_actor_filename logic. +func (s *ServerHandler) resolveActorLogFilename(clusterNameID, sessionID, actorID, suffix string) (nodeID, filename string, err error) { + // Get actor by actor ID + actor, found := s.eventHandler.GetActorByID(clusterNameID, actorID) + if !found { + return "", "", fmt.Errorf("actor not found: actor_id=%s", actorID) + } + + // Check if actor has node_id (means it was scheduled) + if actor.Address.NodeID == "" { + return "", "", fmt.Errorf( + "actor %s has no node_id (actor not scheduled yet)", + actorID, + ) + } + + // Check if actor has worker_id + if actor.Address.WorkerID == "" { + return "", "", fmt.Errorf( + "actor %s has no worker_id (actor not scheduled yet)", + actorID, + ) + } + + // Find worker log file by worker_id + // Worker log files follow the pattern: worker-{worker_id_hex}-{pid}-{worker_startup_token}.{suffix} + nodeIDHex, logFilename, err := s.findWorkerLogFile( + clusterNameID, + sessionID, + actor.Address.NodeID, + actor.Address.WorkerID, + suffix, + ) + if err != nil { + return "", "", fmt.Errorf( + "failed to find worker log file for actor %s (worker_id=%s, node_id=%s): %w", + actorID, actor.Address.WorkerID, actor.Address.NodeID, err, + ) + } + + return nodeIDHex, logFilename, nil +} + // findWorkerLogFile searches for a worker log file by worker_id. // Worker log files follow the pattern: worker-{worker_id_hex}-{pid}-{worker_startup_token}.{suffix} // Returns (nodeIDHex, filename, error). diff --git a/historyserver/pkg/historyserver/router.go b/historyserver/pkg/historyserver/router.go index 44f756ab0fb..6da191b20c1 100644 --- a/historyserver/pkg/historyserver/router.go +++ b/historyserver/pkg/historyserver/router.go @@ -107,10 +107,10 @@ func routerAPI(s *ServerHandler) { Writes("")) // Placeholder for specific return type ws.Route(ws.GET("/v0/logs/file").To(s.getNodeLogFile).Filter(s.CookieHandle). Doc("get logfile"). - Param(ws.QueryParameter("node_id", "node_id (optional if task_id is provided)")). + Param(ws.QueryParameter("node_id", "node_id (optional if task_id or actor_id is provided)")). Param(ws.QueryParameter("filename", "filename (explicit log file path)")). Param(ws.QueryParameter("task_id", "task_id (resolve log file from task)")). - Param(ws.QueryParameter("actor_id", "actor_id (resolve log file from actor, not yet implemented)")). + Param(ws.QueryParameter("actor_id", "actor_id (resolve log file from actor)")). Param(ws.QueryParameter("pid", "pid (resolve log file from process id, not yet implemented)")). Param(ws.QueryParameter("suffix", "suffix (out or err, default: out, used with task_id/actor_id/pid)")). Param(ws.QueryParameter("lines", "lines (number of lines to return, default: 1000)")). diff --git a/historyserver/test/e2e/historyserver_test.go b/historyserver/test/e2e/historyserver_test.go index dc5f7682210..e889a244d43 100644 --- a/historyserver/test/e2e/historyserver_test.go +++ b/historyserver/test/e2e/historyserver_test.go @@ -287,11 +287,11 @@ func testLogFileEndpointLiveCluster(test Test, g *WithT, namespace *corev1.Names if resp.StatusCode == http.StatusOK { successCount++ - LogWithTimestamp(t, "✓ Task %s succeeded, returned %d bytes", taskID, len(body)) + LogWithTimestamp(t, "Task %s succeeded, returned %d bytes", taskID, len(body)) break } else { lastError = fmt.Sprintf("task %s returned %d: %s", taskID, resp.StatusCode, string(body)) - LogWithTimestamp(t, "✗ Task %s failed: %s", taskID, lastError) + LogWithTimestamp(t, "Task %s failed: %s", taskID, lastError) } } @@ -299,6 +299,45 @@ func testLogFileEndpointLiveCluster(test Test, g *WithT, namespace *corev1.Names "At least one task_id should succeed. Last error: %s", lastError) }) + // Sub-test for actor_id parameter (live cluster) + test.T().Run("actor_id parameter", func(t *testing.T) { + g := NewWithT(t) + + // Get all eligible actor IDs + actorIDs := getAllEligibleActorIDs(g, client, historyServerURL) + LogWithTimestamp(t, "Found %d eligible actor IDs for testing", len(actorIDs)) + + var successCount int + var lastError string + + // Try each actor ID until one succeeds + for _, actorID := range actorIDs { + LogWithTimestamp(t, "Testing actor_id: %s", actorID) + + url := fmt.Sprintf("%s%s?actor_id=%s", historyServerURL, EndpointLogFile, actorID) + resp, err := client.Get(url) + if err != nil { + lastError = fmt.Sprintf("HTTP error for actor %s: %v", actorID, err) + continue + } + + body, _ := io.ReadAll(resp.Body) + resp.Body.Close() + + if resp.StatusCode == http.StatusOK { + successCount++ + LogWithTimestamp(t, "Actor %s succeeded, returned %d bytes", actorID, len(body)) + break + } else { + lastError = fmt.Sprintf("actor %s returned %d: %s", actorID, resp.StatusCode, string(body)) + LogWithTimestamp(t, "Actor %s failed: %s", actorID, lastError) + } + } + + g.Expect(successCount).To(BeNumerically(">", 0), + "At least one actor_id should succeed. Last error: %s", lastError) + }) + DeleteS3Bucket(test, g, s3Client) LogWithTimestamp(test.T(), "Log file endpoint tests completed") } @@ -546,11 +585,11 @@ func testLogFileEndpointDeadCluster(test Test, g *WithT, namespace *corev1.Names if resp.StatusCode == http.StatusOK { successCount++ - LogWithTimestamp(t, "✓ Task %s succeeded, returned %d bytes", taskID, len(body)) + LogWithTimestamp(t, "Task %s succeeded, returned %d bytes", taskID, len(body)) break } else { lastError = fmt.Sprintf("task %s returned %d: %s", taskID, resp.StatusCode, string(body)) - LogWithTimestamp(t, "✗ Task %s failed: %s", taskID, lastError) + LogWithTimestamp(t, "Task %s failed: %s", taskID, lastError) } } @@ -558,6 +597,45 @@ func testLogFileEndpointDeadCluster(test Test, g *WithT, namespace *corev1.Names "At least one task_id should succeed. Last error: %s", lastError) }) + // Sub-test for actor_id parameter (dead cluster) + test.T().Run("actor_id parameter", func(t *testing.T) { + g := NewWithT(t) + + // Get all eligible actor IDs + actorIDs := getAllEligibleActorIDs(g, client, historyServerURL) + LogWithTimestamp(t, "Found %d eligible actor IDs for testing", len(actorIDs)) + + var successCount int + var lastError string + + // Try each actor ID until one succeeds + for _, actorID := range actorIDs { + LogWithTimestamp(t, "Testing actor_id: %s", actorID) + + url := fmt.Sprintf("%s%s?actor_id=%s", historyServerURL, EndpointLogFile, actorID) + resp, err := client.Get(url) + if err != nil { + lastError = fmt.Sprintf("HTTP error for actor %s: %v", actorID, err) + continue + } + + body, _ := io.ReadAll(resp.Body) + resp.Body.Close() + + if resp.StatusCode == http.StatusOK { + successCount++ + LogWithTimestamp(t, "Actor %s succeeded, returned %d bytes", actorID, len(body)) + break + } else { + lastError = fmt.Sprintf("actor %s returned %d: %s", actorID, resp.StatusCode, string(body)) + LogWithTimestamp(t, "Actor %s failed: %s", actorID, lastError) + } + } + + g.Expect(successCount).To(BeNumerically(">", 0), + "At least one actor_id should succeed. Last error: %s", lastError) + }) + DeleteS3Bucket(test, g, s3Client) LogWithTimestamp(test.T(), "Dead cluster log file endpoint tests completed") } @@ -568,64 +646,100 @@ func testLogFileEndpointDeadCluster(test Test, g *WithT, namespace *corev1.Names // (unless RAY_ENABLE_RECORD_ACTOR_TASK_LOGGING=1 is set). func getAllEligibleTaskIDs(g *WithT, client *http.Client, historyServerURL string) []string { var taskIDs []string - g.Eventually(func(gg Gomega) { - resp, err := client.Get(historyServerURL + "/api/v0/tasks") - gg.Expect(err).NotTo(HaveOccurred()) - defer resp.Body.Close() + resp, err := client.Get(historyServerURL + "/api/v0/tasks") + g.Expect(err).NotTo(HaveOccurred()) + defer resp.Body.Close() - gg.Expect(resp.StatusCode).To(Equal(http.StatusOK)) + g.Expect(resp.StatusCode).To(Equal(http.StatusOK)) - body, err := io.ReadAll(resp.Body) - gg.Expect(err).NotTo(HaveOccurred()) + body, err := io.ReadAll(resp.Body) + g.Expect(err).NotTo(HaveOccurred()) - var result map[string]interface{} - err = json.Unmarshal(body, &result) - gg.Expect(err).NotTo(HaveOccurred()) + var result map[string]interface{} + err = json.Unmarshal(body, &result) + g.Expect(err).NotTo(HaveOccurred()) - // Extract task_id from response - // Response format: {"result": true, "msg": "...", "data": {"result": {"result": [tasks...], ...}}} - data, ok := result["data"].(map[string]interface{}) - gg.Expect(ok).To(BeTrue(), "response should have 'data' field") + // Extract task_id from response + // Response format: {"result": true, "msg": "...", "data": {"result": {"result": [tasks...], ...}}} + data, ok := result["data"].(map[string]interface{}) + g.Expect(ok).To(BeTrue(), "response should have 'data' field") - dataResult, ok := data["result"].(map[string]interface{}) - gg.Expect(ok).To(BeTrue(), "data should have 'result' field") + dataResult, ok := data["result"].(map[string]interface{}) + g.Expect(ok).To(BeTrue(), "data should have 'result' field") - tasks, ok := dataResult["result"].([]interface{}) - gg.Expect(ok).To(BeTrue(), "result should have 'result' array") - gg.Expect(len(tasks)).To(BeNumerically(">", 0), "should have at least one task") + tasks, ok := dataResult["result"].([]interface{}) + g.Expect(ok).To(BeTrue(), "result should have 'result' array") + g.Expect(len(tasks)).To(BeNumerically(">", 0), "should have at least one task") - // Find all non-actor tasks with node_id - for _, t := range tasks { - task, ok := t.(map[string]interface{}) - if !ok { - continue - } + // Find all non-actor tasks with node_id + for _, t := range tasks { + task, ok := t.(map[string]interface{}) + if !ok { + continue + } - // Check if this is an actor task - actorID, _ := task["actor_id"].(string) - if actorID != "" { - // Skip actor tasks - they don't have task_log_info unless - // RAY_ENABLE_RECORD_ACTOR_TASK_LOGGING=1 is set - continue - } + // Check if this is an actor task + actorID, _ := task["actor_id"].(string) + if actorID != "" { + // Skip actor tasks - they don't have task_log_info unless + // RAY_ENABLE_RECORD_ACTOR_TASK_LOGGING=1 is set + continue + } - // Check if it has node_id - nodeID, _ := task["node_id"].(string) - if nodeID == "" { - // If nodeID is empty, it means the task is not scheduled yet. Skip it - // as it will not have logs - continue - } + // Check if it has node_id + nodeID, _ := task["node_id"].(string) + if nodeID == "" { + // If nodeID is empty, it means the task is not scheduled yet. Skip it + // as it will not have logs + continue + } - // Found a non-actor task with logs - taskID, ok := task["task_id"].(string) - if ok && taskID != "" { - taskIDs = append(taskIDs, taskID) - } + // Found a non-actor task with logs + taskID, ok := task["task_id"].(string) + if ok && taskID != "" { + taskIDs = append(taskIDs, taskID) } + } - gg.Expect(len(taskIDs)).To(BeNumerically(">", 0), "should have at least one eligible task") - }, TestTimeoutShort).Should(Succeed()) + g.Expect(len(taskIDs)).To(BeNumerically(">", 0), "should have at least one eligible task") return taskIDs } + +// getAllEligibleActorIDs retrieves all actor IDs with node_id and worker_id from the /logical/actors endpoint. +// Returns a list of actor IDs that are eligible for log file testing (actors that have been scheduled and are running). +func getAllEligibleActorIDs(g *WithT, client *http.Client, historyServerURL string) []string { + var actorIDs []string + resp, err := client.Get(historyServerURL + "/logical/actors") + g.Expect(err).NotTo(HaveOccurred()) + defer resp.Body.Close() + + g.Expect(resp.StatusCode).To(Equal(http.StatusOK)) + + body, err := io.ReadAll(resp.Body) + g.Expect(err).NotTo(HaveOccurred()) + + var result map[string]interface{} + err = json.Unmarshal(body, &result) + g.Expect(err).NotTo(HaveOccurred()) + + // Extract actor_id from response + // Response format: {"result": true, "msg": "...", "data": {"actors": {actor_id: {...}, ...}}} + data, ok := result["data"].(map[string]interface{}) + g.Expect(ok).To(BeTrue(), "response should have 'data' field") + + actors, ok := data["actors"].(map[string]interface{}) + g.Expect(ok).To(BeTrue(), "data should have 'actors' field") + g.Expect(len(actors)).To(BeNumerically(">", 0), "should have at least one actor") + + // Find all actors with node_id and worker_id (scheduled actors) + for actorID := range actors { + actorIDs = append(actorIDs, actorID) + } + + fmt.Printf("Actor IDs: %+v", actorIDs) + + g.Expect(len(actorIDs)).To(BeNumerically(">", 0), "should have at least one eligible actor") + + return actorIDs +} From d204ef118100c44e3915c80ac115af15797e412e Mon Sep 17 00:00:00 2001 From: machichima Date: Sat, 31 Jan 2026 22:16:25 +0800 Subject: [PATCH 14/37] feat+test: support pid query Signed-off-by: machichima --- historyserver/pkg/historyserver/reader.go | 41 ++++++- historyserver/pkg/historyserver/router.go | 2 +- historyserver/test/e2e/historyserver_test.go | 108 +++++++++++++++++++ 3 files changed, 148 insertions(+), 3 deletions(-) diff --git a/historyserver/pkg/historyserver/reader.go b/historyserver/pkg/historyserver/reader.go index a0d92003d90..0235ff26d26 100644 --- a/historyserver/pkg/historyserver/reader.go +++ b/historyserver/pkg/historyserver/reader.go @@ -5,6 +5,7 @@ import ( "context" "encoding/base64" "encoding/json" + "errors" "fmt" "io" "net/http" @@ -89,6 +90,11 @@ func (s *ServerHandler) _getNodeLogFile(rayClusterNameID, sessionID string, opti // Resolve node_id and filename based on options nodeID, filename, err := s.resolveLogFilename(rayClusterNameID, sessionID, options) if err != nil { + // Preserve HTTPError status code if already set, otherwise use BadRequest + var httpErr *utils.HTTPError + if errors.As(err, &httpErr) { + return nil, err + } return nil, utils.NewHTTPError(err, http.StatusBadRequest) } @@ -201,14 +207,45 @@ func (s *ServerHandler) resolveLogFilename(clusterNameID, sessionID string, opti } // If pid is provided, resolve worker log file - // TODO: not implemented if options.PID > 0 { - return "", "", fmt.Errorf("pid resolution not yet implemented") + return s.resolvePidLogFilename(clusterNameID, sessionID, options.NodeID, options.PID, options.Suffix) } return "", "", fmt.Errorf("must provide one of: filename, task_id, actor_id, or pid") } +// resolvePidLogFilename resolves a log file by PID. +// It requires a nodeID and searches for a log file with a name ending in "-{pid}.{suffix}". +func (s *ServerHandler) resolvePidLogFilename(clusterNameID, sessionID, nodeID string, pid int, suffix string) (string, string, error) { + if nodeID == "" { + return "", "", fmt.Errorf("node_id is required for pid resolution") + } + + // The nodeID from actors/tasks is base64, but the path on storage uses hex. + nodeIDBytes, err := base64.RawURLEncoding.DecodeString(nodeID) + if err != nil { + // Try standard Base64 if URL-safe fails + nodeIDBytes, err = base64.StdEncoding.DecodeString(nodeID) + if err != nil { + return "", "", fmt.Errorf("failed to decode node_id: %w", err) + } + } + nodeIDHex := fmt.Sprintf("%x", nodeIDBytes) + + logPath := path.Join(sessionID, "logs", nodeIDHex) + files := s.reader.ListFiles(clusterNameID, logPath) + + pidSuffix := fmt.Sprintf("-%d.%s", pid, suffix) + + for _, file := range files { + if strings.HasSuffix(file, pidSuffix) { + return nodeIDHex, file, nil + } + } + + return "", "", utils.NewHTTPError(fmt.Errorf("log file not found for pid %d in path %s", pid, logPath), http.StatusNotFound) +} + // resolveTaskLogFilename resolves log file for a task by querying task events. // This mirrors Ray Dashboard's _resolve_task_filename logic. // The sessionID parameter is required for searching worker log files when task_log_info is not available. diff --git a/historyserver/pkg/historyserver/router.go b/historyserver/pkg/historyserver/router.go index 6da191b20c1..b57a85ecb1b 100644 --- a/historyserver/pkg/historyserver/router.go +++ b/historyserver/pkg/historyserver/router.go @@ -111,7 +111,7 @@ func routerAPI(s *ServerHandler) { Param(ws.QueryParameter("filename", "filename (explicit log file path)")). Param(ws.QueryParameter("task_id", "task_id (resolve log file from task)")). Param(ws.QueryParameter("actor_id", "actor_id (resolve log file from actor)")). - Param(ws.QueryParameter("pid", "pid (resolve log file from process id, not yet implemented)")). + Param(ws.QueryParameter("pid", "pid (resolve log file from process id)")). Param(ws.QueryParameter("suffix", "suffix (out or err, default: out, used with task_id/actor_id/pid)")). Param(ws.QueryParameter("lines", "lines (number of lines to return, default: 1000)")). Param(ws.QueryParameter("timeout", "timeout")). diff --git a/historyserver/test/e2e/historyserver_test.go b/historyserver/test/e2e/historyserver_test.go index e889a244d43..0db598bed10 100644 --- a/historyserver/test/e2e/historyserver_test.go +++ b/historyserver/test/e2e/historyserver_test.go @@ -338,6 +338,47 @@ func testLogFileEndpointLiveCluster(test Test, g *WithT, namespace *corev1.Names "At least one actor_id should succeed. Last error: %s", lastError) }) + // Sub-test for pid parameter (live cluster) + test.T().Run("pid parameter", func(t *testing.T) { + g := NewWithT(t) + + // Get an eligible worker PID and its node ID + pid, nodeID := getEligibleWorkerPID(g, client, historyServerURL) + LogWithTimestamp(t, "Found eligible worker PID %d on node %s for testing", pid, nodeID) + + // Test successful case + url := fmt.Sprintf("%s%s?pid=%d&node_id=%s", historyServerURL, EndpointLogFile, pid, nodeID) + resp, err := client.Get(url) + g.Expect(err).NotTo(HaveOccurred()) + body, _ := io.ReadAll(resp.Body) + resp.Body.Close() + g.Expect(resp.StatusCode).To(Equal(http.StatusOK), "Expected OK for valid pid and node_id, got %d: %s", resp.StatusCode, string(body)) + g.Expect(len(body)).To(BeNumerically(">", 0)) + + // Test missing node_id + url = fmt.Sprintf("%s%s?pid=%d", historyServerURL, EndpointLogFile, pid) + resp, err = client.Get(url) + g.Expect(err).NotTo(HaveOccurred()) + resp.Body.Close() + g.Expect(resp.StatusCode).To(Equal(http.StatusBadRequest)) + + // Test invalid pid + url = fmt.Sprintf("%s%s?pid=abc&node_id=%s", historyServerURL, EndpointLogFile, nodeID) + resp, err = client.Get(url) + g.Expect(err).NotTo(HaveOccurred()) + resp.Body.Close() + g.Expect(resp.StatusCode).To(Equal(http.StatusBadRequest)) + + // Test non-existent pid + url = fmt.Sprintf("%s%s?pid=999999&node_id=%s", historyServerURL, EndpointLogFile, nodeID) + resp, err = client.Get(url) + g.Expect(err).NotTo(HaveOccurred()) + body, _ = io.ReadAll(resp.Body) + resp.Body.Close() + // For a live cluster, this is proxied to the dashboard. The dashboard returns 500 for file not found. + g.Expect(resp.StatusCode).To(Equal(http.StatusInternalServerError), "Expected 500 for non-existent pid, got %d: %s", resp.StatusCode, string(body)) + }) + DeleteS3Bucket(test, g, s3Client) LogWithTimestamp(test.T(), "Log file endpoint tests completed") } @@ -428,6 +469,7 @@ func testLogFileEndpointDeadCluster(test Test, g *WithT, namespace *corev1.Names {"file not found", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=nonexistent.log", u, EndpointLogFile, n) }, http.StatusNotFound}, {"invalid suffix", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=%s&suffix=invalid", u, EndpointLogFile, n, filename) }, http.StatusBadRequest}, {"task_id invalid (not found)", func(u, n string) string { return fmt.Sprintf("%s%s?task_id=nonexistent-task-id", u, EndpointLogFile) }, http.StatusBadRequest}, + {"non-existent pid", func(u, n string) string { return fmt.Sprintf("%s%s?pid=999999&node_id=%s", u, EndpointLogFile, n) }, http.StatusNotFound}, // Path traversal attacks {"traversal ../etc/passwd", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=../etc/passwd", u, EndpointLogFile, n) }, http.StatusBadRequest}, @@ -636,6 +678,15 @@ func testLogFileEndpointDeadCluster(test Test, g *WithT, namespace *corev1.Names "At least one actor_id should succeed. Last error: %s", lastError) }) + // Sub-test for pid parameter (dead cluster) + // NOTE: This test is skipped because Ray export events don't include worker_pid. + // See: https://github.com/ray-project/ray/issues/60129 + // Worker lifecycle events are not yet exported, so we cannot obtain worker PIDs + // from historical data for dead clusters. + test.T().Run("pid parameter", func(t *testing.T) { + t.Skip("Skipping pid parameter test for dead cluster: worker_pid not available in Ray export events (see https://github.com/ray-project/ray/issues/60129)") + }) + DeleteS3Bucket(test, g, s3Client) LogWithTimestamp(test.T(), "Dead cluster log file endpoint tests completed") } @@ -743,3 +794,60 @@ func getAllEligibleActorIDs(g *WithT, client *http.Client, historyServerURL stri return actorIDs } + +// getEligibleWorkerPID retrieves an eligible worker PID and its node ID for log testing. +// It queries the /api/v0/tasks endpoint to find any task with a valid worker_pid and node_id. +func getEligibleWorkerPID(g *WithT, client *http.Client, historyServerURL string) (pid int, nodeID string) { + resp, err := client.Get(historyServerURL + "/api/v0/tasks") + g.Expect(err).NotTo(HaveOccurred()) + defer resp.Body.Close() + + g.Expect(resp.StatusCode).To(Equal(http.StatusOK)) + + body, err := io.ReadAll(resp.Body) + g.Expect(err).NotTo(HaveOccurred()) + + var result map[string]interface{} + err = json.Unmarshal(body, &result) + g.Expect(err).NotTo(HaveOccurred()) + + // Response format: {"result": true, "msg": "...", "data": {"result": {"result": [tasks...], ...}}} + data, ok := result["data"].(map[string]interface{}) + g.Expect(ok).To(BeTrue(), "response should have 'data' field") + + dataResult, ok := data["result"].(map[string]interface{}) + g.Expect(ok).To(BeTrue(), "data should have 'result' field") + + tasks, ok := dataResult["result"].([]interface{}) + g.Expect(ok).To(BeTrue(), "result should have 'result' array") + g.Expect(len(tasks)).To(BeNumerically(">", 0), "should have at least one task") + + fmt.Printf("Tasks: %+v", tasks) + + // Find a task with valid worker_pid and node_id + for _, t := range tasks { + task, ok := t.(map[string]interface{}) + if !ok { + continue + } + + // Check for worker_pid (must be non-zero) + workerPidFloat, pidOk := task["worker_pid"].(float64) + if !pidOk || workerPidFloat == 0 { + continue + } + + // Check for node_id (must be non-empty) + taskNodeID, nodeOk := task["node_id"].(string) + if !nodeOk || taskNodeID == "" { + continue + } + + // Found an eligible task with worker PID + return int(workerPidFloat), taskNodeID + } + + // If we get here, no eligible task was found + g.Fail("should find at least one task with valid worker_pid and node_id") + return 0, "" +} From f739e5ab9038eaa25fff6fa0d7e18c8b3aef19bc Mon Sep 17 00:00:00 2001 From: machichima Date: Sun, 1 Feb 2026 09:42:22 +0800 Subject: [PATCH 15/37] docs: todo comment for submission_id Signed-off-by: machichima --- historyserver/pkg/historyserver/router.go | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/historyserver/pkg/historyserver/router.go b/historyserver/pkg/historyserver/router.go index b57a85ecb1b..fb3f703c4dc 100644 --- a/historyserver/pkg/historyserver/router.go +++ b/historyserver/pkg/historyserver/router.go @@ -118,6 +118,16 @@ func routerAPI(s *ServerHandler) { Param(ws.QueryParameter("attempt_number", "attempt_number (task retry attempt number, default: 0)")). Param(ws.QueryParameter("download_file", "download_file (true/false)")). Param(ws.QueryParameter("filter_ansi_code", "filter_ansi_code (true/false)")). + // TODO: submission_id parameter is not currently supported. + // To support it, we need to: + // 1. Implement DRIVER_JOB_DEFINITION_EVENT processing in eventserver to store driver job info + // (including driver_node_id from the export event) + // 2. Add submission_id field to DriverJobDefinitionEvent in Ray (currently missing, tracked in + // https://github.com/ray-project/ray/issues/60129) + // 3. Create resolveSubmissionLogFilename() method to: + // - Look up driver job by submission_id + // - Get driver_node_id from stored event + // - Return filename as "job-driver-{submission_id}.log" Produces("text/plain"). Writes("")) // Placeholder for specific return type From e562958539113f622802f61419d5aea053cc9957 Mon Sep 17 00:00:00 2001 From: machichima Date: Sun, 1 Feb 2026 10:48:11 +0800 Subject: [PATCH 16/37] feat+test: add node_ip support Signed-off-by: machichima --- historyserver/pkg/historyserver/reader.go | 76 ++++++++++++++++++++ historyserver/pkg/historyserver/router.go | 28 ++++++-- historyserver/pkg/historyserver/types.go | 1 + historyserver/test/e2e/historyserver_test.go | 69 +++++++++++++++++- 4 files changed, 164 insertions(+), 10 deletions(-) diff --git a/historyserver/pkg/historyserver/reader.go b/historyserver/pkg/historyserver/reader.go index 0235ff26d26..5d3ab50590b 100644 --- a/historyserver/pkg/historyserver/reader.go +++ b/historyserver/pkg/historyserver/reader.go @@ -15,6 +15,7 @@ import ( "strings" "github.com/emicklei/go-restful/v3" + "github.com/ray-project/kuberay/historyserver/pkg/eventserver/types" eventtypes "github.com/ray-project/kuberay/historyserver/pkg/eventserver/types" "github.com/ray-project/kuberay/historyserver/pkg/utils" "github.com/sirupsen/logrus" @@ -437,6 +438,81 @@ func (s *ServerHandler) GetNodes(rayClusterNameID, sessionId string) ([]byte, er return json.Marshal(templ) } +// ipToNodeId resolves node_id from node_ip by querying node_events from storage. +// This mirrors Ray Dashboard's ip_to_node_id logic. +// Returns node_id in hex format if found, error otherwise. +func (s *ServerHandler) ipToNodeId(rayClusterNameID, sessionID, nodeIP string) (string, error) { + if nodeIP == "" { + return "", fmt.Errorf("node_ip is empty") + } + + // List all node_events files + nodeEventsPath := path.Join(sessionID, "node_events") + files := s.reader.ListFiles(rayClusterNameID, nodeEventsPath) + + // Parse each node event file to find matching node_ip + for _, file := range files { + filePath := path.Join(nodeEventsPath, file) + reader := s.reader.GetContent(rayClusterNameID, filePath) + if reader == nil { + continue + } + + data, err := io.ReadAll(reader) + if err != nil { + logrus.Warnf("Failed to read node event file %s: %v", filePath, err) + continue + } + + var events []map[string]interface{} + if err := json.Unmarshal(data, &events); err != nil { + logrus.Warnf("Failed to unmarshal node events from %s: %v", filePath, err) + continue + } + + // Search for NODE_DEFINITION_EVENT with matching node_ip + for _, event := range events { + eventType, ok := event["eventType"].(string) + if !ok || eventType != string(types.NODE_DEFINITION_EVENT) { + continue + } + + nodeDefEvent, ok := event["nodeDefinitionEvent"].(map[string]interface{}) + if !ok { + continue + } + + ipAddr, ok := nodeDefEvent["nodeIpAddress"].(string) + if !ok || ipAddr != nodeIP { + continue + } + + // Found matching node, extract node_id + nodeIDBytes, ok := nodeDefEvent["nodeId"].(string) + if !ok { + continue + } + + // Convert base64 node_id to hex (Ray stores node_id as base64 in events) + decoded, err := base64.StdEncoding.DecodeString(nodeIDBytes) + if err != nil { + // Try URL-safe base64 encoding + decoded, err = base64.RawURLEncoding.DecodeString(nodeIDBytes) + if err != nil { + logrus.Warnf("Failed to decode node_id %s: %v", nodeIDBytes, err) + continue + } + } + + nodeIDHex := fmt.Sprintf("%x", decoded) + logrus.Infof("Resolved node_ip %s to node_id %s", nodeIP, nodeIDHex) + return nodeIDHex, nil + } + } + + return "", fmt.Errorf("node_id not found for node_ip=%s", nodeIP) +} + // TODO: implement this func (h *ServerHandler) getGrafanaHealth(req *restful.Request, resp *restful.Response) { resp.WriteErrorString(http.StatusNotImplemented, "Grafana health not yet supported") diff --git a/historyserver/pkg/historyserver/router.go b/historyserver/pkg/historyserver/router.go index fb3f703c4dc..ac9cbdd765a 100644 --- a/historyserver/pkg/historyserver/router.go +++ b/historyserver/pkg/historyserver/router.go @@ -107,7 +107,8 @@ func routerAPI(s *ServerHandler) { Writes("")) // Placeholder for specific return type ws.Route(ws.GET("/v0/logs/file").To(s.getNodeLogFile).Filter(s.CookieHandle). Doc("get logfile"). - Param(ws.QueryParameter("node_id", "node_id (optional if task_id or actor_id is provided)")). + Param(ws.QueryParameter("node_id", "node_id (optional if node_ip/task_id/actor_id is provided)")). + Param(ws.QueryParameter("node_ip", "node_ip (optional, resolve to node_id)")). Param(ws.QueryParameter("filename", "filename (explicit log file path)")). Param(ws.QueryParameter("task_id", "task_id (resolve log file from task)")). Param(ws.QueryParameter("actor_id", "actor_id (resolve log file from actor)")). @@ -599,6 +600,12 @@ func (s *ServerHandler) getNodeLogFile(req *restful.Request, resp *restful.Respo return } + // node_id or node_ip is required when not using actor_id or task_id (they can auto-resolve node_id) + if options.NodeID == "" && options.NodeIP == "" && options.ActorID == "" && options.TaskID == "" { + resp.WriteErrorString(http.StatusBadRequest, "node_id or node_ip is required when actor_id or task_id is not provided") + return + } + // Validate required parameters following Ray Dashboard logic // At least one of: actor_id, task_id, pid, filename, submission_id must be provided if options.ActorID == "" && options.TaskID == "" && options.PID == 0 && options.Filename == "" { @@ -606,12 +613,6 @@ func (s *ServerHandler) getNodeLogFile(req *restful.Request, resp *restful.Respo return } - // node_id is required when not using actor_id or task_id (they can auto-resolve node_id) - if options.NodeID == "" && options.ActorID == "" && options.TaskID == "" { - resp.WriteErrorString(http.StatusBadRequest, "node_id is required when actor_id or task_id is not provided") - return - } - // Prevent path traversal attacks (e.g., ../../etc/passwd) if options.NodeID != "" && !fs.ValidPath(options.NodeID) { resp.WriteErrorString(http.StatusBadRequest, fmt.Sprintf("invalid path: path traversal not allowed (node_id=%s)", options.NodeID)) @@ -622,11 +623,23 @@ func (s *ServerHandler) getNodeLogFile(req *restful.Request, resp *restful.Respo return } + // For live cluster, proxy the request directly to Ray Dashboard without any processing if sessionName == "live" { s.redirectRequest(req, resp) return } + // Only resolve node_ip to node_id from stored events for dead cluster + if options.NodeID == "" && options.NodeIP != "" { + nodeID, err := s.ipToNodeId(clusterNameID+"_"+clusterNamespace, sessionName, options.NodeIP) + if err != nil { + resp.WriteErrorString(http.StatusNotFound, + fmt.Sprintf("Cannot find matching node_id for a given node ip %s", options.NodeIP)) + return + } + options.NodeID = nodeID + } + content, err := s._getNodeLogFile(clusterNameID+"_"+clusterNamespace, sessionName, options) if err != nil { var httpErr *utils.HTTPError @@ -652,6 +665,7 @@ func (s *ServerHandler) getNodeLogFile(req *restful.Request, resp *restful.Respo func parseGetLogFileOptions(req *restful.Request) (GetLogFileOptions, error) { options := GetLogFileOptions{ NodeID: req.QueryParameter("node_id"), + NodeIP: req.QueryParameter("node_ip"), Filename: req.QueryParameter("filename"), TaskID: req.QueryParameter("task_id"), ActorID: req.QueryParameter("actor_id"), diff --git a/historyserver/pkg/historyserver/types.go b/historyserver/pkg/historyserver/types.go index 24c3bc057a0..0def7f3dc8d 100644 --- a/historyserver/pkg/historyserver/types.go +++ b/historyserver/pkg/historyserver/types.go @@ -4,6 +4,7 @@ package historyserver type GetLogFileOptions struct { // Node identification (one of these is required if not using task_id/actor_id) NodeID string // The node id where the log file is located + NodeIP string // The node ip address (will be resolved to node_id) // Log file identification (provide one of: Filename, TaskID, ActorID, PID) Filename string // The log file name (explicit path) diff --git a/historyserver/test/e2e/historyserver_test.go b/historyserver/test/e2e/historyserver_test.go index 0db598bed10..0e4f9378d6f 100644 --- a/historyserver/test/e2e/historyserver_test.go +++ b/historyserver/test/e2e/historyserver_test.go @@ -209,7 +209,7 @@ func testLogFileEndpointLiveCluster(test Test, g *WithT, namespace *corev1.Names {"lines+timeout+filter", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=%s&lines=50&timeout=10&filter_ansi_code=true", u, EndpointLogFile, n, filename) }, http.StatusOK}, // Missing mandatory parameters - {"missing node_id", func(u, n string) string { return fmt.Sprintf("%s%s?filename=%s", u, EndpointLogFile, filename) }, http.StatusBadRequest}, + {"missing node_id and node_ip", func(u, n string) string { return fmt.Sprintf("%s%s?filename=%s", u, EndpointLogFile, filename) }, http.StatusBadRequest}, {"missing filename", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s", u, EndpointLogFile, n) }, http.StatusBadRequest}, {"missing both", func(u, n string) string { return fmt.Sprintf("%s%s", u, EndpointLogFile) }, http.StatusBadRequest}, @@ -223,6 +223,9 @@ func testLogFileEndpointLiveCluster(test Test, g *WithT, namespace *corev1.Names {"file not found", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=nonexistent.log", u, EndpointLogFile, n) }, http.StatusInternalServerError}, {"task_id invalid (not found)", func(u, n string) string { return fmt.Sprintf("%s%s?task_id=nonexistent-task-id", u, EndpointLogFile) }, http.StatusInternalServerError}, + // node_ip parameter tests + {"node_ip invalid (non-existent)", func(u, n string) string { return fmt.Sprintf("%s%s?node_ip=192.168.255.255&filename=%s", u, EndpointLogFile, filename) }, http.StatusInternalServerError}, + // Path traversal attacks {"traversal ../etc/passwd", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=../etc/passwd", u, EndpointLogFile, n) }, http.StatusBadRequest}, {"traversal ..", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=..", u, EndpointLogFile, n) }, http.StatusBadRequest}, @@ -379,6 +382,30 @@ func testLogFileEndpointLiveCluster(test Test, g *WithT, namespace *corev1.Names g.Expect(resp.StatusCode).To(Equal(http.StatusInternalServerError), "Expected 500 for non-existent pid, got %d: %s", resp.StatusCode, string(body)) }) + // Sub-test for node_ip parameter (live cluster) + test.T().Run("node_ip parameter", func(t *testing.T) { + g := NewWithT(t) + + // Get node IP from head pod (use Pod IP, not Host IP) + // Ray registers nodes with Pod IP (--node-ip-address flag) + headPod, err := GetHeadPod(test, rayCluster) + g.Expect(err).NotTo(HaveOccurred()) + nodeIP := headPod.Status.PodIP + g.Expect(nodeIP).NotTo(BeEmpty(), "Head pod should have a pod IP") + LogWithTimestamp(t, "Found head pod with IP: %s", nodeIP) + + // Test successful case: node_ip + filename + url := fmt.Sprintf("%s%s?node_ip=%s&filename=%s", historyServerURL, EndpointLogFile, nodeIP, filename) + resp, err := client.Get(url) + g.Expect(err).NotTo(HaveOccurred()) + body, _ := io.ReadAll(resp.Body) + resp.Body.Close() + // For live cluster, the request is proxied to Ray Dashboard + // The dashboard should be able to resolve node_ip to node_id + g.Expect(resp.StatusCode).To(Equal(http.StatusOK), "Expected OK for valid node_ip, got %d: %s", resp.StatusCode, string(body)) + g.Expect(len(body)).To(BeNumerically(">", 0)) + }) + DeleteS3Bucket(test, g, s3Client) LogWithTimestamp(test.T(), "Log file endpoint tests completed") } @@ -398,8 +425,15 @@ func testLogFileEndpointDeadCluster(test Test, g *WithT, namespace *corev1.Names rayCluster := PrepareTestEnv(test, g, namespace, s3Client) ApplyRayJobAndWaitForCompletion(test, g, namespace, rayCluster) + // Capture node IP and ID before deleting cluster (for node_ip tests later) + headPod, err := GetHeadPod(test, rayCluster) + g.Expect(err).NotTo(HaveOccurred()) + savedNodeIP := headPod.Status.PodIP + savedNodeID := GetNodeIDFromHeadPod(test, g, rayCluster) + LogWithTimestamp(test.T(), "Captured node IP %s and node ID %s before cluster deletion", savedNodeIP, savedNodeID) + // Delete RayCluster to trigger log upload - err := test.Client().Ray().RayV1().RayClusters(namespace.Name).Delete(test.Ctx(), rayCluster.Name, metav1.DeleteOptions{}) + err = test.Client().Ray().RayV1().RayClusters(namespace.Name).Delete(test.Ctx(), rayCluster.Name, metav1.DeleteOptions{}) g.Expect(err).NotTo(HaveOccurred()) LogWithTimestamp(test.T(), "Deleted RayCluster %s/%s", namespace.Name, rayCluster.Name) @@ -458,7 +492,7 @@ func testLogFileEndpointDeadCluster(test Test, g *WithT, namespace *corev1.Names {"all parameters", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=%s&lines=100&timeout=15&attempt_number=0&download_file=true&filter_ansi_code=true", u, EndpointLogFile, n, filename) }, http.StatusOK}, // Missing mandatory parameters - {"missing node_id", func(u, n string) string { return fmt.Sprintf("%s%s?filename=%s", u, EndpointLogFile, filename) }, http.StatusBadRequest}, + {"missing node_id and node_ip", func(u, n string) string { return fmt.Sprintf("%s%s?filename=%s", u, EndpointLogFile, filename) }, http.StatusBadRequest}, {"missing filename", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s", u, EndpointLogFile, n) }, http.StatusBadRequest}, {"missing both", func(u, n string) string { return fmt.Sprintf("%s%s", u, EndpointLogFile) }, http.StatusBadRequest}, @@ -471,6 +505,9 @@ func testLogFileEndpointDeadCluster(test Test, g *WithT, namespace *corev1.Names {"task_id invalid (not found)", func(u, n string) string { return fmt.Sprintf("%s%s?task_id=nonexistent-task-id", u, EndpointLogFile) }, http.StatusBadRequest}, {"non-existent pid", func(u, n string) string { return fmt.Sprintf("%s%s?pid=999999&node_id=%s", u, EndpointLogFile, n) }, http.StatusNotFound}, + // node_ip parameter tests + {"node_ip invalid (non-existent)", func(u, n string) string { return fmt.Sprintf("%s%s?node_ip=192.168.255.255&filename=%s", u, EndpointLogFile, filename) }, http.StatusNotFound}, + // Path traversal attacks {"traversal ../etc/passwd", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=../etc/passwd", u, EndpointLogFile, n) }, http.StatusBadRequest}, {"traversal ..", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=..", u, EndpointLogFile, n) }, http.StatusBadRequest}, @@ -687,6 +724,32 @@ func testLogFileEndpointDeadCluster(test Test, g *WithT, namespace *corev1.Names t.Skip("Skipping pid parameter test for dead cluster: worker_pid not available in Ray export events (see https://github.com/ray-project/ray/issues/60129)") }) + // Sub-test for node_ip parameter (dead cluster) + test.T().Run("node_ip parameter", func(t *testing.T) { + g := NewWithT(t) + + // Use the captured node IP and ID from before cluster deletion + LogWithTimestamp(t, "Testing node_ip parameter with IP: %s, ID: %s", savedNodeIP, savedNodeID) + + // Test successful case: node_ip + filename + url := fmt.Sprintf("%s%s?node_ip=%s&filename=%s", historyServerURL, EndpointLogFile, savedNodeIP, filename) + resp, err := client.Get(url) + g.Expect(err).NotTo(HaveOccurred()) + body, _ := io.ReadAll(resp.Body) + resp.Body.Close() + g.Expect(resp.StatusCode).To(Equal(http.StatusOK), "Expected OK for valid node_ip, got %d: %s", resp.StatusCode, string(body)) + g.Expect(len(body)).To(BeNumerically(">", 0)) + + // Test that node_ip and node_id point to the same node (should return same content) + urlWithNodeID := fmt.Sprintf("%s%s?node_id=%s&filename=%s", historyServerURL, EndpointLogFile, savedNodeID, filename) + resp2, err := client.Get(urlWithNodeID) + g.Expect(err).NotTo(HaveOccurred()) + bodyWithNodeID, _ := io.ReadAll(resp2.Body) + resp2.Body.Close() + g.Expect(resp2.StatusCode).To(Equal(http.StatusOK)) + g.Expect(len(body)).To(Equal(len(bodyWithNodeID)), "node_ip and node_id should return same content") + }) + DeleteS3Bucket(test, g, s3Client) LogWithTimestamp(test.T(), "Dead cluster log file endpoint tests completed") } From f2f016176fd52fdab5749e8a1ddc948bb080d2d8 Mon Sep 17 00:00:00 2001 From: machichima Date: Sun, 1 Feb 2026 16:13:32 +0800 Subject: [PATCH 17/37] test: move pid invalid test to logFileTestCases Signed-off-by: machichima --- historyserver/test/e2e/historyserver_test.go | 20 ++------------------ 1 file changed, 2 insertions(+), 18 deletions(-) diff --git a/historyserver/test/e2e/historyserver_test.go b/historyserver/test/e2e/historyserver_test.go index 0e4f9378d6f..3226a43762c 100644 --- a/historyserver/test/e2e/historyserver_test.go +++ b/historyserver/test/e2e/historyserver_test.go @@ -222,9 +222,9 @@ func testLogFileEndpointLiveCluster(test Test, g *WithT, namespace *corev1.Names // ref: https://github.com/ray-project/ray/blob/68d01c4c48a59c7768ec9c2359a1859966c446b6/python/ray/dashboard/modules/state/state_head.py#L282-L284 {"file not found", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=nonexistent.log", u, EndpointLogFile, n) }, http.StatusInternalServerError}, {"task_id invalid (not found)", func(u, n string) string { return fmt.Sprintf("%s%s?task_id=nonexistent-task-id", u, EndpointLogFile) }, http.StatusInternalServerError}, - - // node_ip parameter tests {"node_ip invalid (non-existent)", func(u, n string) string { return fmt.Sprintf("%s%s?node_ip=192.168.255.255&filename=%s", u, EndpointLogFile, filename) }, http.StatusInternalServerError}, + {"pid invalid (string)", func(u, n string) string { return fmt.Sprintf("%s%s?pid=abc&node_id=%s", u, EndpointLogFile, n) }, http.StatusBadRequest}, + {"pid non-existent", func(u, n string) string { return fmt.Sprintf("%s%s?pid=999999&node_id=%s", u, EndpointLogFile, n) }, http.StatusInternalServerError}, // Path traversal attacks {"traversal ../etc/passwd", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=../etc/passwd", u, EndpointLogFile, n) }, http.StatusBadRequest}, @@ -364,22 +364,6 @@ func testLogFileEndpointLiveCluster(test Test, g *WithT, namespace *corev1.Names g.Expect(err).NotTo(HaveOccurred()) resp.Body.Close() g.Expect(resp.StatusCode).To(Equal(http.StatusBadRequest)) - - // Test invalid pid - url = fmt.Sprintf("%s%s?pid=abc&node_id=%s", historyServerURL, EndpointLogFile, nodeID) - resp, err = client.Get(url) - g.Expect(err).NotTo(HaveOccurred()) - resp.Body.Close() - g.Expect(resp.StatusCode).To(Equal(http.StatusBadRequest)) - - // Test non-existent pid - url = fmt.Sprintf("%s%s?pid=999999&node_id=%s", historyServerURL, EndpointLogFile, nodeID) - resp, err = client.Get(url) - g.Expect(err).NotTo(HaveOccurred()) - body, _ = io.ReadAll(resp.Body) - resp.Body.Close() - // For a live cluster, this is proxied to the dashboard. The dashboard returns 500 for file not found. - g.Expect(resp.StatusCode).To(Equal(http.StatusInternalServerError), "Expected 500 for non-existent pid, got %d: %s", resp.StatusCode, string(body)) }) // Sub-test for node_ip parameter (live cluster) From e0151db22977fc5a90ee102b39168f416e7b5e94 Mon Sep 17 00:00:00 2001 From: machichima Date: Sun, 1 Feb 2026 16:14:51 +0800 Subject: [PATCH 18/37] fix: add download_filename rather than download_file flag Signed-off-by: machichima --- historyserver/pkg/historyserver/reader.go | 3 +++ historyserver/pkg/historyserver/router.go | 17 ++++++++++------- historyserver/pkg/historyserver/types.go | 5 +++-- 3 files changed, 16 insertions(+), 9 deletions(-) diff --git a/historyserver/pkg/historyserver/reader.go b/historyserver/pkg/historyserver/reader.go index 5d3ab50590b..8df3fcda48b 100644 --- a/historyserver/pkg/historyserver/reader.go +++ b/historyserver/pkg/historyserver/reader.go @@ -26,6 +26,9 @@ const ( // This matches Ray Dashboard API default behavior. DEFAULT_LOG_LIMIT = 1000 + // DEFAULT_DOWNLOAD_FILENAME is the default filename when download_filename is not specified + DEFAULT_DOWNLOAD_FILENAME = "file.txt" + // MAX_LOG_LIMIT is the maximum number of lines that can be requested. // Requests exceeding this limit will be capped to this value. MAX_LOG_LIMIT = 10000 diff --git a/historyserver/pkg/historyserver/router.go b/historyserver/pkg/historyserver/router.go index ac9cbdd765a..511010c8950 100644 --- a/historyserver/pkg/historyserver/router.go +++ b/historyserver/pkg/historyserver/router.go @@ -117,7 +117,7 @@ func routerAPI(s *ServerHandler) { Param(ws.QueryParameter("lines", "lines (number of lines to return, default: 1000)")). Param(ws.QueryParameter("timeout", "timeout")). Param(ws.QueryParameter("attempt_number", "attempt_number (task retry attempt number, default: 0)")). - Param(ws.QueryParameter("download_file", "download_file (true/false)")). + Param(ws.QueryParameter("download_filename", "download_filename (if set, triggers download with this filename)")). Param(ws.QueryParameter("filter_ansi_code", "filter_ansi_code (true/false)")). // TODO: submission_id parameter is not currently supported. // To support it, we need to: @@ -653,10 +653,12 @@ func (s *ServerHandler) getNodeLogFile(req *restful.Request, resp *restful.Respo return } - // Set Content-Disposition header if download_file is requested - if options.DownloadFile { - resp.AddHeader("Content-Disposition", fmt.Sprintf("attachment; filename=\"%s\"", options.Filename)) + // Use provided download_filename or default to DEFAULT_DOWNLOAD_FILENAME + downloadFilename := options.DownloadFilename + if downloadFilename == "" { + downloadFilename = DEFAULT_DOWNLOAD_FILENAME } + resp.AddHeader("Content-Disposition", fmt.Sprintf("attachment; filename=\"%s\"", downloadFilename)) resp.Write(content) } @@ -723,9 +725,10 @@ func parseGetLogFileOptions(req *restful.Request) (GetLogFileOptions, error) { options.FilterAnsiCode = strings.ToLower(filterStr) == "true" } - // Parse download_file parameter (boolean) - if downloadStr := req.QueryParameter("download_file"); downloadStr != "" { - options.DownloadFile = strings.ToLower(downloadStr) == "true" + // Parse download_filename parameter + // If provided, download with the specified filename + if downloadFilename := req.QueryParameter("download_filename"); downloadFilename != "" { + options.DownloadFilename = downloadFilename } return options, nil diff --git a/historyserver/pkg/historyserver/types.go b/historyserver/pkg/historyserver/types.go index 0def7f3dc8d..c0751765165 100644 --- a/historyserver/pkg/historyserver/types.go +++ b/historyserver/pkg/historyserver/types.go @@ -22,8 +22,9 @@ type GetLogFileOptions struct { AttemptNumber int // Whether to filter ANSI escape codes from logs, default to False FilterAnsiCode bool - // Whether to set Content-Disposition header for file download, default to false - DownloadFile bool + // If set, triggers download with Content-Disposition header using this filename. + // Otherwise, using the default filename DEFAULT_DOWNLOAD_FILENAME + DownloadFilename string // The suffix of the log file ("out" or "err"), default to "out" // Used when resolving by TaskID, ActorID, or PID Suffix string From 892d41a2aedcaa5bb1735ef5ca2d405942bea4a1 Mon Sep 17 00:00:00 2001 From: machichima Date: Sun, 1 Feb 2026 16:20:08 +0800 Subject: [PATCH 19/37] refactor: Base64 to hex conversion logic to util function Signed-off-by: machichima --- historyserver/pkg/historyserver/reader.go | 53 ++++++++++------------- 1 file changed, 23 insertions(+), 30 deletions(-) diff --git a/historyserver/pkg/historyserver/reader.go b/historyserver/pkg/historyserver/reader.go index 8df3fcda48b..453d4781bcf 100644 --- a/historyserver/pkg/historyserver/reader.go +++ b/historyserver/pkg/historyserver/reader.go @@ -43,6 +43,20 @@ func filterAnsiEscapeCodes(content []byte) []byte { return ansiEscapePattern.ReplaceAll(content, []byte("")) } +// decodeBase64ToHex decodes a Base64-encoded ID and returns its hex representation. +// It tries RawURLEncoding first (Ray's default), falling back to StdEncoding if that fails. +func decodeBase64ToHex(id string) (string, error) { + idBytes, err := base64.RawURLEncoding.DecodeString(id) + if err != nil { + // Try standard Base64 if URL-safe fails + idBytes, err = base64.StdEncoding.DecodeString(id) + if err != nil { + return "", fmt.Errorf("failed to decode Base64 ID: %w", err) + } + } + return fmt.Sprintf("%x", idBytes), nil +} + func (s *ServerHandler) listClusters(limit int) []utils.ClusterInfo { // Initial continuation marker logrus.Debugf("Prepare to get list clusters info ...") @@ -226,15 +240,10 @@ func (s *ServerHandler) resolvePidLogFilename(clusterNameID, sessionID, nodeID s } // The nodeID from actors/tasks is base64, but the path on storage uses hex. - nodeIDBytes, err := base64.RawURLEncoding.DecodeString(nodeID) + nodeIDHex, err := decodeBase64ToHex(nodeID) if err != nil { - // Try standard Base64 if URL-safe fails - nodeIDBytes, err = base64.StdEncoding.DecodeString(nodeID) - if err != nil { - return "", "", fmt.Errorf("failed to decode node_id: %w", err) - } + return "", "", fmt.Errorf("failed to decode node_id: %w", err) } - nodeIDHex := fmt.Sprintf("%x", nodeIDBytes) logPath := path.Join(sessionID, "logs", nodeIDHex) files := s.reader.ListFiles(clusterNameID, logPath) @@ -379,26 +388,16 @@ func (s *ServerHandler) resolveActorLogFilename(clusterNameID, sessionID, actorI func (s *ServerHandler) findWorkerLogFile(clusterNameID, sessionID, nodeID, workerID, suffix string) (string, string, error) { // Convert Base64 node_id to hex for the file path // Ray stores IDs in Base64 (URL-safe) in events, but uses hex in log directory structure - nodeIDBytes, err := base64.RawURLEncoding.DecodeString(nodeID) + nodeIDHex, err := decodeBase64ToHex(nodeID) if err != nil { - // Try standard Base64 if URL-safe fails - nodeIDBytes, err = base64.StdEncoding.DecodeString(nodeID) - if err != nil { - return "", "", fmt.Errorf("failed to decode node_id: %w", err) - } + return "", "", fmt.Errorf("failed to decode node_id: %w", err) } - nodeIDHex := fmt.Sprintf("%x", nodeIDBytes) // Convert Base64 worker_id to hex - workerIDBytes, err := base64.RawURLEncoding.DecodeString(workerID) + workerIDHex, err := decodeBase64ToHex(workerID) if err != nil { - // Try standard Base64 if URL-safe fails - workerIDBytes, err = base64.StdEncoding.DecodeString(workerID) - if err != nil { - return "", "", fmt.Errorf("failed to decode worker_id: %w", err) - } + return "", "", fmt.Errorf("failed to decode worker_id: %w", err) } - workerIDHex := fmt.Sprintf("%x", workerIDBytes) // List all files in the node's log directory logPath := path.Join(sessionID, "logs", nodeIDHex) @@ -497,17 +496,11 @@ func (s *ServerHandler) ipToNodeId(rayClusterNameID, sessionID, nodeIP string) ( } // Convert base64 node_id to hex (Ray stores node_id as base64 in events) - decoded, err := base64.StdEncoding.DecodeString(nodeIDBytes) + nodeIDHex, err := decodeBase64ToHex(nodeIDBytes) if err != nil { - // Try URL-safe base64 encoding - decoded, err = base64.RawURLEncoding.DecodeString(nodeIDBytes) - if err != nil { - logrus.Warnf("Failed to decode node_id %s: %v", nodeIDBytes, err) - continue - } + logrus.Warnf("Failed to decode node_id %s: %v", nodeIDBytes, err) + continue } - - nodeIDHex := fmt.Sprintf("%x", decoded) logrus.Infof("Resolved node_ip %s to node_id %s", nodeIP, nodeIDHex) return nodeIDHex, nil } From d25446322a8162d43afed30761dd6ef9a25f50c4 Mon Sep 17 00:00:00 2001 From: machichima Date: Sun, 1 Feb 2026 16:27:10 +0800 Subject: [PATCH 20/37] fix: skip convert to hex if already is Signed-off-by: machichima --- historyserver/pkg/historyserver/reader.go | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/historyserver/pkg/historyserver/reader.go b/historyserver/pkg/historyserver/reader.go index 453d4781bcf..8a8cc238f7b 100644 --- a/historyserver/pkg/historyserver/reader.go +++ b/historyserver/pkg/historyserver/reader.go @@ -43,9 +43,18 @@ func filterAnsiEscapeCodes(content []byte) []byte { return ansiEscapePattern.ReplaceAll(content, []byte("")) } -// decodeBase64ToHex decodes a Base64-encoded ID and returns its hex representation. +// decodeBase64ToHex converts an ID to hex format. +// Handles both cases: +// 1. Already hex format - returns as-is +// 2. Base64-encoded - decodes to hex // It tries RawURLEncoding first (Ray's default), falling back to StdEncoding if that fails. func decodeBase64ToHex(id string) (string, error) { + // Check if already hex (only [0-9a-f]) + if matched, _ := regexp.MatchString("^[0-9a-f]+$", id); matched { + return id, nil + } + + // Try base64 decode idBytes, err := base64.RawURLEncoding.DecodeString(id) if err != nil { // Try standard Base64 if URL-safe fails @@ -239,7 +248,7 @@ func (s *ServerHandler) resolvePidLogFilename(clusterNameID, sessionID, nodeID s return "", "", fmt.Errorf("node_id is required for pid resolution") } - // The nodeID from actors/tasks is base64, but the path on storage uses hex. + // Convert to hex if not already is nodeIDHex, err := decodeBase64ToHex(nodeID) if err != nil { return "", "", fmt.Errorf("failed to decode node_id: %w", err) @@ -386,8 +395,7 @@ func (s *ServerHandler) resolveActorLogFilename(clusterNameID, sessionID, actorI // Worker log files follow the pattern: worker-{worker_id_hex}-{pid}-{worker_startup_token}.{suffix} // Returns (nodeIDHex, filename, error). func (s *ServerHandler) findWorkerLogFile(clusterNameID, sessionID, nodeID, workerID, suffix string) (string, string, error) { - // Convert Base64 node_id to hex for the file path - // Ray stores IDs in Base64 (URL-safe) in events, but uses hex in log directory structure + // Convert to hex if not already is nodeIDHex, err := decodeBase64ToHex(nodeID) if err != nil { return "", "", fmt.Errorf("failed to decode node_id: %w", err) @@ -495,7 +503,7 @@ func (s *ServerHandler) ipToNodeId(rayClusterNameID, sessionID, nodeIP string) ( continue } - // Convert base64 node_id to hex (Ray stores node_id as base64 in events) + // Convert to hex if not already is nodeIDHex, err := decodeBase64ToHex(nodeIDBytes) if err != nil { logrus.Warnf("Failed to decode node_id %s: %v", nodeIDBytes, err) From ccd8243af37886bb07566024ea2a6990ed104100 Mon Sep 17 00:00:00 2001 From: machichima Date: Sun, 1 Feb 2026 16:29:53 +0800 Subject: [PATCH 21/37] fix: close reader to prevent connection leak Signed-off-by: machichima --- historyserver/pkg/historyserver/reader.go | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/historyserver/pkg/historyserver/reader.go b/historyserver/pkg/historyserver/reader.go index 8a8cc238f7b..73f027414ed 100644 --- a/historyserver/pkg/historyserver/reader.go +++ b/historyserver/pkg/historyserver/reader.go @@ -468,6 +468,10 @@ func (s *ServerHandler) ipToNodeId(rayClusterNameID, sessionID, nodeIP string) ( continue } + if closer, ok := reader.(io.Closer); ok { + defer closer.Close() + } + data, err := io.ReadAll(reader) if err != nil { logrus.Warnf("Failed to read node event file %s: %v", filePath, err) From f4dfefb13dfa7c026fa15a32a22c8ff275791467 Mon Sep 17 00:00:00 2001 From: machichima Date: Sun, 1 Feb 2026 16:31:39 +0800 Subject: [PATCH 22/37] refactor: remove duplicate import Signed-off-by: machichima --- historyserver/pkg/historyserver/reader.go | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/historyserver/pkg/historyserver/reader.go b/historyserver/pkg/historyserver/reader.go index 73f027414ed..81b816f90d8 100644 --- a/historyserver/pkg/historyserver/reader.go +++ b/historyserver/pkg/historyserver/reader.go @@ -15,7 +15,6 @@ import ( "strings" "github.com/emicklei/go-restful/v3" - "github.com/ray-project/kuberay/historyserver/pkg/eventserver/types" eventtypes "github.com/ray-project/kuberay/historyserver/pkg/eventserver/types" "github.com/ray-project/kuberay/historyserver/pkg/utils" "github.com/sirupsen/logrus" @@ -487,7 +486,7 @@ func (s *ServerHandler) ipToNodeId(rayClusterNameID, sessionID, nodeIP string) ( // Search for NODE_DEFINITION_EVENT with matching node_ip for _, event := range events { eventType, ok := event["eventType"].(string) - if !ok || eventType != string(types.NODE_DEFINITION_EVENT) { + if !ok || eventType != string(eventtypes.NODE_DEFINITION_EVENT) { continue } From a6b76ad63bfdbeb5491bcd32d3851b7a07607889 Mon Sep 17 00:00:00 2001 From: machichima Date: Sun, 1 Feb 2026 16:49:43 +0800 Subject: [PATCH 23/37] test: fix and refactor test Signed-off-by: machichima --- historyserver/test/e2e/historyserver_test.go | 137 ++++++++----------- 1 file changed, 58 insertions(+), 79 deletions(-) diff --git a/historyserver/test/e2e/historyserver_test.go b/historyserver/test/e2e/historyserver_test.go index 3226a43762c..698ca40696c 100644 --- a/historyserver/test/e2e/historyserver_test.go +++ b/historyserver/test/e2e/historyserver_test.go @@ -459,9 +459,8 @@ func testLogFileEndpointDeadCluster(test Test, g *WithT, namespace *corev1.Names {"attempt_number=0", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=%s&attempt_number=0", u, EndpointLogFile, n, filename) }, http.StatusOK}, {"attempt_number=1 (not found)", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=%s&attempt_number=1", u, EndpointLogFile, n, filename) }, http.StatusNotFound}, - // download_file parameter - {"download_file=true", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=%s&download_file=true", u, EndpointLogFile, n, filename) }, http.StatusOK}, - {"download_file=false", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=%s&download_file=false", u, EndpointLogFile, n, filename) }, http.StatusOK}, + // download_filename parameter + {"download_filename=custom.log", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=%s&download_filename=custom.log", u, EndpointLogFile, n, filename) }, http.StatusOK}, // filter_ansi_code parameter {"filter_ansi_code=true", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=%s&filter_ansi_code=true", u, EndpointLogFile, n, filename) }, http.StatusOK}, @@ -473,7 +472,7 @@ func testLogFileEndpointDeadCluster(test Test, g *WithT, namespace *corev1.Names // Combined parameters {"lines+timeout+filter", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=%s&lines=50&timeout=10&filter_ansi_code=true", u, EndpointLogFile, n, filename) }, http.StatusOK}, - {"all parameters", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=%s&lines=100&timeout=15&attempt_number=0&download_file=true&filter_ansi_code=true", u, EndpointLogFile, n, filename) }, http.StatusOK}, + {"all parameters", func(u, n string) string { return fmt.Sprintf("%s%s?node_id=%s&filename=%s&lines=100&timeout=15&attempt_number=0&download_filename=custom.log&filter_ansi_code=true", u, EndpointLogFile, n, filename) }, http.StatusOK}, // Missing mandatory parameters {"missing node_id and node_ip", func(u, n string) string { return fmt.Sprintf("%s%s?filename=%s", u, EndpointLogFile, filename) }, http.StatusBadRequest}, @@ -530,95 +529,79 @@ func testLogFileEndpointDeadCluster(test Test, g *WithT, namespace *corev1.Names } // Sub-tests for specific parameter behaviors - test.T().Run("download_file header validation", func(t *testing.T) { + test.T().Run("download_filename header validation", func(t *testing.T) { g := NewWithT(t) - g.Eventually(func(gg Gomega) { - // Test with download_file=true - urlWithDownload := fmt.Sprintf("%s%s?node_id=%s&filename=%s&download_file=true", historyServerURL, EndpointLogFile, nodeID, filename) - resp, err := client.Get(urlWithDownload) - gg.Expect(err).NotTo(HaveOccurred()) - defer resp.Body.Close() - - gg.Expect(resp.StatusCode).To(Equal(http.StatusOK)) - contentDisposition := resp.Header.Get("Content-Disposition") - gg.Expect(contentDisposition).To(ContainSubstring("attachment")) - gg.Expect(contentDisposition).To(ContainSubstring(fmt.Sprintf("filename=\"%s\"", filename))) - LogWithTimestamp(test.T(), "download_file=true sets Content-Disposition header: %s", contentDisposition) - - // Test with download_file=false, should not have Content-Disposition header - urlWithoutDownload := fmt.Sprintf("%s%s?node_id=%s&filename=%s&download_file=false", historyServerURL, EndpointLogFile, nodeID, filename) - resp2, err := client.Get(urlWithoutDownload) - gg.Expect(err).NotTo(HaveOccurred()) - defer resp2.Body.Close() + // Test with download_filename parameter set + customFilename := "custom_download.log" + urlWithDownload := fmt.Sprintf("%s%s?node_id=%s&filename=%s&download_filename=%s", historyServerURL, EndpointLogFile, nodeID, filename, customFilename) + resp, err := client.Get(urlWithDownload) + g.Expect(err).NotTo(HaveOccurred()) + defer resp.Body.Close() - gg.Expect(resp2.StatusCode).To(Equal(http.StatusOK)) - contentDisposition2 := resp2.Header.Get("Content-Disposition") - gg.Expect(contentDisposition2).To(BeEmpty(), "Content-Disposition should not be set when download_file=false") - }, TestTimeoutShort).Should(Succeed()) + g.Expect(resp.StatusCode).To(Equal(http.StatusOK)) + contentDisposition := resp.Header.Get("Content-Disposition") + g.Expect(contentDisposition).To(ContainSubstring("attachment")) + g.Expect(contentDisposition).To(ContainSubstring(fmt.Sprintf("filename=\"%s\"", customFilename))) }) test.T().Run("filter_ansi_code behavior", func(t *testing.T) { g := NewWithT(t) - g.Eventually(func(gg Gomega) { - // Fetch with filter_ansi_code=false (original content with ANSI codes) - urlWithoutFilter := fmt.Sprintf("%s%s?node_id=%s&filename=%s&filter_ansi_code=false&lines=100", historyServerURL, EndpointLogFile, nodeID, filename) - resp, err := client.Get(urlWithoutFilter) - gg.Expect(err).NotTo(HaveOccurred()) - defer resp.Body.Close() + // Fetch with filter_ansi_code=false (original content with ANSI codes) + urlWithoutFilter := fmt.Sprintf("%s%s?node_id=%s&filename=%s&filter_ansi_code=false&lines=100", historyServerURL, EndpointLogFile, nodeID, filename) + resp, err := client.Get(urlWithoutFilter) + g.Expect(err).NotTo(HaveOccurred()) + defer resp.Body.Close() - gg.Expect(resp.StatusCode).To(Equal(http.StatusOK)) - bodyWithoutFilter, err := io.ReadAll(resp.Body) - gg.Expect(err).NotTo(HaveOccurred()) + g.Expect(resp.StatusCode).To(Equal(http.StatusOK)) + bodyWithoutFilter, err := io.ReadAll(resp.Body) + g.Expect(err).NotTo(HaveOccurred()) - // Fetch with filter_ansi_code=true (ANSI codes should be removed) - urlWithFilter := fmt.Sprintf("%s%s?node_id=%s&filename=%s&filter_ansi_code=true&lines=100", historyServerURL, EndpointLogFile, nodeID, filename) - resp2, err := client.Get(urlWithFilter) - gg.Expect(err).NotTo(HaveOccurred()) - defer resp2.Body.Close() + // Fetch with filter_ansi_code=true (ANSI codes should be removed) + urlWithFilter := fmt.Sprintf("%s%s?node_id=%s&filename=%s&filter_ansi_code=true&lines=100", historyServerURL, EndpointLogFile, nodeID, filename) + resp2, err := client.Get(urlWithFilter) + g.Expect(err).NotTo(HaveOccurred()) + defer resp2.Body.Close() - gg.Expect(resp2.StatusCode).To(Equal(http.StatusOK)) - bodyWithFilter, err := io.ReadAll(resp2.Body) - gg.Expect(err).NotTo(HaveOccurred()) + g.Expect(resp2.StatusCode).To(Equal(http.StatusOK)) + bodyWithFilter, err := io.ReadAll(resp2.Body) + g.Expect(err).NotTo(HaveOccurred()) - // Check if original content contains ANSI codes using the same pattern as reader.go - hasAnsiInOriginal := ansiEscapePattern.Match(bodyWithoutFilter) + // Check if original content contains ANSI codes using the same pattern as reader.go + hasAnsiInOriginal := ansiEscapePattern.Match(bodyWithoutFilter) - if hasAnsiInOriginal { - LogWithTimestamp(test.T(), "Original log contains ANSI codes, verifying they are filtered") - // Filtered content should NOT contain ANSI escape sequences - hasAnsiInFiltered := ansiEscapePattern.Match(bodyWithFilter) - gg.Expect(hasAnsiInFiltered).To(BeFalse(), "Filtered content should not contain ANSI escape sequences") - } else { - LogWithTimestamp(test.T(), "Log doesn't contain ANSI codes, check is skipped...") - } - }, TestTimeoutShort).Should(Succeed()) + if hasAnsiInOriginal { + LogWithTimestamp(test.T(), "Original log contains ANSI codes, verifying they are filtered") + // Filtered content should NOT contain ANSI escape sequences + hasAnsiInFiltered := ansiEscapePattern.Match(bodyWithFilter) + g.Expect(hasAnsiInFiltered).To(BeFalse(), "Filtered content should not contain ANSI escape sequences") + } else { + LogWithTimestamp(test.T(), "Log doesn't contain ANSI codes, check is skipped...") + } }) test.T().Run("attempt_number behavior", func(t *testing.T) { g := NewWithT(t) - g.Eventually(func(gg Gomega) { - // Test with attempt_number=0 - urlAttempt0 := fmt.Sprintf("%s%s?node_id=%s&filename=%s&attempt_number=0", historyServerURL, EndpointLogFile, nodeID, filename) - resp, err := client.Get(urlAttempt0) - gg.Expect(err).NotTo(HaveOccurred()) - defer resp.Body.Close() + // Test with attempt_number=0 + urlAttempt0 := fmt.Sprintf("%s%s?node_id=%s&filename=%s&attempt_number=0", historyServerURL, EndpointLogFile, nodeID, filename) + resp, err := client.Get(urlAttempt0) + g.Expect(err).NotTo(HaveOccurred()) + defer resp.Body.Close() - gg.Expect(resp.StatusCode).To(Equal(http.StatusOK)) - body, err := io.ReadAll(resp.Body) - gg.Expect(err).NotTo(HaveOccurred()) - gg.Expect(len(body)).To(BeNumerically(">", 0)) - LogWithTimestamp(test.T(), "attempt_number=0 returned %d bytes", len(body)) + g.Expect(resp.StatusCode).To(Equal(http.StatusOK)) + body, err := io.ReadAll(resp.Body) + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(len(body)).To(BeNumerically(">", 0)) + LogWithTimestamp(test.T(), "attempt_number=0 returned %d bytes", len(body)) - // attempt_number=1 should fail as retry log doesn't exist for normal execution - urlAttempt1 := fmt.Sprintf("%s%s?node_id=%s&filename=%s&attempt_number=1", historyServerURL, EndpointLogFile, nodeID, filename) - resp2, err := client.Get(urlAttempt1) - gg.Expect(err).NotTo(HaveOccurred()) - defer resp2.Body.Close() + // attempt_number=1 should fail as retry log doesn't exist for normal execution + urlAttempt1 := fmt.Sprintf("%s%s?node_id=%s&filename=%s&attempt_number=1", historyServerURL, EndpointLogFile, nodeID, filename) + resp2, err := client.Get(urlAttempt1) + g.Expect(err).NotTo(HaveOccurred()) + defer resp2.Body.Close() - gg.Expect(resp2.StatusCode).To(Equal(http.StatusNotFound), - "attempt_number=1 should return 404 when retry log doesn't exist") - LogWithTimestamp(test.T(), "attempt_number=1 correctly returns 404 (file not found)") - }, TestTimeoutShort).Should(Succeed()) + g.Expect(resp2.StatusCode).To(Equal(http.StatusNotFound), + "attempt_number=1 should return 404 when retry log doesn't exist") + LogWithTimestamp(test.T(), "attempt_number=1 correctly returns 404 (file not found)") }) // Sub-test for task_id parameter @@ -835,8 +818,6 @@ func getAllEligibleActorIDs(g *WithT, client *http.Client, historyServerURL stri actorIDs = append(actorIDs, actorID) } - fmt.Printf("Actor IDs: %+v", actorIDs) - g.Expect(len(actorIDs)).To(BeNumerically(">", 0), "should have at least one eligible actor") return actorIDs @@ -869,8 +850,6 @@ func getEligibleWorkerPID(g *WithT, client *http.Client, historyServerURL string g.Expect(ok).To(BeTrue(), "result should have 'result' array") g.Expect(len(tasks)).To(BeNumerically(">", 0), "should have at least one task") - fmt.Printf("Tasks: %+v", tasks) - // Find a task with valid worker_pid and node_id for _, t := range tasks { task, ok := t.(map[string]interface{}) From 4b7a56b185ea25faae86de52aee9a36b12a66312 Mon Sep 17 00:00:00 2001 From: machichima Date: Sun, 1 Feb 2026 16:58:24 +0800 Subject: [PATCH 24/37] fix: append suffix when deal with filename Signed-off-by: machichima --- historyserver/pkg/historyserver/reader.go | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/historyserver/pkg/historyserver/reader.go b/historyserver/pkg/historyserver/reader.go index 81b816f90d8..2ec395ed185 100644 --- a/historyserver/pkg/historyserver/reader.go +++ b/historyserver/pkg/historyserver/reader.go @@ -127,12 +127,6 @@ func (s *ServerHandler) _getNodeLogFile(rayClusterNameID, sessionID string, opti // Build log path logPath := path.Join(sessionID, "logs", nodeID, filename) - // Append attempt_number if specified and not using task_id - // (task_id already includes attempt_number in resolution) - if options.AttemptNumber > 0 && options.TaskID == "" { - logPath = fmt.Sprintf("%s.%d", logPath, options.AttemptNumber) - } - reader := s.reader.GetContent(rayClusterNameID, logPath) if reader == nil { @@ -219,7 +213,12 @@ func (s *ServerHandler) resolveLogFilename(clusterNameID, sessionID string, opti if options.NodeID == "" { return "", "", fmt.Errorf("node_id is required when filename is provided") } - return options.NodeID, options.Filename, nil + filename := options.Filename + // Append attempt_number if specified for explicit filenames + if options.AttemptNumber > 0 { + filename = fmt.Sprintf("%s.%d", filename, options.AttemptNumber) + } + return options.NodeID, filename, nil } // If task_id is provided, resolve from task events From 4bd16dadace58f0929d3716fd091ee792754856d Mon Sep 17 00:00:00 2001 From: machichima Date: Sun, 1 Feb 2026 17:01:13 +0800 Subject: [PATCH 25/37] fix: remove duplicate suffix validation Signed-off-by: machichima --- historyserver/pkg/historyserver/reader.go | 5 ----- 1 file changed, 5 deletions(-) diff --git a/historyserver/pkg/historyserver/reader.go b/historyserver/pkg/historyserver/reader.go index 2ec395ed185..64ff7bb1b1c 100644 --- a/historyserver/pkg/historyserver/reader.go +++ b/historyserver/pkg/historyserver/reader.go @@ -203,11 +203,6 @@ func (s *ServerHandler) _getNodeLogFile(rayClusterNameID, sessionID string, opti // This mirrors Ray Dashboard's resolve_filename logic. // The sessionID parameter is required for task_id resolution to search worker log files. func (s *ServerHandler) resolveLogFilename(clusterNameID, sessionID string, options GetLogFileOptions) (nodeID, filename string, err error) { - // Validate suffix - if options.Suffix != "out" && options.Suffix != "err" { - return "", "", fmt.Errorf("invalid suffix: %s (must be 'out' or 'err')", options.Suffix) - } - // If filename is explicitly provided, use it and ignore suffix if options.Filename != "" { if options.NodeID == "" { From f8dd9eeee4747b958c3436fb3ccb4dfaa9857fcb Mon Sep 17 00:00:00 2001 From: machichima Date: Sun, 1 Feb 2026 17:17:10 +0800 Subject: [PATCH 26/37] refactor: remove not yet implemented comment Signed-off-by: machichima --- historyserver/pkg/historyserver/types.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/historyserver/pkg/historyserver/types.go b/historyserver/pkg/historyserver/types.go index c0751765165..fcb65009599 100644 --- a/historyserver/pkg/historyserver/types.go +++ b/historyserver/pkg/historyserver/types.go @@ -9,8 +9,8 @@ type GetLogFileOptions struct { // Log file identification (provide one of: Filename, TaskID, ActorID, PID) Filename string // The log file name (explicit path) TaskID string // Task ID to resolve log file - ActorID string // Actor ID to resolve log file (not yet implemented) - PID int // Process ID to resolve log file (not yet implemented) + ActorID string // Actor ID to resolve log file + PID int // Process ID to resolve log file // Optional parameters with defaults // Number of lines to return, default to DEFAULT_LOG_LIMIT (1000) From fe49c92fe3b1783178da1e005caa600e3554ce17 Mon Sep 17 00:00:00 2001 From: machichima Date: Sun, 1 Feb 2026 20:19:30 +0800 Subject: [PATCH 27/37] feat+test: add logs/stream endpoint Signed-off-by: machichima --- historyserver/pkg/historyserver/router.go | 25 ++++++ historyserver/test/e2e/historyserver_test.go | 83 +++++++++++++++++++- 2 files changed, 106 insertions(+), 2 deletions(-) diff --git a/historyserver/pkg/historyserver/router.go b/historyserver/pkg/historyserver/router.go index 511010c8950..8fea834faf6 100644 --- a/historyserver/pkg/historyserver/router.go +++ b/historyserver/pkg/historyserver/router.go @@ -131,6 +131,18 @@ func routerAPI(s *ServerHandler) { // - Return filename as "job-driver-{submission_id}.log" Produces("text/plain"). Writes("")) // Placeholder for specific return type + ws.Route(ws.GET("/v0/logs/stream").To(s.getNodeLogStream).Filter(s.CookieHandle). + Doc("stream logs in real-time"). + Param(ws.QueryParameter("node_id", "node_id (optional if node_ip/task_id/actor_id is provided)")). + Param(ws.QueryParameter("node_ip", "node_ip (optional, resolve to node_id)")). + Param(ws.QueryParameter("filename", "filename (explicit log file path)")). + Param(ws.QueryParameter("task_id", "task_id (resolve log file from task)")). + Param(ws.QueryParameter("actor_id", "actor_id (resolve log file from actor)")). + Param(ws.QueryParameter("pid", "pid (resolve log file from process id)")). + Param(ws.QueryParameter("suffix", "suffix (out or err, default: out, used with task_id/actor_id/pid)")). + Param(ws.QueryParameter("interval", "interval (polling interval in seconds)")). + Produces("text/event-stream"). + Writes("")) // Placeholder for specific return type ws.Route(ws.GET("/v0/tasks").To(s.getTaskDetail).Filter(s.CookieHandle). Doc("get task detail "). @@ -734,6 +746,19 @@ func parseGetLogFileOptions(req *restful.Request) (GetLogFileOptions, error) { return options, nil } +func (s *ServerHandler) getNodeLogStream(req *restful.Request, resp *restful.Response) { + sessionName := req.Attribute(COOKIE_SESSION_NAME_KEY).(string) + + // Streaming only available for live clusters + if sessionName != "live" { + resp.WriteErrorString(http.StatusNotImplemented, "Log streaming only available for live clusters") + return + } + + // Forward to Ray Dashboard + s.redirectRequest(req, resp) +} + func (s *ServerHandler) getTaskSummarize(req *restful.Request, resp *restful.Response) { clusterName := req.Attribute(COOKIE_CLUSTER_NAME_KEY).(string) clusterNamespace := req.Attribute(COOKIE_CLUSTER_NAMESPACE_KEY).(string) diff --git a/historyserver/test/e2e/historyserver_test.go b/historyserver/test/e2e/historyserver_test.go index 698ca40696c..2cc70dbda54 100644 --- a/historyserver/test/e2e/historyserver_test.go +++ b/historyserver/test/e2e/historyserver_test.go @@ -20,8 +20,9 @@ import ( ) const ( - LiveSessionName = "live" - EndpointLogFile = "/api/v0/logs/file" + LiveSessionName = "live" + EndpointLogFile = "/api/v0/logs/file" + EndpointLogStream = "/api/v0/logs/stream" ) // ansiEscapePattern matches ANSI escape sequences (same pattern as in reader.go) @@ -48,6 +49,10 @@ func TestHistoryServer(t *testing.T) { name: "/v0/logs/file endpoint (dead cluster)", testFunc: testLogFileEndpointDeadCluster, }, + { + name: "/v0/logs/stream endpoint", + testFunc: testLogStreamEndpoint, + }, } for _, tt := range tests { @@ -877,3 +882,77 @@ func getEligibleWorkerPID(g *WithT, client *http.Client, historyServerURL string g.Fail("should find at least one task with valid worker_pid and node_id") return 0, "" } + +// testLogStreamEndpoint verifies the /v0/logs/stream endpoint behavior for both live and dead clusters. +// +// The test case follows these steps: +// 1. Prepare test environment by applying a Ray cluster +// 2. Submit a Ray job to the existing cluster +// 3. Apply History Server and get its URL +// 4. Test live cluster: streaming should work (return 200) +// 5. Delete cluster to test dead cluster behavior +// 6. Test dead cluster: streaming should return 501 Not Implemented +// 7. Delete S3 bucket to ensure test isolation +func testLogStreamEndpoint(test Test, g *WithT, namespace *corev1.Namespace, s3Client *s3.S3) { + rayCluster := PrepareTestEnv(test, g, namespace, s3Client) + ApplyRayJobAndWaitForCompletion(test, g, namespace, rayCluster) + ApplyHistoryServer(test, g, namespace) + historyServerURL := GetHistoryServerURL(test, g, namespace) + + // Test 1: Live cluster - streaming should work + LogWithTimestamp(test.T(), "Testing /v0/logs/stream for live cluster") + clusterInfo := getClusterFromList(test, g, historyServerURL, rayCluster.Name, namespace.Name) + g.Expect(clusterInfo.SessionName).To(Equal(LiveSessionName)) + + client := CreateHTTPClientWithCookieJar(g) + setClusterContext(test, g, client, historyServerURL, namespace.Name, rayCluster.Name, clusterInfo.SessionName) + + nodeID := GetOneOfNodeID(g, client, historyServerURL) + filename := "raylet.out" + streamURL := fmt.Sprintf("%s%s?node_id=%s&filename=%s", historyServerURL, EndpointLogStream, nodeID, filename) + + // Test live cluster streaming endpoint + test.T().Run("live cluster", func(t *testing.T) { + g := NewWithT(t) + + resp, err := client.Get(streamURL) + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(resp.StatusCode).To(Equal(http.StatusOK), "Live cluster should support log streaming") + + resp.Body.Close() + }) + + // Delete RayCluster to test dead cluster behavior + err := test.Client().Ray().RayV1().RayClusters(namespace.Name).Delete(test.Ctx(), rayCluster.Name, metav1.DeleteOptions{}) + g.Expect(err).NotTo(HaveOccurred()) + LogWithTimestamp(test.T(), "Deleted RayCluster %s/%s", namespace.Name, rayCluster.Name) + + // Wait for cluster to be fully deleted + g.Eventually(func() error { + _, err := GetRayCluster(test, namespace.Name, rayCluster.Name) + return err + }, TestTimeoutMedium).Should(WithTransform(k8serrors.IsNotFound, BeTrue())) + + // Get the new cluster info + deadClusterInfo := getClusterFromList(test, g, historyServerURL, rayCluster.Name, namespace.Name) + g.Expect(deadClusterInfo.SessionName).NotTo(Equal(LiveSessionName)) + + // Set context for dead cluster + setClusterContext(test, g, client, historyServerURL, namespace.Name, rayCluster.Name, deadClusterInfo.SessionName) + + // Test dead cluster streaming endpoint - should return 501 + test.T().Run("dead cluster", func(t *testing.T) { + g := NewWithT(t) + resp2, err := client.Get(streamURL) + g.Expect(err).NotTo(HaveOccurred()) + defer resp2.Body.Close() + + g.Expect(resp2.StatusCode).To(Equal(http.StatusNotImplemented), "Dead cluster should return 501 for log streaming") + body, _ := io.ReadAll(resp2.Body) + g.Expect(string(body)).To(ContainSubstring("Log streaming only available for live clusters")) + LogWithTimestamp(test.T(), "Dead cluster correctly returns 501 Not Implemented") + }) + + DeleteS3Bucket(test, g, s3Client) + LogWithTimestamp(test.T(), "Log stream endpoint tests completed") +} From 200d56c3b317ee7025cdb1c50f97613251e56fd7 Mon Sep 17 00:00:00 2001 From: machichima Date: Mon, 2 Feb 2026 19:59:13 +0800 Subject: [PATCH 28/37] fix: remove redundant status code check Signed-off-by: machichima --- historyserver/test/e2e/historyserver_test.go | 3 --- 1 file changed, 3 deletions(-) diff --git a/historyserver/test/e2e/historyserver_test.go b/historyserver/test/e2e/historyserver_test.go index 2cc70dbda54..8e9b3679f54 100644 --- a/historyserver/test/e2e/historyserver_test.go +++ b/historyserver/test/e2e/historyserver_test.go @@ -259,9 +259,6 @@ func testLogFileEndpointLiveCluster(test Test, g *WithT, namespace *corev1.Names tc.name, tc.expectedStatus, resp.StatusCode, string(body)) } - g.Expect(resp.StatusCode).To(Equal(tc.expectedStatus), - "Test case '%s' failed: expected %d, got %d", tc.name, tc.expectedStatus, resp.StatusCode) - if tc.expectedStatus == http.StatusOK { g.Expect(len(body)).To(BeNumerically(">", 0)) } From c41d87436688f2020081f9c1f9a047f93810ec4d Mon Sep 17 00:00:00 2001 From: machichima Date: Tue, 3 Feb 2026 18:51:08 +0800 Subject: [PATCH 29/37] fix: update format of worker log in comment Signed-off-by: machichima --- historyserver/pkg/historyserver/reader.go | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/historyserver/pkg/historyserver/reader.go b/historyserver/pkg/historyserver/reader.go index 64ff7bb1b1c..e7fe9ccf32c 100644 --- a/historyserver/pkg/historyserver/reader.go +++ b/historyserver/pkg/historyserver/reader.go @@ -320,15 +320,12 @@ func (s *ServerHandler) resolveTaskLogFilename(clusterNameID, sessionID, taskID } // Fallback: Find worker log file by worker_id - // Worker log files follow the pattern: worker-{worker_id_hex}-{pid}-{worker_startup_token}.{suffix} - // We need to search for files matching this pattern if sessionID == "" { return "", "", fmt.Errorf( "task %s (attempt %d) has no task_log_info and sessionID is required to search for worker log files", taskID, attemptNumber, ) } - nodeIDHex, logFilename, err := s.findWorkerLogFile(clusterNameID, sessionID, foundTask.NodeID, foundTask.WorkerID, suffix) if err != nil { return "", "", fmt.Errorf( @@ -366,7 +363,6 @@ func (s *ServerHandler) resolveActorLogFilename(clusterNameID, sessionID, actorI } // Find worker log file by worker_id - // Worker log files follow the pattern: worker-{worker_id_hex}-{pid}-{worker_startup_token}.{suffix} nodeIDHex, logFilename, err := s.findWorkerLogFile( clusterNameID, sessionID, @@ -385,7 +381,8 @@ func (s *ServerHandler) resolveActorLogFilename(clusterNameID, sessionID, actorI } // findWorkerLogFile searches for a worker log file by worker_id. -// Worker log files follow the pattern: worker-{worker_id_hex}-{pid}-{worker_startup_token}.{suffix} +// Worker log files follow the pattern: worker-{worker_id_hex}-{job_id_hex}-{pid}.{suffix} +// Ref: https://github.com/ray-project/ray/blob/219ee7037bbdc02f66b58a814c9ad2618309c19e/src/ray/core_worker/core_worker_process.cc#L80-L80 // Returns (nodeIDHex, filename, error). func (s *ServerHandler) findWorkerLogFile(clusterNameID, sessionID, nodeID, workerID, suffix string) (string, string, error) { // Convert to hex if not already is From d9b7dbaee56a8139aa378cc1b4a92e8c2c8b7b5b Mon Sep 17 00:00:00 2001 From: machichima Date: Tue, 3 Feb 2026 19:04:49 +0800 Subject: [PATCH 30/37] feat: more robust ConvertBase64ToHex and centralize the logic put in utils and use in reader Signed-off-by: machichima --- historyserver/pkg/historyserver/reader.go | 34 +++-------------------- historyserver/pkg/utils/utils.go | 28 +++++++++++++------ 2 files changed, 24 insertions(+), 38 deletions(-) diff --git a/historyserver/pkg/historyserver/reader.go b/historyserver/pkg/historyserver/reader.go index d9c45f39603..eb12c17d739 100644 --- a/historyserver/pkg/historyserver/reader.go +++ b/historyserver/pkg/historyserver/reader.go @@ -3,7 +3,6 @@ package historyserver import ( "bufio" "context" - "encoding/base64" "encoding/json" "errors" "fmt" @@ -14,9 +13,7 @@ import ( "sort" "strings" - "github.com/emicklei/go-restful/v3" eventtypes "github.com/ray-project/kuberay/historyserver/pkg/eventserver/types" - "github.com/ray-project/kuberay/historyserver/pkg/utils" "github.com/sirupsen/logrus" "github.com/ray-project/kuberay/historyserver/pkg/utils" @@ -44,29 +41,6 @@ func filterAnsiEscapeCodes(content []byte) []byte { return ansiEscapePattern.ReplaceAll(content, []byte("")) } -// decodeBase64ToHex converts an ID to hex format. -// Handles both cases: -// 1. Already hex format - returns as-is -// 2. Base64-encoded - decodes to hex -// It tries RawURLEncoding first (Ray's default), falling back to StdEncoding if that fails. -func decodeBase64ToHex(id string) (string, error) { - // Check if already hex (only [0-9a-f]) - if matched, _ := regexp.MatchString("^[0-9a-f]+$", id); matched { - return id, nil - } - - // Try base64 decode - idBytes, err := base64.RawURLEncoding.DecodeString(id) - if err != nil { - // Try standard Base64 if URL-safe fails - idBytes, err = base64.StdEncoding.DecodeString(id) - if err != nil { - return "", fmt.Errorf("failed to decode Base64 ID: %w", err) - } - } - return fmt.Sprintf("%x", idBytes), nil -} - func (s *ServerHandler) listClusters(limit int) []utils.ClusterInfo { // Initial continuation marker logrus.Debugf("Prepare to get list clusters info ...") @@ -244,7 +218,7 @@ func (s *ServerHandler) resolvePidLogFilename(clusterNameID, sessionID, nodeID s } // Convert to hex if not already is - nodeIDHex, err := decodeBase64ToHex(nodeID) + nodeIDHex, err := utils.ConvertBase64ToHex(nodeID) if err != nil { return "", "", fmt.Errorf("failed to decode node_id: %w", err) } @@ -388,13 +362,13 @@ func (s *ServerHandler) resolveActorLogFilename(clusterNameID, sessionID, actorI // Returns (nodeIDHex, filename, error). func (s *ServerHandler) findWorkerLogFile(clusterNameID, sessionID, nodeID, workerID, suffix string) (string, string, error) { // Convert to hex if not already is - nodeIDHex, err := decodeBase64ToHex(nodeID) + nodeIDHex, err := utils.ConvertBase64ToHex(nodeID) if err != nil { return "", "", fmt.Errorf("failed to decode node_id: %w", err) } // Convert Base64 worker_id to hex - workerIDHex, err := decodeBase64ToHex(workerID) + workerIDHex, err := utils.ConvertBase64ToHex(workerID) if err != nil { return "", "", fmt.Errorf("failed to decode worker_id: %w", err) } @@ -500,7 +474,7 @@ func (s *ServerHandler) ipToNodeId(rayClusterNameID, sessionID, nodeIP string) ( } // Convert to hex if not already is - nodeIDHex, err := decodeBase64ToHex(nodeIDBytes) + nodeIDHex, err := utils.ConvertBase64ToHex(nodeIDBytes) if err != nil { logrus.Warnf("Failed to decode node_id %s: %v", nodeIDBytes, err) continue diff --git a/historyserver/pkg/utils/utils.go b/historyserver/pkg/utils/utils.go index e769c88141c..8f0c59a828b 100644 --- a/historyserver/pkg/utils/utils.go +++ b/historyserver/pkg/utils/utils.go @@ -3,10 +3,10 @@ package utils import ( "bytes" "encoding/base64" - "encoding/hex" "fmt" "os" "path" + "regexp" "strings" "time" @@ -243,15 +243,27 @@ func GetRayNodeID() (string, error) { return "", fmt.Errorf("timeout --node_id= not found") } -func ConvertBase64ToHex(input string) (string, error) { - bytes, err := base64.StdEncoding.DecodeString(input) - if err != nil { - return input, err +// ConvertBase64ToHex converts an ID to hex format. +// Handles both cases: +// 1. Already hex format - returns as-is +// 2. Base64-encoded - decodes to hex +// It tries RawURLEncoding first (Ray's default), falling back to StdEncoding if that fails. +func ConvertBase64ToHex(id string) (string, error) { + // Check if already hex (only [0-9a-f]) + if matched, _ := regexp.MatchString("^[0-9a-f]+$", id); matched { + return id, nil } - hexStr := hex.EncodeToString(bytes) - - return hexStr, nil + // Try base64 decode + idBytes, err := base64.RawURLEncoding.DecodeString(id) + if err != nil { + // Try standard Base64 if URL-safe fails + idBytes, err = base64.StdEncoding.DecodeString(id) + if err != nil { + return "", fmt.Errorf("failed to decode Base64 ID: %w", err) + } + } + return fmt.Sprintf("%x", idBytes), nil } // BuildClusterSessionKey constructs the key used to identify a specific cluster session. From f1799d19ce360f1f925d6c696f82be837fc9b4e9 Mon Sep 17 00:00:00 2001 From: machichima Date: Tue, 3 Feb 2026 19:19:38 +0800 Subject: [PATCH 31/37] fix: return original id if cannot decode Signed-off-by: machichima --- historyserver/pkg/utils/utils.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/historyserver/pkg/utils/utils.go b/historyserver/pkg/utils/utils.go index 8f0c59a828b..9cab5f36c90 100644 --- a/historyserver/pkg/utils/utils.go +++ b/historyserver/pkg/utils/utils.go @@ -260,7 +260,7 @@ func ConvertBase64ToHex(id string) (string, error) { // Try standard Base64 if URL-safe fails idBytes, err = base64.StdEncoding.DecodeString(id) if err != nil { - return "", fmt.Errorf("failed to decode Base64 ID: %w", err) + return id, fmt.Errorf("failed to decode Base64 ID: %w", err) } } return fmt.Sprintf("%x", idBytes), nil From e848198f574cc415cbb38fd967b95ca82f639451 Mon Sep 17 00:00:00 2001 From: machichima Date: Tue, 3 Feb 2026 19:38:44 +0800 Subject: [PATCH 32/37] fix: escape filename correctly Signed-off-by: machichima --- historyserver/pkg/historyserver/router.go | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/historyserver/pkg/historyserver/router.go b/historyserver/pkg/historyserver/router.go index 5526e088f64..3f9cc013ca4 100644 --- a/historyserver/pkg/historyserver/router.go +++ b/historyserver/pkg/historyserver/router.go @@ -7,6 +7,7 @@ import ( "fmt" "io" "io/fs" + "mime" "net/http" "strconv" "strings" @@ -765,7 +766,19 @@ func (s *ServerHandler) getNodeLogFile(req *restful.Request, resp *restful.Respo if downloadFilename == "" { downloadFilename = DEFAULT_DOWNLOAD_FILENAME } - resp.AddHeader("Content-Disposition", fmt.Sprintf("attachment; filename=\"%s\"", downloadFilename)) + // Format filename correct to escape " or \ + disposition := mime.FormatMediaType("attachment", map[string]string{ + "filename": downloadFilename, + }) + + if disposition == "" { + logrus.Errorf("Failed to format Content-Disposition header for filename %q: %v", downloadFilename, err) + + // Fallback to the default filename + disposition = fmt.Sprintf("attachment; filename=\"%s\"", DEFAULT_DOWNLOAD_FILENAME) + } + + resp.AddHeader("Content-Disposition", disposition) resp.Write(content) } From 4e30f185e3194a56a36672f427325eb10ad3f3a0 Mon Sep 17 00:00:00 2001 From: machichima Date: Tue, 3 Feb 2026 19:39:19 +0800 Subject: [PATCH 33/37] fix: add sessionID == "" check in resolveActorLogFilename Signed-off-by: machichima --- historyserver/pkg/historyserver/reader.go | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/historyserver/pkg/historyserver/reader.go b/historyserver/pkg/historyserver/reader.go index eb12c17d739..b0d955c4ea3 100644 --- a/historyserver/pkg/historyserver/reader.go +++ b/historyserver/pkg/historyserver/reader.go @@ -338,6 +338,13 @@ func (s *ServerHandler) resolveActorLogFilename(clusterNameID, sessionID, actorI ) } + if sessionID == "" { + return "", "", fmt.Errorf( + "sessionID is required to search for worker log files for actor %s", + actorID, + ) + } + // Find worker log file by worker_id nodeIDHex, logFilename, err := s.findWorkerLogFile( clusterNameID, From 1432037b67e911fa2d5a03b47423ffc864ea819e Mon Sep 17 00:00:00 2001 From: machichima Date: Tue, 3 Feb 2026 19:53:44 +0800 Subject: [PATCH 34/37] fix: use correct cluster name to query task and actor id Signed-off-by: machichima --- historyserver/pkg/historyserver/reader.go | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/historyserver/pkg/historyserver/reader.go b/historyserver/pkg/historyserver/reader.go index b0d955c4ea3..25ecc437240 100644 --- a/historyserver/pkg/historyserver/reader.go +++ b/historyserver/pkg/historyserver/reader.go @@ -241,8 +241,13 @@ func (s *ServerHandler) resolvePidLogFilename(clusterNameID, sessionID, nodeID s // This mirrors Ray Dashboard's _resolve_task_filename logic. // The sessionID parameter is required for searching worker log files when task_log_info is not available. func (s *ServerHandler) resolveTaskLogFilename(clusterNameID, sessionID, taskID string, attemptNumber int, suffix string) (nodeID, filename string, err error) { + // Construct full cluster session key for event lookup + // We append the sessionID to the clusterNameID (which is "name_namespace") + // to match the key format used by utils.BuildClusterSessionKey. + fullKey := fmt.Sprintf("%s_%s", clusterNameID, sessionID) + // Get task attempts by task ID - taskAttempts, found := s.eventHandler.GetTaskByID(clusterNameID, taskID) + taskAttempts, found := s.eventHandler.GetTaskByID(fullKey, taskID) if !found { return "", "", fmt.Errorf("task not found: task_id=%s", taskID) } @@ -316,8 +321,13 @@ func (s *ServerHandler) resolveTaskLogFilename(clusterNameID, sessionID, taskID // resolveActorLogFilename resolves log file for an actor by querying actor events. // This mirrors Ray Dashboard's _resolve_actor_filename logic. func (s *ServerHandler) resolveActorLogFilename(clusterNameID, sessionID, actorID, suffix string) (nodeID, filename string, err error) { + // Construct full cluster session key for event lookup + // We append the sessionID to the clusterNameID (which is "name_namespace") + // to match the key format used by utils.BuildClusterSessionKey. + fullKey := fmt.Sprintf("%s_%s", clusterNameID, sessionID) + // Get actor by actor ID - actor, found := s.eventHandler.GetActorByID(clusterNameID, actorID) + actor, found := s.eventHandler.GetActorByID(fullKey, actorID) if !found { return "", "", fmt.Errorf("actor not found: actor_id=%s", actorID) } From aba1b9ed180b84ad8c02b5b02b82e45e4f649806 Mon Sep 17 00:00:00 2001 From: machichima Date: Tue, 3 Feb 2026 19:54:28 +0800 Subject: [PATCH 35/37] test: filename header use no "" Signed-off-by: machichima --- historyserver/test/e2e/historyserver_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/historyserver/test/e2e/historyserver_test.go b/historyserver/test/e2e/historyserver_test.go index 6a627bf1284..2c637d726bc 100644 --- a/historyserver/test/e2e/historyserver_test.go +++ b/historyserver/test/e2e/historyserver_test.go @@ -587,7 +587,7 @@ func testLogFileEndpointDeadCluster(test Test, g *WithT, namespace *corev1.Names g.Expect(resp.StatusCode).To(Equal(http.StatusOK)) contentDisposition := resp.Header.Get("Content-Disposition") g.Expect(contentDisposition).To(ContainSubstring("attachment")) - g.Expect(contentDisposition).To(ContainSubstring(fmt.Sprintf("filename=\"%s\"", customFilename))) + g.Expect(contentDisposition).To(ContainSubstring(fmt.Sprintf("filename=%s", customFilename))) }) test.T().Run("filter_ansi_code behavior", func(t *testing.T) { From 145ab04fe16aeb7708cde989626e6ca634fc6b4c Mon Sep 17 00:00:00 2001 From: machichima Date: Tue, 3 Feb 2026 20:22:31 +0800 Subject: [PATCH 36/37] fix: redundant err and correct regex for base64 Signed-off-by: machichima --- historyserver/pkg/historyserver/router.go | 2 +- historyserver/pkg/utils/utils.go | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/historyserver/pkg/historyserver/router.go b/historyserver/pkg/historyserver/router.go index 3f9cc013ca4..15d1277df73 100644 --- a/historyserver/pkg/historyserver/router.go +++ b/historyserver/pkg/historyserver/router.go @@ -772,7 +772,7 @@ func (s *ServerHandler) getNodeLogFile(req *restful.Request, resp *restful.Respo }) if disposition == "" { - logrus.Errorf("Failed to format Content-Disposition header for filename %q: %v", downloadFilename, err) + logrus.Errorf("Failed to format Content-Disposition header for filename %q", downloadFilename) // Fallback to the default filename disposition = fmt.Sprintf("attachment; filename=\"%s\"", DEFAULT_DOWNLOAD_FILENAME) diff --git a/historyserver/pkg/utils/utils.go b/historyserver/pkg/utils/utils.go index 9cab5f36c90..ba5c8820f26 100644 --- a/historyserver/pkg/utils/utils.go +++ b/historyserver/pkg/utils/utils.go @@ -250,7 +250,7 @@ func GetRayNodeID() (string, error) { // It tries RawURLEncoding first (Ray's default), falling back to StdEncoding if that fails. func ConvertBase64ToHex(id string) (string, error) { // Check if already hex (only [0-9a-f]) - if matched, _ := regexp.MatchString("^[0-9a-f]+$", id); matched { + if matched, _ := regexp.MatchString("^[0-9a-fA-F]+$", id); matched { return id, nil } From 9980d22f8b848f312cf9bb67cafb5204e59af419 Mon Sep 17 00:00:00 2001 From: machichima Date: Wed, 4 Feb 2026 20:23:12 +0800 Subject: [PATCH 37/37] fix: properly encode url parameter for task and actor id Signed-off-by: machichima --- historyserver/test/e2e/historyserver_test.go | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/historyserver/test/e2e/historyserver_test.go b/historyserver/test/e2e/historyserver_test.go index 2c637d726bc..c97e70cb06f 100644 --- a/historyserver/test/e2e/historyserver_test.go +++ b/historyserver/test/e2e/historyserver_test.go @@ -5,6 +5,7 @@ import ( "fmt" "io" "net/http" + "net/url" "regexp" "testing" @@ -324,7 +325,7 @@ func testLogFileEndpointLiveCluster(test Test, g *WithT, namespace *corev1.Names for _, taskID := range taskIDs { LogWithTimestamp(t, "Testing task_id: %s", taskID) - url := fmt.Sprintf("%s%s?task_id=%s", historyServerURL, EndpointLogFile, taskID) + url := fmt.Sprintf("%s%s?task_id=%s", historyServerURL, EndpointLogFile, url.QueryEscape(taskID)) resp, err := client.Get(url) if err != nil { lastError = fmt.Sprintf("HTTP error for task %s: %v", taskID, err) @@ -363,7 +364,7 @@ func testLogFileEndpointLiveCluster(test Test, g *WithT, namespace *corev1.Names for _, actorID := range actorIDs { LogWithTimestamp(t, "Testing actor_id: %s", actorID) - url := fmt.Sprintf("%s%s?actor_id=%s", historyServerURL, EndpointLogFile, actorID) + url := fmt.Sprintf("%s%s?actor_id=%s", historyServerURL, EndpointLogFile, url.QueryEscape(actorID)) resp, err := client.Get(url) if err != nil { lastError = fmt.Sprintf("HTTP error for actor %s: %v", actorID, err) @@ -665,7 +666,7 @@ func testLogFileEndpointDeadCluster(test Test, g *WithT, namespace *corev1.Names for _, taskID := range taskIDs { LogWithTimestamp(t, "Testing task_id: %s", taskID) - url := fmt.Sprintf("%s%s?task_id=%s", historyServerURL, EndpointLogFile, taskID) + url := fmt.Sprintf("%s%s?task_id=%s", historyServerURL, EndpointLogFile, url.QueryEscape(taskID)) resp, err := client.Get(url) if err != nil { lastError = fmt.Sprintf("HTTP error for task %s: %v", taskID, err) @@ -704,7 +705,7 @@ func testLogFileEndpointDeadCluster(test Test, g *WithT, namespace *corev1.Names for _, actorID := range actorIDs { LogWithTimestamp(t, "Testing actor_id: %s", actorID) - url := fmt.Sprintf("%s%s?actor_id=%s", historyServerURL, EndpointLogFile, actorID) + url := fmt.Sprintf("%s%s?actor_id=%s", historyServerURL, EndpointLogFile, url.QueryEscape(actorID)) resp, err := client.Get(url) if err != nil { lastError = fmt.Sprintf("HTTP error for actor %s: %v", actorID, err)