ray-project · machichima · Jan 29, 2026 · Jan 29, 2026 · Jan 29, 2026 · Jan 30, 2026
diff --git a/historyserver/pkg/historyserver/reader.go b/historyserver/pkg/historyserver/reader.go
@@ -3,15 +3,18 @@ package historyserver
 import (
 	"bufio"
 	"context"
+	"encoding/base64"
 	"encoding/json"
 	"fmt"
 	"io"
 	"net/http"
 	"path"
+	"regexp"
 	"sort"
 	"strings"
 
 	"github.com/emicklei/go-restful/v3"
+	eventtypes "github.com/ray-project/kuberay/historyserver/pkg/eventserver/types"
 	"github.com/ray-project/kuberay/historyserver/pkg/utils"
 	"github.com/sirupsen/logrus"
 )
@@ -26,6 +29,15 @@ const (
 	MAX_LOG_LIMIT = 10000
 )
 
+// ANSI escape code pattern for filtering colored output from logs
+// Matches patterns like: \x1b[31m (red color), \x1b[0m (reset), etc.
+var ansiEscapePattern = regexp.MustCompile(`\x1b\[[0-9;]+m`)
+
+// filterAnsiEscapeCodes removes ANSI escape sequences from log content
+func filterAnsiEscapeCodes(content []byte) []byte {
+	return ansiEscapePattern.ReplaceAll(content, []byte(""))
+}
+
 func (s *ServerHandler) listClusters(limit int) []utils.ClusterInfo {
 	// Initial continuation marker
 	logrus.Debugf("Prepare to get list clusters info ...")
@@ -73,18 +85,39 @@ func (s *ServerHandler) _getNodeLogs(rayClusterNameID, sessionId, nodeId, dir st
 	return json.Marshal(ret)
 }
 
-func (s *ServerHandler) _getNodeLogFile(rayClusterNameID, sessionID, nodeID, filename string, maxLines int) ([]byte, error) {
+func (s *ServerHandler) _getNodeLogFile(rayClusterNameID, sessionID string, options GetLogFileOptions) ([]byte, error) {
+	// Resolve node_id and filename based on options
+	nodeID, filename, err := s.resolveLogFilename(rayClusterNameID, sessionID, options)
+	if err != nil {
+		return nil, utils.NewHTTPError(err, http.StatusBadRequest)
+	}
+
+	// Build log path
 	logPath := path.Join(sessionID, "logs", nodeID, filename)
 
+	// Append attempt_number if specified and not using task_id
+	// (task_id already includes attempt_number in resolution)
+	if options.AttemptNumber > 0 && options.TaskID == "" {
+		logPath = fmt.Sprintf("%s.%d", logPath, options.AttemptNumber)
+	}
+
 	reader := s.reader.GetContent(rayClusterNameID, logPath)
 
 	if reader == nil {
 		return nil, utils.NewHTTPError(fmt.Errorf("log file not found: %s", logPath), http.StatusNotFound)
 	}
 
+	maxLines := options.Lines
 	if maxLines < 0 {
-		// -1 means read all lines, match Ray Dashboard API behavior
-		return io.ReadAll(reader)
+		// -1 means read all lines
+		content, err := io.ReadAll(reader)
+		if err != nil {
+			return nil, err
+		}
+		if options.FilterAnsiCode {
+			content = filterAnsiEscapeCodes(content)
+		}
+		return content, nil
 	}
 
 	if maxLines == 0 {
@@ -132,7 +165,172 @@ func (s *ServerHandler) _getNodeLogFile(rayClusterNameID, sessionID, nodeID, fil
 		lines = append(buffer[start:], buffer[:start]...)
 	}
 
-	return []byte(strings.Join(lines, "\n")), nil
+	result := []byte(strings.Join(lines, "\n"))
+	if options.FilterAnsiCode {
+		result = filterAnsiEscapeCodes(result)
+	}
+
+	return result, nil
+}
+
+// resolveLogFilename resolves the log file node_id and filename based on the provided options.
+// This mirrors Ray Dashboard's resolve_filename logic.
+// The sessionID parameter is required for task_id resolution to search worker log files.
+func (s *ServerHandler) resolveLogFilename(clusterNameID, sessionID string, options GetLogFileOptions) (nodeID, filename string, err error) {
+	// Validate suffix
+	if options.Suffix != "out" && options.Suffix != "err" {
+		return "", "", fmt.Errorf("invalid suffix: %s (must be 'out' or 'err')", options.Suffix)
+	}
+
+	// If filename is explicitly provided, use it and ignore suffix
+	if options.Filename != "" {
+		if options.NodeID == "" {
+			return "", "", fmt.Errorf("node_id is required when filename is provided")
+		}
+		return options.NodeID, options.Filename, nil
+	}
+
+	// If task_id is provided, resolve from task events
+	if options.TaskID != "" {
+		return s.resolveTaskLogFilename(clusterNameID, sessionID, options.TaskID, options.AttemptNumber, options.Suffix)
+	}
+
+	// If actor_id is provided, resolve from actor events
+	// TODO: not implemented
+	if options.ActorID != "" {
+		return "", "", fmt.Errorf("actor_id resolution not yet implemented")
+	}
+
+	// If pid is provided, resolve worker log file
+	// TODO: not implemented
+	if options.PID > 0 {
+		return "", "", fmt.Errorf("pid resolution not yet implemented")
+	}
+
+	return "", "", fmt.Errorf("must provide one of: filename, task_id, actor_id, or pid")
+}
+
+// resolveTaskLogFilename resolves log file for a task by querying task events.
+// This mirrors Ray Dashboard's _resolve_task_filename logic.
+// The sessionID parameter is required for searching worker log files when task_log_info is not available.
+func (s *ServerHandler) resolveTaskLogFilename(clusterNameID, sessionID, taskID string, attemptNumber int, suffix string) (nodeID, filename string, err error) {
+	// Get task attempts by task ID
+	taskAttempts, found := s.eventHandler.GetTaskByID(clusterNameID, taskID)
+	if !found {
+		return "", "", fmt.Errorf("task not found: task_id=%s", taskID)
+	}
+
+	// Find the specific attempt
+	var foundTask *eventtypes.Task
+	for i, task := range taskAttempts {
+		if task.AttemptNumber == attemptNumber {
+			foundTask = &taskAttempts[i]
+			break
+		}
+	}
+
+	if foundTask == nil {
+		return "", "", fmt.Errorf("task attempt not found: task_id=%s, attempt_number=%d", taskID, attemptNumber)
+	}
+
+	// Check if task has node_id
+	if foundTask.NodeID == "" {
+		return "", "", fmt.Errorf("task %s (attempt %d) has no node_id (task not scheduled yet)", taskID, attemptNumber)
+	}
+
+	// Check if this is an actor task
+	if foundTask.ActorID != "" {
+		return "", "", fmt.Errorf(
+			"for actor task, please query actor log for actor(%s) by providing actor_id query parameter",
+			foundTask.ActorID,
+		)
+	}
+
+	// Check if task has worker_id
+	if foundTask.WorkerID == "" {
+		return "", "", fmt.Errorf(
+			"task %s (attempt %d) has no worker_id",
+			taskID, attemptNumber,
+		)
+	}
+
+	// Try to use task_log_info if available
+	// NOTE: task_log_info is currently not supported in ray export event, so we will always
+	// fallback to following logic.
+	if foundTask.TaskLogInfo != nil && len(foundTask.TaskLogInfo) > 0 {
+		filenameKey := "stdout_file"
+		if suffix == "err" {
+			filenameKey = "stderr_file"
+		}
+
+		if logFilename, ok := foundTask.TaskLogInfo[filenameKey]; ok && logFilename != "" {
+			return foundTask.NodeID, logFilename, nil
+		}
+	}
+
+	// Fallback: Find worker log file by worker_id
+	// Worker log files follow the pattern: worker-{worker_id_hex}-{pid}-{worker_startup_token}.{suffix}
+	// We need to search for files matching this pattern
+	if sessionID == "" {
+		return "", "", fmt.Errorf(
+			"task %s (attempt %d) has no task_log_info and sessionID is required to search for worker log files",
+			taskID, attemptNumber,
+		)
+	}
+
+	nodeIDHex, logFilename, err := s.findWorkerLogFile(clusterNameID, sessionID, foundTask.NodeID, foundTask.WorkerID, suffix)
+	if err != nil {
+		return "", "", fmt.Errorf(
+			"failed to find worker log file for task %s (attempt %d, worker_id=%s, node_id=%s): %w",
+			taskID, attemptNumber, foundTask.WorkerID, foundTask.NodeID, err,
+		)
+	}
+
+	return nodeIDHex, logFilename, nil
+}
+
+// findWorkerLogFile searches for a worker log file by worker_id.
+// Worker log files follow the pattern: worker-{worker_id_hex}-{pid}-{worker_startup_token}.{suffix}
+// Returns (nodeIDHex, filename, error).
+func (s *ServerHandler) findWorkerLogFile(clusterNameID, sessionID, nodeID, workerID, suffix string) (string, string, error) {
+	// Convert Base64 node_id to hex for the file path
+	// Ray stores IDs in Base64 (URL-safe) in events, but uses hex in log directory structure
+	nodeIDBytes, err := base64.RawURLEncoding.DecodeString(nodeID)
+	if err != nil {
+		// Try standard Base64 if URL-safe fails
+		nodeIDBytes, err = base64.StdEncoding.DecodeString(nodeID)
+		if err != nil {
+			return "", "", fmt.Errorf("failed to decode node_id: %w", err)
+		}
+	}
+	nodeIDHex := fmt.Sprintf("%x", nodeIDBytes)
+
+	// Convert Base64 worker_id to hex
+	workerIDBytes, err := base64.RawURLEncoding.DecodeString(workerID)
+	if err != nil {
+		// Try standard Base64 if URL-safe fails
+		workerIDBytes, err = base64.StdEncoding.DecodeString(workerID)
+		if err != nil {
+			return "", "", fmt.Errorf("failed to decode worker_id: %w", err)
+		}
+	}
+	workerIDHex := fmt.Sprintf("%x", workerIDBytes)
+
+	// List all files in the node's log directory
+	logPath := path.Join(sessionID, "logs", nodeIDHex)
+	files := s.reader.ListFiles(clusterNameID, logPath)
+
+	// Search for files matching pattern: worker-{worker_id_hex}-*.{suffix}
+	workerPrefix := fmt.Sprintf("worker-%s-", workerIDHex)
+	workerSuffix := fmt.Sprintf(".%s", suffix)
+
+	for _, file := range files {
+		if strings.HasPrefix(file, workerPrefix) && strings.HasSuffix(file, workerSuffix) {
+			return nodeIDHex, file, nil
+		}
+	}
+
+	return "", "", fmt.Errorf("worker log file not found: worker_id=%s (hex=%s), suffix=%s, searched in %s", workerID, workerIDHex, suffix, logPath)
 }
 
 func (s *ServerHandler) GetNodes(rayClusterNameID, sessionId string) ([]byte, error) {