Skip to content
Open
Show file tree
Hide file tree
Changes from 18 commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
3d9bae4
add meta file
popojk Jan 23, 2026
f2633bf
add metadata collect logic
popojk Jan 26, 2026
a119251
add dataset collect logic
popojk Jan 27, 2026
a620950
minor fix
popojk Jan 27, 2026
896d996
add MetaKeyInfo helper for metadata retrieval.
fweilun Jan 27, 2026
40fd1fd
Merge branch 'master' into add_metadata_write_logic_in_history_server
fweilun Jan 27, 2026
e478fa9
nit fix
popojk Jan 28, 2026
97a442d
Merge remote-tracking branch 'origin/add_metadata_write_logic_in_hist…
popojk Jan 28, 2026
f91d13f
refactor
popojk Jan 28, 2026
f8ce56e
check nil for MetaKeyInfo function
fweilun Jan 28, 2026
1ed1ea0
correct MetaKeyInfo path.
fweilun Jan 28, 2026
1c1c361
fix potential meta collector bug
popojk Jan 29, 2026
688c3eb
Merge remote-tracking branch 'origin/add_metadata_write_logic_in_hist…
popojk Jan 29, 2026
750d7a8
refactor meta.go code
popojk Jan 29, 2026
1f665f6
add meta collect enable and dashboard url envvar
popojk Feb 1, 2026
0f34112
fix url info cache issue
popojk Feb 1, 2026
6e1ce62
fix e2e test error
popojk Feb 1, 2026
e58dae8
resolve conflict
popojk Feb 1, 2026
a2b9fc8
fix meta collect enable logic
popojk Feb 1, 2026
89db5fd
add demo script
popojk Feb 1, 2026
2c49cd6
add enable meta collect config to ray cluster yaml
popojk Feb 1, 2026
d603e09
resolve conflict
popojk Feb 3, 2026
aadca4b
add rayservice test yaml
popojk Feb 5, 2026
3536abe
Merge branch 'master' into add_metadata_write_logic_in_history_server
400Ping Feb 5, 2026
a5cd2a0
fix ci error
400Ping Feb 6, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 27 additions & 8 deletions historyserver/cmd/collector/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,10 @@ import (
"context"
"encoding/json"
"flag"
"fmt"
"os"
"path"
"strconv"
"time"

"github.com/sirupsen/logrus"
Expand All @@ -28,6 +30,7 @@ func main() {
logBatching := 1000
eventsPort := 8080
pushInterval := time.Minute
supportUnsupportedData := true
runtimeClassConfigPath := "/var/collector-config/data"

flag.StringVar(&role, "role", "Worker", "")
Expand All @@ -42,6 +45,20 @@ func main() {

flag.Parse()

// Read SUPPORT_RAY_EVENT_UNSUPPORTED_DATA environment variable
if envValue := os.Getenv("SUPPORT_RAY_EVENT_UNSUPPORTED_DATA"); envValue != "" {
if parsed, err := strconv.ParseBool(envValue); err == nil {
supportUnsupportedData = parsed
} else {
logrus.Warnf("Invalid value for SUPPORT_RAY_EVENT_UNSUPPORTED_DATA: %s, using default: %v", envValue, supportUnsupportedData)
}
}

dashboardAddress := os.Getenv("RAY_DASHBOARD_ADDRESS")
if dashboardAddress == "" {
panic(fmt.Errorf("missing RAY_DASHBOARD_ADDRESS in environment variables"))
}
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Worker nodes panic when RAY_DASHBOARD_ADDRESS is unset

Medium Severity

The collector unconditionally panics if RAY_DASHBOARD_ADDRESS is not set, regardless of node role. However, the dashboard address is only needed when SupportRayEventUnSupportData is enabled, which only applies to Head nodes. Worker nodes will crash unnecessarily if this environment variable is missing, even though they never use it for metadata collection.

Fix in Cursor Fix in Web


sessionDir, err := utils.GetSessionDir()
if err != nil {
panic("Failed to get session dir: " + err.Error())
Expand Down Expand Up @@ -73,14 +90,16 @@ func main() {
}

globalConfig := types.RayCollectorConfig{
RootDir: rayRootDir,
SessionDir: sessionDir,
RayNodeName: rayNodeId,
Role: role,
RayClusterName: rayClusterName,
RayClusterID: rayClusterId,
PushInterval: pushInterval,
LogBatching: logBatching,
RootDir: rayRootDir,
SessionDir: sessionDir,
RayNodeName: rayNodeId,
Role: role,
RayClusterName: rayClusterName,
RayClusterID: rayClusterId,
PushInterval: pushInterval,
LogBatching: logBatching,
DashboardAddress: dashboardAddress,
SupportRayEventUnSupportData: supportUnsupportedData,
}
logrus.Info("Using collector config: ", globalConfig)

Expand Down
4 changes: 4 additions & 0 deletions historyserver/config/raycluster-azureblob.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,8 @@ spec:
image: collector:v0.1.0
imagePullPolicy: IfNotPresent
env:
- name: RAY_DASHBOARD_ADDRESS
value: "http://localhost:8265"
# reference: https://learn.microsoft.com/en-us/azure/storage/common/storage-use-azurite#connect-to-the-emulator-by-using-the-azure-storage-explorer
- name: AZURE_STORAGE_CONNECTION_STRING
value: "DefaultEndpointsProtocol=http;AccountName=devstoreaccount1;AccountKey=Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==;BlobEndpoint=http://azurite-service.azurite-dev.svc.cluster.local:10000/devstoreaccount1;"
Expand Down Expand Up @@ -161,6 +163,8 @@ spec:
image: collector:v0.1.0
imagePullPolicy: IfNotPresent
env:
- name: RAY_DASHBOARD_ADDRESS
value: "http://raycluster-historyserver-head-svc:8265"
# reference: https://learn.microsoft.com/en-us/azure/storage/common/storage-use-azurite#connect-to-the-emulator-by-using-the-azure-storage-explorer
- name: AZURE_STORAGE_CONNECTION_STRING
value: "DefaultEndpointsProtocol=http;AccountName=devstoreaccount1;AccountKey=Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==;BlobEndpoint=http://azurite-service.azurite-dev.svc.cluster.local:10000/devstoreaccount1;"
Expand Down
4 changes: 4 additions & 0 deletions historyserver/config/raycluster.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,8 @@ spec:
image: collector:v0.1.0
imagePullPolicy: IfNotPresent
env:
- name: RAY_DASHBOARD_ADDRESS
value: "http://localhost:8265"
- name: S3DISABLE_SSL
value: "true"
- name: AWS_S3ID
Expand Down Expand Up @@ -182,6 +184,8 @@ spec:
image: collector:v0.1.0
imagePullPolicy: IfNotPresent
env:
- name: RAY_DASHBOARD_ADDRESS
value: "http://raycluster-historyserver-head-svc:8265"
- name: AWS_S3ID
value: minioadmin
- name: AWS_S3SECRET
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,29 +17,34 @@ import (
"github.com/fsnotify/fsnotify"
"github.com/sirupsen/logrus"

"github.com/ray-project/kuberay/historyserver/pkg/collector/types"
"github.com/ray-project/kuberay/historyserver/pkg/storage"
"github.com/ray-project/kuberay/historyserver/pkg/utils"
)

type RayLogHandler struct {
Writer storage.StorageWriter
LogFiles chan string
HttpClient *http.Client
ShutdownChan chan struct{}
logFilePaths map[string]bool
MetaDir string
RayClusterName string
LogDir string
RayNodeName string
RayClusterID string
RootDir string
SessionDir string
prevLogsDir string
persistCompleteLogsDir string
PushInterval time.Duration
LogBatching int
filePathMu sync.Mutex
EnableMeta bool
Writer storage.StorageWriter
LogFiles chan string
HttpClient *http.Client
ShutdownChan chan struct{}
logFilePaths map[string]bool
MetaDir string
RayClusterName string
LogDir string
RayNodeName string
RayClusterID string
RootDir string
SessionDir string
prevLogsDir string
persistCompleteLogsDir string
PushInterval time.Duration
LogBatching int
filePathMu sync.Mutex
EnableMeta bool
DashboardAddress string
SupportRayEventUnSupportData bool
MetaCommonUrlInfo []*types.UrlInfo
JobsUrlInfo *types.UrlInfo
}

func (r *RayLogHandler) Start(stop <-chan struct{}) error {
Expand Down Expand Up @@ -73,6 +78,10 @@ func (r *RayLogHandler) Run(stop <-chan struct{}) error {
if r.EnableMeta {
go r.WatchSessionLatestLoops() // Watch session_latest symlink changes
}
//Todo(alex): This should be removed when Ray core implemented events for placement groups, applications, and datasets
if r.SupportRayEventUnSupportData {
go r.PersistMetaLoop(stop)
}

select {
case <-sigChan:
Expand Down
Loading