-
Notifications
You must be signed in to change notification settings - Fork 32
feat: craned health check #635
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
721a700
cba6664
f594047
1c96465
3a77285
cf02764
38900b7
112f9dc
e8a2a9b
2c9ebf8
78ef006
ba560d2
d0feed7
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -648,6 +648,41 @@ void ParseConfig(int argc, char** argv) { | |
| } | ||
| } | ||
|
|
||
| if (config["HealthCheck"]) { | ||
| const auto& health_check_config = config["HealthCheck"]; | ||
| g_config.HealthCheck.Program = | ||
| YamlValueOr(health_check_config["Program"], ""); | ||
| if (g_config.HealthCheck.Program.empty()) { | ||
| CRANE_ERROR("HealthCheckProgram is not configured"); | ||
| std::exit(1); | ||
| } | ||
| g_config.HealthCheck.Interval = | ||
| YamlValueOr<uint64_t>(health_check_config["Interval"], 0L); | ||
| std::string node_state; | ||
| node_state = absl::StripAsciiWhitespace(absl::AsciiStrToLower( | ||
| YamlValueOr(health_check_config["NodeState"], "any"))); | ||
| if (node_state != "any" && node_state != "idle" && | ||
| node_state != "mixed" && node_state != "alloc") { | ||
| CRANE_WARN("HealthCheckNodeState is not valid, reset to any"); | ||
| node_state = "any"; | ||
| } | ||
| if (node_state == "any") { | ||
| g_config.HealthCheck.NodeState = | ||
| Craned::Config::HealthCheckConfig::ANY; | ||
| } else if (node_state == "idle") { | ||
| g_config.HealthCheck.NodeState = | ||
| Craned::Config::HealthCheckConfig::IDLE; | ||
| } else if (node_state == "mixed") { | ||
| g_config.HealthCheck.NodeState = | ||
| Craned::Config::HealthCheckConfig::MIXED; | ||
| } else if (node_state == "alloc") { | ||
| g_config.HealthCheck.NodeState = | ||
| Craned::Config::HealthCheckConfig::ALLOC; | ||
| } | ||
Nativu5 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| g_config.HealthCheck.Cycle = | ||
| YamlValueOr<bool>(health_check_config["Cycle"], false); | ||
| } | ||
|
Comment on lines
+664
to
+684
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Allow parsing of the
- if (node_state != "any" && node_state != "idle" &&
- node_state != "mixed" && node_state != "alloc") {
+ if (node_state != "any" && node_state != "idle" &&
+ node_state != "mixed" && node_state != "alloc" &&
+ node_state != "nondrained_idle") {
CRANE_WARN("HealthCheckNodeState is not valid, reset to any");
node_state = "any";
}
@@
- } else if (node_state == "alloc") {
+ } else if (node_state == "alloc") {
g_config.HealthCheck.NodeState =
Craned::Config::HealthCheckConfig::ALLOC;
+ } else if (node_state == "nondrained_idle") {
+ g_config.HealthCheck.NodeState =
+ Craned::Config::HealthCheckConfig::NONDRAINED_IDLE;
}🤖 Prompt for AI Agents |
||
|
|
||
| if (config["Plugin"]) { | ||
| const auto& plugin_config = config["Plugin"]; | ||
| g_config.Plugin.Enabled = | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -102,6 +102,15 @@ struct Config { | |
| }; | ||
| ContainerConfig Container; | ||
|
|
||
| struct HealthCheckConfig { | ||
| enum NodeStateEnum { IDLE, ALLOC, MIXED, ANY, NONDRAINED_IDLE }; | ||
| std::string Program; | ||
| uint64_t Interval{0}; | ||
| NodeStateEnum NodeState; | ||
|
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 同时指定多个state |
||
| bool Cycle; | ||
| }; | ||
| HealthCheckConfig HealthCheck; | ||
|
|
||
| struct PluginConfig { | ||
| bool Enabled{false}; | ||
| std::string PlugindSockPath; | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -380,6 +380,7 @@ CtldClient::~CtldClient() { | |
| CRANE_TRACE("Waiting for CtldClient thread to finish."); | ||
| if (m_async_send_thread_.joinable()) m_async_send_thread_.join(); | ||
| if (m_uvw_thread_.joinable()) m_uvw_thread_.join(); | ||
| if (m_health_check_thread_.joinable()) m_health_check_thread_.join(); | ||
| } | ||
|
|
||
| void CtldClient::Init() { | ||
|
|
@@ -460,6 +461,24 @@ void CtldClient::InitGrpcChannel(const std::string& server_address) { | |
| // std::unique_ptr will automatically release the dangling stub. | ||
| m_stub_ = CraneCtldForInternal::NewStub(m_ctld_channel_); | ||
|
|
||
| if (g_config.HealthCheck.Interval > 0L) { | ||
| HealthCheck_(); | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 这个需要马上执行一次吗?这个可能会影响Craned的上线速度吧
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 需求里是需要执行,确保故障节点不会连上。 |
||
| m_health_check_thread_ = std::thread([this] { | ||
| std::mt19937 rng{std::random_device{}()}; | ||
| do { | ||
| uint64_t interval = g_config.HealthCheck.Interval; | ||
| int delay = interval; | ||
| if (g_config.HealthCheck.Cycle) { | ||
| std::uniform_int_distribution<int> dist(1, interval); | ||
| delay = dist(rng); | ||
| } | ||
| std::this_thread::sleep_for(std::chrono::seconds(delay)); | ||
| if (m_stopping_ || !m_stub_) return; | ||
| if (CheckNodeState_()) HealthCheck_(); | ||
| } while (true); | ||
| }); | ||
| } | ||
|
|
||
coderabbitai[bot] marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| m_async_send_thread_ = std::thread([this] { AsyncSendThread_(); }); | ||
| } | ||
|
|
||
|
|
@@ -703,6 +722,119 @@ void CtldClient::SendStatusChanges_() { | |
| } | ||
| } | ||
|
|
||
| void CtldClient::SendHealthCheckResult_(bool is_health) const { | ||
| if (m_stopping_ || !m_stub_) return; | ||
|
|
||
| grpc::ClientContext context; | ||
| crane::grpc::SendHealthCheckResultRequest request; | ||
| google::protobuf::Empty reply; | ||
|
|
||
| request.set_craned_id(g_config.CranedIdOfThisNode); | ||
| request.set_healthy(is_health); | ||
|
|
||
| auto result = m_stub_->SendHealthCheckResult(&context, request, &reply); | ||
| if (!result.ok()) { | ||
| CRANE_ERROR("SendHealthCheckResult failed: is_health={}", is_health); | ||
| } | ||
|
Comment on lines
+725
to
+738
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Add an RPC deadline to avoid hanging the health-check thread.
🤖 Prompt for AI Agents |
||
| } | ||
|
|
||
| void CtldClient::HealthCheck_() { | ||
| if (!g_server->ReadyFor(RequestSource::CTLD)) return; | ||
|
|
||
| CRANE_DEBUG("Health checking....."); | ||
L-Xiafeng marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
| subprocess_s subprocess{}; | ||
| std::vector<const char*> argv = {g_config.HealthCheck.Program.c_str(), | ||
| nullptr}; | ||
|
|
||
| if (subprocess_create(argv.data(), 0, &subprocess) != 0) { | ||
| CRANE_ERROR( | ||
| "[Craned Subprocess] HealthCheck subprocess creation failed: {}.", | ||
| strerror(errno)); | ||
| SendHealthCheckResult_(false); | ||
| return; | ||
| } | ||
|
|
||
| pid_t pid = subprocess.child; | ||
| int result = 0; | ||
|
|
||
| auto fut = std::async(std::launch::async, | ||
| [pid, &result]() { return waitpid(pid, &result, 0); }); | ||
|
|
||
| bool child_exited = false; | ||
| if (fut.wait_for(std::chrono::milliseconds(MaxHealthCheckWaitTime)) == | ||
| std::future_status::ready) { | ||
| if (fut.get() == pid) { | ||
| child_exited = true; | ||
| } | ||
| } | ||
|
|
||
| auto read_stream = [](std::FILE* f) { | ||
| std::string out; | ||
| char buf[4096]; | ||
| while (std::fgets(buf, sizeof(buf), f)) out.append(buf); | ||
| return out; | ||
| }; | ||
|
|
||
| if (!child_exited) { | ||
| kill(pid, SIGKILL); | ||
| waitpid(pid, &result, 0); | ||
| std::string stdout_str = read_stream(subprocess_stdout(&subprocess)); | ||
| std::string stderr_str = read_stream(subprocess_stderr(&subprocess)); | ||
| CRANE_WARN("HealthCheck: Timeout. stdout: {}, stderr: {}", stdout_str, | ||
| stderr_str); | ||
| SendHealthCheckResult_(false); | ||
| subprocess_destroy(&subprocess); | ||
| return; | ||
| } | ||
|
|
||
| if (subprocess_destroy(&subprocess) != 0) | ||
| CRANE_ERROR("[Craned Subprocess] HealthCheck destroy failed."); | ||
|
|
||
| if (result != 0) { | ||
| std::string stdout_str = read_stream(subprocess_stdout(&subprocess)); | ||
| std::string stderr_str = read_stream(subprocess_stderr(&subprocess)); | ||
| CRANE_WARN("HealthCheck: Failed (exit code:{}). stdout: {}, stderr: {}", | ||
| result, stdout_str, stderr_str); | ||
| SendHealthCheckResult_(false); | ||
| return; | ||
| } | ||
|
|
||
| CRANE_DEBUG("Health check success."); | ||
| SendHealthCheckResult_(true); | ||
| } | ||
|
|
||
| bool CtldClient::CheckNodeState_() { | ||
| if (g_config.HealthCheck.NodeState == Config::HealthCheckConfig::ANY) | ||
| return true; | ||
|
|
||
| grpc::ClientContext context; | ||
| crane::grpc::QueryNodeStateRequest req; | ||
| crane::grpc::QueryNodeStateReply reply; | ||
| req.set_craned_id(g_config.CranedIdOfThisNode); | ||
| auto result = m_stub_->QueryNodeState(&context, req, &reply); | ||
| if (!result.ok() || !reply.ok()) { | ||
| CRANE_ERROR("QueryNodeState failed"); | ||
| return false; | ||
| } | ||
|
|
||
|
Comment on lines
+811
to
+820
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Set a deadline on the QueryNodeState RPC.
🤖 Prompt for AI Agents |
||
| switch (g_config.HealthCheck.NodeState) { | ||
| case Config::HealthCheckConfig::NONDRAINED_IDLE: | ||
| return !reply.drain() && | ||
| reply.state() == crane::grpc::CranedResourceState::CRANE_IDLE; | ||
| case Config::HealthCheckConfig::IDLE: | ||
| return reply.state() == crane::grpc::CranedResourceState::CRANE_IDLE; | ||
| case Config::HealthCheckConfig::MIXED: | ||
| return reply.state() == crane::grpc::CranedResourceState::CRANE_MIX; | ||
| case Config::HealthCheckConfig::ALLOC: | ||
| return reply.state() == crane::grpc::CranedResourceState::CRANE_ALLOC; | ||
| case Config::HealthCheckConfig::ANY: | ||
| break; | ||
| } | ||
|
|
||
| return false; | ||
| } | ||
|
|
||
| bool CtldClient::Ping_() { | ||
| grpc::ClientContext context; | ||
| context.set_deadline(std::chrono::system_clock::now() + | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
在 ALLOC 状态(所有 CPU 均已分配)的节点上运行。
在任意状态的节点上运行。
不是在所有节点上同时运行健康检查程序,而是在整个 HealthCheckInterval 周期内轮流在所有计算节点上执行。可以与其他节点状态选项组合。
在 IDLE 状态(空闲)的节点上运行。
在处于 IDLE 状态且未被 DRAINED 的节点上运行。
在 MIXED 状态(部分 CPU 空闲,部分 CPU 已分配)的节点上运行。
仅在 slurmd 守护进程启动时运行。