-
Notifications
You must be signed in to change notification settings - Fork 32
Add cbatch --signal #781
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Add cbatch --signal #781
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -132,6 +132,11 @@ enum InteractiveTaskType { | |
| Crun = 1; | ||
| } | ||
|
|
||
| message SignalParam { | ||
| int32 signal_number = 1; | ||
| uint32 seconds_before_kill = 2; | ||
| } | ||
|
|
||
| message TaskToCtld { | ||
| /* -------- Fields that are set at the submission time. ------- */ | ||
| google.protobuf.Duration time_limit = 1; | ||
|
|
@@ -181,6 +186,7 @@ message TaskToCtld { | |
| } | ||
| repeated License licenses_count = 40; | ||
| bool is_licenses_or = 41; | ||
| optional SignalParam signal_param = 42; | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 同上修改 |
||
| } | ||
|
|
||
| message TaskInEmbeddedDb { | ||
|
|
@@ -335,6 +341,7 @@ message StepToD { | |
| double cpus_per_task = 24; | ||
|
|
||
| bool get_user_env = 25; | ||
| optional SignalParam signal_param = 26; | ||
|
|
||
| // Not used now. | ||
| string extra_attr = 29; | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -458,6 +458,15 @@ crane::grpc::StepToD DaemonStepInCtld::GetStepToD( | |
| step_to_d.mutable_container_meta()->CopyFrom( | ||
| crane::grpc::ContainerTaskAdditionalMeta(container_meta.value())); | ||
|
|
||
| if (this->job->TaskToCtld().has_signal_param()) { | ||
| step_to_d.mutable_signal_param()->CopyFrom( | ||
| this->job->TaskToCtld().signal_param()); | ||
| auto signal_param = step_to_d.signal_param(); | ||
| CRANE_INFO(" seconds_before_kill {} signal_num {} ", | ||
| signal_param.seconds_before_kill(), | ||
| signal_param.signal_number()); | ||
|
Comment on lines
+465
to
+467
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 打印这个的目的是? |
||
| } | ||
|
|
||
| return step_to_d; | ||
| } | ||
|
|
||
|
|
@@ -797,6 +806,14 @@ crane::grpc::StepToD CommonStepInCtld::GetStepToD( | |
| auto* mutable_meta = step_to_d.mutable_container_meta(); | ||
| mutable_meta->CopyFrom(StepToCtld().container_meta()); | ||
| } | ||
| if (this->job->TaskToCtld().has_signal_param()) { | ||
| step_to_d.mutable_signal_param()->CopyFrom( | ||
| this->job->TaskToCtld().signal_param()); | ||
| auto signal_param = step_to_d.signal_param(); | ||
| CRANE_INFO(" common seconds_before_kill {} signal_num {} ", | ||
| signal_param.seconds_before_kill(), | ||
| signal_param.signal_number()); | ||
|
Comment on lines
+813
to
+815
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 同上,这种没必要打印 |
||
| } | ||
|
|
||
| return step_to_d; | ||
| } | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -1593,6 +1593,7 @@ void TaskManager::TaskFinish_(task_id_t task_id, | |
| bool orphaned = m_step_.orphaned; | ||
| if (m_step_.AllTaskFinished()) { | ||
| DelTerminationTimer_(); | ||
| DelSignalTimer_(); | ||
| m_step_.StopCforedClient(); | ||
| if (!orphaned) { | ||
| g_craned_client->StepStatusChangeAsync(new_status, exit_code, | ||
|
|
@@ -1952,6 +1953,7 @@ void TaskManager::EvCleanChangeTaskTimeLimitQueueCb_() { | |
| } | ||
| // Delete the old timer. | ||
| DelTerminationTimer_(); | ||
| DelSignalTimer_(); | ||
|
|
||
| absl::Time start_time = | ||
| absl::FromUnixSeconds(m_step_.GetStep().start_time().seconds()); | ||
|
|
@@ -1965,8 +1967,26 @@ void TaskManager::EvCleanChangeTaskTimeLimitQueueCb_() { | |
| m_terminate_task_async_handle_->send(); | ||
| } else { | ||
| // If the task haven't timed out, set up a new timer. | ||
| AddTerminationTimer_( | ||
| ToInt64Seconds((new_time_limit - (absl::Now() - start_time)))); | ||
| int64_t new_sec = | ||
| ToInt64Seconds(new_time_limit - (absl::Now() - start_time)); | ||
| AddTerminationTimer_(new_sec); | ||
|
|
||
| if (m_step_.GetStep().has_signal_param()) { | ||
| auto signal_param = m_step_.GetStep().signal_param(); | ||
| int64_t signal_sec = new_sec - signal_param.seconds_before_kill(); | ||
| if (signal_sec > 0) { | ||
| int signal_num = signal_param.signal_number(); | ||
| AddSignalTimer_(signal_sec, signal_num); | ||
| CRANE_INFO( | ||
| "Add a new signal timer of seconds_before_kill {} signal_num " | ||
| "{} new seconds {}, time_limit {}", | ||
| signal_param.seconds_before_kill(), signal_param.signal_number(), | ||
| signal_sec, new_sec); | ||
|
Comment on lines
+1981
to
+1984
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 这个日志不对啊,改一下吧: |
||
| } else { | ||
| CRANE_WARN("Signal offset {} >= time_limit {}, skipping signal timer", | ||
| signal_param.seconds_before_kill(), new_sec); | ||
| } | ||
| } | ||
| } | ||
|
|
||
| elem.ok_prom.set_value(CraneErrCode::SUCCESS); | ||
|
|
@@ -2000,7 +2020,27 @@ void TaskManager::EvGrpcExecuteTaskCb_() { | |
| // so we move it outside the multithreading part. | ||
| int64_t sec = m_step_.GetStep().time_limit().seconds(); | ||
| AddTerminationTimer_(sec); | ||
| CRANE_INFO("Add a timer of {} seconds", sec); | ||
| CRANE_INFO("Add a timer of {} seconds {}", sec, | ||
| m_step_.GetStep().has_signal_param()); | ||
|
Comment on lines
+2023
to
+2024
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 这个日志被改错了,这个地方是打印 timelimit,怎么把 Signal 加进去了?保留原样,下面的分支会打印 signal 相关的信息了。 |
||
| if (m_step_.GetStep().has_signal_param()) { | ||
| auto signal_param = m_step_.GetStep().signal_param(); | ||
| int64_t signal_sec = sec - signal_param.seconds_before_kill(); | ||
| if (signal_sec > 0) { | ||
| int signal_num = signal_param.signal_number(); | ||
| AddSignalTimer_(signal_sec, signal_num); | ||
| CRANE_INFO( | ||
| "Add a signal timer of seconds_before_kill {} signal_num {} for " | ||
| "job #{}", | ||
| signal_param.seconds_before_kill(), signal_param.signal_number(), | ||
|
Comment on lines
+2032
to
+2034
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 日志的问题同上。 |
||
| m_step_.GetStep().job_id()); | ||
|
Comment on lines
+2031
to
+2035
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 前面已经有signal_sec和signal_num了,后面没必要再从 proto 里面拿了。另外 jobid 直接用 g_config.JobId 更方便。 |
||
| } else { | ||
| CRANE_WARN( | ||
| "Signal offset {} >= time_limit {} for job #{}, skipping signal " | ||
| "timer", | ||
| signal_param.seconds_before_kill(), sec, | ||
| m_step_.GetStep().job_id()); | ||
| } | ||
| } | ||
|
|
||
| m_step_.pwd.Init(m_step_.uid); | ||
| if (!m_step_.pwd.Valid()) { | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
SignalOption