Skip to content

[core][refactor] Always use SetTaskStatus for task status transitions #52637

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Apr 28, 2025
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
88 changes: 45 additions & 43 deletions src/ray/core_worker/task_manager.cc
Original file line number Diff line number Diff line change
Expand Up @@ -888,10 +888,11 @@ void TaskManager::CompletePendingTask(const TaskID &task_id,
it->second.num_successful_executions++;

if (is_application_error) {
SetTaskStatus(it->second,
rpc::TaskStatus::FAILED,
gcs::GetRayErrorInfo(rpc::ErrorType::TASK_EXECUTION_EXCEPTION,
reply.task_execution_error()));
SetTaskStatus(
it->second,
rpc::TaskStatus::FAILED,
worker::TaskStatusEvent::TaskStateUpdate(gcs::GetRayErrorInfo(
rpc::ErrorType::TASK_EXECUTION_EXCEPTION, reply.task_execution_error())));
} else {
SetTaskStatus(it->second, rpc::TaskStatus::FINISHED);
}
Expand Down Expand Up @@ -1060,12 +1061,14 @@ void TaskManager::FailPendingTask(const TaskID &task_id,
// to exit and not be marked as failure.
SetTaskStatus(it->second, rpc::TaskStatus::FINISHED);
} else {
auto error_info =
(ray_error_info == nullptr
? gcs::GetRayErrorInfo(error_type,
(status != nullptr ? status->ToString() : ""))
: *ray_error_info);
SetTaskStatus(it->second,
rpc::TaskStatus::FAILED,
(ray_error_info == nullptr
? gcs::GetRayErrorInfo(
error_type, (status != nullptr ? status->ToString() : ""))
: *ray_error_info));
worker::TaskStatusEvent::TaskStateUpdate(error_info));
}
submissible_tasks_.erase(it);
num_pending_tasks_--;
Expand Down Expand Up @@ -1417,15 +1420,9 @@ void TaskManager::MarkTaskWaitingForExecution(const TaskID &task_id,
RAY_CHECK(it->second.GetStatus() == rpc::TaskStatus::PENDING_NODE_ASSIGNMENT)
<< ", task ID = " << it->first << ", status = " << it->second.GetStatus();
it->second.SetNodeId(node_id);
it->second.SetStatus(rpc::TaskStatus::SUBMITTED_TO_WORKER);
RAY_UNUSED(task_event_buffer_.RecordTaskStatusEventIfNeeded(
it->second.spec.TaskId(),
it->second.spec.JobId(),
it->second.spec.AttemptNumber(),
it->second.spec,
rpc::TaskStatus::SUBMITTED_TO_WORKER,
/* include_task_info */ false,
worker::TaskStatusEvent::TaskStateUpdate(node_id, worker_id)));
SetTaskStatus(it->second,
rpc::TaskStatus::SUBMITTED_TO_WORKER,
worker::TaskStatusEvent::TaskStateUpdate(node_id, worker_id));
}

void TaskManager::MarkTaskRetryOnResubmit(TaskEntry &task_entry) {
Expand All @@ -1435,51 +1432,56 @@ void TaskManager::MarkTaskRetryOnResubmit(TaskEntry &task_entry) {
task_entry.MarkRetry();

// Mark the new status and also include task spec info for the new attempt.
task_entry.SetStatus(rpc::TaskStatus::PENDING_ARGS_AVAIL);
//
// NOTE(rickyx): We only increment the AttemptNumber on the task spec when
// `retry_task_callback_` is invoked. In order to record the correct status change for
// the new task attempt, we pass the the attempt number explicitly.
RAY_UNUSED(task_event_buffer_.RecordTaskStatusEventIfNeeded(
task_entry.spec.TaskId(),
task_entry.spec.JobId(),
task_entry.spec.AttemptNumber() + 1,
task_entry.spec,
rpc::TaskStatus::PENDING_ARGS_AVAIL,
/* include_task_info */ true));
SetTaskStatus(task_entry,
rpc::TaskStatus::PENDING_ARGS_AVAIL,
/* state_update */ std::nullopt,
/* include_task_info */ true,
task_entry.spec.AttemptNumber() + 1);
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In a follow-up PR: let's move the logic to retry_task_callback so that we don't need to manually +1 to the AttemptNumber().

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

follow up: #52695

}

void TaskManager::MarkTaskRetryOnFailed(TaskEntry &task_entry,
const rpc::RayErrorInfo &error_info) {
RAY_CHECK(task_entry.IsPending());

// Record the old attempt status as FAILED.
SetTaskStatus(task_entry, rpc::TaskStatus::FAILED, error_info);
SetTaskStatus(task_entry,
rpc::TaskStatus::FAILED,
worker::TaskStatusEvent::TaskStateUpdate(error_info));
task_entry.MarkRetry();

// Mark the new status and also include task spec info for the new attempt.
task_entry.SetStatus(rpc::TaskStatus::PENDING_ARGS_AVAIL);
RAY_UNUSED(task_event_buffer_.RecordTaskStatusEventIfNeeded(
task_entry.spec.TaskId(),
task_entry.spec.JobId(),
task_entry.spec.AttemptNumber() + 1,
task_entry.spec,
rpc::TaskStatus::PENDING_ARGS_AVAIL,
/* include_task_info */ true));
SetTaskStatus(task_entry,
rpc::TaskStatus::PENDING_ARGS_AVAIL,
/* state_update */ std::nullopt,
/* include_task_info */ true,
task_entry.spec.AttemptNumber() + 1);
}

void TaskManager::SetTaskStatus(
TaskEntry &task_entry,
rpc::TaskStatus status,
const std::optional<const rpc::RayErrorInfo> &error_info) {
std::optional<worker::TaskStatusEvent::TaskStateUpdate> state_update,
bool include_task_info,
std::optional<int32_t> attempt_number) {
RAY_LOG(DEBUG).WithField(task_entry.spec.TaskId())
<< "Setting task status from " << task_entry.GetStatus() << " to " << status;
task_entry.SetStatus(status);
RAY_UNUSED(task_event_buffer_.RecordTaskStatusEventIfNeeded(
task_entry.spec.TaskId(),
task_entry.spec.JobId(),
task_entry.spec.AttemptNumber(),
task_entry.spec,
status,
/* include_task_info */ false,
worker::TaskStatusEvent::TaskStateUpdate(error_info)));

int32_t attempt_number_to_record =
attempt_number.value_or(task_entry.spec.AttemptNumber());
auto state_update_to_record =
state_update.value_or(worker::TaskStatusEvent::TaskStateUpdate());
RAY_UNUSED(task_event_buffer_.RecordTaskStatusEventIfNeeded(task_entry.spec.TaskId(),
task_entry.spec.JobId(),
attempt_number_to_record,
task_entry.spec,
status,
include_task_info,
state_update_to_record));
}

std::unordered_map<rpc::LineageReconstructionTask, uint64_t>
Expand Down
11 changes: 9 additions & 2 deletions src/ray/core_worker/task_manager.h
Original file line number Diff line number Diff line change
Expand Up @@ -734,11 +734,18 @@ class TaskManager : public TaskFinisherInterface, public TaskResubmissionInterfa
///
/// \param task_entry corresponding TaskEntry of a task to record the event.
/// \param status new status.
/// \param error_info Optional error info for task execution.
/// \param state_update The state update for the task status change event.
/// \param include_task_info Whether to include task info in the task status change
/// event.
/// \param attempt_number The attempt number to record the task status change
/// event. If not specified, the attempt number will be the current attempt number of
/// the task.
void SetTaskStatus(
TaskEntry &task_entry,
rpc::TaskStatus status,
const std::optional<const rpc::RayErrorInfo> &error_info = absl::nullopt);
std::optional<worker::TaskStatusEvent::TaskStateUpdate> state_update = std::nullopt,
bool include_task_info = false,
std::optional<int32_t> attempt_number = std::nullopt);

/// Update the task entry for the task attempt to reflect retry on resubmit.
///
Expand Down