-
Notifications
You must be signed in to change notification settings - Fork 6.6k
[core][refactor] Move to_resubmit_
from CoreWorker to TaskManager to avoid an abstraction leak
#52779
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[core][refactor] Move to_resubmit_
from CoreWorker to TaskManager to avoid an abstraction leak
#52779
Changes from 5 commits
640cf4f
a12d6dd
69e0dfc
72ab5ca
097fb78
969cb2c
0c282df
2bec81c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -719,25 +719,14 @@ CoreWorker::CoreWorker(CoreWorkerOptions options, const WorkerID &worker_id) | |
RAY_CHECK_OK(PutInLocalPlasmaStore(object, object_id, /*pin_object=*/true)); | ||
}, | ||
/* retry_task_callback= */ | ||
[this](TaskSpecification &spec, bool object_recovery, uint32_t delay_ms) { | ||
spec.GetMutableMessage().set_attempt_number(spec.AttemptNumber() + 1); | ||
if (!object_recovery) { | ||
// Retry after a delay to emulate the existing Raylet reconstruction | ||
// behaviour. TODO(ekl) backoff exponentially. | ||
RAY_LOG(INFO) << "Will resubmit task after a " << delay_ms | ||
<< "ms delay: " << spec.DebugString(); | ||
absl::MutexLock lock(&mutex_); | ||
TaskToRetry task_to_retry{current_time_ms() + delay_ms, spec}; | ||
to_resubmit_.push(std::move(task_to_retry)); | ||
[this](TaskSpecification &spec) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should this be renamed to |
||
if (spec.IsActorTask()) { | ||
auto actor_handle = actor_manager_->GetActorHandle(spec.ActorId()); | ||
actor_handle->SetResubmittedActorTaskSpec(spec); | ||
RAY_CHECK_OK(actor_task_submitter_->SubmitTask(spec)); | ||
} else { | ||
if (spec.IsActorTask()) { | ||
auto actor_handle = actor_manager_->GetActorHandle(spec.ActorId()); | ||
actor_handle->SetResubmittedActorTaskSpec(spec); | ||
RAY_CHECK_OK(actor_task_submitter_->SubmitTask(spec)); | ||
} else { | ||
RAY_CHECK(spec.IsNormalTask()); | ||
RAY_CHECK_OK(normal_task_submitter_->SubmitTask(spec)); | ||
} | ||
RAY_CHECK(spec.IsNormalTask()); | ||
RAY_CHECK_OK(normal_task_submitter_->SubmitTask(spec)); | ||
} | ||
}, | ||
push_error_callback, | ||
|
@@ -1358,18 +1347,9 @@ void CoreWorker::ExitIfParentRayletDies() { | |
|
||
void CoreWorker::InternalHeartbeat() { | ||
// Retry tasks. | ||
std::vector<TaskToRetry> tasks_to_resubmit; | ||
{ | ||
absl::MutexLock lock(&mutex_); | ||
const auto current_time = current_time_ms(); | ||
while (!to_resubmit_.empty() && current_time > to_resubmit_.top().execution_time_ms) { | ||
tasks_to_resubmit.emplace_back(to_resubmit_.top()); | ||
to_resubmit_.pop(); | ||
} | ||
} | ||
std::vector<TaskSpecification> tasks_to_retry = task_manager_->PopTasksToRetry(); | ||
|
||
for (auto &task_to_retry : tasks_to_resubmit) { | ||
auto &spec = task_to_retry.task_spec; | ||
for (auto &spec : tasks_to_retry) { | ||
if (spec.IsActorTask()) { | ||
auto actor_handle = actor_manager_->GetActorHandle(spec.ActorId()); | ||
actor_handle->SetResubmittedActorTaskSpec(spec); | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1038,12 +1038,26 @@ void TaskManager::RetryTask(TaskEntry *task_entry, | |
bool object_recovery, | ||
uint32_t delay_ms) { | ||
RAY_CHECK(task_entry != nullptr); | ||
|
||
auto &spec = task_entry->spec; | ||
spec.GetMutableMessage().set_attempt_number(spec.AttemptNumber() + 1); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. copy from |
||
SetTaskStatus(*task_entry, | ||
rpc::TaskStatus::PENDING_ARGS_AVAIL, | ||
/* state_update */ std::nullopt, | ||
/* include_task_info */ true, | ||
task_entry->spec.AttemptNumber() + 1); | ||
retry_task_callback_(task_entry->spec, object_recovery, delay_ms); | ||
/* include_task_info */ true); | ||
|
||
if (!object_recovery) { | ||
// Retry after a delay to emulate the existing Raylet reconstruction | ||
// behaviour. TODO(ekl) backoff exponentially. | ||
Comment on lines
+1039
to
+1040
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this TODO still relevant? Can we figure it out and either create a ticket/issue or remove it? |
||
RAY_LOG(INFO) << "Will resubmit task after a " << delay_ms | ||
<< "ms delay: " << spec.DebugString(); | ||
absl::MutexLock lock(&mu_); | ||
TaskToRetry task_to_retry{current_time_ms() + delay_ms, spec}; | ||
to_resubmit_.push(std::move(task_to_retry)); | ||
return; | ||
} | ||
|
||
retry_task_callback_(spec); | ||
} | ||
|
||
void TaskManager::FailPendingTask(const TaskID &task_id, | ||
|
@@ -1585,5 +1599,17 @@ ObjectID TaskManager::TaskGeneratorId(const TaskID &task_id) const { | |
return it->second.spec.ReturnId(0); | ||
} | ||
|
||
std::vector<TaskSpecification> TaskManager::PopTasksToRetry() { | ||
absl::MutexLock lock(&mu_); | ||
std::vector<TaskSpecification> tasks_to_retry; | ||
const auto current_time = current_time_ms(); | ||
while (!to_resubmit_.empty() && current_time > to_resubmit_.top().execution_time_ms) { | ||
tasks_to_retry.emplace_back(to_resubmit_.top().task_spec); | ||
to_resubmit_.pop(); | ||
} | ||
|
||
return tasks_to_retry; | ||
} | ||
|
||
} // namespace core | ||
} // namespace ray |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -14,6 +14,8 @@ | |
|
||
#pragma once | ||
|
||
#include <deque> | ||
#include <queue> | ||
#include <string> | ||
#include <tuple> | ||
#include <unordered_map> | ||
|
@@ -48,8 +50,7 @@ class TaskResubmissionInterface { | |
using TaskStatusCounter = CounterMap<std::tuple<std::string, rpc::TaskStatus, bool>>; | ||
using PutInLocalPlasmaCallback = | ||
std::function<void(const RayObject &object, const ObjectID &object_id)>; | ||
using RetryTaskCallback = | ||
std::function<void(TaskSpecification &spec, bool object_recovery, uint32_t delay_ms)>; | ||
using RetryTaskCallback = std::function<void(TaskSpecification &spec)>; | ||
using ReconstructObjectCallback = std::function<void(const ObjectID &object_id)>; | ||
using PushErrorCallback = std::function<Status(const JobID &job_id, | ||
const std::string &type, | ||
|
@@ -172,6 +173,29 @@ class ObjectRefStream { | |
int64_t total_num_object_consumed_{}; | ||
}; | ||
|
||
/// Represents a task that needs to be retried at a specific time. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. copy from core_worker.h |
||
struct TaskToRetry { | ||
/// Time when the task should be retried. | ||
int64_t execution_time_ms{}; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would remove the comment and change the name of the variable to something that describes its purpose more accurately. |
||
|
||
/// The details of the task. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Consider removing comment. It doesn't add anything. |
||
TaskSpecification task_spec; | ||
}; | ||
|
||
/// Sorts TaskToRetry in descending order of the execution time. | ||
/// Priority queue naturally sorts elements in descending order, | ||
/// in order to have the tasks ordered by execution time in | ||
/// ascending order we use a comparator that sorts elements in | ||
/// descending order. Per docs "Priority queues are a type of container | ||
/// adaptors, specifically designed such that its first element is always | ||
/// the greatest of the elements it contains". | ||
Comment on lines
+185
to
+191
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Since you're refactoring, do you think we can make this comment more concise? |
||
class TaskToRetryDescComparator { | ||
public: | ||
bool operator()(const TaskToRetry &left, const TaskToRetry &right) { | ||
return left.execution_time_ms > right.execution_time_ms; | ||
} | ||
}; | ||
|
||
class TaskManager : public TaskFinisherInterface, public TaskResubmissionInterface { | ||
public: | ||
TaskManager(CoreWorkerMemoryStore &in_memory_store, | ||
|
@@ -578,6 +602,11 @@ class TaskManager : public TaskFinisherInterface, public TaskResubmissionInterfa | |
/// Record OCL metrics. | ||
void RecordMetrics(); | ||
|
||
/// Returns a vector of task specifications that are ready to be retried. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. new function |
||
/// This function removes tasks from the to_resubmit_ queue whose execution time | ||
/// has passed and returns them. | ||
std::vector<TaskSpecification> PopTasksToRetry(); | ||
|
||
private: | ||
struct TaskEntry { | ||
TaskEntry(TaskSpecification spec_arg, | ||
|
@@ -865,6 +894,10 @@ class TaskManager : public TaskFinisherInterface, public TaskResubmissionInterfa | |
/// error). | ||
worker::TaskEventBuffer &task_event_buffer_; | ||
|
||
/// Queue of tasks to retry, ordered by their execution time. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. copy from core_worker.h |
||
std::priority_queue<TaskToRetry, std::deque<TaskToRetry>, TaskToRetryDescComparator> | ||
to_resubmit_ ABSL_GUARDED_BY(mu_); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You're using resubmit and retry interchangeably. E.g. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we use a separate mutex to guard this priority_queue since it's updated independently of any other state in task_manager? |
||
|
||
friend class TaskManagerTest; | ||
}; | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Not from your PR, but from before. Binding
this
to the scope of a lambda is also an abstraction leak too. Can we try to clean this up?