@@ -546,15 +546,21 @@ bool LocalTaskManager::PoppedWorkerHandler(
546546 not_detached_with_owner_failed = true ;
547547 }
548548
549- if (!canceled) {
550- const auto &required_resource =
551- task.GetTaskSpecification ().GetRequiredResources ().GetResourceMap ();
552- for (auto &entry : required_resource) {
553- // This is to make sure PG resource is not deleted during popping worker
554- // unless the lease request is cancelled.
555- RAY_CHECK (cluster_resource_scheduler_->GetLocalResourceManager ().ResourcesExist (
556- scheduling::ResourceID (entry.first )))
557- << entry.first ;
549+ const auto &required_resource =
550+ task.GetTaskSpecification ().GetRequiredResources ().GetResourceMap ();
551+ for (auto &entry : required_resource) {
552+ if (!cluster_resource_scheduler_->GetLocalResourceManager ().ResourcesExist (
553+ scheduling::ResourceID (entry.first ))) {
554+ RAY_CHECK (task.GetTaskSpecification ().PlacementGroupBundleId ().first !=
555+ PlacementGroupID::Nil ());
556+ RAY_LOG (DEBUG) << " The placement group: "
557+ << task.GetTaskSpecification ().PlacementGroupBundleId ().first
558+ << " was removed when poping workers for task: " << task_id
559+ << " , will cancel the task." ;
560+ CancelTask (
561+ task_id,
562+ rpc::RequestWorkerLeaseReply::SCHEDULING_CANCELLED_PLACEMENT_GROUP_REMOVED);
563+ canceled = true ;
558564 }
559565 }
560566
@@ -849,7 +855,7 @@ void LocalTaskManager::ReleaseTaskArgs(const TaskID &task_id) {
849855}
850856
851857namespace {
852- void ReplyCancelled (const std::shared_ptr<internal::Work> &work,
858+ void ReplyCancelled (std::shared_ptr<internal::Work> &work,
853859 rpc::RequestWorkerLeaseReply::SchedulingFailureType failure_type,
854860 const std::string &scheduling_failure_message) {
855861 auto reply = work->reply ;
@@ -861,67 +867,55 @@ void ReplyCancelled(const std::shared_ptr<internal::Work> &work,
861867}
862868} // namespace
863869
864- bool LocalTaskManager::CancelTasks (
865- std::function< bool ( const std::shared_ptr<internal::Work> &)> predicate ,
870+ bool LocalTaskManager::CancelTask (
871+ const TaskID &task_id ,
866872 rpc::RequestWorkerLeaseReply::SchedulingFailureType failure_type,
867873 const std::string &scheduling_failure_message) {
868- bool tasks_cancelled = false ;
869-
870- ray::erase_if<SchedulingClass, std::shared_ptr<internal::Work>>(
871- tasks_to_dispatch_, [&](const std::shared_ptr<internal::Work> &work) {
872- if (predicate (work)) {
873- const TaskID task_id = work->task .GetTaskSpecification ().TaskId ();
874- RAY_LOG (DEBUG) << " Canceling task " << task_id << " from dispatch queue." ;
875- ReplyCancelled (work, failure_type, scheduling_failure_message);
876- if (work->GetState () == internal::WorkStatus::WAITING_FOR_WORKER) {
877- // We've already acquired resources so we need to release them.
878- cluster_resource_scheduler_->GetLocalResourceManager ().ReleaseWorkerResources (
879- work->allocated_instances );
880- // Release pinned task args.
881- ReleaseTaskArgs (task_id);
882- }
883- if (!work->task .GetTaskSpecification ().GetDependencies ().empty ()) {
884- task_dependency_manager_.RemoveTaskDependencies (
885- work->task .GetTaskSpecification ().TaskId ());
886- }
887- RemoveFromRunningTasksIfExists (work->task );
888- work->SetStateCancelled ();
889- tasks_cancelled = true ;
890- return true ;
891- } else {
892- return false ;
874+ for (auto shapes_it = tasks_to_dispatch_.begin (); shapes_it != tasks_to_dispatch_.end ();
875+ shapes_it++) {
876+ auto &work_queue = shapes_it->second ;
877+ for (auto work_it = work_queue.begin (); work_it != work_queue.end (); work_it++) {
878+ const auto &task = (*work_it)->task ;
879+ if (task.GetTaskSpecification ().TaskId () == task_id) {
880+ RAY_LOG (DEBUG) << " Canceling task " << task_id << " from dispatch queue." ;
881+ ReplyCancelled (*work_it, failure_type, scheduling_failure_message);
882+ if ((*work_it)->GetState () == internal::WorkStatus::WAITING_FOR_WORKER) {
883+ // We've already acquired resources so we need to release them.
884+ cluster_resource_scheduler_->GetLocalResourceManager ().ReleaseWorkerResources (
885+ (*work_it)->allocated_instances );
886+ // Release pinned task args.
887+ ReleaseTaskArgs (task_id);
893888 }
894- });
895-
896- ray::erase_if<std::shared_ptr<internal::Work>>(
897- waiting_task_queue_, [&](const std::shared_ptr<internal::Work> &work) {
898- if (predicate (work)) {
899- ReplyCancelled (work, failure_type, scheduling_failure_message);
900- if (!work->task .GetTaskSpecification ().GetDependencies ().empty ()) {
901- task_dependency_manager_.RemoveTaskDependencies (
902- work->task .GetTaskSpecification ().TaskId ());
903- }
904- waiting_tasks_index_.erase (work->task .GetTaskSpecification ().TaskId ());
905- tasks_cancelled = true ;
906- return true ;
907- } else {
908- return false ;
889+ if (!task.GetTaskSpecification ().GetDependencies ().empty ()) {
890+ task_dependency_manager_.RemoveTaskDependencies (
891+ task.GetTaskSpecification ().TaskId ());
892+ }
893+ RemoveFromRunningTasksIfExists (task);
894+ (*work_it)->SetStateCancelled ();
895+ work_queue.erase (work_it);
896+ if (work_queue.empty ()) {
897+ tasks_to_dispatch_.erase (shapes_it);
909898 }
910- });
899+ return true ;
900+ }
901+ }
902+ }
911903
912- return tasks_cancelled;
913- }
904+ auto iter = waiting_tasks_index_.find (task_id);
905+ if (iter != waiting_tasks_index_.end ()) {
906+ const auto &task = (*iter->second )->task ;
907+ ReplyCancelled (*iter->second , failure_type, scheduling_failure_message);
908+ if (!task.GetTaskSpecification ().GetDependencies ().empty ()) {
909+ task_dependency_manager_.RemoveTaskDependencies (
910+ task.GetTaskSpecification ().TaskId ());
911+ }
912+ waiting_task_queue_.erase (iter->second );
913+ waiting_tasks_index_.erase (iter);
914914
915- bool LocalTaskManager::CancelTask (
916- const TaskID &task_id,
917- rpc::RequestWorkerLeaseReply::SchedulingFailureType failure_type,
918- const std::string &scheduling_failure_message) {
919- return CancelTasks (
920- [task_id](const std::shared_ptr<internal::Work> &work) {
921- return work->task .GetTaskSpecification ().TaskId () == task_id;
922- },
923- failure_type,
924- scheduling_failure_message);
915+ return true ;
916+ }
917+
918+ return false ;
925919}
926920
927921bool LocalTaskManager::AnyPendingTasksForResourceAcquisition (
0 commit comments