@@ -1469,16 +1469,15 @@ void NodeManager::HandleTaskBlocked(const std::shared_ptr<LocalClientConnection>
1469
1469
// Get the CPU resources required by the running task.
1470
1470
const auto required_resources = task.GetTaskSpecification ().GetRequiredResources ();
1471
1471
double required_cpus = required_resources.GetNumCpus ();
1472
- std::unordered_map<std::string, double > cpu_resources;
1473
- if (required_cpus > 0 ) {
1474
- cpu_resources[kCPU_ResourceLabel ] = required_cpus;
1475
- }
1472
+ const std::unordered_map<std::string, double > cpu_resources = {
1473
+ {kCPU_ResourceLabel , required_cpus}};
1476
1474
1477
1475
// Release the CPU resources.
1478
1476
auto const cpu_resource_ids = worker->ReleaseTaskCpuResources ();
1479
1477
local_available_resources_.Release (cpu_resource_ids);
1480
- cluster_resource_map_[gcs_client_->client_table ().GetLocalClientId ()].Release (
1481
- ResourceSet (cpu_resources));
1478
+ RAY_CHECK (
1479
+ cluster_resource_map_[gcs_client_->client_table ().GetLocalClientId ()].Release (
1480
+ ResourceSet (cpu_resources)));
1482
1481
worker->MarkBlocked ();
1483
1482
1484
1483
// Try dispatching tasks since we may have released some resources.
@@ -1522,11 +1521,9 @@ void NodeManager::HandleTaskUnblocked(
1522
1521
// Get the CPU resources required by the running task.
1523
1522
const auto required_resources = task.GetTaskSpecification ().GetRequiredResources ();
1524
1523
double required_cpus = required_resources.GetNumCpus ();
1525
- std::unordered_map<std::string, double > cpu_resources_map;
1526
- if (required_cpus > 0 ) {
1527
- cpu_resources_map[kCPU_ResourceLabel ] = required_cpus;
1528
- }
1529
- const ResourceSet cpu_resources (cpu_resources_map);
1524
+ const ResourceSet cpu_resources (
1525
+ std::unordered_map<std::string, double >({{kCPU_ResourceLabel , required_cpus}}));
1526
+
1530
1527
// Check if we can reacquire the CPU resources.
1531
1528
bool oversubscribed = !local_available_resources_.Contains (cpu_resources);
1532
1529
@@ -1536,8 +1533,9 @@ void NodeManager::HandleTaskUnblocked(
1536
1533
// reacquire here may be different from the ones that the task started with.
1537
1534
auto const resource_ids = local_available_resources_.Acquire (cpu_resources);
1538
1535
worker->AcquireTaskCpuResources (resource_ids);
1539
- cluster_resource_map_[gcs_client_->client_table ().GetLocalClientId ()].Acquire (
1540
- cpu_resources);
1536
+ RAY_CHECK (
1537
+ cluster_resource_map_[gcs_client_->client_table ().GetLocalClientId ()].Acquire (
1538
+ cpu_resources));
1541
1539
} else {
1542
1540
// In this case, we simply don't reacquire the CPU resources for the worker.
1543
1541
// The worker can keep running and when the task finishes, it will simply
@@ -1629,7 +1627,7 @@ bool NodeManager::AssignTask(const Task &task) {
1629
1627
auto acquired_resources =
1630
1628
local_available_resources_.Acquire (spec.GetRequiredResources ());
1631
1629
const auto &my_client_id = gcs_client_->client_table ().GetLocalClientId ();
1632
- cluster_resource_map_[my_client_id].Acquire (spec.GetRequiredResources ());
1630
+ RAY_CHECK ( cluster_resource_map_[my_client_id].Acquire (spec.GetRequiredResources () ));
1633
1631
1634
1632
if (spec.IsActorCreationTask ()) {
1635
1633
// Check that we are not placing an actor creation task on a node with 0 CPUs.
@@ -1743,8 +1741,8 @@ void NodeManager::FinishAssignedTask(Worker &worker) {
1743
1741
// Release task's resources. The worker's lifetime resources are still held.
1744
1742
auto const &task_resources = worker.GetTaskResourceIds ();
1745
1743
local_available_resources_.Release (task_resources);
1746
- cluster_resource_map_[gcs_client_->client_table ().GetLocalClientId ()].Release (
1747
- task_resources.ToResourceSet ());
1744
+ RAY_CHECK ( cluster_resource_map_[gcs_client_->client_table ().GetLocalClientId ()].Release (
1745
+ task_resources.ToResourceSet ())) ;
1748
1746
worker.ResetTaskResourceIds ();
1749
1747
1750
1748
// If this was an actor or actor creation task, handle the actor's new state.
@@ -2036,9 +2034,6 @@ void NodeManager::ForwardTaskOrResubmit(const Task &task,
2036
2034
2037
2035
RAY_LOG (INFO) << " Failed to forward task " << task_id << " to node manager "
2038
2036
<< node_manager_id;
2039
-
2040
- // TODO(romilb): We should probably revert the load subtraction from
2041
- // SchedulingPolicy::Schedule()
2042
2037
// Mark the failed task as pending to let other raylets know that we still
2043
2038
// have the task. TaskDependencyManager::TaskPending() is assumed to be
2044
2039
// idempotent.
0 commit comments