NVIDIA · caugonnet · Mar 26, 2026 · Mar 26, 2026 · Mar 26, 2026 · Mar 26, 2026
@@ -248,52 +248,12 @@ public:
     return *this;
   }
 
-  void populate_deps_scheduling_info() const
-  {
-    // Error checking copied from acquire() in acquire_release()
-
-    int index        = 0;
-    const auto& deps = get_task_deps();
-    for (const auto& dep : deps)
-    {
-      if (!dep.get_data().is_initialized())
-      {
-        fprintf(stderr, "Error: dependency number %d is an uninitialized logical data.\n", index);
-        abort();
-      }
-      dep.set_symbol(dep.get_data().get_symbol());
-      dep.set_data_footprint(dep.get_data().get_data_interface().data_footprint());
-      index++;
-    }
-  }
-
   /**
-   * @brief Use the scheduler to assign a device to this task
-   *
-   * @return returns true if the task's time needs to be recorded
+   * @brief Determine if the task's time needs to be recorded (for DOT visualization)
    */
-  bool schedule_task()
+  bool should_record_time()
   {
-    auto& dot        = *ctx.get_dot();
-    auto& statistics = reserved::task_statistics::instance();
-
-    const bool is_auto = get_exec_place().affine_data_place() == data_place::device_auto();
-    bool calibrate     = false;
-
-    // We need to know the data footprint if scheduling or calibrating tasks
-    if (is_auto || statistics.is_calibrating())
-    {
-      populate_deps_scheduling_info();
-    }
-
-    if (is_auto)
-    {
-      auto [place, needs_calibration] = ctx.schedule_task(*this);
-      set_exec_place(place);
-      calibrate = needs_calibration;
-    }
-
-    return dot.is_timing() || (calibrate && statistics.is_calibrating());
+    return ctx.get_dot()->is_timing();
   }
 
   // Only valid if we have defined a capture stream
@@ -312,48 +272,23 @@ public:
   template <typename Fun>
   void operator->*(Fun&& f)
   {
-    auto& dot        = *ctx.get_dot();
-    auto& statistics = reserved::task_statistics::instance();
-
-    // cudaEvent_t start_event, end_event;
+    auto& dot = *ctx.get_dot();
 
-    bool record_time = schedule_task();
-
-    if (statistics.is_calibrating_to_file())
-    {
-      record_time = true;
-    }
+    bool record_time = should_record_time();
 
     start();
 
-    if (record_time)
-    {
-      // Events must be created here to avoid issues with multi-gpu
-      // cuda_safe_call(cudaEventCreate(&start_event));
-      // cuda_safe_call(cudaEventCreate(&end_event));
-      // cuda_safe_call(cudaEventRecord(start_event));
-    }
-
     SCOPE(exit)
     {
       end_uncleared();
       if (record_time)
       {
-        // cuda_safe_call(cudaEventRecord(end_event));
-        // cuda_safe_call(cudaEventSynchronize(end_event));
-
         float milliseconds = 0;
-        // cuda_safe_call(cudaEventElapsedTime(&milliseconds, start_event, end_event));
 
         if (dot.is_tracing())
         {
           dot.template add_vertex_timing<task>(*this, milliseconds);
         }
-
-        if (statistics.is_calibrating())
-        {
-          statistics.log_task_time(*this, milliseconds);
-        }
       }
       clear();
     };
@@ -569,48 +504,23 @@ public:
   template <typename Fun>
   void operator->*(Fun&& f)
   {
-    auto& dot        = *ctx.get_dot();
-    auto& statistics = reserved::task_statistics::instance();
-
-    // cudaEvent_t start_event, end_event;
-
-    bool record_time = schedule_task();
+    auto& dot = *ctx.get_dot();
 
-    if (statistics.is_calibrating_to_file())
-    {
-      record_time = true;
-    }
+    bool record_time = should_record_time();
 
     start();
 
-    if (record_time)
-    {
-      // Events must be created here to avoid issues with multi-gpu
-      // cuda_safe_call(cudaEventCreate(&start_event));
-      // cuda_safe_call(cudaEventCreate(&end_event));
-      // cuda_safe_call(cudaEventRecord(start_event));
-    }
-
     SCOPE(exit)
     {
       end_uncleared();
       if (record_time)
       {
-        // cuda_safe_call(cudaEventRecord(end_event));
-        // cuda_safe_call(cudaEventSynchronize(end_event));
-
         float milliseconds = 0;
-        // cuda_safe_call(cudaEventElapsedTime(&milliseconds, start_event, end_event));
 
         if (dot.is_tracing())
         {
           dot.template add_vertex_timing<task>(*this, milliseconds);
         }
-
-        if (statistics.is_calibrating())
-        {
-          statistics.log_task_time(*this, milliseconds);
-        }
       }
       clear();
     };

@@ -49,8 +49,7 @@ namespace cuda::experimental::stf
  *         generated during the acquisition of dependencies. This list represents the
  *         prerequisites for the task to start execution.
  *
- * @note The function `EXPECT`s the task to be in the setup phase and the execution place
- *       not to be `exec_place::device_auto()`.
+ * @note The function `EXPECT`s the task to be in the setup phase.
  * @note Dependencies are sorted by logical data addresses to prevent deadlocks.
  * @note For tasks with multiple dependencies on the same logical data, only one
  *       instance of the data is used, and its access mode is determined by combining
@@ -61,7 +60,6 @@ inline event_list task::acquire(backend_ctx_untyped& ctx)
   EXPECT(get_task_phase() == task::phase::setup);
 
   const auto eplace = get_exec_place();
-  _CCCL_ASSERT(eplace != exec_place::device_auto(), "");
   // If there are any extra dependencies to fulfill
   auto result = get_input_events();
 

@@ -33,9 +33,7 @@
 #include <cuda/experimental/__stf/internal/execution_policy.cuh> // backend_ctx<T>::launch() uses execution_policy
 #include <cuda/experimental/__stf/internal/interpreted_execution_policy.cuh>
 #include <cuda/experimental/__stf/internal/machine.cuh> // backend_ctx_untyped::impl usese machine
-#include <cuda/experimental/__stf/internal/reorderer.cuh> // backend_ctx_untyped::impl uses reorderer
 #include <cuda/experimental/__stf/internal/repeat.cuh>
-#include <cuda/experimental/__stf/internal/scheduler.cuh> // backend_ctx_untyped::impl uses scheduler
 #include <cuda/experimental/__stf/internal/slice.cuh> // backend_ctx<T> uses shape_of
 #include <cuda/experimental/__stf/internal/thread_hierarchy.cuh>
 #include <cuda/experimental/__stf/internal/void_interface.cuh>
@@ -114,9 +112,7 @@ protected:
     friend class backend_ctx_untyped;
 
     impl(async_resources_handle async_resources = async_resources_handle())
-        : auto_scheduler(reserved::scheduler::make(getenv("CUDASTF_SCHEDULE")))
-        , auto_reorderer(reserved::reorderer::make(getenv("CUDASTF_TASK_ORDER")))
-        , async_resources(async_resources ? mv(async_resources) : async_resources_handle())
+        : async_resources(async_resources ? mv(async_resources) : async_resources_handle())
     {
       // Forces init
       cudaError_t ret = cudaFree(0);
@@ -320,7 +316,7 @@ protected:
     void cleanup()
     {
       attached_allocators.clear();
-      // Leave custom_allocator, auto_scheduler, and auto_reordered as they were.
+      // Leave custom_allocator as it was.
     }
 
     /* Current context-wide allocator (same as default_allocator unless it is changed) */
@@ -333,8 +329,6 @@ protected:
     ::std::vector<block_allocator_untyped> attached_allocators;
     reserved::composite_slice_cache composite_cache;
 
-    ::std::unique_ptr<reserved::scheduler> auto_scheduler;
-    ::std::unique_ptr<reserved::reorderer> auto_reorderer;
     // Stats-related stuff
     ::std::unordered_map<::std::pair<int, int>,
                          ::std::pair<size_t, size_t>,
@@ -664,31 +658,11 @@ public:
     return pimpl->async_resources;
   }
 
-  bool reordering_tasks() const
-  {
-    assert(pimpl);
-    return pimpl->auto_reorderer != nullptr;
-  }
-
   auto& get_composite_cache()
   {
     return pimpl->composite_cache;
   }
 
-  ::std::pair<exec_place, bool> schedule_task(const task& t) const
-  {
-    assert(pimpl);
-    assert(pimpl->auto_scheduler);
-    return pimpl->auto_scheduler->schedule_task(t);
-  }
-
-  void reorder_tasks(::std::vector<int>& tasks, ::std::unordered_map<int, reserved::reorderer_payload>& task_map)
-  {
-    assert(pimpl);
-    assert(pimpl->auto_reorderer);
-    pimpl->auto_reorderer->reorder_tasks(tasks, task_map);
-  }
-
   void increment_task_count()
   {
     ++pimpl->total_task_cnt;

@@ -29,7 +29,6 @@
 
 #include <cuda/experimental/__stf/internal/backend_ctx.cuh>
 #include <cuda/experimental/__stf/internal/task_dep.cuh>
-#include <cuda/experimental/__stf/internal/task_statistics.cuh>
 
 namespace cuda::experimental::stf
 {
@@ -353,9 +352,7 @@ public:
       t.set_symbol(symbol);
     }
 
-    // Do we need to measure the duration of the kernel(s) ?
-    auto& statistics   = reserved::task_statistics::instance();
-    record_time        = t.schedule_task() || statistics.is_calibrating_to_file();
+    record_time        = t.should_record_time();
     record_time_device = -1;
 
     t.start();
@@ -401,12 +398,6 @@ public:
         {
           dot.template add_vertex_timing<typename Ctx::task_type>(t, milliseconds, record_time_device);
         }
-
-        auto& statistics = reserved::task_statistics::instance();
-        if (statistics.is_calibrating())
-        {
-          statistics.log_task_time(t, milliseconds);
-        }
       }
     }
 

@@ -30,7 +30,6 @@
 #include <cuda/experimental/__stf/internal/backend_ctx.cuh>
 #include <cuda/experimental/__stf/internal/ctx_resource.cuh>
 #include <cuda/experimental/__stf/internal/task_dep.cuh>
-#include <cuda/experimental/__stf/internal/task_statistics.cuh>
 #include <cuda/experimental/__stf/internal/thread_hierarchy.cuh>
 #include <cuda/experimental/__stf/internal/void_interface.cuh>
 
@@ -114,8 +113,7 @@ public:
   template <typename Fun>
   void operator->*(Fun&& f)
   {
-    auto& dot        = *ctx.get_dot();
-    auto& statistics = reserved::task_statistics::instance();
+    auto& dot = *ctx.get_dot();
 
     auto t = ctx.task(exec_place::host());
     t.add_deps(deps);
@@ -125,7 +123,7 @@ public:
     }
 
     cudaEvent_t start_event, end_event;
-    const bool record_time = t.schedule_task() || statistics.is_calibrating_to_file();
+    const bool record_time = t.should_record_time();
 
     t.start();
 
@@ -156,11 +154,6 @@ public:
           {
             dot.template add_vertex_timing<typename Ctx::task_type>(t, milliseconds, -1);
           }
-
-          if (statistics.is_calibrating())
-          {
-            statistics.log_task_time(t, milliseconds);
-          }
         }
       }
       t.clear();

@@ -23,7 +23,6 @@
 #include <cuda/experimental/__stf/internal/execution_policy.cuh> // launch_impl() uses execution_policy
 #include <cuda/experimental/__stf/internal/interpreted_execution_policy_impl.cuh>
 #include <cuda/experimental/__stf/internal/task_dep.cuh>
-#include <cuda/experimental/__stf/internal/task_statistics.cuh>
 #include <cuda/experimental/__stf/internal/thread_hierarchy.cuh>
 #include <cuda/experimental/__stf/utility/scope_guard.cuh> // graph_launch_impl() uses SCOPE
 
@@ -322,21 +321,14 @@ public:
 
     EXPECT(e_place != exec_place::host(), "Attempt to run a launch on the host.");
 
-    auto& dot        = *ctx.get_dot();
-    auto& statistics = reserved::task_statistics::instance();
+    auto& dot = *ctx.get_dot();
 
     auto t = ctx.task(e_place);
 
     _CCCL_ASSERT(e_place.affine_data_place() == t.get_affine_data_place(), "Affine data places must match");
 
-    /*
-     * If we have a grid (including 1-element grids), the implicit affine partitioner is the blocked_partition.
-     *
-     * An explicit composite data place is required per data dependency to customize this behaviour.
-     */
     if (e_place.size() > 1)
     {
-      // Create a composite data place defined by the grid of places + the partitioning function
       t.set_affine_data_place(data_place::composite(blocked_partition(), e_place.as_grid()));
     }
 
@@ -346,14 +338,8 @@ public:
       t.set_symbol(symbol);
     }
 
-    bool record_time = t.schedule_task();
-    // Execution place may have changed during scheduling task
-    e_place = t.get_exec_place();
-
-    if (statistics.is_calibrating_to_file())
-    {
-      record_time = true;
-    }
+    bool record_time = t.should_record_time();
+    e_place          = t.get_exec_place();
 
     nvtx_range nr(t.get_symbol().c_str());
     t.start();
@@ -415,11 +401,6 @@ public:
           {
             dot.template add_vertex_timing<stream_task<>>(t, milliseconds, device);
           }
-
-          if (statistics.is_calibrating())
-          {
-            statistics.log_task_time(t, milliseconds);
-          }
         }
       }