Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
104 changes: 7 additions & 97 deletions cudax/include/cuda/experimental/__stf/graph/graph_task.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -248,52 +248,12 @@ public:
return *this;
}

void populate_deps_scheduling_info() const
{
// Error checking copied from acquire() in acquire_release()

int index = 0;
const auto& deps = get_task_deps();
for (const auto& dep : deps)
{
if (!dep.get_data().is_initialized())
{
fprintf(stderr, "Error: dependency number %d is an uninitialized logical data.\n", index);
abort();
}
dep.set_symbol(dep.get_data().get_symbol());
dep.set_data_footprint(dep.get_data().get_data_interface().data_footprint());
index++;
}
}

/**
* @brief Use the scheduler to assign a device to this task
*
* @return returns true if the task's time needs to be recorded
* @brief Determine if the task's time needs to be recorded (for DOT visualization)
*/
bool schedule_task()
bool should_record_time()
{
auto& dot = *ctx.get_dot();
auto& statistics = reserved::task_statistics::instance();

const bool is_auto = get_exec_place().affine_data_place() == data_place::device_auto();
bool calibrate = false;

// We need to know the data footprint if scheduling or calibrating tasks
if (is_auto || statistics.is_calibrating())
{
populate_deps_scheduling_info();
}

if (is_auto)
{
auto [place, needs_calibration] = ctx.schedule_task(*this);
set_exec_place(place);
calibrate = needs_calibration;
}

return dot.is_timing() || (calibrate && statistics.is_calibrating());
return ctx.get_dot()->is_timing();
}

// Only valid if we have defined a capture stream
Expand All @@ -312,48 +272,23 @@ public:
template <typename Fun>
void operator->*(Fun&& f)
{
auto& dot = *ctx.get_dot();
auto& statistics = reserved::task_statistics::instance();

// cudaEvent_t start_event, end_event;
auto& dot = *ctx.get_dot();

bool record_time = schedule_task();

if (statistics.is_calibrating_to_file())
{
record_time = true;
}
bool record_time = should_record_time();

start();

if (record_time)
{
// Events must be created here to avoid issues with multi-gpu
// cuda_safe_call(cudaEventCreate(&start_event));
// cuda_safe_call(cudaEventCreate(&end_event));
// cuda_safe_call(cudaEventRecord(start_event));
}

SCOPE(exit)
{
end_uncleared();
if (record_time)
{
// cuda_safe_call(cudaEventRecord(end_event));
// cuda_safe_call(cudaEventSynchronize(end_event));

float milliseconds = 0;
// cuda_safe_call(cudaEventElapsedTime(&milliseconds, start_event, end_event));

if (dot.is_tracing())
{
dot.template add_vertex_timing<task>(*this, milliseconds);
}

if (statistics.is_calibrating())
{
statistics.log_task_time(*this, milliseconds);
}
}
clear();
};
Expand Down Expand Up @@ -569,48 +504,23 @@ public:
template <typename Fun>
void operator->*(Fun&& f)
{
auto& dot = *ctx.get_dot();
auto& statistics = reserved::task_statistics::instance();

// cudaEvent_t start_event, end_event;

bool record_time = schedule_task();
auto& dot = *ctx.get_dot();

if (statistics.is_calibrating_to_file())
{
record_time = true;
}
bool record_time = should_record_time();

start();

if (record_time)
{
// Events must be created here to avoid issues with multi-gpu
// cuda_safe_call(cudaEventCreate(&start_event));
// cuda_safe_call(cudaEventCreate(&end_event));
// cuda_safe_call(cudaEventRecord(start_event));
}

SCOPE(exit)
{
end_uncleared();
if (record_time)
{
// cuda_safe_call(cudaEventRecord(end_event));
// cuda_safe_call(cudaEventSynchronize(end_event));

float milliseconds = 0;
// cuda_safe_call(cudaEventElapsedTime(&milliseconds, start_event, end_event));

if (dot.is_tracing())
{
dot.template add_vertex_timing<task>(*this, milliseconds);
}

if (statistics.is_calibrating())
{
statistics.log_task_time(*this, milliseconds);
}
}
clear();
};
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,7 @@ namespace cuda::experimental::stf
* generated during the acquisition of dependencies. This list represents the
* prerequisites for the task to start execution.
*
* @note The function `EXPECT`s the task to be in the setup phase and the execution place
* not to be `exec_place::device_auto()`.
* @note The function `EXPECT`s the task to be in the setup phase.
* @note Dependencies are sorted by logical data addresses to prevent deadlocks.
* @note For tasks with multiple dependencies on the same logical data, only one
* instance of the data is used, and its access mode is determined by combining
Expand All @@ -61,7 +60,6 @@ inline event_list task::acquire(backend_ctx_untyped& ctx)
EXPECT(get_task_phase() == task::phase::setup);

const auto eplace = get_exec_place();
_CCCL_ASSERT(eplace != exec_place::device_auto(), "");
// If there are any extra dependencies to fulfill
auto result = get_input_events();

Expand Down
30 changes: 2 additions & 28 deletions cudax/include/cuda/experimental/__stf/internal/backend_ctx.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,7 @@
#include <cuda/experimental/__stf/internal/execution_policy.cuh> // backend_ctx<T>::launch() uses execution_policy
#include <cuda/experimental/__stf/internal/interpreted_execution_policy.cuh>
#include <cuda/experimental/__stf/internal/machine.cuh> // backend_ctx_untyped::impl usese machine
#include <cuda/experimental/__stf/internal/reorderer.cuh> // backend_ctx_untyped::impl uses reorderer
#include <cuda/experimental/__stf/internal/repeat.cuh>
#include <cuda/experimental/__stf/internal/scheduler.cuh> // backend_ctx_untyped::impl uses scheduler
#include <cuda/experimental/__stf/internal/slice.cuh> // backend_ctx<T> uses shape_of
#include <cuda/experimental/__stf/internal/thread_hierarchy.cuh>
#include <cuda/experimental/__stf/internal/void_interface.cuh>
Expand Down Expand Up @@ -114,9 +112,7 @@ protected:
friend class backend_ctx_untyped;

impl(async_resources_handle async_resources = async_resources_handle())
: auto_scheduler(reserved::scheduler::make(getenv("CUDASTF_SCHEDULE")))
, auto_reorderer(reserved::reorderer::make(getenv("CUDASTF_TASK_ORDER")))
, async_resources(async_resources ? mv(async_resources) : async_resources_handle())
: async_resources(async_resources ? mv(async_resources) : async_resources_handle())
{
// Forces init
cudaError_t ret = cudaFree(0);
Expand Down Expand Up @@ -320,7 +316,7 @@ protected:
void cleanup()
{
attached_allocators.clear();
// Leave custom_allocator, auto_scheduler, and auto_reordered as they were.
// Leave custom_allocator as it was.
}

/* Current context-wide allocator (same as default_allocator unless it is changed) */
Expand All @@ -333,8 +329,6 @@ protected:
::std::vector<block_allocator_untyped> attached_allocators;
reserved::composite_slice_cache composite_cache;

::std::unique_ptr<reserved::scheduler> auto_scheduler;
::std::unique_ptr<reserved::reorderer> auto_reorderer;
// Stats-related stuff
::std::unordered_map<::std::pair<int, int>,
::std::pair<size_t, size_t>,
Expand Down Expand Up @@ -664,31 +658,11 @@ public:
return pimpl->async_resources;
}

bool reordering_tasks() const
{
assert(pimpl);
return pimpl->auto_reorderer != nullptr;
}

auto& get_composite_cache()
{
return pimpl->composite_cache;
}

::std::pair<exec_place, bool> schedule_task(const task& t) const
{
assert(pimpl);
assert(pimpl->auto_scheduler);
return pimpl->auto_scheduler->schedule_task(t);
}

void reorder_tasks(::std::vector<int>& tasks, ::std::unordered_map<int, reserved::reorderer_payload>& task_map)
{
assert(pimpl);
assert(pimpl->auto_reorderer);
pimpl->auto_reorderer->reorder_tasks(tasks, task_map);
}

void increment_task_count()
{
++pimpl->total_task_cnt;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@

#include <cuda/experimental/__stf/internal/backend_ctx.cuh>
#include <cuda/experimental/__stf/internal/task_dep.cuh>
#include <cuda/experimental/__stf/internal/task_statistics.cuh>

namespace cuda::experimental::stf
{
Expand Down Expand Up @@ -353,9 +352,7 @@ public:
t.set_symbol(symbol);
}

// Do we need to measure the duration of the kernel(s) ?
auto& statistics = reserved::task_statistics::instance();
record_time = t.schedule_task() || statistics.is_calibrating_to_file();
record_time = t.should_record_time();
record_time_device = -1;

t.start();
Expand Down Expand Up @@ -401,12 +398,6 @@ public:
{
dot.template add_vertex_timing<typename Ctx::task_type>(t, milliseconds, record_time_device);
}

auto& statistics = reserved::task_statistics::instance();
if (statistics.is_calibrating())
{
statistics.log_task_time(t, milliseconds);
}
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@
#include <cuda/experimental/__stf/internal/backend_ctx.cuh>
#include <cuda/experimental/__stf/internal/ctx_resource.cuh>
#include <cuda/experimental/__stf/internal/task_dep.cuh>
#include <cuda/experimental/__stf/internal/task_statistics.cuh>
#include <cuda/experimental/__stf/internal/thread_hierarchy.cuh>
#include <cuda/experimental/__stf/internal/void_interface.cuh>

Expand Down Expand Up @@ -114,8 +113,7 @@ public:
template <typename Fun>
void operator->*(Fun&& f)
{
auto& dot = *ctx.get_dot();
auto& statistics = reserved::task_statistics::instance();
auto& dot = *ctx.get_dot();

auto t = ctx.task(exec_place::host());
t.add_deps(deps);
Expand All @@ -125,7 +123,7 @@ public:
}

cudaEvent_t start_event, end_event;
const bool record_time = t.schedule_task() || statistics.is_calibrating_to_file();
const bool record_time = t.should_record_time();

t.start();

Expand Down Expand Up @@ -156,11 +154,6 @@ public:
{
dot.template add_vertex_timing<typename Ctx::task_type>(t, milliseconds, -1);
}

if (statistics.is_calibrating())
{
statistics.log_task_time(t, milliseconds);
}
}
}
t.clear();
Expand Down
25 changes: 3 additions & 22 deletions cudax/include/cuda/experimental/__stf/internal/launch.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@
#include <cuda/experimental/__stf/internal/execution_policy.cuh> // launch_impl() uses execution_policy
#include <cuda/experimental/__stf/internal/interpreted_execution_policy_impl.cuh>
#include <cuda/experimental/__stf/internal/task_dep.cuh>
#include <cuda/experimental/__stf/internal/task_statistics.cuh>
#include <cuda/experimental/__stf/internal/thread_hierarchy.cuh>
#include <cuda/experimental/__stf/utility/scope_guard.cuh> // graph_launch_impl() uses SCOPE

Expand Down Expand Up @@ -322,21 +321,14 @@ public:

EXPECT(e_place != exec_place::host(), "Attempt to run a launch on the host.");

auto& dot = *ctx.get_dot();
auto& statistics = reserved::task_statistics::instance();
auto& dot = *ctx.get_dot();

auto t = ctx.task(e_place);

_CCCL_ASSERT(e_place.affine_data_place() == t.get_affine_data_place(), "Affine data places must match");

/*
* If we have a grid (including 1-element grids), the implicit affine partitioner is the blocked_partition.
*
* An explicit composite data place is required per data dependency to customize this behaviour.
*/
if (e_place.size() > 1)
{
// Create a composite data place defined by the grid of places + the partitioning function
t.set_affine_data_place(data_place::composite(blocked_partition(), e_place.as_grid()));
}

Expand All @@ -346,14 +338,8 @@ public:
t.set_symbol(symbol);
}

bool record_time = t.schedule_task();
// Execution place may have changed during scheduling task
e_place = t.get_exec_place();

if (statistics.is_calibrating_to_file())
{
record_time = true;
}
bool record_time = t.should_record_time();
e_place = t.get_exec_place();

nvtx_range nr(t.get_symbol().c_str());
t.start();
Expand Down Expand Up @@ -415,11 +401,6 @@ public:
{
dot.template add_vertex_timing<stream_task<>>(t, milliseconds, device);
}

if (statistics.is_calibrating())
{
statistics.log_task_time(t, milliseconds);
}
}
}

Expand Down
Loading