#include "xsched/xsched.h"
#include "xsched/cuda/hal.h" // use the hal header of the target platform
// create two cuda streams and two XQueues
cudaStream_t stream_1, stream_2;
cudaStreamCreate(&stream_1);
cudaStreamCreate(&stream_2);
HwQueueHandle hwq_1, hwq_2;
CudaQueueCreate(&hwq_1, stream_1);
CudaQueueCreate(&hwq_2, stream_2);
XQueueHandle xq_1, xq_2;
XQueueCreate(&xq_1, hwq_1, kPreemptLevelBlock, kQueueCreateFlagNone);
XQueueCreate(&xq_2, hwq_2, kPreemptLevelBlock, kQueueCreateFlagNone);
// use hints to set the priority of each XQueue
XHintPriority(xq_1, 1); // lower priority
XHintPriority(xq_2, 2); // higher priority
// Set the scheduler to local scheduler so that the process will use
// the in-process scheduler, rather than the xserver.
// Set the policy of the local scheduler to highest priority first.
XHintSetScheduler(kSchedulerLocal, kPolicyHighestPriorityFirst);
// Use the stream to launch commands (kernels & memory copies) as usual.
// The task on stream_2 can preempt the task on stream_1 as it has higher priority.
// e.g., on thread 1:
kernel<<<grid, block, 0, stream_1>>>(...);
// e.g., on thread 2:
kernel<<<grid, block, 0, stream_2>>>(...);The usage on HIP platform is similar.
# go to the root directory of XSched and build XSched with CUDA support
cd xsched
# by default, XSched will be installed to xsched/output
# on CUDA platform
make cuda INSTALL_PATH=<install_path>
# on HIP platform
make hip INSTALL_PATH=<install_path>
# build the app
cd examples/3_intra_process_sched
# on CUDA platform
make cuda
# on HIP platform
make hipPlease refer to 2_give_hints for guidance to link XSched with your own app.
# Intercept the CUDA calls using the shim library
export LD_LIBRARY_PATH=<install_path>/lib:$LD_LIBRARY_PATH
# Intercept the HIP calls using the shim library
export LD_PRELOAD=<install_path>/lib/libshimhip.so:$LD_PRELOAD
./app_concurrentYou will see the output like this:
Prio 1 Task 0 completed in 152 ms
Prio 2 Task 10 completed in 67 ms
Prio 2 Task 11 completed in 71 ms
Prio 1 Task 1 completed in 177 ms
Prio 2 Task 12 completed in 67 ms
Prio 2 Task 13 completed in 69 ms
Prio 1 Task 2 completed in 182 ms
Prio 2 Task 14 completed in 67 ms
Prio 2 Task 15 completed in 69 ms
The tasks are identical, but the higher-priority tasks have significant lower latencies compared to the lower-priority ones.