SWDEV-521647 - Fix tracking of hw_event (#206)

saleelk · web-flow · commit 072fb0804e2a · 2025-04-25T08:46:44.000-07:00
- When a command may possibly have two packets(like device heap
  initializer), and if there is no signal on the main kernel packet the
tracking was broken as it marked HW event of the command as the first
packet signal.
- Make sure if no completion signal is attached to the second packet
  then clear the HW event for the command.
diff --git a/rocclr/device/rocm/rocdevice.cpp b/rocclr/device/rocm/rocdevice.cpp
@@ -2863,7 +2863,11 @@ bool Device::IsHwEventReady(const amd::Event& event, bool wait, uint32_t hip_eve
     bool active_wait = !(hip_event_flags & kHipEventBlockingSync) && ActiveWait();
     return WaitForSignal(reinterpret_cast<ProfilingSignal*>(hw_event)->signal_, active_wait);
   }
-  return (hsa_signal_load_relaxed(reinterpret_cast<ProfilingSignal*>(hw_event)->signal_) == 0);
+
+  auto signal = reinterpret_cast<ProfilingSignal*>(hw_event)->signal_;
+  ClPrint(amd::LOG_INFO, amd::LOG_SIG, "Check HW event = 0x%lx", signal.handle);
+
+  return (hsa_signal_load_relaxed(signal) == 0);
 }
 
 // ================================================================================================
diff --git a/rocclr/device/rocm/rocvirtual.cpp b/rocclr/device/rocm/rocvirtual.cpp
@@ -422,7 +422,20 @@ bool VirtualGPU::HwQueueTracker::Create() {
 
 // ================================================================================================
 hsa_signal_t VirtualGPU::HwQueueTracker::ActiveSignal(
-    hsa_signal_value_t init_val, Timestamp* ts) {
+    hsa_signal_value_t init_val, Timestamp* ts, bool attach_signal) {
+
+  amd::Command* cmd = gpu_.command();
+  // If no signal is needed, decrement the refcount and clear the hw_event of current command
+  if (!attach_signal) {
+    if (nullptr != cmd) {
+      if (cmd->HwEvent() != nullptr) {
+        reinterpret_cast<ProfilingSignal*>(cmd->HwEvent())->release();
+      }
+      cmd->SetHwEvent(nullptr);
+    }
+    return hsa_signal_t {0};
+  }
+
   bool new_signal = false;
 
   // Peep signal +2 ahead to see if its done
@@ -503,8 +516,7 @@ hsa_signal_t VirtualGPU::HwQueueTracker::ActiveSignal(
   prof_signal->engine_ = engine_;
   prof_signal->flags_.isPacketDispatch_ = false;
 
-  // Store the HW event
-  amd::Command* cmd = gpu_.command();
+
   if (nullptr != cmd) {
     // Release any existing HwEvent before setting new one for the same command
     if (cmd->HwEvent() != nullptr) {
@@ -1026,24 +1038,25 @@ bool VirtualGPU::dispatchGenericAqlPacket(
 
   fence_state_ = static_cast<Device::CacheState>(expected_fence_state);
 
-  if (timestamp_ != nullptr || attach_signal) {
-    // Get active signal for current dispatch if profiling is necessary
-    packet->completion_signal = Barriers().ActiveSignal(kInitSignalValueOne, timestamp_);
-
-    if (std::is_same<decltype(packet), hsa_kernel_dispatch_packet_t*>::value) {
-      // If profiling is enabled, store the correlation ID in the dispatch packet. The profiler can
-      // retrieve this correlation ID to attribute waves to specific dispatch locations.
-      if (amd::activity_prof::IsEnabled(OP_ID_DISPATCH)) {
-        auto dispatchPacket = reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet);
-        dispatchPacket->reserved2 = timestamp_->command().profilingInfo().correlation_id_;
-      }
-
-      ProfilingSignal* current_signal = Barriers().GetLastSignal();
-      current_signal->flags_.isPacketDispatch_ = true;
+  bool attachSignal = timestamp_ != nullptr || attach_signal;
+  // Get active signal for current dispatch if profiling is necessary
+  packet->completion_signal = Barriers().ActiveSignal(kInitSignalValueOne,
+                                                      timestamp_, attachSignal);
 
+  if (std::is_same<decltype(packet), hsa_kernel_dispatch_packet_t*>::value
+      && timestamp_ != nullptr) {
+    // If profiling is enabled, store the correlation ID in the dispatch packet. The profiler can
+    // retrieve this correlation ID to attribute waves to specific dispatch locations.
+    if (amd::activity_prof::IsEnabled(OP_ID_DISPATCH) ) {
+      auto dispatchPacket = reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet);
+      dispatchPacket->reserved2 = timestamp_->command().profilingInfo().correlation_id_;
     }
+
+    ProfilingSignal* current_signal = Barriers().GetLastSignal();
+    current_signal->flags_.isPacketDispatch_ = true;
   }
 
+
   // Make sure the slot is free for usage
   while ((index - hsa_queue_load_read_index_scacquire(gpu_queue_)) >= sw_queue_size) {
     amd::Os::yield();
diff --git a/rocclr/device/rocm/rocvirtual.hpp b/rocclr/device/rocm/rocvirtual.hpp
@@ -254,7 +254,7 @@ class VirtualGPU : public device::VirtualDevice {
 
     //! Finds a free signal for the upcomming operation
     hsa_signal_t ActiveSignal(hsa_signal_value_t init_val = kInitSignalValueOne,
-                              Timestamp* ts = nullptr);
+                              Timestamp* ts = nullptr, bool attach_signal = true);
 
     //! Wait for the curent active signal. Can idle the queue
     bool WaitCurrent() {
diff --git a/rocclr/platform/command.cpp b/rocclr/platform/command.cpp
@@ -277,7 +277,6 @@ bool Event::notifyCmdQueue(bool cpu_wait) {
         notified_.clear();
         return false;
       }
-      ClPrint(LOG_DEBUG, LOG_CMD, "Queue marker to command queue: %p", queue);
       command->enqueue();
       // Save notification, associated with the current event
       notify_event_ = command;
@@ -290,7 +289,6 @@ bool Event::notifyCmdQueue(bool cpu_wait) {
         notified_.clear();
         return false;
       }
-      ClPrint(LOG_DEBUG, LOG_CMD, "Queue marker to command queue: %p", queue);
       command->enqueue();
       command->release();
     }
@@ -356,8 +354,8 @@ void Command::enqueue() {
     Agent::postEventCreate(as_cl(static_cast<Event*>(this)), type_);
   }
 
-  ClPrint(LOG_DEBUG, LOG_CMD, "Command (%s) enqueued: %p",
-          amd::activity_prof::getOclCommandKindString(this->type()), this);
+  ClPrint(LOG_DEBUG, LOG_CMD, "Command (%s) enqueued: %p to queue: %p",
+          amd::activity_prof::getOclCommandKindString(this->type()), this, queue_);
 
   // Direct dispatch logic below will submit the command immediately, but the command status
   // update will occur later after flush() with a wait
diff --git a/rocclr/platform/command.hpp b/rocclr/platform/command.hpp
@@ -213,7 +213,7 @@ class Event : public RuntimeObject {
   //! Returns the callback for this event
   const CallBackEntry* Callback() const { return callbacks_; }
 
-  // Saves HW event, associated with the current command
+  //! Saves HW event, associated with the current command
   void SetHwEvent(void* hw_event) { hw_event_ = hw_event; }
 
   //! Returns HW event, associated with the current command

Original file line number	Diff line number	Diff line change
`@@ -2863,7 +2863,11 @@ bool Device::IsHwEventReady(const amd::Event& event, bool wait, uint32_t hip_eve`
`2863`	`2863`	`bool active_wait = !(hip_event_flags & kHipEventBlockingSync) && ActiveWait();`
`2864`	`2864`	`return WaitForSignal(reinterpret_cast<ProfilingSignal*>(hw_event)->signal_, active_wait);`
`2865`	`2865`	`}`
`2866`		`- return (hsa_signal_load_relaxed(reinterpret_cast<ProfilingSignal*>(hw_event)->signal_) == 0);`
	`2866`	`+`
	`2867`	`+ auto signal = reinterpret_cast<ProfilingSignal*>(hw_event)->signal_;`
	`2868`	`+ ClPrint(amd::LOG_INFO, amd::LOG_SIG, "Check HW event = 0x%lx", signal.handle);`
	`2869`	`+`
	`2870`	`+ return (hsa_signal_load_relaxed(signal) == 0);`
`2867`	`2871`	`}`
`2868`	`2872`
`2869`	`2873`	`// ================================================================================================`
Original file line number	Diff line number	Diff line change
`@@ -277,7 +277,6 @@ bool Event::notifyCmdQueue(bool cpu_wait) {`
`277`	`277`	`notified_.clear();`
`278`	`278`	`return false;`
`279`	`279`	`}`
`280`		`- ClPrint(LOG_DEBUG, LOG_CMD, "Queue marker to command queue: %p", queue);`
`281`	`280`	`command->enqueue();`
`282`	`281`	`// Save notification, associated with the current event`
`283`	`282`	`notify_event_ = command;`
`@@ -290,7 +289,6 @@ bool Event::notifyCmdQueue(bool cpu_wait) {`
`290`	`289`	`notified_.clear();`
`291`	`290`	`return false;`
`292`	`291`	`}`
`293`		`- ClPrint(LOG_DEBUG, LOG_CMD, "Queue marker to command queue: %p", queue);`
`294`	`292`	`command->enqueue();`
`295`	`293`	`command->release();`
`296`	`294`	`}`
`@@ -356,8 +354,8 @@ void Command::enqueue() {`
`356`	`354`	`Agent::postEventCreate(as_cl(static_cast<Event*>(this)), type_);`
`357`	`355`	`}`
`358`	`356`
`359`		`- ClPrint(LOG_DEBUG, LOG_CMD, "Command (%s) enqueued: %p",`
`360`		`- amd::activity_prof::getOclCommandKindString(this->type()), this);`
	`357`	`+ ClPrint(LOG_DEBUG, LOG_CMD, "Command (%s) enqueued: %p to queue: %p",`
	`358`	`+ amd::activity_prof::getOclCommandKindString(this->type()), this, queue_);`
`361`	`359`
`362`	`360`	`// Direct dispatch logic below will submit the command immediately, but the command status`
`363`	`361`	`// update will occur later after flush() with a wait`