Skip to content

Commit 18cd0b0

Browse files
winstonzhang-intelwinston.zhang
andauthored
[UR][L0] Fix barrier event cleanup on urEventRelease (intel#21340)
Event pools (zeEventPoolCreate) are cached but never destroyed during execution -- zeEventPoolDestroy only runs in Context::finalize() at exit. Long-running apps exhaust vm.max_map_count (~65K) in under 2 hours. Root cause: event caching holds pool slots indefinitely, preventing decrementUnreleasedEventsInPool() from freeing pools. Fix: destroy underlying ZeEvent when caching events to release pool slots; re-allocate a fresh ZeEvent when reusing cached events; destroy fully-empty pools immediately when the cache already has an available pool. Tested on PVC (120s Grid): pools destroyed during execution went from 0/1584 to 10099/11674 (86.5% recycled), bounding steady-state at ~1575. Co-authored-by: winston.zhang <winstonz@borealis-admin1.hpe.jf.intel.com>
1 parent 552daa3 commit 18cd0b0

File tree

1 file changed

+35
-3
lines changed

1 file changed

+35
-3
lines changed

unified-runtime/source/adapters/level_zero/context.cpp

Lines changed: 35 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -306,7 +306,7 @@ ur_result_t ur_context_handle_t_::finalize() {
306306
std::scoped_lock<ur_mutex> Lock(EventCacheMutex);
307307
for (auto &EventCache : EventCaches) {
308308
for (auto &Event : EventCache) {
309-
if (checkL0LoaderTeardown()) {
309+
if (Event->ZeEvent && checkL0LoaderTeardown()) {
310310
auto ZeResult = ZE_CALL_NOCHECK(zeEventDestroy, (Event->ZeEvent));
311311
// Gracefully handle the case that L0 was already unloaded.
312312
if (ZeResult && (ZeResult != ZE_RESULT_ERROR_UNINITIALIZED &&
@@ -516,6 +516,21 @@ ur_result_t ur_context_handle_t_::getFreeSlotInExistingOrNewPool(
516516
ze_event_pool_handle_t *ZePool = &ZePoolCache->front();
517517
Index = 0;
518518
// Create one event ZePool per MaxNumEventsPerPool events
519+
if (*ZePool == nullptr) {
520+
// Before creating a new pool, scan the cache tail for a fully-recycled
521+
// pool that can be reused (all events released, all slots available).
522+
for (auto it = std::next(ZePoolCache->begin()); it != ZePoolCache->end();
523+
++it) {
524+
if (*it != nullptr && NumEventsUnreleasedInEventPool.count(*it) &&
525+
NumEventsUnreleasedInEventPool[*it] == 0 &&
526+
NumEventsAvailableInEventPool.count(*it) &&
527+
NumEventsAvailableInEventPool[*it] == MaxNumEventsPerPool) {
528+
ZePoolCache->front() = *it;
529+
ZePoolCache->erase(it);
530+
break;
531+
}
532+
}
533+
}
519534
if (*ZePool == nullptr) {
520535
ze_event_pool_counter_based_exp_desc_t counterBasedExt = {
521536
ZE_STRUCTURE_TYPE_COUNTER_BASED_EVENT_POOL_EXP_DESC, nullptr, 0};
@@ -682,9 +697,26 @@ ur_context_handle_t_::decrementUnreleasedEventsInPool(ur_event_handle_t Event) {
682697
die("Invalid event release: event pool doesn't have unreleased events");
683698
if (--NumEventsUnreleasedInEventPool[Event->ZeEventPool] == 0) {
684699
if (ZePoolCache->front() != Event->ZeEventPool) {
685-
ZePoolCache->push_back(Event->ZeEventPool);
700+
bool hasFrontPool =
701+
!ZePoolCache->empty() && ZePoolCache->front() != nullptr;
702+
if (hasFrontPool && checkL0LoaderTeardown()) {
703+
ZE_CALL_NOCHECK(zeEventPoolDestroy, (Event->ZeEventPool));
704+
NumEventsAvailableInEventPool.erase(Event->ZeEventPool);
705+
NumEventsUnreleasedInEventPool.erase(Event->ZeEventPool);
706+
// Remove the destroyed pool handle from the cache to prevent
707+
// double-free in finalize().
708+
ZePoolCache->remove(Event->ZeEventPool);
709+
Event->ZeEventPool = nullptr;
710+
} else if (!ZePoolCache->empty() && ZePoolCache->front() == nullptr) {
711+
ZePoolCache->front() = Event->ZeEventPool;
712+
NumEventsAvailableInEventPool[Event->ZeEventPool] = MaxNumEventsPerPool;
713+
} else {
714+
ZePoolCache->push_back(Event->ZeEventPool);
715+
NumEventsAvailableInEventPool[Event->ZeEventPool] = MaxNumEventsPerPool;
716+
}
717+
} else {
718+
NumEventsAvailableInEventPool[Event->ZeEventPool] = MaxNumEventsPerPool;
686719
}
687-
NumEventsAvailableInEventPool[Event->ZeEventPool] = MaxNumEventsPerPool;
688720
}
689721

690722
return UR_RESULT_SUCCESS;

0 commit comments

Comments
 (0)