-
Notifications
You must be signed in to change notification settings - Fork 120
Add fast path for teardown check with a shared thread #321
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Closed
Closed
Changes from 5 commits
Commits
Show all changes
6 commits
Select commit
Hold shift + click to select a range
20fcafe
Add fast path for teardown check with a shared thread
nrspruit bb6af5e
Add a timeout for when the stability thread becomes lost
nrspruit 68fe276
Allocate the global stability variables with the context and catch ex…
nrspruit d5432a9
Fix teardown of stability resources
nrspruit 485c23b
Fix stabilityCheck to avoid data race at close
nrspruit c038c59
Updated thread state values, increased polling interval and updated t…
nrspruit File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,3 +1,4 @@ | ||
|
|
||
| /* | ||
| * | ||
| * Copyright (C) 2019-2021 Intel Corporation | ||
|
|
@@ -27,6 +28,11 @@ namespace ze_lib | |
| } | ||
| } | ||
| bool delayContextDestruction = false; | ||
| std::mutex *stabilityMutex = nullptr; | ||
| std::promise<int> *stabilityPromiseResult = nullptr; | ||
| std::future<int> *resultFutureResult = nullptr; | ||
| std::atomic<int> *stabilityCheckThreadStarted = nullptr; | ||
| std::thread *stabilityThread = nullptr; | ||
| #endif | ||
| bool destruction = false; | ||
|
|
||
|
|
@@ -43,6 +49,35 @@ namespace ze_lib | |
| if (loader) { | ||
| FREE_DRIVER_LIBRARY( loader ); | ||
| } | ||
| if (ze_lib::stabilityCheckThreadStarted) | ||
| ze_lib::stabilityCheckThreadStarted->store(-1); | ||
| try { | ||
| if (stabilityThread && stabilityThread->joinable()) { | ||
| stabilityThread->join(); | ||
| } | ||
| } catch (...) { | ||
| // Ignore any exceptions from thread join | ||
| } | ||
| if (stabilityThread) { | ||
| delete stabilityThread; | ||
| stabilityThread = nullptr; | ||
| } | ||
| if (stabilityMutex) { | ||
| delete stabilityMutex; | ||
| stabilityMutex = nullptr; | ||
| } | ||
| if (stabilityPromiseResult) { | ||
| delete stabilityPromiseResult; | ||
| stabilityPromiseResult = nullptr; | ||
| } | ||
| if (resultFutureResult) { | ||
| delete resultFutureResult; | ||
| resultFutureResult = nullptr; | ||
| } | ||
| if (stabilityCheckThreadStarted) { | ||
| delete stabilityCheckThreadStarted; | ||
| stabilityCheckThreadStarted = nullptr; | ||
| } | ||
| #endif | ||
| ze_lib::destruction = true; | ||
| }; | ||
|
|
@@ -149,6 +184,10 @@ namespace ze_lib | |
| std::string version_message = "Loader API Version to be requested is v" + std::to_string(ZE_MAJOR_VERSION(version)) + "." + std::to_string(ZE_MINOR_VERSION(version)); | ||
| debug_trace_message(version_message, ""); | ||
| loaderDriverGet = reinterpret_cast<ze_pfnDriverGet_t>(GET_FUNCTION_PTR(loader, "zeDriverGet")); | ||
| stabilityMutex = new std::mutex(); | ||
| stabilityPromiseResult = new std::promise<int>(); | ||
| resultFutureResult = new std::future<int>(stabilityPromiseResult->get_future()); | ||
| stabilityCheckThreadStarted = new std::atomic<int>(0); | ||
| #else | ||
| result = zeLoaderInit(); | ||
| if( ZE_RESULT_SUCCESS == result ) { | ||
|
|
@@ -410,61 +449,53 @@ zelSetDelayLoaderContextTeardown() | |
| #define ZEL_STABILITY_CHECK_RESULT_DRIVER_GET_NULL 1 | ||
| #define ZEL_STABILITY_CHECK_RESULT_DRIVER_GET_FAILED 2 | ||
| #define ZEL_STABILITY_CHECK_RESULT_EXCEPTION 3 | ||
| // The stability check thread timeout in milliseconds | ||
| #define ZEL_STABILITY_CHECK_THREAD_TIMEOUT 100 | ||
|
|
||
| /** | ||
| * @brief Performs a stability check for the Level Zero loader. | ||
| * | ||
| * This function checks the stability of the Level Zero loader by verifying | ||
| * the presence of the loader module, the validity of the `zeDriverGet` function | ||
| * pointer, and the ability to retrieve driver information. The result of the | ||
| * stability check is communicated through the provided promise. | ||
| * This function verifies the stability of the Level Zero loader by checking: | ||
| * - The presence of the loader module. | ||
| * - The validity of the `zeDriverGet` function pointer. | ||
| * - The ability to retrieve driver information. | ||
| * | ||
| * The result of the stability check is returned as an integer, with the following possible values: | ||
| * - `ZEL_STABILITY_CHECK_RESULT_SUCCESS`: The stability check was successful. | ||
| * - `ZEL_STABILITY_CHECK_RESULT_DRIVER_GET_NULL`: The `zeDriverGet` function pointer is invalid. | ||
| * - `ZEL_STABILITY_CHECK_RESULT_DRIVER_GET_FAILED`: The loader failed to retrieve driver information. | ||
| * - `ZEL_STABILITY_CHECK_RESULT_EXCEPTION`: An exception occurred during the stability check. | ||
| * | ||
| * @param stabilityPromise A promise object used to communicate the result of | ||
| * the stability check. The promise is set with one of | ||
| * the following values: | ||
| * - ZEL_STABILITY_CHECK_RESULT_DRIVER_GET_NULL: The | ||
| * `zeDriverGet` function pointer is invalid. | ||
| * - ZEL_STABILITY_CHECK_RESULT_DRIVER_GET_FAILED: The | ||
| * loader failed to retrieve driver information. | ||
| * - ZEL_STABILITY_CHECK_RESULT_EXCEPTION: An | ||
| * exception occurred during the stability check. | ||
| * - ZEL_STABILITY_CHECK_RESULT_SUCCESS: The stability | ||
| * check was successful. | ||
| * If debug tracing is enabled, debug messages are logged for each failure scenario. | ||
| * | ||
| * @note If debug tracing is enabled, debug messages are logged for each failure | ||
| * scenario. | ||
| * @note If the Loader is completely torn down, this thread is expected to be killed | ||
| * due to invalid memory access and the stability check will determine a failure. | ||
| * @return An integer indicating the result of the stability check. | ||
| * | ||
| * @exception This function catches all exceptions internally and does not throw. | ||
| * @note If the loader is completely torn down, this function may fail due to invalid memory access. | ||
| * @note This function catches all exceptions internally and does not throw. | ||
| */ | ||
| void stabilityCheck(std::promise<int> stabilityPromise) { | ||
| int stabilityCheck() { | ||
| try { | ||
| if (!ze_lib::context->loaderDriverGet) { | ||
| if (ze_lib::context->debugTraceEnabled) { | ||
| std::string message = "LoaderDriverGet is a bad pointer. Exiting stability checker thread."; | ||
| std::string message = "LoaderDriverGet is a bad pointer. Exiting stability checker."; | ||
| ze_lib::context->debug_trace_message(message, ""); | ||
| } | ||
| stabilityPromise.set_value(ZEL_STABILITY_CHECK_RESULT_DRIVER_GET_NULL); | ||
| return; | ||
| return ZEL_STABILITY_CHECK_RESULT_DRIVER_GET_NULL; | ||
| } | ||
|
|
||
| uint32_t driverCount = 0; | ||
| ze_result_t result = ZE_RESULT_ERROR_UNINITIALIZED; | ||
| result = ze_lib::context->loaderDriverGet(&driverCount, nullptr); | ||
| if (result != ZE_RESULT_SUCCESS || driverCount == 0) { | ||
| if (ze_lib::context->debugTraceEnabled) { | ||
| std::string message = "Loader stability check failed. Exiting stability checker thread."; | ||
| std::string message = "Loader stability check failed. Exiting stability checker."; | ||
| ze_lib::context->debug_trace_message(message, ""); | ||
| } | ||
| stabilityPromise.set_value(ZEL_STABILITY_CHECK_RESULT_DRIVER_GET_FAILED); | ||
| return; | ||
| return ZEL_STABILITY_CHECK_RESULT_DRIVER_GET_FAILED; | ||
| } | ||
| stabilityPromise.set_value(ZEL_STABILITY_CHECK_RESULT_SUCCESS); | ||
| return; | ||
| return ZEL_STABILITY_CHECK_RESULT_SUCCESS; | ||
| } catch (...) { | ||
| stabilityPromise.set_value(ZEL_STABILITY_CHECK_RESULT_EXCEPTION); | ||
| return; | ||
| return ZEL_STABILITY_CHECK_RESULT_EXCEPTION; | ||
| } | ||
| } | ||
| #endif | ||
|
|
@@ -490,18 +521,60 @@ zelCheckIsLoaderInTearDown() { | |
| return true; | ||
| } | ||
| #ifdef DYNAMIC_LOAD_LOADER | ||
| std::promise<int> stabilityPromise; | ||
| std::future<int> resultFuture = stabilityPromise.get_future(); | ||
| int result = -1; | ||
| static bool unstable = false; | ||
| int threadResult = -1; | ||
| if (unstable) { | ||
| return true; | ||
| } | ||
| try { | ||
| // Launch the stability checker thread | ||
| std::thread stabilityThread(stabilityCheck, std::move(stabilityPromise)); | ||
| result = resultFuture.get(); // Blocks until the result is available | ||
| if (ze_lib::context->debugTraceEnabled) { | ||
| std::string message = "Stability checker thread completed with result: " + std::to_string(result); | ||
| ze_lib::context->debug_trace_message(message, ""); | ||
| // Launch the stability checker thread on the first call | ||
| static std::once_flag stabilityThreadFlag; | ||
| std::lock_guard<std::mutex> lock(*ze_lib::stabilityMutex); | ||
| *ze_lib::stabilityPromiseResult = std::promise<int>(); | ||
| *ze_lib::resultFutureResult = ze_lib::stabilityPromiseResult->get_future(); | ||
| ze_lib::stabilityCheckThreadStarted->store(1); | ||
|
||
| std::call_once(stabilityThreadFlag, []() { | ||
| ze_lib::stabilityThread = new std::thread([]() { | ||
| while (true) { | ||
| while(ze_lib::stabilityCheckThreadStarted && ze_lib::stabilityCheckThreadStarted->load() == 0) { | ||
| std::this_thread::sleep_for(std::chrono::milliseconds(1)); | ||
| } | ||
| if (ze_lib::destruction || ze_lib::context == nullptr) { | ||
| break; | ||
| } | ||
| if (!ze_lib::stabilityCheckThreadStarted) { | ||
| break; | ||
| } | ||
| if (ze_lib::stabilityCheckThreadStarted->load() == -1) { | ||
| break; | ||
| } | ||
| ze_lib::stabilityCheckThreadStarted->store(0); | ||
| int result = stabilityCheck(); | ||
| if (result != ZEL_STABILITY_CHECK_RESULT_SUCCESS) { | ||
| if (ze_lib::context->debugTraceEnabled) { | ||
| std::string message = "Loader stability check thread failed with result: " + std::to_string(result); | ||
| ze_lib::context->debug_trace_message(message, ""); | ||
| } | ||
| if (ze_lib::stabilityPromiseResult) { | ||
| ze_lib::stabilityPromiseResult->set_value(result); | ||
| } | ||
| break; // Exit the thread if stability check fails | ||
| } | ||
| if (ze_lib::stabilityPromiseResult) { | ||
| ze_lib::stabilityPromiseResult->set_value(result); | ||
| } | ||
| } | ||
| }); | ||
| }); | ||
| if (ze_lib::resultFutureResult->wait_for(std::chrono::milliseconds(ZEL_STABILITY_CHECK_THREAD_TIMEOUT)) == std::future_status::timeout) { | ||
| if (ze_lib::context->debugTraceEnabled) { | ||
| std::string message = "Stability Thread timeout, assuming thread has crashed"; | ||
| ze_lib::context->debug_trace_message(message, ""); | ||
| } | ||
| threadResult = ZEL_STABILITY_CHECK_RESULT_EXCEPTION; | ||
| } else { | ||
| threadResult = ze_lib::resultFutureResult->get(); | ||
| } | ||
| stabilityThread.join(); | ||
| } catch (const std::exception& e) { | ||
| if (ze_lib::context->debugTraceEnabled) { | ||
| std::string message = "Exception caught in parent thread: " + std::string(e.what()); | ||
|
|
@@ -513,11 +586,12 @@ zelCheckIsLoaderInTearDown() { | |
| ze_lib::context->debug_trace_message(message, ""); | ||
| } | ||
| } | ||
| if (result != ZEL_STABILITY_CHECK_RESULT_SUCCESS) { | ||
| if (threadResult != ZEL_STABILITY_CHECK_RESULT_SUCCESS) { | ||
| if (ze_lib::context->debugTraceEnabled) { | ||
| std::string message = "Loader stability check failed with result: " + std::to_string(result); | ||
| std::string message = "Loader stability check failed with result: " + std::to_string(threadResult); | ||
| ze_lib::context->debug_trace_message(message, ""); | ||
| } | ||
| unstable = true; | ||
| return true; | ||
| } | ||
| #endif | ||
|
|
||
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
White space promotes readership?