Skip to content

Commit 64fa011

Browse files
Merge branch 'main' into adrianl/SessionQueryPartitionInfo_Revival
2 parents e63fe9e + de99059 commit 64fa011

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

53 files changed

+1804
-377
lines changed

cmake/CMakeLists.txt

Lines changed: 0 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -539,25 +539,6 @@ if(onnxruntime_USE_KLEIDIAI)
539539
set(${is_supported_var} FALSE PARENT_SCOPE)
540540
return()
541541
endif()
542-
543-
# check for compiler support
544-
if(MSVC)
545-
# TODO detect on MSVC
546-
else()
547-
check_cxx_compiler_flag(-march=armv8.2-a+dotprod HAS_ARM64_DOTPROD)
548-
check_cxx_compiler_flag(-march=armv8.2-a+i8mm HAS_ARM64_I8MM)
549-
if(NOT HAS_ARM64_DOTPROD)
550-
message(WARNING "The compiler doesn't support dotprod instructions.")
551-
endif()
552-
if(NOT HAS_ARM64_I8MM)
553-
message(WARNING "The compiler doesn't support i8mm instructions.")
554-
endif()
555-
if(NOT HAS_ARM64_DOTPROD OR NOT HAS_ARM64_I8MM)
556-
set(${is_supported_var} FALSE PARENT_SCOPE)
557-
return()
558-
endif()
559-
endif()
560-
561542
set(${is_supported_var} TRUE PARENT_SCOPE)
562543
endfunction()
563544

cmake/adjust_global_compile_flags.cmake

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -208,10 +208,8 @@ endif()
208208

209209

210210
macro(check_nvcc_compiler_flag _FLAG _RESULT)
211-
execute_process(COMMAND ${CUDAToolkit_BIN_DIR}/nvcc "${_FLAG}" RESULT_VARIABLE NVCC_OUT ERROR_VARIABLE NVCC_ERROR)
212-
message("NVCC_ERROR = ${NVCC_ERROR}")
213-
message("NVCC_OUT = ${NVCC_OUT}")
214-
if ("${NVCC_OUT}" MATCHES "0")
211+
execute_process(COMMAND ${CMAKE_CUDA_COMPILER} --compiler-options "${_FLAG}" -c ${REPO_ROOT}/cmake/empty.c -o ${CMAKE_CURRENT_BINARY_DIR}/empty.o RESULT_VARIABLE NVCC_OUT ERROR_QUIET OUTPUT_QUIET)
212+
if (NVCC_OUT EQUAL 0)
215213
set(${_RESULT} 1)
216214
else()
217215
set(${_RESULT} 0)

cmake/empty.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
// This file is used by the check_nvcc_compiler_flag macro in adjust_global_compile_flags.cmake to test nvcc compiler flags.
2+
void empty() {}

cmake/onnxruntime_providers_cuda.cmake

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -257,6 +257,17 @@
257257
target_include_directories(${target} PRIVATE ${cutlass_SOURCE_DIR}/include ${cutlass_SOURCE_DIR}/examples ${cutlass_SOURCE_DIR}/tools/util/include)
258258
target_link_libraries(${target} PRIVATE Eigen3::Eigen)
259259
target_include_directories(${target} PRIVATE ${ONNXRUNTIME_ROOT} ${CMAKE_CURRENT_BINARY_DIR} PUBLIC ${CUDAToolkit_INCLUDE_DIRS})
260+
261+
# Handle CUDA 13.0 CCCL header directory move
262+
if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 13.0)
263+
foreach(inc_dir ${CUDAToolkit_INCLUDE_DIRS})
264+
if (EXISTS "${inc_dir}/cccl")
265+
# Add the cccl subdirectory to the include path so <cuda/std/utility> can be found
266+
target_include_directories(${target} PRIVATE "${inc_dir}/cccl")
267+
endif()
268+
endforeach()
269+
endif()
270+
260271
# ${CMAKE_CURRENT_BINARY_DIR} is so that #include "onnxruntime_config.h" inside tensor_shape.h is found
261272
set_target_properties(${target} PROPERTIES LINKER_LANGUAGE CUDA)
262273
set_target_properties(${target} PROPERTIES FOLDER "ONNXRuntime")

include/onnxruntime/core/graph/graph.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1726,7 +1726,8 @@ class Graph { // NOLINT(clang-analyzer-optin.performance.Padding): preserve exi
17261726
#if !defined(ORT_MINIMAL_BUILD)
17271727
// Build and verify node connection (edges).
17281728
// Verify NodeArg name/type/shape matching correctly.
1729-
common::Status BuildConnections(std::unordered_set<std::string>& outer_scope_node_args_consumed);
1729+
common::Status BuildConnections(std::unordered_set<std::string>& outer_scope_node_args_consumed,
1730+
bool& removed_node_with_subgraph);
17301731

17311732
common::Status VerifyNoDuplicateName();
17321733

include/onnxruntime/core/session/environment.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -146,6 +146,17 @@ class Environment {
146146
return execution_devices_;
147147
}
148148

149+
/// Get hardware device incompatibility details for a specific EP.
150+
/// @param ep_name The name of the execution provider to check.
151+
/// @param hw The hardware device to check for incompatibility.
152+
/// @param details Output: Incompatibility details including reasons for incompatibility if any.
153+
/// @returns Status indicating success or failure.
154+
Status GetHardwareDeviceEpIncompatibilityDetails(const std::string& ep_name,
155+
const OrtHardwareDevice* hw,
156+
std::unique_ptr<OrtDeviceEpIncompatibilityDetails>& details) const;
157+
158+
const std::vector<const OrtHardwareDevice*>& GetSortedOrtHardwareDevices() const;
159+
149160
Status CreateSharedAllocator(const OrtEpDevice& ep_device,
150161
OrtDeviceMemoryType mem_type, OrtAllocatorType allocator_type,
151162
const OrtKeyValuePairs* allocator_options, OrtAllocator** allocator);

include/onnxruntime/core/session/onnxruntime_c_api.h

Lines changed: 129 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
// Copyright (c) Microsoft Corporation. All rights reserved.
1+
// Copyright (c) Microsoft Corporation. All rights reserved.
22
// Licensed under the MIT License.
33

44
// See docs\c_cxx\README.md on generating the Doxygen documentation from this file
@@ -333,6 +333,7 @@ ORT_RUNTIME_CLASS(ExternalInitializerInfo);
333333
ORT_RUNTIME_CLASS(ExternalResourceImporter); // Capability object for external resource import
334334
ORT_RUNTIME_CLASS(ExternalMemoryHandle); // EP-imported view of shared external allocation
335335
ORT_RUNTIME_CLASS(ExternalSemaphoreHandle); // EP-imported view of shared external semaphore
336+
ORT_RUNTIME_CLASS(DeviceEpIncompatibilityDetails);
336337
ORT_RUNTIME_CLASS(EpAssignedSubgraph);
337338
ORT_RUNTIME_CLASS(EpAssignedNode);
338339

@@ -512,6 +513,16 @@ typedef enum OrtExecutionProviderDevicePolicy {
512513
OrtExecutionProviderDevicePolicy_MIN_OVERALL_POWER,
513514
} OrtExecutionProviderDevicePolicy;
514515

516+
/** \brief Reasons why an execution provider might not be compatible with a device
517+
*/
518+
typedef enum OrtDeviceEpIncompatibilityReason {
519+
OrtDeviceEpIncompatibility_NONE = 0,
520+
OrtDeviceEpIncompatibility_DRIVER_INCOMPATIBLE = 1 << 0,
521+
OrtDeviceEpIncompatibility_DEVICE_INCOMPATIBLE = 1 << 1,
522+
OrtDeviceEpIncompatibility_MISSING_DEPENDENCY = 1 << 2,
523+
OrtDeviceEpIncompatibility_UNKNOWN = 1 << 31
524+
} OrtDeviceEpIncompatibilityReason;
525+
515526
/** \brief Delegate to allow providing custom OrtEpDevice selection logic
516527
*
517528
* This delegate is called by the EP selection code to allow the user to provide custom device selection logic.
@@ -6786,6 +6797,123 @@ struct OrtApi {
67866797
ORT_API2_STATUS(SessionGetEpDeviceForOutputs, _In_ const OrtSession* session,
67876798
_Out_writes_(num_outputs) const OrtEpDevice** outputs_ep_devices,
67886799
_In_ size_t num_outputs);
6800+
/** \brief Get the number of available hardware devices.
6801+
*
6802+
* Returns the count of hardware devices discovered on the system.
6803+
* Use this to allocate an array before calling GetHardwareDevices().
6804+
*
6805+
* \param[in] env The OrtEnv instance where device discovery results are stored.
6806+
* \param[out] num_devices The number of OrtHardwareDevice instances available.
6807+
*
6808+
* \snippet{doc} snippets.dox OrtStatus Return Value
6809+
*
6810+
* \since Version 1.24.
6811+
*/
6812+
ORT_API2_STATUS(GetNumHardwareDevices, _In_ const OrtEnv* env, _Out_ size_t* num_devices);
6813+
6814+
/** \brief Get the list of available hardware devices.
6815+
*
6816+
* Enumerates hardware devices available on the system.
6817+
* Populates a user-provided array with pointers to OrtHardwareDevice instances. The caller is responsible
6818+
* for allocating the array with sufficient space (use GetNumHardwareDevices() to get the count).
6819+
*
6820+
* The returned pointers reference internal ORT data structures that are discovered once at process
6821+
* startup and remain valid for the lifetime of the OrtEnv. The caller does not need to release these
6822+
* pointers, but should not use them after calling ReleaseEnv().
6823+
*
6824+
* \param[in] env The OrtEnv instance where device discovery results are stored.
6825+
* \param[out] devices User-allocated array to receive pointers to OrtHardwareDevice instances.
6826+
* The array must have space for at least num_devices elements.
6827+
* \param[in] num_devices The size of the user-allocated devices array.
6828+
*
6829+
* \snippet{doc} snippets.dox OrtStatus Return Value
6830+
*
6831+
* \since Version 1.24.
6832+
*/
6833+
ORT_API2_STATUS(GetHardwareDevices, _In_ const OrtEnv* env,
6834+
_Out_writes_(num_devices) const OrtHardwareDevice** devices,
6835+
_In_ size_t num_devices);
6836+
6837+
/** \brief Check for known incompatibility issues between hardware device and a specific execution provider.
6838+
*
6839+
* This function checks for known incompatibility issues between the specified hardware device
6840+
* and a specific execution provider.
6841+
* If returned incompatibility details have non-zero reasons, it indicates the device is not compatible.
6842+
* However, if returned detail have reason == 0, it doesn't guarantee 100% compatibility for all models,
6843+
* as models may have specific requirements.
6844+
*
6845+
* Note: This method should only be called when the OrtEnv has been initialized with execution
6846+
* providers (after RegisterExecutionProviderLibrary is called).
6847+
*
6848+
* \param[in] env The OrtEnv instance with registered execution providers.
6849+
* \param[in] ep_name The name of the execution provider to check. Required and cannot be null or empty.
6850+
* \param[in] hw The hardware device to check for incompatibility.
6851+
* \param[out] details Compatibility details including reasons for incompatibility if any.
6852+
* Must be freed with OrtApi::ReleaseDeviceEpIncompatibilityDetails.
6853+
*
6854+
* \snippet{doc} snippets.dox OrtStatus Return Value
6855+
*
6856+
* \since Version 1.24.
6857+
*/
6858+
ORT_API2_STATUS(GetHardwareDeviceEpIncompatibilityDetails, _In_ const OrtEnv* env,
6859+
_In_ const char* ep_name,
6860+
_In_ const OrtHardwareDevice* hw,
6861+
_Outptr_ OrtDeviceEpIncompatibilityDetails** details);
6862+
6863+
/// \name OrtDeviceEpIncompatibilityDetails
6864+
/// Accessor functions for device incompatibility details
6865+
/// @{
6866+
6867+
/** \brief Get the incompatibility reasons bitmask from OrtDeviceEpIncompatibilityDetails.
6868+
*
6869+
* \param[in] details The OrtDeviceEpIncompatibilityDetails instance to query.
6870+
* \param[out] reasons_bitmask Pointer to store the bitmask of incompatibility reasons.
6871+
*
6872+
* \snippet{doc} snippets.dox OrtStatus Return Value
6873+
*
6874+
* \since Version 1.24.
6875+
*/
6876+
ORT_API2_STATUS(DeviceEpIncompatibilityDetails_GetReasonsBitmask,
6877+
_In_ const OrtDeviceEpIncompatibilityDetails* details,
6878+
_Out_ uint32_t* reasons_bitmask);
6879+
6880+
/** \brief Get the notes from OrtDeviceEpIncompatibilityDetails.
6881+
*
6882+
* \param[in] details The OrtDeviceEpIncompatibilityDetails instance to query.
6883+
* \param[out] notes Pointer to the notes string. May be nullptr if no notes are available.
6884+
* The returned string is owned by the details object and should not be freed.
6885+
*
6886+
* \snippet{doc} snippets.dox OrtStatus Return Value
6887+
*
6888+
* \since Version 1.24.
6889+
*/
6890+
ORT_API2_STATUS(DeviceEpIncompatibilityDetails_GetNotes,
6891+
_In_ const OrtDeviceEpIncompatibilityDetails* details,
6892+
_Outptr_result_maybenull_ const char** notes);
6893+
6894+
/** \brief Get the execution provider error code from OrtDeviceEpIncompatibilityDetails.
6895+
*
6896+
* This allows Independent Hardware Vendors (IHVs) to define their own error codes
6897+
* to provide additional details about device incompatibility.
6898+
*
6899+
* \param[in] details The OrtDeviceEpIncompatibilityDetails instance to query.
6900+
* \param[out] error_code Pointer to store the EP-specific error code. A value of 0 indicates no error code was set.
6901+
*
6902+
* \snippet{doc} snippets.dox OrtStatus Return Value
6903+
*
6904+
* \since Version 1.24.
6905+
*/
6906+
ORT_API2_STATUS(DeviceEpIncompatibilityDetails_GetErrorCode,
6907+
_In_ const OrtDeviceEpIncompatibilityDetails* details,
6908+
_Out_ int32_t* error_code);
6909+
6910+
/** \brief Release an OrtDeviceEpIncompatibilityDetails instance.
6911+
*
6912+
* \since Version 1.24.
6913+
*/
6914+
ORT_CLASS_RELEASE(DeviceEpIncompatibilityDetails);
6915+
6916+
/// @}
67896917

67906918
/** \brief Get information about the subgraphs assigned to each EP and the nodes within.
67916919
*

include/onnxruntime/core/session/onnxruntime_ep_c_api.h

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1308,6 +1308,27 @@ struct OrtEpApi {
13081308
*/
13091309
ORT_API2_STATUS(KernelInfo_GetEp, _In_ const OrtKernelInfo* info, _Outptr_ const OrtEp** ep);
13101310

1311+
/** \brief Set the details of an OrtDeviceEpIncompatibilityDetails instance.
1312+
*
1313+
* Used by execution provider factories to set incompatibility details in their
1314+
* GetHardwareDeviceIncompatibilityDetails implementation. ORT creates and initializes the object
1315+
* before passing it to the EP, so calling this function is optional. The EP uses this function
1316+
* to set incompatibility information when the device is not compatible.
1317+
*
1318+
* \param[in,out] details The OrtDeviceEpIncompatibilityDetails instance to update.
1319+
* \param[in] reasons_bitmask Bitmask of OrtDeviceEpIncompatibilityReason values. (0 = no incompatibility).
1320+
* \param[in] error_code Optional EP-specific error code (0 = no error).
1321+
* \param[in] notes Optional human-readable notes. Can be null.
1322+
*
1323+
* \snippet{doc} snippets.dox OrtStatus Return Value
1324+
*
1325+
* \since Version 1.24.
1326+
*/
1327+
ORT_API2_STATUS(DeviceEpIncompatibilityDetails_SetDetails, _Inout_ OrtDeviceEpIncompatibilityDetails* details,
1328+
_In_ uint32_t reasons_bitmask,
1329+
_In_ int32_t error_code,
1330+
_In_opt_z_ const char* notes);
1331+
13111332
/** \brief Creates an OrtKernelImpl instance for an If operator.
13121333
*
13131334
* Control flow operators require access to ORT session internals to orchestrate subgraph operations.
@@ -1990,6 +2011,30 @@ struct OrtEpFactory {
19902011
*/
19912012
ORT_API2_STATUS(SetEnvironmentOptions, _In_ OrtEpFactory* this_ptr, _In_ const OrtKeyValuePairs* options);
19922013

2014+
/** \brief Check for known incompatibility reasons between a hardware device and this execution provider.
2015+
*
2016+
* This function allows an execution provider to check if a specific hardware device is compatible
2017+
* with the execution provider. The EP can set specific incompatibility reasons via the
2018+
* OrtDeviceEpIncompatibilityDetails parameter using OrtEpApi::DeviceEpIncompatibilityDetails_SetDetails.
2019+
*
2020+
* \param[in] this_ptr The OrtEpFactory instance.
2021+
* \param[in] hw The hardware device to check for incompatibility.
2022+
* \param[in,out] details Pre-allocated incompatibility details object created and initialized by ORT.
2023+
* The EP can use OrtEpApi::DeviceEpIncompatibilityDetails_SetDetails to set
2024+
* incompatibility information. If the device is compatible, the EP can
2025+
* leave the object unchanged (it defaults to no incompatibility).
2026+
*
2027+
* \note Implementation of this function is optional.
2028+
* If not implemented, ORT will assume the device is compatible with this EP.
2029+
*
2030+
* \snippet{doc} snippets.dox OrtStatus Return Value
2031+
*
2032+
* \since Version 1.24.
2033+
*/
2034+
ORT_API2_STATUS(GetHardwareDeviceIncompatibilityDetails, _In_ OrtEpFactory* this_ptr,
2035+
_In_ const OrtHardwareDevice* hw,
2036+
_Inout_ OrtDeviceEpIncompatibilityDetails* details);
2037+
19932038
/** \brief Create an OrtExternalResourceImporterImpl for external resource import.
19942039
*
19952040
* This is used to create an external resource importer that enables zero-copy import of

js/node/src/session_options_helper.cc

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -143,15 +143,15 @@ void ParseExecutionProviders(const Napi::Array epList, Ort::SessionOptions& sess
143143
#ifdef USE_CUDA
144144
} else if (name == "cuda") {
145145
OrtCUDAProviderOptionsV2* options;
146-
Ort::GetApi().CreateCUDAProviderOptions(&options);
146+
Ort::ThrowOnError(Ort::GetApi().CreateCUDAProviderOptions(&options));
147147
options->device_id = deviceId;
148148
sessionOptions.AppendExecutionProvider_CUDA_V2(*options);
149149
Ort::GetApi().ReleaseCUDAProviderOptions(options);
150150
#endif
151151
#ifdef USE_TENSORRT
152152
} else if (name == "tensorrt") {
153153
OrtTensorRTProviderOptionsV2* options;
154-
Ort::GetApi().CreateTensorRTProviderOptions(&options);
154+
Ort::ThrowOnError(Ort::GetApi().CreateTensorRTProviderOptions(&options));
155155
options->device_id = deviceId;
156156
sessionOptions.AppendExecutionProvider_TensorRT_V2(*options);
157157
Ort::GetApi().ReleaseTensorRTProviderOptions(options);

onnxruntime/contrib_ops/cpu/quantization/dynamic_quantize_matmul.cc

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -163,7 +163,7 @@ class DynamicQuantizeMatMul final : public MatMulIntegerToFloatBase {
163163

164164
Status Compute(OpKernelContext* context) const override;
165165

166-
#if defined(USE_KLEIDIAI) && !defined(_MSC_VER)
166+
#if defined(USE_KLEIDIAI)
167167
Status PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
168168
/*out*/ bool& is_packed,
169169
/*out*/ PrePackedWeights* prepacked_weights) override {
@@ -307,7 +307,7 @@ class DynamicQuantizeMatMul final : public MatMulIntegerToFloatBase {
307307
private:
308308
// Indicates when MlasDynamicQGemmBatch() can be used
309309
bool can_use_dynamic_quant_mlas_{false};
310-
#if defined(USE_KLEIDIAI) && !defined(_MSC_VER)
310+
#if defined(USE_KLEIDIAI)
311311
// Indicates that the biases are a constant input and thus already quantized / packed
312312
bool dynamic_quant_mlas_bias_data_was_packed_{false};
313313
#endif
@@ -382,7 +382,7 @@ Status DynamicQuantizeMatMul::Compute(OpKernelContext* ctx) const {
382382
}
383383
// Guard against KleidiAI functions being called in non kleidi builds
384384
// TODO: migrate to a suitable override function call for kleidi dynamic qgemm function calls
385-
#if defined(USE_KLEIDIAI) && !defined(_MSC_VER)
385+
#if defined(USE_KLEIDIAI)
386386
else {
387387
MatMulComputeHelper helper;
388388
ORT_RETURN_IF_ERROR(helper.Compute(ctx->Input<Tensor>(IN_A)->Shape(),

0 commit comments

Comments
 (0)