Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 0 additions & 36 deletions csharp/src/Microsoft.ML.OnnxRuntime/targets/netstandard/props.xml
Original file line number Diff line number Diff line change
Expand Up @@ -28,14 +28,7 @@
</Link>
</ItemDefinitionGroup>

<ItemDefinitionGroup Condition="'$(PlatformTarget)' == 'x86' OR ('$(PlatformTarget)' == 'AnyCPU' AND '$(Prefer32Bit)' == 'true')">
<Link>
<AdditionalDependencies>$(MSBuildThisFileDirectory)../../runtimes/win-x86/native/onnxruntime.lib;%(AdditionalDependencies)</AdditionalDependencies>
</Link>
</ItemDefinitionGroup>

<PropertyGroup>
<EnginePlatform Condition="'$(Platform)' == 'Win32'">x86</EnginePlatform>
<EnginePlatform Condition="'$(Platform)' == 'ARM64'">arm64</EnginePlatform>
<EnginePlatform Condition="'$(Platform)' == 'ARM'">arm</EnginePlatform>
<EnginePlatform Condition="'$(Platform)' != 'Win32' AND '$(Platform)' != 'ARM64'">$(Platform)</EnginePlatform>
Expand Down Expand Up @@ -147,34 +140,5 @@
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
<Visible>false</Visible>
</None>

<!-- x86 -->
<None Include="$(MSBuildThisFileDirectory)..\..\runtimes\win-x86\native\onnxruntime.dll"
Condition="('$(PlatformTarget)' == 'x86' OR ('$(PlatformTarget)' == 'AnyCPU' AND '$(Prefer32Bit)' == 'true'))">
<Link>onnxruntime.dll</Link>
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
<Visible>false</Visible>
</None>
<None Include="$(MSBuildThisFileDirectory)..\..\runtimes\win-x86\native\dnnl.dll"
Condition="('$(PlatformTarget)' == 'x86' OR ('$(PlatformTarget)' == 'AnyCPU' AND '$(Prefer32Bit)' == 'true')) AND
Exists('$(MSBuildThisFileDirectory)..\..\runtimes\win-x86\native\dnnl.dll')">
<Link>dnnl.dll</Link>
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
<Visible>false</Visible>
</None>
<None Include="$(MSBuildThisFileDirectory)..\..\runtimes\win-x86\native\mklml.dll"
Condition="('$(PlatformTarget)' == 'x86' OR ('$(PlatformTarget)' == 'AnyCPU' AND '$(Prefer32Bit)' == 'true')) AND
Exists('$(MSBuildThisFileDirectory)..\..\runtimes\win-x86\native\mklml.dll')">
<Link>mklml.dll</Link>
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
<Visible>false</Visible>
</None>
<None Include="$(MSBuildThisFileDirectory)..\..\runtimes\win-x86\native\libiomp5md.dll"
Condition="('$(PlatformTarget)' == 'x86' OR ('$(PlatformTarget)' == 'AnyCPU' AND '$(Prefer32Bit)' == 'true')) AND
Exists('$(MSBuildThisFileDirectory)..\..\runtimes\win-x86\native\libiomp5md.dll')">
<Link>libiomp5md.dll</Link>
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
<Visible>false</Visible>
</None>
</ItemGroup>
</Project>
Original file line number Diff line number Diff line change
Expand Up @@ -28,14 +28,7 @@
</Link>
</ItemDefinitionGroup>

<ItemDefinitionGroup Condition="'$(PlatformTarget)' == 'x86' OR ('$(PlatformTarget)' == 'AnyCPU' AND '$(Prefer32Bit)' == 'true')">
<Link>
<AdditionalDependencies>$(MSBuildThisFileDirectory)../../runtimes/win-x86/native/onnxruntime.lib;%(AdditionalDependencies)</AdditionalDependencies>
</Link>
</ItemDefinitionGroup>

<PropertyGroup>
<EnginePlatform Condition="'$(Platform)' == 'Win32'">x86</EnginePlatform>
<EnginePlatform Condition="'$(Platform)' == 'ARM64'">arm64</EnginePlatform>
<EnginePlatform Condition="'$(Platform)' == 'ARM'">arm</EnginePlatform>
<EnginePlatform Condition="'$(Platform)' != 'Win32' AND '$(Platform)' != 'ARM64'">$(Platform)</EnginePlatform>
Expand Down Expand Up @@ -91,13 +84,5 @@
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
<Visible>false</Visible>
</None>

<!-- x86 -->
<None Include="$(MSBuildThisFileDirectory)..\..\runtimes\win-x86\native\onnxruntime.dll"
Condition="('$(PlatformTarget)' == 'x86' OR ('$(PlatformTarget)' == 'AnyCPU' AND '$(Prefer32Bit)' == 'true'))">
<Link>onnxruntime.dll</Link>
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
<Visible>false</Visible>
</None>
</ItemGroup>
</Project>
4 changes: 3 additions & 1 deletion onnxruntime/contrib_ops/webgpu/moe/moe.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@

#pragma once

#include <limits>

#include "core/providers/webgpu/program.h"
#include "core/providers/webgpu/webgpu_kernel.h"

Expand Down Expand Up @@ -31,7 +33,7 @@ class MoE : public WebGpuKernel {
activation_alpha_ = static_cast<float>(info.GetAttrOrDefault<float>("activation_alpha", 1.0));
activation_beta_ = static_cast<float>(info.GetAttrOrDefault<float>("activation_beta", 1.0));
swiglu_fusion_ = static_cast<int>(info.GetAttrOrDefault<int64_t>("swiglu_fusion", 0));
swiglu_limit_ = info.GetAttrOrDefault<float>("swiglu_limit", 0);
swiglu_limit_ = info.GetAttrOrDefault<float>("swiglu_limit", std::numeric_limits<float>::infinity());
k_ = static_cast<int>(info.GetAttrOrDefault<int64_t>("k", 4));
normalize_routing_weights_ = info.GetAttrOrDefault<int64_t>("normalize_routing_weights", 0) == 1;
use_sparse_mixer_ = info.GetAttrOrDefault<int64_t>("use_sparse_mixer", 0) == 1;
Expand Down
35 changes: 33 additions & 2 deletions onnxruntime/core/mlas/lib/kleidiai/convolve_kleidiai.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -395,6 +395,12 @@ static std::shared_ptr<const void*[]> LhsPtrFill(const size_t ci, const size_t i
auto lhs_ptrs = std::shared_ptr<const void*[]>(new const void*[lhs_ptrs_k * lhs_ptrs_m],
std::default_delete<const void*[]>());

// Initialize all padding entries. For partial tiles (m < m_step),
// the kai LHS packing kernel may still read pointer entries beyond the logically
// filled 'm' positions. Leaving these uninitialized can cause non-deterministic
// reads and corrupt packed LHS data.
auto lhs_ptrs_ = lhs_ptrs.get();
std::fill(lhs_ptrs_, lhs_ptrs_ + (lhs_ptrs_k * lhs_ptrs_m), reinterpret_cast<const void*>(&pad_ptr[0]));

auto ih_out_size = ComputeConvOutSize(ih, kh, padding, 1);
auto iw_out_size = ComputeConvOutSize(iw, kw, padding, 1);
Expand Down Expand Up @@ -430,7 +436,6 @@ static std::shared_ptr<const void*[]> LhsPtrFill(const size_t ci, const size_t i
};

size_t m_{0};
auto lhs_ptrs_ = lhs_ptrs.get();
for (size_t ih_ = 0; ih_ < ih_out_size; ih_ += sh) {
for (size_t iw_ = 0; iw_ < iw_out_size; iw_ += sw, ++m_) {
size_t k_{0};
Expand Down Expand Up @@ -460,7 +465,23 @@ static std::unique_ptr<std::byte[]> LhsPackImageDataSme(const size_t ci, const s
// figure out how many blocks needed to correctly fill padding
padsize = ((ci + padsize - 1) / padsize) * padsize;
}
static std::vector<float>pad_ptr(padsize, 0.f);

// pad_ptr must be at least 'ci' floats for padding pixels.
// Using a thread_local grow-only buffer to avoid cross-thread interference and ensure sizing is correct.
thread_local std::vector<float> pad_ptr;
const float* old_pad_ptr = pad_ptr.data();
bool has_pad_ptr_changed = false;

if (pad_ptr.size() < padsize) {
pad_ptr.resize(padsize, 0.f);
if (pad_ptr.data() != old_pad_ptr) {
has_pad_ptr_changed = true;
}
} else {
// Ensure any previously-used region remains zeroed (grow-only means it should already be zeros,
// but keep this explicit for safety).
std::fill(pad_ptr.begin(), pad_ptr.end(), 0.f);
}

LhsCacheKey key = {
ci, ih, iw,
Expand All @@ -481,6 +502,16 @@ static std::unique_ptr<std::byte[]> LhsPackImageDataSme(const size_t ci, const s
// Cache of computed lhs ptr offsets. thread_local to prevent interference from parallel sessions.
thread_local std::unordered_map<LhsCacheKey, std::shared_ptr<const void*[]>> lhs_ptrs_cache;

if (has_pad_ptr_changed)
{
// If the pad buffer was resized and a re-allocation has occurred, the cached lhs ptrs are invalid as they
// would be referencing the old pad buffer.
// See discussion in https://github.com/microsoft/onnxruntime/pull/27214.
// TODO(hasesh / JonathanC-ARM): A better approach would be to include the pad buffer address in the cache key
// or any other approach that would reduce unnecessary cache invalidations.
lhs_ptrs_cache.clear();
}

std::shared_ptr<const void*[]> lhs_ptrs;
if (auto found = lhs_ptrs_cache.find(key); found != lhs_ptrs_cache.end()) {
lhs_ptrs = found->second;
Expand Down
36 changes: 18 additions & 18 deletions onnxruntime/core/mlas/lib/rotary_embedding_kernel_neon_fp16.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -150,8 +150,8 @@ RopeKernel_Fp16_Impl<true>(
if (i + 15 < dim) {
float16x8_t x0 = MlasLoadFloat16x8(input + i);
float16x8_t x1 = MlasLoadFloat16x8(input + i + 8);
float16x8_t sin_val = MlasLoadFloat16x8(sin + i);
float16x8_t cos_val = MlasLoadFloat16x8(cos + i);
float16x8_t sin_val = MlasLoadFloat16x8(sin + i / 2);
float16x8_t cos_val = MlasLoadFloat16x8(cos + i / 2);
for (; i + 31 < dim; i += 16) {
float16x8_t real = vuzp1q_f16(x0, x1);
float16x8_t imag = vuzp2q_f16(x0, x1);
Expand All @@ -163,8 +163,8 @@ RopeKernel_Fp16_Impl<true>(
MlasStoreFloat16x8(output + i + 8, y1);
x0 = MlasLoadFloat16x8(input + i + 16);
x1 = MlasLoadFloat16x8(input + i + 24);
sin_val = MlasLoadFloat16x8(sin + i + 16);
cos_val = MlasLoadFloat16x8(cos + i + 16);
sin_val = MlasLoadFloat16x8(sin + (i + 16) / 2);
cos_val = MlasLoadFloat16x8(cos + (i + 16) / 2);
}
float16x8_t real = vuzp1q_f16(x0, x1);
float16x8_t imag = vuzp2q_f16(x0, x1);
Expand All @@ -181,8 +181,8 @@ RopeKernel_Fp16_Impl<true>(
float16x4_t x1 = MlasLoadFloat16x4(input + i + 4);
float16x4_t real = vuzp1_f16(x0, x1);
float16x4_t imag = vuzp2_f16(x0, x1);
float16x4_t sin_val = MlasLoadFloat16x4(sin + i);
float16x4_t cos_val = MlasLoadFloat16x4(cos + i);
float16x4_t sin_val = MlasLoadFloat16x4(sin + i / 2);
float16x4_t cos_val = MlasLoadFloat16x4(cos + i / 2);
float16x4_t real_out = vfms_f16(vmul_f16(real, cos_val), imag, sin_val);
float16x4_t imag_out = vfma_f16(vmul_f16(real, sin_val), imag, cos_val);
float16x4_t y0 = vzip1_f16(real_out, imag_out);
Expand All @@ -201,12 +201,12 @@ RopeKernel_Fp16_Impl<true>(
imag = MlasLoadLaneFloat16x4<1>(input + i + 3, imag);
real = MlasLoadLaneFloat16x4<2>(input + i + 4, real);
imag = MlasLoadLaneFloat16x4<2>(input + i + 5, imag);
sin_val = MlasLoadLaneFloat16x4<0>(sin + i, sin_val);
sin_val = MlasLoadLaneFloat16x4<1>(sin + i + 1, sin_val);
sin_val = MlasLoadLaneFloat16x4<2>(sin + i + 2, sin_val);
cos_val = MlasLoadLaneFloat16x4<0>(cos + i, cos_val);
cos_val = MlasLoadLaneFloat16x4<1>(cos + i + 1, cos_val);
cos_val = MlasLoadLaneFloat16x4<2>(cos + i + 2, cos_val);
sin_val = MlasLoadLaneFloat16x4<0>(sin + i / 2, sin_val);
sin_val = MlasLoadLaneFloat16x4<1>(sin + i / 2 + 1, sin_val);
sin_val = MlasLoadLaneFloat16x4<2>(sin + i / 2 + 2, sin_val);
cos_val = MlasLoadLaneFloat16x4<0>(cos + i / 2, cos_val);
cos_val = MlasLoadLaneFloat16x4<1>(cos + i / 2 + 1, cos_val);
cos_val = MlasLoadLaneFloat16x4<2>(cos + i / 2 + 2, cos_val);
float16x4_t real_out = vfms_f16(vmul_f16(real, cos_val), imag, sin_val);
float16x4_t imag_out = vfma_f16(vmul_f16(real, sin_val), imag, cos_val);
MlasStoreLaneFloat16x4<0>(output + i, real_out);
Expand All @@ -224,10 +224,10 @@ RopeKernel_Fp16_Impl<true>(
imag = MlasLoadLaneFloat16x4<0>(input + i + 1, imag);
real = MlasLoadLaneFloat16x4<1>(input + i + 2, real);
imag = MlasLoadLaneFloat16x4<1>(input + i + 3, imag);
sin_val = MlasLoadLaneFloat16x4<0>(sin + i, sin_val);
sin_val = MlasLoadLaneFloat16x4<1>(sin + i + 1, sin_val);
cos_val = MlasLoadLaneFloat16x4<0>(cos + i, cos_val);
cos_val = MlasLoadLaneFloat16x4<1>(cos + i + 1, cos_val);
sin_val = MlasLoadLaneFloat16x4<0>(sin + i / 2, sin_val);
sin_val = MlasLoadLaneFloat16x4<1>(sin + i / 2 + 1, sin_val);
cos_val = MlasLoadLaneFloat16x4<0>(cos + i / 2, cos_val);
cos_val = MlasLoadLaneFloat16x4<1>(cos + i / 2 + 1, cos_val);
float16x4_t real_out = vfms_f16(vmul_f16(real, cos_val), imag, sin_val);
float16x4_t imag_out = vfma_f16(vmul_f16(real, sin_val), imag, cos_val);
MlasStoreLaneFloat16x4<0>(output + i, real_out);
Expand All @@ -241,8 +241,8 @@ RopeKernel_Fp16_Impl<true>(
float16x4_t cos_val = MlasZeroFloat16x4();
real = MlasLoadLaneFloat16x4<0>(input + i, real);
imag = MlasLoadLaneFloat16x4<0>(input + i + 1, imag);
sin_val = MlasLoadLaneFloat16x4<0>(sin + i, sin_val);
cos_val = MlasLoadLaneFloat16x4<0>(cos + i, cos_val);
sin_val = MlasLoadLaneFloat16x4<0>(sin + i / 2, sin_val);
cos_val = MlasLoadLaneFloat16x4<0>(cos + i / 2, cos_val);
float16x4_t real_out = vfms_f16(vmul_f16(real, cos_val), imag, sin_val);
float16x4_t imag_out = vfma_f16(vmul_f16(real, sin_val), imag, cos_val);
MlasStoreLaneFloat16x4<0>(output + i, real_out);
Expand Down
31 changes: 24 additions & 7 deletions onnxruntime/python/onnxruntime_inference_collection.py
Original file line number Diff line number Diff line change
Expand Up @@ -521,8 +521,25 @@ def __init__(
def _create_inference_session(self, providers, provider_options, disabled_optimizers=None):
available_providers = C.get_available_providers()

# Tensorrt can fall back to CUDA if it's explicitly assigned. All others fall back to CPU.
if "TensorrtExecutionProvider" in available_providers:
# Validate that TensorrtExecutionProvider and NvTensorRTRTXExecutionProvider are not both specified
if providers:
has_tensorrt = any(
provider == "TensorrtExecutionProvider"
or (isinstance(provider, tuple) and provider[0] == "TensorrtExecutionProvider")
for provider in providers
)
has_tensorrt_rtx = any(
provider == "NvTensorRTRTXExecutionProvider"
or (isinstance(provider, tuple) and provider[0] == "NvTensorRTRTXExecutionProvider")
for provider in providers
)
if has_tensorrt and has_tensorrt_rtx:
raise ValueError(
"Cannot enable both 'TensorrtExecutionProvider' and 'NvTensorRTRTXExecutionProvider' "
"in the same session."
)
# Tensorrt and TensorRT RTX can fall back to CUDA if it's explicitly assigned. All others fall back to CPU.
if "NvTensorRTRTXExecutionProvider" in available_providers:
if (
providers
and any(
Expand All @@ -531,15 +548,15 @@ def _create_inference_session(self, providers, provider_options, disabled_optimi
for provider in providers
)
and any(
provider == "TensorrtExecutionProvider"
or (isinstance(provider, tuple) and provider[0] == "TensorrtExecutionProvider")
provider == "NvTensorRTRTXExecutionProvider"
or (isinstance(provider, tuple) and provider[0] == "NvTensorRTRTXExecutionProvider")
for provider in providers
)
):
self._fallback_providers = ["CUDAExecutionProvider", "CPUExecutionProvider"]
else:
self._fallback_providers = ["CPUExecutionProvider"]
if "NvTensorRTRTXExecutionProvider" in available_providers:
elif "TensorrtExecutionProvider" in available_providers:
if (
providers
and any(
Expand All @@ -548,8 +565,8 @@ def _create_inference_session(self, providers, provider_options, disabled_optimi
for provider in providers
)
and any(
provider == "NvTensorRTRTXExecutionProvider"
or (isinstance(provider, tuple) and provider[0] == "NvExecutionProvider")
provider == "TensorrtExecutionProvider"
or (isinstance(provider, tuple) and provider[0] == "TensorrtExecutionProvider")
for provider in providers
)
):
Expand Down
14 changes: 8 additions & 6 deletions onnxruntime/python/tools/transformers/machine_info.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
# It is used to dump machine information for Notebooks

import argparse
import importlib.metadata
import json
import logging
import platform
Expand Down Expand Up @@ -122,10 +123,7 @@ def get_gpu_info_by_nvml(self) -> dict:
return result

def get_related_packages(self) -> list[str]:
import pkg_resources # noqa: PLC0415

installed_packages = pkg_resources.working_set
related_packages = [
related_packages = {
"onnxruntime-gpu",
"onnxruntime",
"onnx",
Expand All @@ -137,8 +135,12 @@ def get_related_packages(self) -> list[str]:
"flatbuffers",
"numpy",
"onnxconverter-common",
]
related_packages_list = {i.key: i.version for i in installed_packages if i.key in related_packages}
}
related_packages_list = {}
for dist in importlib.metadata.distributions():
if dist.metadata["Name"].lower() in related_packages:
related_packages_list[dist.metadata["Name"].lower()] = dist.version

return related_packages_list

def get_onnxruntime_info(self) -> dict:
Expand Down
Loading
Loading