Round 2 of cherry-picks into rel-1.21.0 (#23899)

amarin16 · jambayk · fs-eire · web-flow · commit e0b66cad2820 · 2025-03-05T16:37:52.000-08:00
The second round of cherry-picks into [rel-1.21.0](https://github.com/microsoft/onnxruntime/tree/rel-1.21.0). The first one was done in #23846. - #23779 - #23856 - #23827 - #23834 - #23876 - #23892 --------- Co-authored-by: Jambay Kinley <jambaykinley@microsoft.com> Co-authored-by: Yulong Wang <7679871+fs-eire@users.noreply.github.com> Co-authored-by: Ashish Garg <quic_ashigarg@quicinc.com> Co-authored-by: Ashish Garg <ashigarg@qti.qualcomm.com>
diff --git a/ThirdPartyNotices.txt b/ThirdPartyNotices.txt
@@ -6045,3 +6045,38 @@ https://github.com/intel/neural-speed
    terms, and open source software license terms. These separate license terms
    govern your use of the third party programs as set forth in the
    "THIRD-PARTY-PROGRAMS" file.
+
+_____
+
+dawn
+
+https://dawn.googlesource.com/dawn
+
+   BSD 3-Clause License
+
+   Copyright 2017-2023 The Dawn & Tint Authors
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are met:
+
+   1. Redistributions of source code must retain the above copyright notice, this
+      list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright notice,
+      this list of conditions and the following disclaimer in the documentation
+      and/or other materials provided with the distribution.
+
+   3. Neither the name of the copyright holder nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+   AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+   IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+   DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+   FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+   DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+   SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+   CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+   OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/cmake/onnxruntime_framework.cmake b/cmake/onnxruntime_framework.cmake
@@ -36,10 +36,7 @@ elseif(onnxruntime_ENABLE_TRITON)
 endif()
 
 if (onnxruntime_MINIMAL_BUILD)
-  set(onnxruntime_framework_src_exclude
-    "${ONNXRUNTIME_ROOT}/core/framework/fallback_cpu_capability.h"
-    "${ONNXRUNTIME_ROOT}/core/framework/fallback_cpu_capability.cc"
-  )
+  set(onnxruntime_framework_src_exclude)
 
   # custom ops support must be explicitly enabled in a minimal build. exclude if not.
   if (NOT onnxruntime_MINIMAL_BUILD_CUSTOM_OPS)
diff --git a/cmake/onnxruntime_providers_js.cmake b/cmake/onnxruntime_providers_js.cmake
@@ -1,6 +1,10 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT License.
 
+  if (onnxruntime_MINIMAL_BUILD AND NOT onnxruntime_EXTENDED_MINIMAL_BUILD)
+    message(FATAL_ERROR "JSEP can not be used in a basic minimal build. Please build with '--minimal_build extended'")
+  endif()
+
   add_compile_definitions(USE_JSEP=1)
 
   file(GLOB_RECURSE onnxruntime_providers_js_cc_srcs
@@ -18,4 +22,4 @@
     onnxruntime_common onnxruntime_framework onnx onnx_proto ${PROTOBUF_LIB} flatbuffers Boost::mp11 Eigen3::Eigen
   )
 
-  add_dependencies(onnxruntime_providers_js ${onnxruntime_EXTERNAL_DEPENDENCIES})
+  add_dependencies(onnxruntime_providers_js ${onnxruntime_EXTERNAL_DEPENDENCIES})
diff --git a/js/common/lib/tensor-impl-type-mapping.ts b/js/common/lib/tensor-impl-type-mapping.ts
@@ -44,12 +44,6 @@ export const NUMERIC_TENSOR_TYPEDARRAY_TO_TYPE_MAP = new Map<SupportedTypedArray
   [Uint32Array, 'uint32'],
 ]);
 
-// a dummy type declaration for Float16Array in case any polyfill is available.
-declare global {
-  // eslint-disable-next-line @typescript-eslint/naming-convention, @typescript-eslint/no-explicit-any
-  const Float16Array: any;
-}
-
 // the following code allows delaying execution of BigInt/Float16Array checking. This allows lazy initialization for
 // NUMERIC_TENSOR_TYPE_TO_TYPEDARRAY_MAP and NUMERIC_TENSOR_TYPEDARRAY_TO_TYPE_MAP, which allows BigInt/Float16Array
 // polyfill if available.
@@ -59,6 +53,9 @@ export const checkTypedArray = () => {
     isTypedArrayChecked = true;
     const isBigInt64ArrayAvailable = typeof BigInt64Array !== 'undefined' && BigInt64Array.from;
     const isBigUint64ArrayAvailable = typeof BigUint64Array !== 'undefined' && BigUint64Array.from;
+
+    // eslint-disable-next-line @typescript-eslint/naming-convention, @typescript-eslint/no-explicit-any
+    const Float16Array = (globalThis as any).Float16Array;
     const isFloat16ArrayAvailable = typeof Float16Array !== 'undefined' && Float16Array.from;
 
     if (isBigInt64ArrayAvailable) {
diff --git a/js/common/lib/tensor-impl.ts b/js/common/lib/tensor-impl.ts
@@ -261,6 +261,13 @@ export class Tensor implements TensorInterface {
             } else {
               throw new TypeError(`A Uint8ClampedArray tensor's data must be type of uint8`);
             }
+          } else if (arg0 === 'float16' && arg1 instanceof Uint16Array && typedArrayConstructor !== Uint16Array) {
+            // when Float16Array is available and data is of type Uint16Array.
+            // We allow Uint16Array to be passed in as data for 'float16' tensor until Float16Array is generally
+            // supported in JavaScript environment.
+
+            // eslint-disable-next-line @typescript-eslint/no-explicit-any
+            data = new (globalThis as any).Float16Array(arg1.buffer, arg1.byteOffset, arg1.length);
           } else {
             throw new TypeError(`A ${type} tensor's data must be type of ${typedArrayConstructor}`);
           }
diff --git a/js/common/package.json b/js/common/package.json
@@ -15,7 +15,8 @@
     "build": "node ./build.js",
     "prepare": "npm run build",
     "pretest": "tsc --build ./test",
-    "test": "mocha ./test/**/*.js --timeout 30000"
+    "test": "mocha \"./test/**/*.js\" --timeout 30000",
+    "test:f16": "mocha -n js-float16array \"./test/**/*.js\" --timeout 30000"
   },
   "devDependencies": {
     "typedoc": "^0.25.7"
diff --git a/js/common/test/unit-tests/common.ts b/js/common/test/unit-tests/common.ts
@@ -29,9 +29,10 @@ export const NUMBER_COMPATIBLE_NUMERICAL_TYPES = [
 export const BIGINT_TYPES = [['int64', BigInt64Array, true] as const, ['uint64', BigUint64Array, true] as const];
 
 /**
- * float16 type, data represented by Uint16Array
+ * float16 type, data represented by Uint16Array/Float16Array
  */
-export const FLOAT16_TYPE = ['float16', Uint16Array, false] as const;
+// eslint-disable-next-line @typescript-eslint/no-explicit-any
+export const FLOAT16_TYPE = ['float16', (globalThis as any).Float16Array ?? Uint16Array, false] as const;
 
 /**
  * A list of all numerical types.
diff --git a/js/common/test/unit-tests/tensor/constructor-f16.ts b/js/common/test/unit-tests/tensor/constructor-f16.ts
@@ -0,0 +1,62 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+import assert from 'assert/strict';
+import { Tensor } from 'onnxruntime-common';
+
+// eslint-disable-next-line @typescript-eslint/no-explicit-any
+const globalF16 = (globalThis as any).Float16Array;
+
+(globalF16 ? describe : describe.skip)('Tensor Constructor Tests - check type float16 (Float16Array available)', () => {
+  it("[float16] new Tensor('float16', numbers, dims): allow number array when Float16Array is available", () => {
+    const tensor = new Tensor('float16', [1, 2, 3, 4], [2, 2]);
+    assert.equal(tensor.type, 'float16', "tensor.type should be 'float16'");
+    assert(tensor.data instanceof globalF16, "tensor.data should be an instance of 'Float16Array'");
+    assert.equal(tensor.data[0], 1, 'tensor.data[0] should be 1');
+    assert.equal(tensor.data[1], 2, 'tensor.data[1] should be 2');
+    assert.equal(tensor.data[2], 3, 'tensor.data[2] should be 3');
+    assert.equal(tensor.data[3], 4, 'tensor.data[3] should be 4');
+    assert.equal(tensor.data.length, 4, 'tensor.data.length should be 4');
+  });
+
+  it("[float16] new Tensor('float16', float16array, dims): allow Float16Array when Float16Array is available", () => {
+    const tensor = new Tensor('float16', new globalF16([1, 2, 3, 4]), [2, 2]);
+    assert.equal(tensor.type, 'float16', "tensor.type should be 'float16'");
+    assert(tensor.data instanceof globalF16, "tensor.data should be an instance of 'Float16Array'");
+    assert.equal(tensor.data[0], 1, 'tensor.data[0] should be 1');
+    assert.equal(tensor.data[1], 2, 'tensor.data[1] should be 2');
+    assert.equal(tensor.data[2], 3, 'tensor.data[2] should be 3');
+    assert.equal(tensor.data[3], 4, 'tensor.data[3] should be 4');
+    assert.equal(tensor.data.length, 4, 'tensor.data.length should be 4');
+  });
+
+  it("[float16] new Tensor('float16', uint16array, dims): allow Uint16Array when Float16Array is available", () => {
+    const tensor = new Tensor('float16', new Uint16Array([15360, 16384, 16896, 17408]), [2, 2]);
+    assert.equal(tensor.type, 'float16', "tensor.type should be 'float16'");
+    assert(tensor.data instanceof globalF16, "tensor.data should be an instance of 'Float16Array'");
+    assert.equal(tensor.data[0], 1, 'tensor.data[0] should be 1');
+    assert.equal(tensor.data[1], 2, 'tensor.data[1] should be 2');
+    assert.equal(tensor.data[2], 3, 'tensor.data[2] should be 3');
+    assert.equal(tensor.data[3], 4, 'tensor.data[3] should be 4');
+    assert.equal(tensor.data.length, 4, 'tensor.data.length should be 4');
+  });
+});
+
+(globalF16 ? describe.skip : describe)(
+  'Tensor Constructor Tests - check type float16 (Float16Array not available)',
+  () => {
+    it(
+      "[float16] new Tensor('float16', numbers, dims): " +
+        "expect to throw because it's not allowed to construct 'float16' tensor from number array",
+      () => {
+        assert.throws(() => new Tensor('float16', [1, 2, 3, 4], [2, 2]), TypeError);
+      },
+    );
+
+    it("[float16] new Tensor('float16', uint16array, dims): allow Uint16Array", () => {
+      const tensor = new Tensor('float16', new Uint16Array([15360, 16384, 16896, 17408]), [2, 2]);
+      assert.equal(tensor.type, 'float16', "tensor.type should be 'float16'");
+      assert(tensor.data instanceof Uint16Array, "tensor.data should be an instance of 'Uint16Array'");
+    });
+  },
+);
diff --git a/js/common/test/unit-tests/tensor/constructor-type.ts b/js/common/test/unit-tests/tensor/constructor-type.ts
@@ -105,14 +105,6 @@ describe('Tensor Constructor Tests - check types', () => {
     assert(tensor.data instanceof Uint8Array, "tensor.data should be an instance of 'Uint8Array'");
   });
 
-  it(
-    "[float16] new Tensor('float16', numbers, dims): " +
-      "expect to throw because it's not allowed to construct 'float16' tensor from number array",
-    () => {
-      assert.throws(() => new Tensor('float16', [1, 2, 3, 4], [2, 2]), TypeError);
-    },
-  );
-
   it("[badtype] new Tensor('a', numbers, dims): expect to throw because 'a' is an invalid type", () => {
     assert.throws(() => new TensorAny('a', [1, 2, 3, 4], [2, 2]), TypeError);
   });
diff --git a/onnxruntime/core/framework/fallback_cpu_capability.cc b/onnxruntime/core/framework/fallback_cpu_capability.cc
@@ -1,6 +1,8 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
+#if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
+
 #include "core/framework/fallback_cpu_capability.h"
 #include "core/common/inlined_containers.h"
 
@@ -176,3 +178,5 @@ std::unordered_set<NodeIndex> GetCpuPreferredNodes(const onnxruntime::GraphViewe
 }
 
 }  // namespace onnxruntime
+
+#endif  // !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
diff --git a/onnxruntime/core/framework/fallback_cpu_capability.h b/onnxruntime/core/framework/fallback_cpu_capability.h
@@ -3,6 +3,8 @@
 
 #pragma once
 
+#if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
+
 #include <gsl/gsl>
 #include "core/common/inlined_containers_fwd.h"
 #include "core/framework/execution_provider.h"  // for IExecutionProvider::IKernelLookup
@@ -26,3 +28,5 @@ std::unordered_set<NodeIndex> GetCpuPreferredNodes(const GraphViewer& graph,
                                                    const logging::Logger& logger);
 
 }  // namespace onnxruntime
+
+#endif  // !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
diff --git a/onnxruntime/core/providers/qnn/qnn_allocator.cc b/onnxruntime/core/providers/qnn/qnn_allocator.cc
@@ -181,7 +181,9 @@ void HtpSharedMemoryAllocator::Free(void* allocation_address) {
   // Avoid throwing exceptions as this may be running from a destructor.
   try {
     // take ownership of shared memory and free at end of scope
-    auto shared_memory = WrapSharedMemoryWithUniquePtr(allocation_address, rpcmem_lib_->Api());
+    const size_t allocation_offset = AllocationOffsetFromStartOfHeader();
+    void* raw_allocation_address = (void*)((std::byte*)allocation_address - allocation_offset);
+    auto shared_memory = WrapSharedMemoryWithUniquePtr(raw_allocation_address, rpcmem_lib_->Api());
 
     // destroy header
     allocation_header.~AllocationHeader();
diff --git a/onnxruntime/python/tools/quantization/execution_providers/qnn/quant_config.py b/onnxruntime/python/tools/quantization/execution_providers/qnn/quant_config.py
@@ -55,6 +55,7 @@ def get_qnn_qdq_config(
     stride: int | None = None,
     calibration_providers: list[str] | None = None,
     op_types_to_quantize: list[str] | None = None,
+    nodes_to_exclude: list[str] | None = None,
 ) -> StaticQuantConfig:
     """
     Returns a static quantization configuration suitable for running QDQ models on QNN EP.
@@ -122,6 +123,8 @@ def get_qnn_qdq_config(
         calibration_providers: Execution providers to run the session during calibration. Default is None which uses
             [ "CPUExecutionProvider" ].
         op_types_to_quantize: If set to None, all operator types will be quantized except for OP_TYPES_TO_EXCLUDE
+        nodes_to_exclude: List of nodes names to exclude from quantization. The nodes in this list will be excluded from
+            quantization when it is not None.
 
     Returns:
         A StaticQuantConfig object
@@ -167,10 +170,13 @@ def get_qnn_qdq_config(
     )
 
     op_types_to_quantize_set = set(op_types_to_quantize) if op_types_to_quantize else None
+    nodes_to_exclude_set = set(nodes_to_exclude) if nodes_to_exclude else None
 
     for node in model.graph.node:
         if op_types_to_quantize_set and node.op_type not in op_types_to_quantize_set:
             continue
+        if nodes_to_exclude_set and node.name in nodes_to_exclude_set:
+            continue
         op_types.add(node.op_type)
         qnn_compat.process_node(node)
 
@@ -198,9 +204,10 @@ def get_qnn_qdq_config(
         calibrate_method=calibrate_method,
         activation_type=activation_type,
         weight_type=weight_type,
-        op_types_to_quantize=op_types_to_quantize
-        if op_types_to_quantize
-        else list(op_types.difference(OP_TYPES_TO_EXCLUDE)),
+        op_types_to_quantize=(
+            op_types_to_quantize if op_types_to_quantize else list(op_types.difference(OP_TYPES_TO_EXCLUDE))
+        ),
+        nodes_to_exclude=nodes_to_exclude,
         per_channel=per_channel,
         use_external_data_format=(model_has_external_data or model.ByteSize() >= MODEL_SIZE_THRESHOLD),
         calibration_providers=calibration_providers,
diff --git a/onnxruntime/python/tools/quantization/quantize.py b/onnxruntime/python/tools/quantization/quantize.py
@@ -240,6 +240,8 @@ def get_qdq_config(
     keep_removable_activations: bool = False,
     min_real_range: float | None = None,
     tensor_quant_overrides: dict[str, list[dict[str, Any]]] | None = None,
+    calibration_providers: list[str] | None = None,
+    op_types_to_quantize: list[str] | None = None,
     nodes_to_exclude: list[str] | Callable[[onnx.ModelProto, onnx.NodeProto], bool] | None = None,
     extra_options: dict | None = None,
 ) -> StaticQuantConfig:
@@ -294,6 +296,10 @@ def get_qdq_config(
                 'convert["recv_nodes"] = Set : Set of node names that consume the converted activation,
                                                other nodes get the original type. If not specified,
                                                assume all consumer nodes get the converted type.
+        calibration_providers: Execution providers to run the session during calibration. Default is None which uses
+            [ "CPUExecutionProvider" ].
+        op_types_to_quantize: List of operator types to quantize. If None, all operators other than Cast, DequantizeLinear,
+            and QuantizeLinear are quantized.
         nodes_to_exclude: List of nodes names to exclude from quantization. Alternatively, can provide a function that
             accepts an onnx.ModelProto and onnx.NodeProto as arguments and returns true if the give onnx.NodeProto
             should be excluded from quantization.
@@ -324,17 +330,20 @@ def get_qdq_config(
         if onnx.external_data_helper.uses_external_data(initializer):
             model_has_external_data = True
 
-    final_nodes_to_exclude = []
-    if nodes_to_exclude is not None and isinstance(nodes_to_exclude, list):
-        final_nodes_to_exclude.extend(nodes_to_exclude)
+    op_types_to_quantize_set = set(op_types_to_quantize) if op_types_to_quantize else None
+    nodes_to_exclude_set = set(nodes_to_exclude) if isinstance(nodes_to_exclude, list) else set()
 
     # Iterate through nodes to get all operator types in the model and
     # call user's function to filter out nodes from quantization.
     for node in model.graph.node:
-        op_types.add(node.op_type)
-        if nodes_to_exclude is not None and callable(nodes_to_exclude):
-            if nodes_to_exclude(model, node):
-                final_nodes_to_exclude.append(node.name)
+        if op_types_to_quantize_set and node.op_type not in op_types_to_quantize_set:
+            continue
+        if node.name in nodes_to_exclude_set:
+            continue
+        if callable(nodes_to_exclude) and nodes_to_exclude(model, node):
+            nodes_to_exclude_set.add(node.name)
+        else:
+            op_types.add(node.op_type)
 
     final_extra_options = {
         "MinimumRealRange": min_real_range,
@@ -378,11 +387,14 @@ def get_qdq_config(
         quant_format=QuantFormat.QDQ,
         activation_type=activation_type,
         weight_type=weight_type,
-        op_types_to_quantize=list(op_types.difference(op_types_to_exclude)),
-        nodes_to_exclude=final_nodes_to_exclude,
+        op_types_to_quantize=(
+            op_types_to_quantize if op_types_to_quantize else list(op_types.difference(op_types_to_exclude))
+        ),
+        nodes_to_exclude=list(nodes_to_exclude_set),
         per_channel=per_channel,
         reduce_range=reduce_range,
         use_external_data_format=(model_has_external_data or model.ByteSize() >= MODEL_SIZE_THRESHOLD),
+        calibration_providers=calibration_providers,
         extra_options=final_extra_options,
     )
 
@@ -442,7 +454,7 @@ def check_static_quant_arguments(quant_format: QuantFormat, activation_type: Qua
     if activation_type != QuantType.QFLOAT8E4M3FN and weight_type == QuantType.QFLOAT8E4M3FN:
         raise ValueError(
             f"ONNXRuntime quantization doesn't support data format: activation_type={activation_type} "
-            f"!=QuantType.QFLOAT8E4M3FN, weight_type=QuantType.QFLOAT8E4M3FN."
+            "!=QuantType.QFLOAT8E4M3FN, weight_type=QuantType.QFLOAT8E4M3FN."
         )
 
     if activation_type == QuantType.QFLOAT8E4M3FN and weight_type != QuantType.QFLOAT8E4M3FN:
diff --git a/onnxruntime/test/python/quantization/test_get_qdq_config.py b/onnxruntime/test/python/quantization/test_get_qdq_config.py
diff --git a/tools/ci_build/github/azure-pipelines/templates/linux-web-init-and-check.yml b/tools/ci_build/github/azure-pipelines/templates/linux-web-init-and-check.yml
diff --git a/tools/ci_build/github/azure-pipelines/templates/web-ci.yml b/tools/ci_build/github/azure-pipelines/templates/web-ci.yml

Original file line number	Diff line number	Diff line change
`@@ -261,6 +261,13 @@ export class Tensor implements TensorInterface {`
`261`	`261`	`} else {`
`262`	`262`	throw new TypeError(`A Uint8ClampedArray tensor's data must be type of uint8`);
`263`	`263`	`}`
	`264`	`+ } else if (arg0 === 'float16' && arg1 instanceof Uint16Array && typedArrayConstructor !== Uint16Array) {`
	`265`	`+ // when Float16Array is available and data is of type Uint16Array.`
	`266`	`+ // We allow Uint16Array to be passed in as data for 'float16' tensor until Float16Array is generally`
	`267`	`+ // supported in JavaScript environment.`
	`268`	`+`
	`269`	`+ // eslint-disable-next-line @typescript-eslint/no-explicit-any`
	`270`	`+ data = new (globalThis as any).Float16Array(arg1.buffer, arg1.byteOffset, arg1.length);`
`264`	`271`	`} else {`
`265`	`272`	throw new TypeError(`A ${type} tensor's data must be type of ${typedArrayConstructor}`);
`266`	`273`	`}`