[WebNN EP] Automatically use ml-tensor for outputs (#24282)

egalli · ashrit-ms · commit a316188a2152 · 2025-04-24T11:01:58.000-07:00
### Description If it would improve performance, this patch moves outputs to MLTensor backed Tensors. ### Motivation and Context We are currently performing an extra copy on output tensors located in the CPU when using the WebNN EP (MLTensor -(copy)-> wasm heap -(copy)-> JS). This patch removes this copy by moving the readback to JS instead of wasm. As an extra benefit, we can also start the readbacks and wait for them in parallel. This change is similar to #23073
diff --git a/js/web/lib/wasm/jsep/backend-webnn.ts b/js/web/lib/wasm/jsep/backend-webnn.ts
@@ -79,11 +79,20 @@ export class WebNNBackend {
    * Maps from session id to list of graph inputs.
    */
   private sessionGraphInputs: Map<number, string[]> = new Map();
+  /**
+   * Maps from session id to list of graph outputs.
+   */
+  private sessionGraphOutputs: Map<number, string[]> = new Map();
   /**
    * Temporary graph inputs for the current session.
    * These inputs will be registered when the session is created.
    */
   private temporaryGraphInputs: string[] = [];
+  /**
+   * Temporary graph outputs for the current session.
+   * These outputs will be registered when the session is created.
+   */
+  private temporaryGraphOutputs: string[] = [];
   /**
    * Temporary tensors for the current session.
    */
@@ -167,10 +176,15 @@ export class WebNNBackend {
       this.sessionGraphInputs.set(sessionId, this.temporaryGraphInputs);
       this.temporaryGraphInputs = [];
     }
+    if (this.temporaryGraphOutputs.length > 0) {
+      this.sessionGraphOutputs.set(sessionId, this.temporaryGraphOutputs);
+      this.temporaryGraphOutputs = [];
+    }
   }
 
   public onReleaseSession(sessionId: number): void {
     this.sessionGraphInputs.delete(sessionId);
+    this.sessionGraphOutputs.delete(sessionId);
     const mlContext = this.mlContextBySessionId.get(sessionId)!;
     if (!mlContext) {
       // Current session is not a WebNN session.
@@ -363,6 +377,10 @@ export class WebNNBackend {
     this.temporaryGraphInputs.push(inputName);
   }
 
+  public registerGraphOutput(outputName: string): void {
+    this.temporaryGraphOutputs.push(outputName);
+  }
+
   public isGraphInput(sessionId: number, inputName: string): boolean {
     const inputNames = this.sessionGraphInputs.get(sessionId);
     if (!inputNames) {
@@ -371,6 +389,14 @@ export class WebNNBackend {
     return inputNames.includes(inputName);
   }
 
+  public isGraphOutput(sessionId: number, outputName: string): boolean {
+    const outputNames = this.sessionGraphOutputs.get(sessionId);
+    if (!outputNames) {
+      return false;
+    }
+    return outputNames.includes(outputName);
+  }
+
   public isInt64Supported(sessionId: number): boolean {
     const context = this.mlContextBySessionId.get(sessionId);
     return !!context?.opSupportLimits().input.dataTypes.includes('int64');
diff --git a/js/web/lib/wasm/wasm-core-impl.ts b/js/web/lib/wasm/wasm-core-impl.ts
@@ -172,7 +172,13 @@ export const initEp = async (env: Env, epName: string): Promise<void> => {
 /**
  * valid data locations for input/output tensors.
  */
-type SupportedTensorDataLocationForInputOutput = 'cpu' | 'cpu-pinned' | 'gpu-buffer' | 'ml-tensor';
+type SupportedTensorDataLocationForInputOutput =
+  | 'cpu'
+  | 'cpu-pinned'
+  | 'gpu-buffer'
+  | 'ml-tensor'
+  // Use 'ml-tensor' during inference, but output a tensor located on the CPU.
+  | 'ml-tensor-cpu-output';
 
 type IOBindingState = {
   /**
@@ -424,6 +430,11 @@ export const createSession = async (
           typeof options?.preferredOutputLocation === 'string'
             ? options.preferredOutputLocation
             : (options?.preferredOutputLocation?.[nameString] ?? 'cpu');
+        const isGraphOutput = wasm.webnnIsGraphOutput;
+        if (location === 'cpu' && isGraphOutput && isGraphOutput(sessionHandle, nameString)) {
+          outputPreferredLocations.push('ml-tensor-cpu-output');
+          continue;
+        }
         if (location !== 'cpu' && location !== 'cpu-pinned' && location !== 'gpu-buffer' && location !== 'ml-tensor') {
           throw new Error(`Not supported preferred output location: ${location}.`);
         }
@@ -438,7 +449,10 @@ export const createSession = async (
 
     // use IO binding only when at least one output is preferred to be on GPU.
     let bindingState: IOBindingState | null = null;
-    if (!BUILD_DEFS.DISABLE_JSEP && outputPreferredLocations.some((l) => l === 'gpu-buffer' || l === 'ml-tensor')) {
+    if (
+      !BUILD_DEFS.DISABLE_JSEP &&
+      outputPreferredLocations.some((l) => l === 'gpu-buffer' || l === 'ml-tensor' || l === 'ml-tensor-cpu-output')
+    ) {
       ioBindingHandle = wasm._OrtCreateBinding(sessionHandle);
       if (ioBindingHandle === 0) {
         checkLastError("Can't create IO binding.");
@@ -447,7 +461,10 @@ export const createSession = async (
       bindingState = {
         handle: ioBindingHandle,
         outputPreferredLocations,
-        outputPreferredLocationsEncoded: outputPreferredLocations.map((l) => dataLocationStringToEnum(l)),
+        outputPreferredLocationsEncoded: outputPreferredLocations
+          // 'ml-tensor-cpu-output' is treated as 'ml-tensor' for the purpose of IO binding.
+          .map((l) => (l === 'ml-tensor-cpu-output' ? 'ml-tensor' : l))
+          .map((l) => dataLocationStringToEnum(l)),
       };
     }
 
@@ -599,10 +616,11 @@ export const prepareInputOutputTensor = async (
       }
     } else {
       const isGraphInput = wasm.webnnIsGraphInput;
-      if (dataType !== 'string' && isGraphInput) {
+      const isGraphOutput = wasm.webnnIsGraphOutput;
+      if (dataType !== 'string' && isGraphInput && isGraphOutput) {
         const tensorName = wasm.UTF8ToString(tensorNameUTF8Encoded);
         // Promote the tensor to 'ml-tensor' if it is a graph input.
-        if (isGraphInput(sessionId, tensorName)) {
+        if (isGraphInput(sessionId, tensorName) || isGraphOutput(sessionId, tensorName)) {
           const dataTypeEnum = tensorDataTypeStringToEnum(dataType);
           dataByteLength = calculateTensorSizeInBytes(dataTypeEnum, dims)!;
           actualLocation = 'ml-tensor';
@@ -810,6 +828,7 @@ export const run = async (
     }
 
     const output: TensorMetadata[] = [];
+    const outputPromises: Array<Promise<[number, Tensor.DataType]>> = [];
 
     for (let i = 0; i < outputCount; i++) {
       const tensor = Number(wasm.getValue(outputValuesOffset + i * ptrSize, '*'));
@@ -958,6 +977,20 @@ export const run = async (
               },
               'ml-tensor',
             ]);
+          } else if (preferredLocation === 'ml-tensor-cpu-output' && size > 0) {
+            const data = wasm.webnnCreateMLTensorDownloader!(dataOffset, type as Tensor.MLTensorDataTypes)();
+            const index = output.length;
+            // Delay the data download and releasing the tensor until we can wait for all output tensors to be downloaded.
+            keepOutputTensor = true;
+            outputPromises.push(
+              (async () => {
+                const result: [number, Tensor.DataType] = [index, await data];
+                wasm.webnnReleaseTensorId!(dataOffset);
+                wasm._OrtReleaseTensor(tensor);
+                return result;
+              })(),
+            );
+            output.push([type, dims, [], 'cpu']);
           } else {
             const typedArrayConstructor = tensorTypeToTypedArrayConstructor(type);
             const data = new typedArrayConstructor(size);
@@ -975,7 +1008,6 @@ export const run = async (
         if (!keepOutputTensor) {
           wasm._OrtReleaseTensor(tensor);
         }
-        wasm.webnnOnRunEnd?.(sessionHandle);
       }
     }
 
@@ -992,8 +1024,14 @@ export const run = async (
         false,
       ]);
     }
+    // Wait for all output tensor data to be downloaded.
+    for (const [index, data] of await Promise.all(outputPromises)) {
+      output[index][2] = data;
+    }
     return output;
   } finally {
+    wasm.webnnOnRunEnd?.(sessionHandle);
+
     wasm.stackRestore(beforeRunStack);
 
     if (BUILD_DEFS.USE_WEBGPU_EP) {
diff --git a/js/web/lib/wasm/wasm-types.ts b/js/web/lib/wasm/wasm-types.ts
@@ -287,6 +287,19 @@ export declare namespace JSEP {
      * @returns whether the input is a WebNN graph input.
      */
     webnnIsGraphInput: (sessionId: number, inputName: string) => boolean;
+    /**
+     * [exported from pre-jsep.js] Register a WebNN graph output.
+     * @param outputName - specify the output name.
+     */
+    webnnRegisterGraphOutput: (outputName: string) => void;
+    /**
+     * [exported from pre-jsep.js] Check if a graph output is a WebNN graph output.
+     * @param sessionId - specify the session ID.
+     * @param outputName - specify the output name.
+     * @returns whether the output is a WebNN graph output.
+     */
+    webnnIsGraphOutput: (sessionId: number, outputName: string) => boolean;
+
     /**
      * [exported from pre-jsep.js] Create a temporary MLTensor for a session.
      * @param sessionId - specify the session ID.
diff --git a/onnxruntime/core/providers/webnn/builders/model_builder.cc b/onnxruntime/core/providers/webnn/builders/model_builder.cc
@@ -300,6 +300,7 @@ Status ModelBuilder::RegisterModelInputOutput(const NodeArg& node_arg, bool is_i
     emscripten::val::module_property("webnnRegisterGraphInput")(name);
     input_names_.push_back(name);
   } else {
+    emscripten::val::module_property("webnnRegisterGraphOutput")(name);
     output_names_.push_back(name);
   }
 
diff --git a/onnxruntime/wasm/pre-jsep.js b/onnxruntime/wasm/pre-jsep.js
@@ -151,6 +151,9 @@ Module["jsepInit"] = (name, params) => {
     Module["webnnRegisterGraphInput"] =
       backend["registerGraphInput"].bind(backend);
     Module["webnnIsGraphInput"] = backend["isGraphInput"].bind(backend);
+    Module["webnnRegisterGraphOutput"] =
+      backend["registerGraphOutput"].bind(backend);
+    Module["webnnIsGraphOutput"] = backend["isGraphOutput"].bind(backend);
 
     Module["webnnCreateTemporaryTensor"] =
       backend["createTemporaryTensor"].bind(backend);

Original file line number	Diff line number	Diff line change
`@@ -300,6 +300,7 @@ Status ModelBuilder::RegisterModelInputOutput(const NodeArg& node_arg, bool is_i`
`300`	`300`	`emscripten::val::module_property("webnnRegisterGraphInput")(name);`
`301`	`301`	`input_names_.push_back(name);`
`302`	`302`	`} else {`
	`303`	`+ emscripten::val::module_property("webnnRegisterGraphOutput")(name);`
`303`	`304`	`output_names_.push_back(name);`
`304`	`305`	`}`
`305`	`306`