Merge pull request #654 from robertknight/fix-pytorch-half-pixel

robertknight · web-flow · commit 3269bd618845 · 2025-04-08T07:47:13.000+01:00
Fix `pytorch_half_pixel` resize mode in Resize op
diff --git a/rten-examples/src/deeplab_reference.py b/rten-examples/src/deeplab_reference.py
@@ -0,0 +1,86 @@
+# Reference inference for DeepLab example using ONNX Runtime.
+#
+# To use this, first export the DeepLab model then run inference:
+#
+# ```
+# python export-deeplab.py
+# python deeplab_reference.py deeplab.onnx path/to/test_image.jpeg
+# ```
+#
+# This will produce an `out_reference.png` image containing the segmentation map.
+from argparse import ArgumentParser
+
+from PIL import Image
+import numpy as np
+import onnxruntime
+
+IMAGENET_MEAN = [0.485, 0.456, 0.406]
+IMAGENET_STD_DEV = [0.229, 0.224, 0.225]
+
+# Labels and colors for the different categories of object that DeepLabv3 can
+# detect.
+#
+# For the labels, see https://github.com/NVIDIA/DIGITS/blob/master/examples/semantic-segmentation/pascal-voc-classes.txt.
+PASCAL_VOC_LABELS = [
+    ("background", (0.0, 0.0, 0.0)),  # Black
+    ("aeroplane", (0.0, 1.0, 0.0)),  # Green
+    ("bicycle", (0.0, 0.0, 1.0)),  # Blue
+    ("bird", (1.0, 1.0, 0.0)),  # Yellow
+    ("boat", (1.0, 0.0, 1.0)),  # Magenta
+    ("bottle", (0.0, 1.0, 1.0)),  # Cyan
+    ("bus", (0.5, 0.0, 0.0)),  # Dark Red
+    ("car", (0.0, 0.5, 0.0)),  # Dark Green
+    ("cat", (0.0, 0.0, 0.5)),  # Dark Blue
+    ("chair", (0.5, 0.5, 0.0)),  # Olive
+    ("cow", (0.5, 0.0, 0.5)),  # Purple
+    ("diningtable", (0.0, 0.5, 0.5)),  # Teal
+    ("dog", (0.75, 0.75, 0.75)),  # Light Gray
+    ("horse", (0.5, 0.5, 0.5)),  # Gray
+    ("motorbike", (0.25, 0.25, 0.25)),  # Dark Gray
+    ("person", (1.0, 0.5, 0.0)),  # Orange
+    ("pottedplant", (0.5, 1.0, 0.5)),  # Pastel Green
+    ("sheep", (0.5, 0.5, 1.0)),  # Pastel Blue
+    ("sofa", (1.0, 0.75, 0.8)),  # Pink
+    ("train", (0.64, 0.16, 0.16)),  # Brown
+    ("tvmonitor", (1.0, 1.0, 1.0)),  # White
+]
+
+parser = ArgumentParser()
+parser.add_argument("model", help="Path to DeepLab ONNX model")
+parser.add_argument("image", help="Image to segment")
+args = parser.parse_args()
+
+session = onnxruntime.InferenceSession(args.model)
+
+# Input image size expected by model
+input_width = 693
+input_height = 520
+
+# Load image, normalize and convert to NHWC layout
+image = Image.open(args.image)
+image = image.resize([input_width, input_height])
+image = np.asarray(image).astype("float32") / 255.0
+image = np.transpose(image, (2, 0, 1))  # HWC => CHW
+
+norm_mean = np.array(IMAGENET_MEAN, dtype="float32").reshape(-1, 1, 1)
+norm_std_dev = np.array(IMAGENET_STD_DEV, dtype="float32").reshape(-1, 1, 1)
+image = (image - norm_mean) / norm_std_dev
+image = np.expand_dims(image, axis=0)  # Insert batch dim
+
+# Segment image, producing an HW tensor containing the class index for each pixel.
+seg_classes = session.run(["output"], {"input": image})[0]
+seg_classes = np.transpose(seg_classes, (0, 2, 3, 1))  # (N,class,H,W) => (N,H,W,class)
+seg_classes = np.argmax(seg_classes[0], axis=-1)
+
+# Produce a segmentation map with pixels colored based on predicted class for
+# each pixel.
+out_height, out_width = seg_classes.shape
+seg_map = np.zeros((out_height, out_width, 3), dtype="float32")
+for cls_id, cls_info in enumerate(PASCAL_VOC_LABELS):
+    cls_name, cls_color = cls_info
+    cls_mask = seg_classes == cls_id
+    for chan in range(3):
+        seg_map[cls_mask, chan] = cls_color[chan]
+
+out_im = Image.fromarray(np.uint8(seg_map * 255))
+out_im.save("out_reference.png")
diff --git a/rten-examples/src/export-deeplab.py b/rten-examples/src/export-deeplab.py
@@ -1,3 +1,5 @@
+from argparse import ArgumentParser
+
 import torch
 from torchvision.models.segmentation import (
     deeplabv3_mobilenet_v3_large,
@@ -16,12 +18,32 @@
 img = torch.rand((3, 480, 640))
 batch = preprocess(img).unsqueeze(0)
 
-# Export to ONNX
-torch.onnx.export(
-    model,
-    args=(batch),
-    f="deeplab.onnx",
-    verbose=False,
-    input_names=["input"],
-    output_names=["output"],
+parser = ArgumentParser()
+parser.add_argument("-f", "--filename", default="deeplab.onnx")
+parser.add_argument(
+    "--dynamo", action="store_true", help="Use TorchDynamo-based exporter"
 )
+args = parser.parse_args()
+
+if args.dynamo:
+    print("Exporting model using TorchDynamo...")
+    onnx_prog = torch.onnx.export(
+        model,
+        args=(batch),
+        verbose=False,
+        input_names=["input"],
+        output_names=["output"],
+        dynamo=True,
+    )
+    onnx_prog.optimize()
+    onnx_prog.save(args.filename)
+else:
+    print("Exporting model using TorchScript...")
+    torch.onnx.export(
+        model,
+        args=(batch),
+        f=args.filename,
+        verbose=False,
+        input_names=["input"],
+        output_names=["output"],
+    )
diff --git a/src/ops/resize.rs b/src/ops/resize.rs
@@ -31,7 +31,9 @@ pub enum ResizeTarget<'a> {
 /// - `length_resized` is the size of the axis in the output
 ///
 /// See https://github.com/onnx/onnx/blob/v1.15.0/docs/Operators.md#resize
-/// for the formulae for different transform modes.
+/// for the formulae for different transform modes. Note that `scale` here is
+/// the inverse of the `scale` used in the spec, in order to replace division
+/// with multiplication.
 ///
 /// The default is half pixel, and is is consistent with how OpenCV
 /// (`cv2.resize`) and PyTorch (`torch.nn.functional.interpolate`) work. See
@@ -58,7 +60,7 @@ fn input_coord(
             // PyTorch behavior). This implementation does however match
             // ONNX Runtime (https://github.com/microsoft/onnxruntime/blob/24620e70d9f14956a0dc84bb8a332dcd64c95a94/onnxruntime/core/providers/cpu/tensor/upsamplebase.h#L331)
             if length_resized > 1 {
-                (dest_coord as f32 + 0.5) / scale - 0.5
+                scale * (dest_coord as f32 + 0.5) - 0.5
             } else {
                 0.
             }
@@ -692,29 +694,35 @@ mod tests {
                 image,
                 scales: vec![1., 1., 1.5, 1.5],
                 coord_transform_mode: None,
-                expected: Tensor::from_data(
-                    &[1, 1, 3, 3],
-                    vec![
-                        0.2, 0.45, 0.7, // Y=0
-                        0.25, 0.5, 0.75, // Y=1
-                        0.3, 0.55, 0.8, // Y=2
-                    ],
-                ),
+                expected: Tensor::from([[0.2, 0.45, 0.7], [0.25, 0.5, 0.75], [0.3, 0.55, 0.8]])
+                    .into_shape([1, 1, 3, 3].as_slice()),
             },
-            // Scale width and height by 2x
+            // Scale width and height by 2x, using `half_pixel`.
             Case {
                 image,
                 scales: vec![1., 1., 2., 2.],
                 coord_transform_mode: None,
-                expected: Tensor::from_data(
-                    &[1, 1, 4, 4],
-                    vec![
-                        0.2, 0.325, 0.575, 0.7, // Y=0
-                        0.225, 0.35, 0.6, 0.725, // Y=1
-                        0.275, 0.4, 0.65, 0.775, // Y=2
-                        0.3, 0.425, 0.675, 0.8, // Y=3
-                    ],
-                ),
+                expected: Tensor::from([
+                    [0.2, 0.325, 0.575, 0.7],
+                    [0.225, 0.35, 0.6, 0.725],
+                    [0.275, 0.4, 0.65, 0.775],
+                    [0.3, 0.425, 0.675, 0.8],
+                ])
+                .into_shape([1, 1, 4, 4].as_slice()),
+            },
+            // Scale width and height by 2x, using `pytorch_half_pixel`. This
+            // should give the same result as for `half_pixel`.
+            Case {
+                image,
+                scales: vec![1., 1., 2., 2.],
+                coord_transform_mode: Some(CoordTransformMode::PytorchHalfPixel),
+                expected: Tensor::from([
+                    [0.2, 0.325, 0.575, 0.7],
+                    [0.225, 0.35, 0.6, 0.725],
+                    [0.275, 0.4, 0.65, 0.775],
+                    [0.3, 0.425, 0.675, 0.8],
+                ])
+                .into_shape([1, 1, 4, 4].as_slice()),
             },
             // Scale width and height by 2x, align corners.
             Case {
@@ -737,17 +745,15 @@ mod tests {
                 image,
                 scales: vec![1., 1., 3., 3.],
                 coord_transform_mode: None,
-                expected: Tensor::from_data(
-                    &[1, 1, 6, 6],
-                    vec![
-                        0.2000, 0.2000, 0.3667, 0.5333, 0.7000, 0.7000, // Y=0
-                        0.2000, 0.2000, 0.3667, 0.5333, 0.7000, 0.7000, // Y=1
-                        0.2333, 0.2333, 0.4000, 0.5667, 0.7333, 0.7333, // Y=2
-                        0.2667, 0.2667, 0.4333, 0.6000, 0.7667, 0.7667, // Y=3
-                        0.3000, 0.3000, 0.4667, 0.6333, 0.8000, 0.8000, // Y=4
-                        0.3000, 0.3000, 0.4667, 0.6333, 0.8000, 0.8000, // Y=5
-                    ],
-                ),
+                expected: Tensor::from([
+                    [0.2000, 0.2000, 0.3667, 0.5333, 0.7000, 0.7000],
+                    [0.2000, 0.2000, 0.3667, 0.5333, 0.7000, 0.7000],
+                    [0.2333, 0.2333, 0.4000, 0.5667, 0.7333, 0.7333],
+                    [0.2667, 0.2667, 0.4333, 0.6000, 0.7667, 0.7667],
+                    [0.3000, 0.3000, 0.4667, 0.6333, 0.8000, 0.8000],
+                    [0.3000, 0.3000, 0.4667, 0.6333, 0.8000, 0.8000],
+                ])
+                .into_shape([1, 1, 6, 6].as_slice()),
             },
         ];
 
diff --git a/tools/compare-tensors.py b/tools/compare-tensors.py
@@ -1,18 +1,38 @@
 from argparse import ArgumentParser
+import json
 import sys
 
 import numpy as np
 
 from debug_utils import read_tensor
 
+def read_json_tensor(path: str):
+    """
+    Load a tensor from a JSON file.
+
+    The JSON data format is `{ "data": [elements...], "shape": [dims...] }`.
+    This matches rten-tensor's serde serialization for the `Tensor` type.
+    """
+    with open(path) as tensor_fp:
+        tensor_json = json.load(tensor_fp)
+        return np.array(tensor_json["data"]).reshape(tensor_json["shape"])
+
+
 def main():
     parser = ArgumentParser(description="Compare two binary tensors")
     parser.add_argument('tensor_a', help="File containing first tensor")
     parser.add_argument('tensor_b', help="File containing second_tensor")
     args = parser.parse_args()
 
-    x = read_tensor(args.tensor_a)
-    y = read_tensor(args.tensor_b)
+    if args.tensor_a.endswith(".json"):
+        x = read_json_tensor(args.tensor_a)
+    else:
+        x = read_tensor(args.tensor_a)
+
+    if args.tensor_b.endswith(".json"):
+        y = read_json_tensor(args.tensor_b)
+    else:
+        y = read_tensor(args.tensor_b)
 
     print(f"X shape {x.shape} Y shape {y.shape}")