pytorch · AntoineSimoulin · Jun 4, 2025 · May 25, 2025 · May 26, 2025 · May 26, 2025
diff --git a/test/assets/fakedata/draw_rotated_boxes.png b/test/assets/fakedata/draw_rotated_boxes.png
diff --git a/test/common_utils.py b/test/common_utils.py
@@ -444,13 +444,13 @@ def sample_position(values, max_value):
         r_rad = r * torch.pi / 180.0
         cos, sin = torch.cos(r_rad), torch.sin(r_rad)
         x1, y1 = x, y
-        x3 = x1 + w * cos
-        y3 = y1 - w * sin
-        x2 = x3 + h * sin
-        y2 = y3 + h * cos
+        x2 = x1 + w * cos
+        y2 = y1 - w * sin
+        x3 = x2 + h * sin
+        y3 = y2 + h * cos
         x4 = x1 + h * sin
         y4 = y1 + h * cos
-        parts = (x1, y1, x3, y3, x2, y2, x4, y4)
+        parts = (x1, y1, x2, y2, x3, y3, x4, y4)
     else:
         raise ValueError(f"Format {format} is not supported")
 

diff --git a/test/test_transforms_v2.py b/test/test_transforms_v2.py
@@ -560,6 +560,78 @@ def affine_bounding_boxes(bounding_boxes):
     )
 
 
+def reference_affine_rotated_bounding_boxes_helper(bounding_boxes, *, affine_matrix, new_canvas_size=None, clamp=True):
+    format = bounding_boxes.format
+    canvas_size = new_canvas_size or bounding_boxes.canvas_size
+
+    def affine_rotated_bounding_boxes(bounding_boxes):
+        dtype = bounding_boxes.dtype
+        device = bounding_boxes.device
+
+        # Go to float before converting to prevent precision loss in case of CXCYWHR -> XYXYXYXY and W or H is 1
+        input_xyxyxyxy = F.convert_bounding_box_format(
+            bounding_boxes.to(dtype=torch.float64, device="cpu", copy=True),
+            old_format=format,
+            new_format=tv_tensors.BoundingBoxFormat.XYXYXYXY,
+            inplace=True,
+        )
+        x1, y1, x2, y2, x3, y3, x4, y4 = input_xyxyxyxy.squeeze(0).tolist()
+
+        points = np.array(
+            [
+                [x1, y1, 1.0],
+                [x2, y2, 1.0],
+                [x3, y3, 1.0],
+                [x4, y4, 1.0],
+            ]
+        )
+        transformed_points = np.matmul(points, affine_matrix.astype(points.dtype).T)
+        output = torch.Tensor(
+            [
+                float(transformed_points[1, 0]),
+                float(transformed_points[1, 1]),
+                float(transformed_points[0, 0]),
+                float(transformed_points[0, 1]),
+                float(transformed_points[3, 0]),
+                float(transformed_points[3, 1]),
+                float(transformed_points[2, 0]),
+                float(transformed_points[2, 1]),
+            ]
+        )
+
+        output = F.convert_bounding_box_format(
+            output, old_format=tv_tensors.BoundingBoxFormat.XYXYXYXY, new_format=format
+        )
+
+        if clamp:
+            # It is important to clamp before casting, especially for CXCYWHR format, dtype=int64
+            output = F.clamp_bounding_boxes(
+                output,
+                format=format,
+                canvas_size=canvas_size,
+            )
+        else:
+            # We leave the bounding box as float32 so the caller gets the full precision to perform any additional
+            # operation
+            dtype = output.dtype
+
+        return output.to(dtype=dtype, device=device)
+
+    return tv_tensors.BoundingBoxes(
+        torch.cat(
+            [
+                affine_rotated_bounding_boxes(b)
+                for b in bounding_boxes.reshape(
+                    -1, 5 if format != tv_tensors.BoundingBoxFormat.XYXYXYXY else 8
+                ).unbind()
+            ],
+            dim=0,
+        ).reshape(bounding_boxes.shape),
+        format=format,
+        canvas_size=canvas_size,
+    )
+
+
 class TestResize:
     INPUT_SIZE = (17, 11)
     OUTPUT_SIZES = [17, [17], (17,), None, [12, 13], (12, 13)]
@@ -1012,7 +1084,7 @@ class TestHorizontalFlip:
     def test_kernel_image(self, dtype, device):
         check_kernel(F.horizontal_flip_image, make_image(dtype=dtype, device=device))
 
-    @pytest.mark.parametrize("format", SUPPORTED_BOX_FORMATS)
+    @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat))
     @pytest.mark.parametrize("dtype", [torch.float32, torch.int64])
     @pytest.mark.parametrize("device", cpu_and_cuda())
     def test_kernel_bounding_boxes(self, format, dtype, device):
@@ -1071,17 +1143,22 @@ def test_image_correctness(self, fn):
 
         torch.testing.assert_close(actual, expected)
 
-    def _reference_horizontal_flip_bounding_boxes(self, bounding_boxes):
+    def _reference_horizontal_flip_bounding_boxes(self, bounding_boxes: tv_tensors.BoundingBoxes):
         affine_matrix = np.array(
             [
                 [-1, 0, bounding_boxes.canvas_size[1]],
                 [0, 1, 0],
             ],
         )
 
-        return reference_affine_bounding_boxes_helper(bounding_boxes, affine_matrix=affine_matrix)
+        helper = (
+            reference_affine_rotated_bounding_boxes_helper
+            if tv_tensors.is_rotated_bounding_format(bounding_boxes.format)
+            else reference_affine_bounding_boxes_helper
+        )
+        return helper(bounding_boxes, affine_matrix=affine_matrix)
 
-    @pytest.mark.parametrize("format", SUPPORTED_BOX_FORMATS)
+    @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat))
     @pytest.mark.parametrize(
         "fn", [F.horizontal_flip, transform_cls_to_functional(transforms.RandomHorizontalFlip, p=1)]
     )
@@ -1464,7 +1541,7 @@ class TestVerticalFlip:
     def test_kernel_image(self, dtype, device):
         check_kernel(F.vertical_flip_image, make_image(dtype=dtype, device=device))
 
-    @pytest.mark.parametrize("format", SUPPORTED_BOX_FORMATS)
+    @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat))
     @pytest.mark.parametrize("dtype", [torch.float32, torch.int64])
     @pytest.mark.parametrize("device", cpu_and_cuda())
     def test_kernel_bounding_boxes(self, format, dtype, device):
@@ -1521,17 +1598,22 @@ def test_image_correctness(self, fn):
 
         torch.testing.assert_close(actual, expected)
 
-    def _reference_vertical_flip_bounding_boxes(self, bounding_boxes):
+    def _reference_vertical_flip_bounding_boxes(self, bounding_boxes: tv_tensors.BoundingBoxes):
         affine_matrix = np.array(
             [
                 [1, 0, 0],
                 [0, -1, bounding_boxes.canvas_size[0]],
             ],
         )
 
-        return reference_affine_bounding_boxes_helper(bounding_boxes, affine_matrix=affine_matrix)
+        helper = (
+            reference_affine_rotated_bounding_boxes_helper
+            if tv_tensors.is_rotated_bounding_format(bounding_boxes.format)
+            else reference_affine_bounding_boxes_helper
+        )
+        return helper(bounding_boxes, affine_matrix=affine_matrix)
 
-    @pytest.mark.parametrize("format", SUPPORTED_BOX_FORMATS)
+    @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat))
     @pytest.mark.parametrize("fn", [F.vertical_flip, transform_cls_to_functional(transforms.RandomVerticalFlip, p=1)])
     def test_bounding_boxes_correctness(self, format, fn):
         bounding_boxes = make_bounding_boxes(format=format)

diff --git a/test/test_tv_tensors.py b/test/test_tv_tensors.py
@@ -43,6 +43,34 @@ def test_bbox_instance(data, format):
     assert bboxes.format == format
 
 
+@pytest.mark.parametrize(
+    "format, is_rotated_expected",
+    [
+        ("XYXY", False),
+        ("XYWH", False),
+        ("CXCYWH", False),
+        ("XYXYXYXY", True),
+        ("XYWHR", True),
+        ("CXCYWHR", True),
+        (tv_tensors.BoundingBoxFormat.XYXY, False),
+        (tv_tensors.BoundingBoxFormat.XYWH, False),
+        (tv_tensors.BoundingBoxFormat.CXCYWH, False),
+        (tv_tensors.BoundingBoxFormat.XYXYXYXY, True),
+        (tv_tensors.BoundingBoxFormat.XYWHR, True),
+        (tv_tensors.BoundingBoxFormat.CXCYWHR, True),
+    ],
+)
+@pytest.mark.parametrize("scripted", (False, True))
+def test_bbox_format(format, is_rotated_expected, scripted):
+    if isinstance(format, str):
+        format = tv_tensors.BoundingBoxFormat[(format.upper())]
+
+    fn = tv_tensors.is_rotated_bounding_format
+    if scripted:
+        fn = torch.jit.script(fn)
+    assert fn(format) == is_rotated_expected
+
+
 def test_bbox_dim_error():
     data_3d = [[[1, 2, 3, 4]]]
     with pytest.raises(ValueError, match="Expected a 1D or 2D tensor, got 3D"):

diff --git a/test/test_utils.py b/test/test_utils.py
@@ -17,7 +17,14 @@
 PILLOW_VERSION = tuple(int(x) for x in PILLOW_VERSION.split("."))
 
 boxes = torch.tensor([[0, 0, 20, 20], [0, 0, 0, 0], [10, 15, 30, 35], [23, 35, 93, 95]], dtype=torch.float)
-
+rotated_boxes = torch.tensor(
+    [
+        [100, 150, 150, 150, 150, 250, 100, 250],
+        [200, 350, 250, 350, 250, 250, 200, 250],
+        [300, 200, 200, 200, 200, 250, 300, 250],
+    ],
+    dtype=torch.float,
+)
 keypoints = torch.tensor([[[10, 10], [5, 5], [2, 2]], [[20, 20], [30, 30], [3, 3]]], dtype=torch.float)
 
 
@@ -148,6 +155,18 @@ def test_draw_boxes_with_coloured_label_backgrounds():
     assert_equal(result, expected)
 
 
+@pytest.mark.skipif(PILLOW_VERSION < (10, 1), reason="The reference image is only valid for PIL >= 10.1")
+def test_draw_rotatated_boxes():
+    img = torch.full((3, 500, 500), 255, dtype=torch.uint8)
+    colors = ["blue", "yellow", (0, 255, 0)]
+
+    result = utils.draw_bounding_boxes(img, rotated_boxes, colors=colors)
+    expected = torch.as_tensor(np.array(result)).permute(2, 0, 1)
+    path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "assets", "fakedata", "draw_rotated_boxes.png")
+    expected = torch.as_tensor(np.array(Image.open(path))).permute(2, 0, 1)
+    assert_equal(result, expected)
+
+
 @pytest.mark.parametrize("fill", [True, False])
 def test_draw_boxes_dtypes(fill):
     img_uint8 = torch.full((3, 100, 100), 255, dtype=torch.uint8)

diff --git a/torchvision/ops/_box_convert.py b/torchvision/ops/_box_convert.py
@@ -130,56 +130,56 @@ def _box_xywhr_to_cxcywhr(boxes: Tensor) -> Tensor:
 
 def _box_xywhr_to_xyxyxyxy(boxes: Tensor) -> Tensor:
     """
-    Converts rotated bounding boxes from (x1, y1, w, h, r) format to (x1, y1, x3, y3, x2, y2, x4, y4) format.
+    Converts rotated bounding boxes from (x1, y1, w, h, r) format to (x1, y1, x2, y2, x3, y3, x4, y4) format.
     (x1, y1) refer to top left of bounding box
     (w, h) are width and height of the rotated bounding box
     r is rotation angle w.r.t to the box center by :math:`|r|` degrees counter clock wise in the image plan
 
     (x1, y1) refer to top left of rotated bounding box
-    (x3, y3) refer to top right of rotated bounding box
-    (x2, y2) refer to bottom right of rotated bounding box
+    (x2, y2) refer to top right of rotated bounding box
+    (x3, y3) refer to bottom right of rotated bounding box
     (x4, y4) refer to bottom left ofrotated bounding box
     Args:
         boxes (Tensor[N, 5]): rotated boxes in (cx, cy, w, h, r) format which will be converted.
 
     Returns:
-        boxes (Tensor(N, 8)): rotated boxes in (x1, y1, x3, y3, x2, y2, x4, y4) format.
+        boxes (Tensor(N, 8)): rotated boxes in (x1, y1, x2, y2, x3, y3, x4, y4) format.
     """
     x1, y1, w, h, r = boxes.unbind(-1)
     r_rad = r * torch.pi / 180.0
     cos, sin = torch.cos(r_rad), torch.sin(r_rad)
 
-    x3 = x1 + w * cos
-    y3 = y1 - w * sin
-    x2 = x3 + h * sin
-    y2 = y3 + h * cos
+    x2 = x1 + w * cos
+    y2 = y1 - w * sin
+    x3 = x2 + h * sin
+    y3 = y2 + h * cos
     x4 = x1 + h * sin
     y4 = y1 + h * cos
 
-    return torch.stack((x1, y1, x3, y3, x2, y2, x4, y4), dim=-1)
+    return torch.stack((x1, y1, x2, y2, x3, y3, x4, y4), dim=-1)
 
 
 def _box_xyxyxyxy_to_xywhr(boxes: Tensor) -> Tensor:
     """
-    Converts rotated bounding boxes from (x1, y1, x3, y3, x2, y2, x4, y4) format to (x1, y1, w, h, r) format.
+    Converts rotated bounding boxes from (x1, y1, x2, y2, x3, y3, x4, y4) format to (x1, y1, w, h, r) format.
     (x1, y1) refer to top left of the rotated bounding box
-    (x3, y3) refer to bottom left of the rotated bounding box
-    (x2, y2) refer to bottom right of the rotated bounding box
+    (x2, y2) refer to bottom left of the rotated bounding box
+    (x3, y3) refer to bottom right of the rotated bounding box
     (x4, y4) refer to top right of the rotated bounding box
     (w, h) refers to width and height of rotated bounding box
     r is rotation angle w.r.t to the box center by :math:`|r|` degrees counter clock wise in the image plan
 
     Args:
-        boxes (Tensor(N, 8)): rotated boxes in (x1, y1, x3, y3, x2, y2, x4, y4) format.
+        boxes (Tensor(N, 8)): rotated boxes in (x1, y1, x2, y2, x3, y3, x4, y4) format.
 
     Returns:
         boxes (Tensor[N, 5]): rotated boxes in (x1, y1, w, h, r) format.
     """
-    x1, y1, x3, y3, x2, y2, x4, y4 = boxes.unbind(-1)
-    r_rad = torch.atan2(y1 - y3, x3 - x1)
+    x1, y1, x2, y2, x3, y3, x4, y4 = boxes.unbind(-1)
+    r_rad = torch.atan2(y1 - y2, x2 - x1)
     r = r_rad * 180 / torch.pi
 
-    w = ((x3 - x1) ** 2 + (y1 - y3) ** 2).sqrt()
+    w = ((x2 - x1) ** 2 + (y1 - y2) ** 2).sqrt()
     h = ((x3 - x2) ** 2 + (y3 - y2) ** 2).sqrt()
 
     boxes = torch.stack((x1, y1, w, h, r), dim=-1)

diff --git a/torchvision/ops/boxes.py b/torchvision/ops/boxes.py
@@ -209,8 +209,8 @@ def box_convert(boxes: Tensor, in_fmt: str, out_fmt: str) -> Tensor:
     being width and height.
     r is rotation angle w.r.t to the box center by :math:`|r|` degrees counter clock wise in the image plan
 
-    ``'xyxyxyxy'``: boxes are represented via corners, x1, y1 being top left, x2, y2 bottom right,
-    x3, y3 bottom left, and x4, y4 top right.
+    ``'xyxyxyxy'``: boxes are represented via corners, x1, y1 being top left, x2, y2 top right,
+    x3, y3 bottom right, and x4, y4 bottom left.
 
     Args:
         boxes (Tensor[N, K]): boxes which will be converted. K is the number of coordinates (4 for unrotated bounding boxes, 5 or 8 for rotated bounding boxes)