pytorch · vfdev-5 · Nov 3, 2022 · Nov 1, 2022 · Nov 1, 2022 · Nov 1, 2022
diff --git a/test/prototype_transforms_kernel_infos.py b/test/prototype_transforms_kernel_infos.py
@@ -25,7 +25,7 @@
 )
 from torch.utils._pytree import tree_map
 from torchvision.prototype import features
-from torchvision.transforms.functional_tensor import _max_value as get_max_value
+from torchvision.transforms.functional_tensor import _max_value as get_max_value, _parse_pad_padding
 
 __all__ = ["KernelInfo", "KERNEL_INFOS"]
 
@@ -1078,6 +1078,38 @@ def sample_inputs_pad_video():
         yield ArgsKwargs(video_loader, padding=[1])
 
 
+def reference_pad_bounding_box(bounding_box, *, format, spatial_size, padding, padding_mode):
+
+    left, right, top, bottom = _parse_pad_padding(padding)
+
+    affine_matrix = np.array(
+        [
+            [1, 0, left],
+            [0, 1, top],
+        ],
+        dtype="float32",
+    )
+
+    height = spatial_size[0] + top + bottom
+    width = spatial_size[1] + left + right
+
+    expected_bboxes = reference_affine_bounding_box_helper(bounding_box, format=format, affine_matrix=affine_matrix)
+    return expected_bboxes, (height, width)
+
+
+def reference_inputs_pad_bounding_box():
+    for bounding_box_loader, padding in itertools.product(
+        make_bounding_box_loaders(extra_dims=((), (4,))), [1, (1,), (1, 2), (1, 2, 3, 4), [1], [1, 2], [1, 2, 3, 4]]
+    ):
+        yield ArgsKwargs(
+            bounding_box_loader,
+            format=bounding_box_loader.format,
+            spatial_size=bounding_box_loader.spatial_size,
+            padding=padding,
+            padding_mode="constant",
+        )
+
+
 KERNEL_INFOS.extend(
     [
         KernelInfo(
@@ -1097,6 +1129,8 @@ def sample_inputs_pad_video():
         KernelInfo(
             F.pad_bounding_box,
             sample_inputs_fn=sample_inputs_pad_bounding_box,
+            reference_fn=reference_pad_bounding_box,
+            reference_inputs_fn=reference_inputs_pad_bounding_box,
             test_marks=[
                 xfail_jit_python_scalar_arg("padding"),
                 xfail_jit_tuple_instead_of_list("padding"),

diff --git a/torchvision/prototype/transforms/functional/_geometry.py b/torchvision/prototype/transforms/functional/_geometry.py
@@ -753,11 +753,11 @@ def pad_bounding_box(
     bounding_box = bounding_box.clone()
 
     # this works without conversion since padding only affects xy coordinates
-    bounding_box[..., 0] += left
-    bounding_box[..., 1] += top
     if format == features.BoundingBoxFormat.XYXY:
-        bounding_box[..., 2] += left
-        bounding_box[..., 3] += top
+        pad = torch.tensor([left, top, left, top], device=bounding_box.device)
+    else:
+        pad = torch.tensor([left, top, 0, 0], device=bounding_box.device)
+    bounding_box = bounding_box.add_(pad)
 
     height, width = spatial_size
     height += top + bottom
@@ -914,16 +914,16 @@ def perspective_bounding_box(
         (-perspective_coeffs[0] * perspective_coeffs[7] + perspective_coeffs[1] * perspective_coeffs[6]) / denom,
     ]
 
-    theta1 = torch.tensor(
-        [[inv_coeffs[0], inv_coeffs[1], inv_coeffs[2]], [inv_coeffs[3], inv_coeffs[4], inv_coeffs[5]]],
+    theta12_T = torch.tensor(
+        [
+            [inv_coeffs[0], inv_coeffs[3], inv_coeffs[6], inv_coeffs[6]],
+            [inv_coeffs[1], inv_coeffs[4], inv_coeffs[7], inv_coeffs[7]],
+            [inv_coeffs[2], inv_coeffs[5], 1.0, 1.0],
+        ],
         dtype=dtype,
         device=device,
     )
 
-    theta2 = torch.tensor(
-        [[inv_coeffs[6], inv_coeffs[7], 1.0], [inv_coeffs[6], inv_coeffs[7], 1.0]], dtype=dtype, device=device
-    )
-
     # 1) Let's transform bboxes into a tensor of 4 points (top-left, top-right, bottom-left, bottom-right corners).
     # Tensor of points has shape (N * 4, 3), where N is the number of bboxes
     # Single point structure is similar to
@@ -934,15 +934,16 @@ def perspective_bounding_box(
     #   x_out = (coeffs[0] * x + coeffs[1] * y + coeffs[2]) / (coeffs[6] * x + coeffs[7] * y + 1)
     #   y_out = (coeffs[3] * x + coeffs[4] * y + coeffs[5]) / (coeffs[6] * x + coeffs[7] * y + 1)
 
-    numer_points = torch.matmul(points, theta1.T)
-    denom_points = torch.matmul(points, theta2.T)
-    transformed_points = numer_points / denom_points
+    numer_denom_points = torch.matmul(points, theta12_T)
+    numer_points = numer_denom_points[:, :2]
+    denom_points = numer_denom_points[:, 2:]
+    transformed_points = numer_points.div_(denom_points)
 
     # 3) Reshape transformed points to [N boxes, 4 points, x/y coords]
     # and compute bounding box from 4 transformed points:
     transformed_points = transformed_points.reshape(-1, 4, 2)
-    out_bbox_mins, _ = torch.min(transformed_points, dim=1)
-    out_bbox_maxs, _ = torch.max(transformed_points, dim=1)
+    out_bbox_mins, out_bbox_maxs = torch.aminmax(transformed_points, dim=1)
+
     out_bboxes = torch.cat([out_bbox_mins, out_bbox_maxs], dim=1).to(bounding_box.dtype)
 
     # out_bboxes should be of shape [N boxes, 4]