Skip to content

Commit cbfd2c1

Browse files
Bordacodex
andcommitted
feat: store keypoints on detections
Add validated keypoints support to Detections, including slicing, equality, merge, and inner-object merge preservation. Remove the RF-DETR-specific KeyPoints adapter and make VertexEllipseAnnotator rely on caller-selected keypoints instead of an annotator confidence threshold. Co-authored-by: Codex <codex@openai.com>
1 parent 94bb51e commit cbfd2c1

8 files changed

Lines changed: 168 additions & 400 deletions

File tree

src/supervision/detection/core.py

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,9 @@ class simplifies data manipulation and filtering, providing a uniform API for
135135
mask: An array of shape `(n, H, W)` containing the segmentation masks
136136
(`bool` data type), or `None` when masks are not available, or as
137137
:class:`~supervision.detection.compact_mask.CompactMask`.
138+
keypoints: An array of shape `(n, K, 2)` or `(n, K, 3)` containing
139+
keypoint coordinates for each detection, or `None` when keypoints
140+
are not available.
138141
confidence: An array of shape `(n,)` containing the confidence scores
139142
of the detections, or `None` when confidence values are not available.
140143
class_id: An array of shape `(n,)` containing the class ids of the
@@ -156,6 +159,7 @@ class simplifies data manipulation and filtering, providing a uniform API for
156159
tracker_id: npt.NDArray[np.generic] | None = None
157160
data: dict[str, npt.NDArray[np.generic] | list[Any]] = field(default_factory=dict)
158161
metadata: dict[str, Any] = field(default_factory=dict)
162+
keypoints: npt.NDArray[np.generic] | None = None
159163

160164
def __post_init__(self) -> None:
161165
validate_detections_fields(
@@ -165,6 +169,7 @@ def __post_init__(self) -> None:
165169
class_id=self.class_id,
166170
tracker_id=self.tracker_id,
167171
data=self.data,
172+
keypoints=self.keypoints,
168173
)
169174

170175
def __len__(self) -> int:
@@ -206,6 +211,7 @@ def __eq__(self, other: object) -> bool:
206211
[
207212
np.array_equal(self.xyxy, other.xyxy),
208213
np.array_equal(self.mask, other.mask),
214+
np.array_equal(self.keypoints, other.keypoints),
209215
np.array_equal(self.class_id, other.class_id),
210216
np.array_equal(self.confidence, other.confidence),
211217
np.array_equal(self.tracker_id, other.tracker_id),
@@ -2109,8 +2115,8 @@ def merge(cls, detections_list: list[Detections]) -> Detections:
21092115
Merge a list of Detections objects into a single Detections object.
21102116
21112117
This method takes a list of Detections objects and combines their
2112-
respective fields (`xyxy`, `mask`, `confidence`, `class_id`, and `tracker_id`)
2113-
into a single Detections object.
2118+
respective fields (`xyxy`, `mask`, `keypoints`, `confidence`, `class_id`, and
2119+
`tracker_id`) into a single Detections object.
21142120
21152121
For example, if merging Detections with 3 and 4 detected objects, this method
21162122
will return a Detections with 7 objects (7 entries in `xyxy`, `mask`, etc).
@@ -2171,6 +2177,7 @@ def merge(cls, detections_list: list[Detections]) -> Detections:
21712177
class_id=detections.class_id,
21722178
tracker_id=detections.tracker_id,
21732179
data=detections.data,
2180+
keypoints=detections.keypoints,
21742181
)
21752182

21762183
xyxy = np.vstack([d.xyxy for d in detections_list])
@@ -2188,9 +2195,12 @@ def stack_or_none(
21882195
return CompactMask.merge(masks)
21892196
# Mixed or all-ndarray: __array__ auto-converts any CompactMask.
21902197
return np.vstack([np.asarray(m) for m in masks])
2198+
if name == "keypoints":
2199+
return np.vstack([d.__getattribute__(name) for d in detections_list])
21912200
return np.hstack([d.__getattribute__(name) for d in detections_list])
21922201

21932202
mask = stack_or_none("mask")
2203+
keypoints = stack_or_none("keypoints")
21942204
confidence = stack_or_none("confidence")
21952205
class_id = stack_or_none("class_id")
21962206
tracker_id = stack_or_none("tracker_id")
@@ -2208,6 +2218,7 @@ def stack_or_none(
22082218
tracker_id=tracker_id,
22092219
data=data,
22102220
metadata=metadata,
2221+
keypoints=keypoints,
22112222
)
22122223

22132224
def get_anchors_coordinates(self, anchor: Position) -> npt.NDArray[np.generic]:
@@ -2322,6 +2333,7 @@ def __getitem__(
23222333
tracker_id=self.tracker_id[index] if self.tracker_id is not None else None,
23232334
data=get_data_item(self.data, index),
23242335
metadata=self.metadata,
2336+
keypoints=self.keypoints[index] if self.keypoints is not None else None,
23252337
)
23262338

23272339
def __setitem__(self, key: str, value: npt.NDArray[np.generic] | list[Any]) -> None:
@@ -2582,7 +2594,8 @@ def merge_inner_detection_object_pair(
25822594
The resulting `confidence` of the merged object is calculated by the weighted
25832595
contribution of each detection to the merged object.
25842596
The bounding boxes and masks of the two input detections are merged into a
2585-
single bounding box and mask, respectively.
2597+
single bounding box and mask, respectively. If keypoints are present, keypoints
2598+
from the winning detection are preserved.
25862599
25872600
Args:
25882601
detections_1: The first Detections object.
@@ -2657,6 +2670,7 @@ def merge_inner_detection_object_pair(
26572670
tracker_id=winning_detection.tracker_id,
26582671
data=winning_detection.data,
26592672
metadata=metadata,
2673+
keypoints=winning_detection.keypoints,
26602674
)
26612675

26622676

src/supervision/key_points/annotators.py

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -208,7 +208,6 @@ def __init__(
208208
thickness: int = 2,
209209
sigma: float = 2.0,
210210
covariance_data_key: str = "covariance",
211-
confidence_threshold: float = 0.0,
212211
max_axis_length: float | None = None,
213212
line_style: Literal["solid", "dashed"] = "solid",
214213
dash_length: int = 16,
@@ -220,8 +219,6 @@ def __init__(
220219
sigma: Number of standard deviations represented by the ellipse axes.
221220
covariance_data_key: Key in ``key_points.data`` containing covariance
222221
matrices with shape ``(N, K, 2, 2)``.
223-
confidence_threshold: Minimum keypoint confidence required for drawing.
224-
Ignored when ``key_points.confidence`` is ``None``.
225222
max_axis_length: Optional cap for ellipse semi-axis lengths in pixels.
226223
When ``None`` (default), near-singular precision matrices can produce
227224
extremely large eigenvalues and frame-spanning ellipses. Set this to
@@ -247,7 +244,6 @@ def __init__(
247244
self.thickness = thickness
248245
self.sigma = sigma
249246
self.covariance_data_key = covariance_data_key
250-
self.confidence_threshold = confidence_threshold
251247
self.max_axis_length = max_axis_length
252248
self.line_style = line_style
253249
self.dash_length = dash_length
@@ -300,8 +296,6 @@ def annotate(self, scene: ImageType, key_points: KeyPoints) -> ImageType:
300296
confidence = key_points.confidence[detection_index, point_index]
301297
if not np.isfinite(confidence):
302298
continue
303-
if confidence < self.confidence_threshold:
304-
continue
305299
ellipse = self._covariance_to_ellipse(
306300
covariance=covariances[detection_index, point_index]
307301
)

src/supervision/key_points/core.py

Lines changed: 0 additions & 203 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
from __future__ import annotations
22

3-
import logging
43
from collections.abc import Iterable, Iterator
54
from dataclasses import dataclass, field
65
from typing import Any, Union, cast
@@ -13,8 +12,6 @@
1312
from supervision.detection.utils.internal import get_data_item, is_data_equal
1413
from supervision.validators import validate_key_points_fields
1514

16-
logger = logging.getLogger(__name__)
17-
1815
Index1D = Union[
1916
int,
2017
slice,
@@ -26,94 +23,6 @@
2623
Index2D = tuple[Index1D, Index1D]
2724

2825

29-
def _rfdetr_source_shape(
30-
rfdetr_detections: Detections,
31-
detections_count: int,
32-
) -> npt.NDArray[np.float32]:
33-
source_shape = rfdetr_detections.data.get("source_shape")
34-
if source_shape is None:
35-
raise ValueError(
36-
"RF-DETR detections with keypoint precision data must contain "
37-
"data['source_shape'] with shape (N, 2) where each row is "
38-
"(height, width) in pixels."
39-
)
40-
41-
source_shape_array = np.asarray(source_shape, dtype=np.float32)
42-
expected_shape = (detections_count, 2)
43-
if source_shape_array.shape != expected_shape:
44-
raise ValueError(
45-
"Expected RF-DETR source_shape shape "
46-
f"{expected_shape}, got {source_shape_array.shape}."
47-
)
48-
return source_shape_array
49-
50-
51-
def _rfdetr_precision_cholesky_to_pixel_covariance(
52-
precision_cholesky: npt.NDArray[np.float32],
53-
source_shape: npt.NDArray[np.float32],
54-
) -> npt.NDArray[np.float32]:
55-
if precision_cholesky.ndim != 3 or precision_cholesky.shape[2] != 3:
56-
raise ValueError(
57-
"Expected RF-DETR keypoint precision shape (N, K, 3), "
58-
f"got {precision_cholesky.shape}."
59-
)
60-
if precision_cholesky.shape[0] != source_shape.shape[0]:
61-
raise ValueError(
62-
"RF-DETR keypoint precision and source_shape must contain the same "
63-
"number of detections, got "
64-
f"{precision_cholesky.shape[0]} and {source_shape.shape[0]}."
65-
)
66-
67-
n_total = precision_cholesky.shape[0] * precision_cholesky.shape[1]
68-
n_non_finite = 0
69-
n_singular = 0
70-
n_overflow = 0
71-
72-
covariances = np.full(
73-
(*precision_cholesky.shape[:2], 2, 2), np.nan, dtype=np.float32
74-
)
75-
for detection_index, detection_precision in enumerate(precision_cholesky):
76-
height, width = source_shape[detection_index]
77-
scale = np.diag([width, height]).astype(np.float64)
78-
for keypoint_index, params in enumerate(detection_precision):
79-
if not np.isfinite(params).all():
80-
n_non_finite += 1
81-
continue
82-
log_l11 = float(np.clip(params[0], -20.0, 20.0))
83-
l21 = float(np.clip(params[1], -1.0e4, 1.0e4))
84-
log_l22 = float(np.clip(params[2], -20.0, 20.0))
85-
l11 = float(np.exp(log_l11))
86-
l22 = float(np.exp(log_l22))
87-
precision = np.array(
88-
[[l11 * l11, l11 * l21], [l11 * l21, l21 * l21 + l22 * l22]],
89-
dtype=np.float64,
90-
)
91-
try:
92-
covariance = np.linalg.inv(precision)
93-
except np.linalg.LinAlgError:
94-
n_singular += 1
95-
continue
96-
97-
pixel_covariance = scale @ covariance @ scale
98-
if np.isfinite(pixel_covariance).all():
99-
covariances[detection_index, keypoint_index] = pixel_covariance
100-
else:
101-
n_overflow += 1
102-
103-
n_failed = n_non_finite + n_singular + n_overflow
104-
if n_failed > 0:
105-
logger.warning(
106-
"%d of %d precision matrices failed: "
107-
"non_finite=%d, singular=%d, overflow=%d",
108-
n_failed,
109-
n_total,
110-
n_non_finite,
111-
n_singular,
112-
n_overflow,
113-
)
114-
return covariances
115-
116-
11726
def _optional_array_equal(
11827
first: npt.NDArray[np.generic] | None,
11928
second: npt.NDArray[np.generic] | None,
@@ -250,13 +159,6 @@ class simplifies data manipulation and filtering, providing a uniform API for
250159
key_point = sv.KeyPoints.from_transformers(results[0])
251160
```
252161
253-
Note:
254-
[`sv.KeyPoints.from_rfdetr`][supervision.key_points.core.KeyPoints.from_rfdetr]
255-
accepts ``sv.Detections`` (not native RF-DETR output) because RF-DETR keypoints
256-
are attached as extra fields inside a ``sv.Detections`` object returned by
257-
``model.predict()``. Run that conversion first, then pass the result to
258-
``from_rfdetr``.
259-
260162
Attributes:
261163
xy: An array of shape `(n, m, 2)` containing
262164
`n` detected objects, each composed of `m` equally-sized
@@ -338,111 +240,6 @@ def __eq__(self, other: object) -> bool:
338240
]
339241
)
340242

341-
@classmethod
342-
def from_rfdetr(cls, rfdetr_detections: Detections) -> KeyPoints:
343-
"""
344-
Create a `sv.KeyPoints` object from RF-DETR `sv.Detections` output.
345-
346-
RF-DETR attaches keypoint coordinates to ``detections.data["keypoints"]``
347-
with shape ``(N, K, 3)`` where the last dimension stores ``[x, y,
348-
confidence]`` in pixel coordinates. When RF-DETR also provides
349-
``detections.data["keypoint_precision_cholesky"]``, this method converts
350-
those per-keypoint precision parameters into pixel-space covariance matrices
351-
and stores them in ``key_points.data["covariance"]`` for use with
352-
`sv.VertexEllipseAnnotator`.
353-
354-
Note:
355-
``detections.data["source_shape"]`` must have shape ``(N, 2)`` where each
356-
row is ``(height, width)`` in pixels — note this is HW order, not the WH
357-
order used by ``resolution_wh`` elsewhere in supervision.
358-
359-
Keypoint confidence values are stored as-is from RF-DETR output and are
360-
expected to be probabilities in the range ``[0, 1]``. If RF-DETR returns
361-
logits instead, user-supplied ``confidence_threshold`` values in
362-
`sv.VertexEllipseAnnotator` should be adjusted accordingly.
363-
364-
Args:
365-
rfdetr_detections: RF-DETR prediction returned by ``model.predict()``.
366-
367-
Returns:
368-
A `sv.KeyPoints` object containing RF-DETR keypoints and optional
369-
covariance matrices.
370-
371-
Raises:
372-
ValueError: If the RF-DETR detections do not contain valid keypoints,
373-
or if precision parameters are present without source shape data.
374-
375-
Examples:
376-
Basic usage — keypoints only:
377-
378-
>>> import numpy as np
379-
>>> import supervision as sv
380-
>>> kp_arr = np.array([[[50, 80, 0.9], [60, 90, 0.8]]], dtype=np.float32)
381-
>>> detections = sv.Detections(
382-
... xyxy=np.array([[10, 20, 100, 200]], dtype=np.float32),
383-
... data={"keypoints": kp_arr},
384-
... )
385-
>>> key_points = sv.KeyPoints.from_rfdetr(detections)
386-
>>> key_points.xy.shape
387-
(1, 2, 2)
388-
389-
With precision Cholesky parameters (produces covariance data):
390-
391-
>>> kp_arr2 = np.array([[[50, 80, 0.9], [60, 90, 0.8]]], dtype=np.float32)
392-
>>> chol = np.zeros((1, 2, 3), dtype=np.float32)
393-
>>> src = np.array([[480, 640]], dtype=np.float32)
394-
>>> detections_with_cov = sv.Detections(
395-
... xyxy=np.array([[10, 20, 100, 200]], dtype=np.float32),
396-
... data={
397-
... "keypoints": kp_arr2,
398-
... "keypoint_precision_cholesky": chol,
399-
... "source_shape": src,
400-
... },
401-
... )
402-
>>> kp = sv.KeyPoints.from_rfdetr(detections_with_cov)
403-
>>> "covariance" in kp.data
404-
True
405-
"""
406-
rfdetr_keypoints = rfdetr_detections.data.get("keypoints")
407-
if rfdetr_keypoints is None:
408-
raise ValueError("RF-DETR detections must contain data['keypoints'].")
409-
410-
keypoints = np.asarray(rfdetr_keypoints, dtype=np.float32)
411-
if keypoints.ndim != 3 or keypoints.shape[2] != 3:
412-
raise ValueError(
413-
f"Expected RF-DETR keypoints shape (N, K, 3), got {keypoints.shape}."
414-
)
415-
if keypoints.shape[0] == 0:
416-
return cls.empty()
417-
418-
data: dict[str, npt.NDArray[np.generic] | list[Any]] = {}
419-
precision_cholesky = rfdetr_detections.data.get("keypoint_precision_cholesky")
420-
if precision_cholesky is not None:
421-
precision_cholesky_array = np.asarray(precision_cholesky, dtype=np.float32)
422-
if precision_cholesky_array.shape[:2] != keypoints.shape[:2]:
423-
raise ValueError(
424-
"keypoint_precision_cholesky shape "
425-
f"{precision_cholesky_array.shape[:2]} does not match "
426-
f"keypoints shape {keypoints.shape[:2]}."
427-
)
428-
source_shape = _rfdetr_source_shape(
429-
rfdetr_detections, detections_count=keypoints.shape[0]
430-
)
431-
data["covariance"] = _rfdetr_precision_cholesky_to_pixel_covariance(
432-
precision_cholesky=precision_cholesky_array,
433-
source_shape=source_shape,
434-
)
435-
class_id: npt.NDArray[np.int_] | None = None
436-
if rfdetr_detections.class_id is not None:
437-
class_id = rfdetr_detections.class_id.astype(np.int_)
438-
439-
return cls(
440-
xy=keypoints[:, :, :2].astype(np.float32),
441-
confidence=keypoints[:, :, 2].astype(np.float32),
442-
class_id=class_id,
443-
data=data,
444-
)
445-
446243
@classmethod
447244
def from_inference(cls, inference_result: Any) -> KeyPoints:
448245
"""

0 commit comments

Comments
 (0)