Komorebi-AI · davidggphy · May 24, 2023 · May 24, 2023 · May 24, 2023 · May 24, 2023
diff --git a/src/super_gradients/common/object_names.py b/src/super_gradients/common/object_names.py
@@ -63,6 +63,7 @@ class Transforms:
     DetectionPaddedRescale = "DetectionPaddedRescale"
     DetectionTargetsFormatTransform = "DetectionTargetsFormatTransform"
     DetectionNormalize = "DetectionNormalize"
+    DetectionRandomSideCrop = "DetectionRandomSideCrop"
     #
     RandomResizedCropAndInterpolation = "RandomResizedCropAndInterpolation"
     RandAugmentTransform = "RandAugmentTransform"
@@ -333,6 +334,8 @@ class Dataloaders:
     COCO2017_VAL_YOLOX = "coco2017_val_yolox"
     COCO2017_TRAIN_YOLO_NAS = "coco2017_train_yolo_nas"
     COCO2017_VAL_YOLO_NAS = "coco2017_val_yolo_nas"
+    COCO_DETECTION_YOLO_FORMAT_TRAIN_CUSTOM = "coco_detection_yolo_format_train_custom"
+    COCO_DETECTION_YOLO_FORMAT_VAL_CUSTOM = "coco_detection_yolo_format_val_custom"
     COCO2017_TRAIN_PPYOLOE = "coco2017_train_ppyoloe"
     COCO2017_VAL_PPYOLOE = "coco2017_val_ppyoloe"
     COCO2017_TRAIN_SSD_LITE_MOBILENET_V2 = "coco2017_train_ssd_lite_mobilenet_v2"

diff --git a/...adients/recipes/dataset_params/coco_detection_yolo_format_base_dataset_params_custom.yaml b/...adients/recipes/dataset_params/coco_detection_yolo_format_base_dataset_params_custom.yaml
@@ -0,0 +1,97 @@
+
+train_dataset_params:
+  data_dir: /data/coco # TO FILL: Where the data is stored.
+  images_dir: images/train2017 # TO FILL: Local path to directory that includes all the images. Path relative to `data_dir`. Can be the same as `labels_dir`.
+  labels_dir: labels/train2017 # TO FILL: Local path to directory that includes all the labels. Path relative to `data_dir`. Can be the same as `images_dir`.
+  classes: [ person, bicycle, car, motorcycle, airplane, bus, train, truck, boat, traffic light, fire hydrant, stop sign,
+             parking meter, bench, bird, cat, dog, horse, sheep, cow, elephant, bear, zebra, giraffe, backpack, umbrella, handbag,
+             tie, suitcase, frisbee, skis, snowboard, sports ball, kite, baseball bat, baseball glove, skateboard, surfboard,
+             tennis racket, bottle, wine glass, cup, fork, knife, spoon, bowl, banana, apple, sandwich, orange, broccoli, carrot,
+             hot dog, pizza, donut, cake, chair, couch, potted plant, bed, dining table, toilet, tv, laptop, mouse, remote,
+             keyboard, cell phone, microwave, oven, toaster, sink, refrigerator, book, clock, vase, scissors, teddy bear,
+             hair drier, toothbrush] # TO FILL: List of classes used in your dataset.
+  input_dim: [640, 640]
+  cache_dir:
+  cache: False
+  transforms:
+    - DetectionRandomSideCrop:
+        min_rel_width : 0.3 
+        max_rel_width : 0.6 
+        p_side_right: 0.5
+        prob: 0.25
+    # - DetectionMosaic:
+    #     input_dim: ${dataset_params.train_dataset_params.input_dim}
+    #     prob: 1.
+    - DetectionRandomAffine:
+        degrees: 10.                  # rotation degrees, randomly sampled from [-degrees, degrees]
+        translate: 0.1                # image translation fraction
+        scales: [ 0.1, 2 ]              # random rescale range (keeps size by padding/cropping) after mosaic transform.
+        shear: 2.0                    # shear degrees, randomly sampled from [-degrees, degrees]
+        target_size: ${dataset_params.train_dataset_params.input_dim}
+        filter_box_candidates: True   # whether to filter out transformed bboxes by edge size, area ratio, and aspect ratio.
+        wh_thr: 2                     # edge size threshold when filter_box_candidates = True (pixels)
+        area_thr: 0.1                 # threshold for area ratio between original image and the transformed one, when when filter_box_candidates = True
+        ar_thr: 20                    # aspect ratio threshold when filter_box_candidates = True
+    - DetectionMixup:
+        input_dim: ${dataset_params.train_dataset_params.input_dim}
+        mixup_scale: [ 0.5, 1.5 ]         # random rescale range for the additional sample in mixup
+        prob: 1.0                       # probability to apply per-sample mixup
+        flip_prob: 0.5                  # probability to apply horizontal flip
+    - DetectionHSV:
+        prob: 1.0                       # probability to apply HSV transform
+        hgain: 5                        # HSV transform hue gain (randomly sampled from [-hgain, hgain])
+        sgain: 30                       # HSV transform saturation gain (randomly sampled from [-sgain, sgain])
+        vgain: 30                       # HSV transform value gain (randomly sampled from [-vgain, vgain])
+    - DetectionHorizontalFlip:
+        prob: 0.5                       # probability to apply horizontal flip
+    - DetectionPaddedRescale:
+        input_dim: ${dataset_params.train_dataset_params.input_dim}
+        max_targets: 120
+    - DetectionTargetsFormatTransform:
+        input_dim: ${dataset_params.train_dataset_params.input_dim}
+        output_format: LABEL_CXCYWH
+  class_inclusion_list:
+  max_num_samples:
+
+train_dataloader_params:
+  batch_size: 25
+  num_workers: 8
+  shuffle: True
+  drop_last: True
+  pin_memory: True
+  collate_fn:
+    _target_: super_gradients.training.utils.detection_utils.DetectionCollateFN
+
+val_dataset_params:
+  data_dir: /data/coco # TO FILL: Where the data is stored.
+  images_dir: images/val2017 # TO FILL: Local path to directory that includes all the images. Path relative to `data_dir`. Can be the same as `labels_dir`.
+  labels_dir: labels/val2017 # TO FILL: Local path to directory that includes all the labels. Path relative to `data_dir`. Can be the same as `images_dir`.
+  classes: [ person, bicycle, car, motorcycle, airplane, bus, train, truck, boat, traffic light, fire hydrant, stop sign,
+             parking meter, bench, bird, cat, dog, horse, sheep, cow, elephant, bear, zebra, giraffe, backpack, umbrella, handbag,
+             tie, suitcase, frisbee, skis, snowboard, sports ball, kite, baseball bat, baseball glove, skateboard, surfboard,
+             tennis racket, bottle, wine glass, cup, fork, knife, spoon, bowl, banana, apple, sandwich, orange, broccoli, carrot,
+             hot dog, pizza, donut, cake, chair, couch, potted plant, bed, dining table, toilet, tv, laptop, mouse, remote,
+             keyboard, cell phone, microwave, oven, toaster, sink, refrigerator, book, clock, vase, scissors, teddy bear,
+             hair drier, toothbrush] # TO FILL: List of classes used in your dataset.
+  input_dim: [640, 640]
+  cache_dir:
+  cache: False
+  transforms:
+  - DetectionPaddedRescale:
+      input_dim: ${dataset_params.val_dataset_params.input_dim}
+  - DetectionTargetsFormatTransform:
+      max_targets: 50
+      input_dim: ${dataset_params.val_dataset_params.input_dim}
+      output_format: LABEL_CXCYWH
+  class_inclusion_list:
+  max_num_samples:
+
+val_dataloader_params:
+  batch_size: 25
+  num_workers: 8
+  drop_last: False
+  pin_memory: True
+  collate_fn:
+    _target_: super_gradients.training.utils.detection_utils.DetectionCollateFN
+
+_convert_: all
diff --git a/...super_gradients/recipes/dataset_params/coco_detection_yolo_nas_dataset_params_custom.yaml b/...super_gradients/recipes/dataset_params/coco_detection_yolo_nas_dataset_params_custom.yaml
@@ -0,0 +1,96 @@
+train_dataset_params:
+  data_dir: /data/coco # root path to coco data
+  subdir: images/train2017 # sub directory path of data_dir containing the train data.
+  json_file: instances_train2017.json # path to coco train json file, data_dir/annotations/train_json_file.
+  input_dim: [640, 640]
+  cache_dir:
+  cache: False
+  transforms:
+    - DetectionRandomSideCrop:
+        min_rel_width : 0.3 
+        max_rel_width : 0.6 
+        p_side_right: 0.5
+        prob: 0.25
+    - DetectionRandomAffine:
+        degrees: 0                    # rotation degrees, randomly sampled from [-degrees, degrees]
+        translate: 0.25               # image translation fraction
+        scales: [ 0.5, 1.5 ]          # random rescale range (keeps size by padding/cropping) after mosaic transform.
+        shear: 0.0                    # shear degrees, randomly sampled from [-degrees, degrees]
+        target_size:
+        filter_box_candidates: True   # whether to filter out transformed bboxes by edge size, area ratio, and aspect ratio.
+        wh_thr: 2                     # edge size threshold when filter_box_candidates = True (pixels)
+        area_thr: 0.1                 # threshold for area ratio between original image and the transformed one, when when filter_box_candidates = True
+        ar_thr: 20                    # aspect ratio threshold when filter_box_candidates = True
+    - DetectionRGB2BGR:
+        prob: 0.5
+    - DetectionHSV:
+        prob: 0.5                       # probability to apply HSV transform
+        hgain: 18                       # HSV transform hue gain (randomly sampled from [-hgain, hgain])
+        sgain: 30                       # HSV transform saturation gain (randomly sampled from [-sgain, sgain])
+        vgain: 30                       # HSV transform value gain (randomly sampled from [-vgain, vgain])
+    - DetectionHorizontalFlip:
+        prob: 0.5                       # probability to apply horizontal flip
+    - DetectionMixup:
+        input_dim:
+        mixup_scale: [ 0.5, 1.5 ]         # random rescale range for the additional sample in mixup
+        prob: 0.5                       # probability to apply per-sample mixup
+        flip_prob: 0.5                  # probability to apply horizontal flip
+    - DetectionPaddedRescale:
+        input_dim: [640, 640]
+        max_targets: 120
+        pad_value: 114
+    - DetectionStandardize:
+        max_value: 255.
+    - DetectionTargetsFormatTransform:
+        max_targets: 256
+        output_format: LABEL_CXCYWH
+
+  tight_box_rotation: False
+  class_inclusion_list:
+  max_num_samples:
+  with_crowd: False
+
+train_dataloader_params:
+  batch_size: 25
+  num_workers: 8
+  shuffle: True
+  drop_last: True
+  pin_memory: True
+  collate_fn:
+    _target_: super_gradients.training.utils.detection_utils.DetectionCollateFN
+
+val_dataset_params:
+  data_dir: /data/coco # root path to coco data
+  subdir: images/val2017 # sub directory path of data_dir containing the train data.
+  json_file: instances_val2017.json # path to coco train json file, data_dir/annotations/train_json_file.
+  input_dim: [636, 636]
+  cache_dir:
+  cache: False
+  transforms:
+    - DetectionRGB2BGR:
+        prob: 1
+    - DetectionPadToSize:
+        output_size: [640, 640]
+        pad_value: 114
+    - DetectionStandardize:
+        max_value: 255.
+    - DetectionImagePermute
+    - DetectionTargetsFormatTransform:
+        max_targets: 50
+        input_dim: [640, 640]
+        output_format: LABEL_CXCYWH
+  tight_box_rotation: False
+  class_inclusion_list:
+  max_num_samples:
+  with_crowd: True
+
+val_dataloader_params:
+  batch_size: 25
+  num_workers: 8
+  drop_last: False
+  shuffle: False
+  pin_memory: True
+  collate_fn:
+    _target_: super_gradients.training.utils.detection_utils.CrowdDetectionCollateFN
+
+_convert_: all
diff --git a/src/super_gradients/training/dataloaders/dataloaders.py b/src/super_gradients/training/dataloaders/dataloaders.py
@@ -194,6 +194,27 @@ def coco2017_val_yolo_nas(dataset_params: Dict = None, dataloader_params: Dict =
         dataloader_params=dataloader_params,
     )
 
+@register_dataloader(Dataloaders.COCO_DETECTION_YOLO_FORMAT_TRAIN_CUSTOM)
+def coco_detection_yolo_format_train_custom(dataset_params: Dict = None, dataloader_params: Dict = None) -> DataLoader:
+    return get_data_loader(
+        config_name="coco_detection_yolo_format_base_dataset_params_custom",
+        dataset_cls=YoloDarknetFormatDetectionDataset,
+        train=True,
+        dataset_params=dataset_params,
+        dataloader_params=dataloader_params,
+    )
+
+
+@register_dataloader(Dataloaders.COCO_DETECTION_YOLO_FORMAT_VAL_CUSTOM)
+def coco_detection_yolo_format_val_custom(dataset_params: Dict = None, dataloader_params: Dict = None) -> DataLoader:
+    return get_data_loader(
+        config_name="coco_detection_yolo_format_base_dataset_params_custom",
+        dataset_cls=YoloDarknetFormatDetectionDataset,
+        train=False,
+        dataset_params=dataset_params,
+        dataloader_params=dataloader_params,
+    )
+
 
 @register_dataloader(Dataloaders.COCO2017_TRAIN_PPYOLOE)
 def coco2017_train_ppyoloe(dataset_params: Dict = None, dataloader_params: Dict = None) -> DataLoader:

diff --git a/src/super_gradients/training/transforms/transforms.py b/src/super_gradients/training/transforms/transforms.py
@@ -1079,6 +1079,108 @@ def get_equivalent_preprocessing(self) -> List:
         return []
 
 
+@register_transform(Transforms.DetectionRandomSideCrop)
+class DetectionRandomSideCrop(DetectionTransform):
+    """Preprocessing transform to crop in width an image and bboxes from the border.
+    This is, the output image will have a width between `min_rel_width` and `max_rel_width` of the original image.
+
+    Note: It assumes the targets are in (X,Y,X,Y,label) format.
+    """    
+
+    def __init__(self, min_rel_width:float = 0.3, max_rel_width: float = 0.6,  p_side_right: float = 0.5, prob: float = 1.0):
+        """_summary_
+
+        :param min_rel_width: minimum relative width of the resulting crop, defaults to 0.3
+        :param max_rel_width: maximum relative width of the resulting crop, defaults to 0.6
+        :param p_side_right: probability of keeping the right side when croping, defaults to 0.5
+        :param prob: probability of applying the transformation, defaults to 1.0
+        :raises AssertionError: Input parameters are not in the correct range
+        """     
+
+        assert 0 < min_rel_width <= 1, f"`min_rel_width` value must be between 0 (not included) and 1, found {min_rel_width}"
+        assert 0 <= max_rel_width <= 1, f"`max_rel_width` value must be between 0 and 1, found {max_rel_width}"
+        assert 0 <= prob <= 1, f"Probability value must be between 0 and 1, found {prob}"
+        assert 0 <= p_side_right <= 1, f"Probability of side value must be between 0 and 1, found {p_side_right}"
+        super(DetectionRandomSideCrop, self).__init__()
+        self.max_rel_width = max_rel_width
+        self.min_rel_width = min_rel_width
+        self.p_side_right = p_side_right
+        self.p = prob
+
+    def __call__(self, sample: dict[str, np.array]) ->  dict[str, np.array]:
+        if random.random() > self.p:
+            return sample
+
+        side = "right" if random.random() > self.p_side_right else "left"
+        random_rel_x = random.uniform(self.min_rel_width, self.max_rel_width)
+
+        image, targets = sample["image"], sample["target"]
+        bboxes = targets[:,:4]
+
+        abs_x = min(int(random_rel_x * image.shape[1]),image.shape[1]-1)
+
+        if side == "left": 
+            abs_x = image.shape[1] - abs_x
+
+        sample["image"] = self._crop_image(image, abs_x, side)
+        boxes, kept_indices = self._crop_bboxes(bboxes, abs_x, side)
+        targets = targets[kept_indices]
+        targets[:,:4] = boxes
+        sample["target"] = targets
+
+
+        if "crowd_target" in sample.keys():
+            crowd_targets = sample["crowd_target"]
+            boxes = crowd_targets.copy()
+            boxes, kept_indices = self._crop_bboxes(bboxes, abs_x, side)
+            targets = targets[kept_indices]
+            targets[:,:4] = boxes
+            sample["crowd_target"] = targets
+
+        return sample
+
+    def _crop_image(self, img: np.ndarray, abs_x: int, side: str) -> np.ndarray:
+        """Return the cropped image.
+
+        :param img: Numpy array of image
+        :param abx_x: Absolute value of the x coordinate to crop
+        :param side: Side of the resulting crop. Either "right" or "left"
+        :return: Numpy array of cropped image
+        """        
+        if side == "right":
+            output_img = img[:, abs_x:]
+        else:
+            output_img =  img[:, :abs_x]
+        return output_img
+
+    def _crop_bboxes(self, bboxes: np.ndarray, abs_x: int, side:str) -> np.ndarray:
+        """Return the bboxes that are inside the crop. In the case of intersection, the bbox is cropped.
+
+        :param bboxes: Numpy array of bounding boxes in (X,Y,X,Y) format. Shape (N,4)
+        :param abx_x: Absolute value of the x coordinate to crop
+        :param side: Side of the resulting crop. Either "right" or "left"
+        :return: Numpy array of cropped bounding boxes in (X,Y,X,Y) format. Shape (N',4)
+        """
+
+        fixed_bboxes = []
+        kept_indices = []
+        if side == "right":
+            for i, bbox in enumerate(bboxes):
+                # bottom right corner is inside the crop (right side of the image)
+                if bbox[2] > abs_x:
+                    kept_indices.append(i)
+                    fixed_bboxes.append([max(bbox[0], abs_x) - abs_x, bbox[1], bbox[2] - abs_x, bbox[3]])
+        else:
+            for i, bbox in enumerate(bboxes):
+                # upper left corner is inside the crop (left side of the image)
+                if bbox[0] < abs_x:
+                    kept_indices.append(i)
+                    fixed_bboxes.append([bbox[0], bbox[1], min(bbox[2], abs_x), bbox[3]])
+
+        return np.array(fixed_bboxes).reshape((-1,4)), kept_indices
+
+
+
 def get_aug_params(value: Union[tuple, float], center: float = 0) -> float:
     """
     Generates a random value for augmentations as described below