Komorebi-AI · CuarteroAlvaro · May 24, 2023 · May 24, 2023 · May 24, 2023 · May 24, 2023
diff --git a/src/super_gradients/common/object_names.py b/src/super_gradients/common/object_names.py
@@ -63,6 +63,7 @@ class Transforms:
     DetectionPaddedRescale = "DetectionPaddedRescale"
     DetectionTargetsFormatTransform = "DetectionTargetsFormatTransform"
     DetectionNormalize = "DetectionNormalize"
+    DetectionRandomSideCrop = "DetectionRandomSideCrop"
     #
     RandomResizedCropAndInterpolation = "RandomResizedCropAndInterpolation"
     RandAugmentTransform = "RandAugmentTransform"
@@ -333,6 +334,8 @@ class Dataloaders:
     COCO2017_VAL_YOLOX = "coco2017_val_yolox"
     COCO2017_TRAIN_YOLO_NAS = "coco2017_train_yolo_nas"
     COCO2017_VAL_YOLO_NAS = "coco2017_val_yolo_nas"
+    COCO_DETECTION_YOLO_FORMAT_TRAIN_CUSTOM = "coco_detection_yolo_format_train_custom"
+    COCO_DETECTION_YOLO_FORMAT_VAL_CUSTOM = "coco_detection_yolo_format_val_custom"
     COCO2017_TRAIN_PPYOLOE = "coco2017_train_ppyoloe"
     COCO2017_VAL_PPYOLOE = "coco2017_val_ppyoloe"
     COCO2017_TRAIN_SSD_LITE_MOBILENET_V2 = "coco2017_train_ssd_lite_mobilenet_v2"

diff --git a/...adients/recipes/dataset_params/coco_detection_yolo_format_base_dataset_params_custom.yaml b/...adients/recipes/dataset_params/coco_detection_yolo_format_base_dataset_params_custom.yaml
@@ -0,0 +1,100 @@
+train_dataset_params:
+  data_dir: /data/coco # TO FILL: Where the data is stored.
+  images_dir: images/train2017 # TO FILL: Local path to directory that includes all the images. Path relative to `data_dir`. Can be the same as `labels_dir`.
+  labels_dir: labels/train2017 # TO FILL: Local path to directory that includes all the labels. Path relative to `data_dir`. Can be the same as `images_dir`.
+  classes: [ person, bicycle, car, motorcycle, airplane, bus, train, truck, boat, traffic light, fire hydrant, stop sign,
+             parking meter, bench, bird, cat, dog, horse, sheep, cow, elephant, bear, zebra, giraffe, backpack, umbrella, handbag,
+             tie, suitcase, frisbee, skis, snowboard, sports ball, kite, baseball bat, baseball glove, skateboard, surfboard,
+             tennis racket, bottle, wine glass, cup, fork, knife, spoon, bowl, banana, apple, sandwich, orange, broccoli, carrot,
+             hot dog, pizza, donut, cake, chair, couch, potted plant, bed, dining table, toilet, tv, laptop, mouse, remote,
+             keyboard, cell phone, microwave, oven, toaster, sink, refrigerator, book, clock, vase, scissors, teddy bear,
+             hair drier, toothbrush] # TO FILL: List of classes used in your dataset.
+  input_dim: [1024, 1024]
+  cache_dir:
+  cache: False
+  transforms:
+    - DetectionRandomSideCrop:
+        min_rel_width : 1e-16
+        max_rel_width : 0.5
+        p_side_right: 0.5
+        prob: 1.0
+    # - DetectionMosaic:
+    #     input_dim: ${dataset_params.train_dataset_params.input_dim}
+    #     prob: 1.
+    # - DetectionRandomAffine:
+    #    degrees: 3.                  # rotation degrees, randomly sampled from [-degrees, degrees]
+    #    translate: 0.05                # image translation fraction
+      #  shear: 0.                # shear degrees, randomly sampled from [-degrees, degrees]
+    #     target_size: ${dataset_params.train_dataset_params.input_dim}
+    #     filter_box_candidates: True   # whether to filter out transformed bboxes by edge size, area ratio, and aspect ratio.
+    #     wh_thr: 2                     # edge size threshold when filter_box_candidates = True (pixels)
+    #     area_thr: 0.1                 # threshold for area ratio between original image and the transformed one, when when filter_box_candidates = True
+    #     ar_thr: 20                    # aspect ratio threshold when filter_box_candidates = True
+    # - DetectionMixup:
+    #     input_dim: ${dataset_params.train_dataset_params.input_dim}
+        # mixup_scale: [ 0.5, 1.5 ]         # random rescale range for the additional sample in mixup
+        # prob: 1.0                       # probability to apply per-sample mixup
+    #     flip_prob: 0.5                  # probability to apply horizontal flip
+    - DetectionHSV:
+        prob: 1.0                      # probability to apply HSV transform
+        hgain: 18                        # HSV transform hue gain (randomly sampled from [-hgain, hgain])
+        sgain: 30                       # HSV transform saturation gain (randomly sampled from [-sgain, sgain])
+        vgain: 30                       # HSV transform value gain (randomly sampled from [-vgain, vgain])
+    - DetectionHorizontalFlip:
+        prob: 0.3                       # probability to apply horizontal flip
+    - DetectionRescale:
+        output_shape: ${dataset_params.train_dataset_params.input_dim}
+        swap: [2,0,1]
+    - DetectionStandardize:
+        max_value: 255.
+    - DetectionTargetsFormatTransform:
+        input_dim: ${dataset_params.train_dataset_params.input_dim}
+        output_format: LABEL_CXCYWH
+
+  class_inclusion_list:
+  max_num_samples:
+
+train_dataloader_params:
+  batch_size: 25
+  num_workers: 8
+  shuffle: True
+  drop_last: True
+  pin_memory: True
+  collate_fn:
+    _target_: super_gradients.training.utils.detection_utils.DetectionCollateFN
+
+val_dataset_params:
+  data_dir: /data/coco # TO FILL: Where the data is stored.
+  images_dir: images/val2017 # TO FILL: Local path to directory that includes all the images. Path relative to `data_dir`. Can be the same as `labels_dir`.
+  labels_dir: labels/val2017 # TO FILL: Local path to directory that includes all the labels. Path relative to `data_dir`. Can be the same as `images_dir`.
+  classes: [ person, bicycle, car, motorcycle, airplane, bus, train, truck, boat, traffic light, fire hydrant, stop sign,
+             parking meter, bench, bird, cat, dog, horse, sheep, cow, elephant, bear, zebra, giraffe, backpack, umbrella, handbag,
+             tie, suitcase, frisbee, skis, snowboard, sports ball, kite, baseball bat, baseball glove, skateboard, surfboard,
+             tennis racket, bottle, wine glass, cup, fork, knife, spoon, bowl, banana, apple, sandwich, orange, broccoli, carrot,
+             hot dog, pizza, donut, cake, chair, couch, potted plant, bed, dining table, toilet, tv, laptop, mouse, remote,
+             keyboard, cell phone, microwave, oven, toaster, sink, refrigerator, book, clock, vase, scissors, teddy bear,
+             hair drier, toothbrush] # TO FILL: List of classes used in your dataset.
+  input_dim: [1024, 1024]
+  cache_dir:
+  cache: False
+  transforms:
+  - DetectionRescale:
+        output_shape: ${dataset_params.train_dataset_params.input_dim}
+        swap: [2,0,1]
+  - DetectionStandardize:
+        max_value: 255.
+  - DetectionTargetsFormatTransform:
+      input_dim: ${dataset_params.val_dataset_params.input_dim}
+      output_format: LABEL_CXCYWH
+  class_inclusion_list:
+  max_num_samples:
+
+val_dataloader_params:
+  batch_size: 25
+  num_workers: 8
+  drop_last: False
+  pin_memory: True
+  collate_fn:
+    _target_: super_gradients.training.utils.detection_utils.DetectionCollateFN
+
+_convert_: all
diff --git a/...super_gradients/recipes/dataset_params/coco_detection_yolo_nas_dataset_params_custom.yaml b/...super_gradients/recipes/dataset_params/coco_detection_yolo_nas_dataset_params_custom.yaml
@@ -0,0 +1,96 @@
+train_dataset_params:
+  data_dir: /data/coco # root path to coco data
+  subdir: images/train2017 # sub directory path of data_dir containing the train data.
+  json_file: instances_train2017.json # path to coco train json file, data_dir/annotations/train_json_file.
+  input_dim: [640, 640]
+  cache_dir:
+  cache: False
+  transforms:
+    # - DetectionRandomSideCrop:
+    #     min_rel_width : 0.3 
+    #     max_rel_width : 0.6 
+    #     p_side_right: 0.5
+    #     prob: 0.25
+    - DetectionRandomAffine:
+        degrees: 0                    # rotation degrees, randomly sampled from [-degrees, degrees]
+        translate: 0.25               # image translation fraction
+        scales: [ 0.5, 1.5 ]          # random rescale range (keeps size by padding/cropping) after mosaic transform.
+        shear: 0.0                    # shear degrees, randomly sampled from [-degrees, degrees]
+        target_size:
+        filter_box_candidates: True   # whether to filter out transformed bboxes by edge size, area ratio, and aspect ratio.
+        wh_thr: 2                     # edge size threshold when filter_box_candidates = True (pixels)
+        area_thr: 0.1                 # threshold for area ratio between original image and the transformed one, when when filter_box_candidates = True
+        ar_thr: 20                    # aspect ratio threshold when filter_box_candidates = True
+    - DetectionRGB2BGR:
+        prob: 0.5
+    - DetectionHSV:
+        prob: 0.5                       # probability to apply HSV transform
+        hgain: 18                       # HSV transform hue gain (randomly sampled from [-hgain, hgain])
+        sgain: 30                       # HSV transform saturation gain (randomly sampled from [-sgain, sgain])
+        vgain: 30                       # HSV transform value gain (randomly sampled from [-vgain, vgain])
+    - DetectionHorizontalFlip:
+        prob: 0.5                       # probability to apply horizontal flip
+    - DetectionMixup:
+        input_dim:
+        mixup_scale: [ 0.5, 1.5 ]         # random rescale range for the additional sample in mixup
+        prob: 0.5                       # probability to apply per-sample mixup
+        flip_prob: 0.5                  # probability to apply horizontal flip
+    - DetectionPaddedRescale:
+        input_dim: [640, 640]
+        max_targets: 120
+        pad_value: 114
+    - DetectionStandardize:
+        max_value: 255.
+    - DetectionTargetsFormatTransform:
+        max_targets: 256
+        output_format: LABEL_CXCYWH
+
+  tight_box_rotation: False
+  class_inclusion_list:
+  max_num_samples:
+  with_crowd: False
+
+train_dataloader_params:
+  batch_size: 25
+  num_workers: 8
+  shuffle: True
+  drop_last: True
+  pin_memory: True
+  collate_fn:
+    _target_: super_gradients.training.utils.detection_utils.DetectionCollateFN
+
+val_dataset_params:
+  data_dir: /data/coco # root path to coco data
+  subdir: images/val2017 # sub directory path of data_dir containing the train data.
+  json_file: instances_val2017.json # path to coco train json file, data_dir/annotations/train_json_file.
+  input_dim: [636, 636]
+  cache_dir:
+  cache: False
+  transforms:
+    - DetectionRGB2BGR:
+        prob: 1
+    - DetectionPadToSize:
+        output_size: [640, 640]
+        pad_value: 114
+    - DetectionStandardize:
+        max_value: 255.
+    - DetectionImagePermute
+    - DetectionTargetsFormatTransform:
+        max_targets: 50
+        input_dim: [640, 640]
+        output_format: LABEL_CXCYWH
+  tight_box_rotation: False
+  class_inclusion_list:
+  max_num_samples:
+  with_crowd: True
+
+val_dataloader_params:
+  batch_size: 25
+  num_workers: 8
+  drop_last: False
+  shuffle: False
+  pin_memory: True
+  collate_fn:
+    _target_: super_gradients.training.utils.detection_utils.CrowdDetectionCollateFN
+
+_convert_: all
diff --git a/src/super_gradients/training/dataloaders/dataloaders.py b/src/super_gradients/training/dataloaders/dataloaders.py
@@ -194,6 +194,27 @@ def coco2017_val_yolo_nas(dataset_params: Dict = None, dataloader_params: Dict =
         dataloader_params=dataloader_params,
     )
 
+@register_dataloader(Dataloaders.COCO_DETECTION_YOLO_FORMAT_TRAIN_CUSTOM)
+def coco_detection_yolo_format_train_custom(dataset_params: Dict = None, dataloader_params: Dict = None) -> DataLoader:
+    return get_data_loader(
+        config_name="coco_detection_yolo_format_base_dataset_params_custom",
+        dataset_cls=YoloDarknetFormatDetectionDataset,
+        train=True,
+        dataset_params=dataset_params,
+        dataloader_params=dataloader_params,
+    )
+
+
+@register_dataloader(Dataloaders.COCO_DETECTION_YOLO_FORMAT_VAL_CUSTOM)
+def coco_detection_yolo_format_val_custom(dataset_params: Dict = None, dataloader_params: Dict = None) -> DataLoader:
+    return get_data_loader(
+        config_name="coco_detection_yolo_format_base_dataset_params_custom",
+        dataset_cls=YoloDarknetFormatDetectionDataset,
+        train=False,
+        dataset_params=dataset_params,
+        dataloader_params=dataloader_params,
+    )
+
 
 @register_dataloader(Dataloaders.COCO2017_TRAIN_PPYOLOE)
 def coco2017_train_ppyoloe(dataset_params: Dict = None, dataloader_params: Dict = None) -> DataLoader:

diff --git a/src/super_gradients/training/models/detection_models/customizable_detector.py b/src/super_gradients/training/models/detection_models/customizable_detector.py
@@ -126,6 +126,7 @@ def set_dataset_processing_params(
         iou: Optional[float] = None,
         conf: Optional[float] = None,
     ) -> None:
+
         """Set the processing parameters for the dataset.
 
         :param class_names:     (Optional) Names of the dataset the model was trained on.
@@ -173,6 +174,7 @@ def predict(self, images: ImageSource, iou: Optional[float] = None, conf: Option
         :param fuse_model: If True, create a copy of the model, and fuse some of its layers to increase performance. This increases memory usage.
         """
         pipeline = self._get_pipeline(iou=iou, conf=conf, fuse_model=fuse_model)
+
         return pipeline(images)  # type: ignore
 
     def predict_webcam(self, iou: Optional[float] = None, conf: Optional[float] = None, fuse_model: bool = True):

diff --git a/src/super_gradients/training/pipelines/pipelines.py b/src/super_gradients/training/pipelines/pipelines.py
@@ -163,6 +163,7 @@ def _generate_prediction_result_single_batch(self, images: Iterable[np.ndarray])
 
         # Preprocess
         preprocessed_images, processing_metadatas = [], []
+
         for image in images:
             preprocessed_image, processing_metadata = self.image_processor.preprocess_image(image=image.copy())
             preprocessed_images.append(preprocessed_image)

diff --git a/src/super_gradients/training/processing/processing.py b/src/super_gradients/training/processing/processing.py
@@ -224,6 +224,23 @@ def preprocess_image(self, image: np.ndarray) -> Tuple[np.ndarray, RescaleMetada
 
         return rescaled_image, RescaleMetadata(original_shape=image.shape[:2], scale_factor_h=scale_factor_h, scale_factor_w=scale_factor_w)
 
+class _RescaleWithSwap(Processing, ABC):
+    """Resize image to given image dimensions WITHOUT preserving aspect ratio.
+
+    :param output_shape: (H, W)
+    """
+
+    def __init__(self, output_shape: Tuple[int, int], swap):
+        self.output_shape = output_shape
+
+    def preprocess_image(self, image: np.ndarray) -> Tuple[np.ndarray, RescaleMetadata]:
+
+        scale_factor_h, scale_factor_w = self.output_shape[0] / image.shape[0], self.output_shape[1] / image.shape[1]
+        rescaled_image = _rescale_image_with_swap(image, target_shape=self.output_shape)
+
+        return rescaled_image, RescaleMetadata(original_shape=image.shape[:2], scale_factor_h=scale_factor_h, scale_factor_w=scale_factor_w)
+
+
 
 class _LongestMaxSizeRescale(Processing, ABC):
     """Resize image to given image dimensions WITH preserving aspect ratio.
@@ -249,9 +266,11 @@ def preprocess_image(self, image: np.ndarray) -> Tuple[np.ndarray, RescaleMetada
 class DetectionRescale(_Rescale):
     def postprocess_predictions(self, predictions: DetectionPrediction, metadata: RescaleMetadata) -> DetectionPrediction:
         predictions.bboxes_xyxy = _rescale_bboxes(targets=predictions.bboxes_xyxy, scale_factors=(1 / metadata.scale_factor_h, 1 / metadata.scale_factor_w))
+
         return predictions
 
 
+
 @register_processing(Processings.DetectionLongestMaxSizeRescale)
 class DetectionLongestMaxSizeRescale(_LongestMaxSizeRescale):
     def postprocess_predictions(self, predictions: DetectionPrediction, metadata: RescaleMetadata) -> DetectionPrediction: