Drop intermediate conv outputs and VOC FeatureExtractor.

kdexd · kdexd · commit 11c579376d9c · 2021-04-04T06:39:20.000-04:00
diff --git a/scripts/clf_voc07.py b/scripts/clf_voc07.py
@@ -15,7 +15,6 @@
 
 from virtex.config import Config
 from virtex.factories import PretrainingModelFactory, DownstreamDatasetFactory
-from virtex.models.downstream import FeatureExtractor
 from virtex.utils.checkpointing import CheckpointManager
 from virtex.utils.common import common_parser, common_setup
 
@@ -36,10 +35,6 @@
 
 # fmt: off
 parser.add_argument_group("Checkpointing")
-group.add_argument(
-    "--layer", choices=["layer1", "layer2", "layer3", "layer4", "avgpool"],
-    default="avgpool", help="Evaluate features extracted from this layer."
-)
 parser.add_argument(
     "--weight-init", choices=["random", "imagenet", "torchvision", "virtex"],
     default="virtex", help="""How to initialize weights:
@@ -161,9 +156,12 @@ def main(_A: argparse.Namespace):
             torch.load(_A.checkpoint_path, map_location="cpu")["state_dict"],
             strict=False,
         )
+        # Set ``ITERATION`` to a dummy value.
+        ITERATION = 0
 
-    model = FeatureExtractor(model, layer_name=_A.layer, flatten_and_normalize=True)
-    model = model.to(device).eval()
+    # Transfer model to GPU and set to eval mode. This is a torchvision model
+    # and it returns features as ``(batch_size, 2048, 7, 7)``.
+    model = model.visual.cnn.to(device).eval()
 
     # -------------------------------------------------------------------------
     #   EXTRACT FEATURES FOR TRAINING SVMs
@@ -180,13 +178,33 @@ def main(_A: argparse.Namespace):
         for batch in tqdm(train_dataloader, desc="Extracting train features:"):
             features = model(batch["image"].to(device))
 
+            # Global average pool features. Assume the tensor is in NCHW format.
+            if len(features.size()) > 2:
+                features = features.view(features.size(0), features.size(1), -1)
+
+                # shape: (batch_size, visual_feature_size)
+                features = features.mean(dim=-1)
+
+            # shape: (batch_size, visual_feature_size)
+            features = features.view(features.size(0), -1)
+
+            # L2-normalize the global average pooled features.
+            features = features / torch.norm(features, dim=-1).unsqueeze(-1)
+
             features_train.append(features.cpu())
             targets_train.append(batch["label"])
 
         # Similarly extract test features.
         for batch in tqdm(test_dataloader, desc="Extracting test features:"):
             features = model(batch["image"].to(device))
 
+            if len(features.size()) > 2:
+                features = features.view(features.size(0), features.size(1), -1)
+                features = features.mean(dim=-1)
+
+            features = features.view(features.size(0), -1)
+            features = features / torch.norm(features, dim=-1).unsqueeze(-1)
+
             features_test.append(features.cpu())
             targets_test.append(batch["label"])
 
@@ -226,13 +244,10 @@ def main(_A: argparse.Namespace):
 
     # Test set mAP for each class, for features from every layer.
     test_map = torch.tensor(pool_output).mean()
-    logger.info(f"mAP: {test_map}")
-
-    # Tensorboard logging only when _A.weight_init == "virtex"
-    if _A.weight_init == "virtex":
-        tensorboard_writer.add_scalars(
-            "metrics/voc07_clf", {f"{_A.layer}_mAP": test_map}, ITERATION
-        )
+    logger.info(f"Iteration: {ITERATION}, mAP: {test_map}")
+    tensorboard_writer.add_scalars(
+        "metrics/voc07_clf", {f"{_A.layer}_mAP": test_map}, ITERATION
+    )
 
 
 if __name__ == "__main__":
diff --git a/virtex/models/downstream.py b/virtex/models/downstream.py
diff --git a/virtex/modules/visual_backbones.py b/virtex/modules/visual_backbones.py
@@ -1,4 +1,4 @@
-from typing import Any, Dict, Union
+from typing import Any, Dict
 
 import torch
 from torch import nn
@@ -17,50 +17,6 @@ def __init__(self, visual_feature_size: int):
         self.visual_feature_size = visual_feature_size
 
 
-class BlindVisualBackbone(VisualBackbone):
-    r"""
-    A visual backbone which cannot see the image. It always outputs a tensor
-    filled with constant value.
-
-    Parameters
-    ----------
-    visual_feature_size: int, optional (default = 2048)
-        Size of the last dimension (channels) of output from forward pass.
-    bias_value: float, optional (default = 1.0)
-        Constant value to fill in the output tensor.
-    """
-
-    def __init__(self, visual_feature_size: int = 2048, bias_value: float = 1.0):
-        super().__init__(visual_feature_size)
-
-        # We never update the bias because a blind model cannot learn anything
-        # about the image. Add an axis for proper broadcasting.
-        self._bias = nn.Parameter(
-            torch.full((1, self.visual_feature_size), fill_value=bias_value),
-            requires_grad=False,
-        )
-
-    def forward(self, image: torch.Tensor) -> torch.Tensor:
-        r"""
-        Compute visual features for a batch of input images. Since this model
-        is *blind*, output will always be constant.
-
-        Parameters
-        ----------
-        image: torch.Tensor
-            Batch of input images. A tensor of shape
-            ``(batch_size, 3, height, width)``.
-
-        Returns
-        -------
-        torch.Tensor
-            Output visual features, filled with :attr:`bias_value`. A tensor of
-            shape ``(batch_size, visual_feature_size)``.
-        """
-        batch_size = image.size(0)
-        return self._bias.repeat(batch_size, 1)
-
-
 class TorchvisionVisualBackbone(VisualBackbone):
     r"""
     A visual backbone from `Torchvision model zoo
@@ -91,7 +47,8 @@ def __init__(
         self.cnn = getattr(torchvision.models, name)(
             pretrained, zero_init_residual=True
         )
-        # Do nothing after the final residual stage.
+        # Reove global average pooling and fc layer.
+        self.cnn.avgpool = nn.Identity()
         self.cnn.fc = nn.Identity()
 
         # Freeze all weights if specified.
@@ -100,12 +57,7 @@ def __init__(
                 param.requires_grad = False
             self.cnn.eval()
 
-        # Keep a list of intermediate layer names.
-        self._stage_names = [f"layer{i}" for i in range(1, 5)]
-
-    def forward(
-        self, image: torch.Tensor, return_intermediate_outputs: bool = False
-    ) -> Union[torch.Tensor, Dict[str, torch.Tensor]]:
+    def forward(self, image: torch.Tensor) -> torch.Tensor:
         r"""
         Compute visual features for a batch of input images.
 
@@ -114,41 +66,17 @@ def forward(
         image: torch.Tensor
             Batch of input images. A tensor of shape
             ``(batch_size, 3, height, width)``.
-        return_intermediate_outputs: bool, optional (default = False)
-            Whether to return feaures extracted from all intermediate stages or
-            just the last one. This can only be set ``True`` when using a
-            ResNet-like model.
 
         Returns
         -------
-        Union[torch.Tensor, Dict[str, torch.Tensor]]
-            - If ``return_intermediate_outputs = False``, this will be a tensor
-              of shape ``(batch_size, channels, height, width)``, for example
-              it will be ``(batch_size, 2048, 7, 7)`` for ResNet-50 (``layer4``).
-
-            - If ``return_intermediate_outputs = True``, this will be a dict
-              with keys ``{"layer1", "layer2", "layer3", "layer4", "avgpool"}``
-              containing features from all intermediate layers and global
-              average pooling layer.
+        torch.Tensor
+            A tensor of shape ``(batch_size, channels, height, width)``, for
+            example it will be ``(batch_size, 2048, 7, 7)`` for ResNet-50.
         """
 
-        # Iterate through the modules in sequence and collect feature
-        # vectors for last layers in each stage.
-        intermediate_outputs: Dict[str, torch.Tensor] = {}
-        for idx, (name, layer) in enumerate(self.cnn.named_children()):
-            out = layer(image) if idx == 0 else layer(out)
-            if name in self._stage_names:
-                intermediate_outputs[name] = out
-
-        # Add pooled spatial features.
-        intermediate_outputs["avgpool"] = torch.mean(
-            intermediate_outputs["layer4"], dim=[2, 3]
-        )
-        if return_intermediate_outputs:
-            return intermediate_outputs
-        else:
-            # shape: (batch_size, channels, height, width)
-            return intermediate_outputs["layer4"]
+        # shape: (batch_size, channels, height, width)
+        # [ResNet-50: (b, 2048, 7, 7)]
+        return self.cnn(image)
 
     def detectron2_backbone_state_dict(self) -> Dict[str, Any]:
         r"""