wip

nerdai · nerdai · commit bbb9d27caa21 · 2025-04-11T10:49:31.000-04:00
diff --git a/examples/nnunet_example/client.py b/examples/nnunet_example/client.py
@@ -22,11 +22,12 @@
 from torchmetrics.segmentation import GeneralizedDiceScore
 
 from fl4health.clients.nnunet_client import NnunetClient
+from fl4health.mixins.personalized import make_it_personal
 from fl4health.utils.load_data import load_msd_dataset
 from fl4health.utils.metrics import TorchMetric, TransformsMetric
 from fl4health.utils.msd_dataset_sources import get_msd_dataset_enum, msd_num_labels
 from fl4health.utils.nnunet_utils import get_segs_from_probs, set_nnunet_env
-from fl4health.mixins.personalized import make_it_personal
+
 
 personalized_client_classes = {"ditto": make_it_personal(NnunetClient, "ditto")}
 
@@ -43,74 +44,75 @@ def main(
     client_name: str | None = None,
     personalized_strategy: Literal["ditto"] | None = None,
 ) -> None:
-    # Log device and server address
-    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-    log(INFO, f"Using device: {device}")
-    log(INFO, f"Using server address: {server_address}")
-
-    # Load the dataset if necessary
-    msd_dataset_enum = get_msd_dataset_enum(msd_dataset_name)
-    nnUNet_raw = join(dataset_path, "nnunet_raw")
-    if not exists(join(nnUNet_raw, msd_dataset_enum.value)):
-        log(INFO, f"Downloading and extracting {msd_dataset_enum.value} dataset")
-        load_msd_dataset(nnUNet_raw, msd_dataset_name)
-
-    # The dataset ID will be the same as the MSD Task number
-    dataset_id = int(msd_dataset_enum.value[4:6])
-    nnunet_dataset_name = f"Dataset{dataset_id:03d}_{msd_dataset_enum.value.split('_')[1]}"
-
-    # Convert the msd dataset if necessary
-    if not exists(join(nnUNet_raw, nnunet_dataset_name)):
-        log(INFO, f"Converting {msd_dataset_enum.value} into nnunet dataset")
-        convert_msd_dataset(source_folder=join(nnUNet_raw, msd_dataset_enum.value))
-
-    # Create a metric
-    dice = TransformsMetric(
-        metric=TorchMetric(
-            name="Pseudo DICE",
-            metric=GeneralizedDiceScore(
-                num_classes=msd_num_labels[msd_dataset_enum], weight_type="square", include_background=False
-            ).to(device),
-        ),
-        pred_transforms=[torch.sigmoid, get_segs_from_probs],
-    )
+    with torch.autograd.set_detect_anomaly(True):
+        # Log device and server address
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        log(INFO, f"Using device: {device}")
+        log(INFO, f"Using server address: {server_address}")
+
+        # Load the dataset if necessary
+        msd_dataset_enum = get_msd_dataset_enum(msd_dataset_name)
+        nnUNet_raw = join(dataset_path, "nnunet_raw")
+        if not exists(join(nnUNet_raw, msd_dataset_enum.value)):
+            log(INFO, f"Downloading and extracting {msd_dataset_enum.value} dataset")
+            load_msd_dataset(nnUNet_raw, msd_dataset_name)
+
+        # The dataset ID will be the same as the MSD Task number
+        dataset_id = int(msd_dataset_enum.value[4:6])
+        nnunet_dataset_name = f"Dataset{dataset_id:03d}_{msd_dataset_enum.value.split('_')[1]}"
+
+        # Convert the msd dataset if necessary
+        if not exists(join(nnUNet_raw, nnunet_dataset_name)):
+            log(INFO, f"Converting {msd_dataset_enum.value} into nnunet dataset")
+            convert_msd_dataset(source_folder=join(nnUNet_raw, msd_dataset_enum.value))
+
+        # Create a metric
+        dice = TransformsMetric(
+            metric=TorchMetric(
+                name="Pseudo DICE",
+                metric=GeneralizedDiceScore(
+                    num_classes=msd_num_labels[msd_dataset_enum], weight_type="square", include_background=False
+                ).to(device),
+            ),
+            pred_transforms=[torch.sigmoid, get_segs_from_probs],
+        )
 
-    if intermediate_client_state_dir is not None:
-        checkpoint_and_state_module = ClientCheckpointAndStateModule(
-            state_checkpointer=PerRoundStateCheckpointer(Path(intermediate_client_state_dir))
+        if intermediate_client_state_dir is not None:
+            checkpoint_and_state_module = ClientCheckpointAndStateModule(
+                state_checkpointer=PerRoundStateCheckpointer(Path(intermediate_client_state_dir))
+            )
+        else:
+            checkpoint_and_state_module = None
+
+        # Create client
+        client_kwargs = {}
+        client_kwargs.update(
+            # Args specific to nnUNetClient
+            dataset_id=dataset_id,
+            fold=fold,
+            always_preprocess=always_preprocess,
+            verbose=verbose,
+            compile=compile,
+            # BaseClient Args
+            device=device,
+            metrics=[dice],
+            progress_bar=verbose,
+            checkpoint_and_state_module=checkpoint_and_state_module,
+            client_name=client_name,
         )
-    else:
-        checkpoint_and_state_module = None
-
-    # Create client
-    client_kwargs = {}
-    client_kwargs.update(
-        # Args specific to nnUNetClient
-        dataset_id=dataset_id,
-        fold=fold,
-        always_preprocess=always_preprocess,
-        verbose=verbose,
-        compile=compile,
-        # BaseClient Args
-        device=device,
-        metrics=[dice],
-        progress_bar=verbose,
-        checkpoint_and_state_module=checkpoint_and_state_module,
-        client_name=client_name,
-    )
-    if personalized_strategy:
-        log(INFO, f"Setting up client for personalized strategy: {personalized_strategy}")
-        client = personalized_client_classes[personalized_strategy](**client_kwargs)
-    else:
-        log(INFO, f"Setting up client without personalization")
-        client = NnunetClient(**client_kwargs)
-    log(INFO, f"Using client: {type(client).__name__}")
-    log(INFO, f"Parameter exchanger: {type(client.parameter_exchanger).__name__}")
-
-    start_client(server_address=server_address, client=client.to_client())
-
-    # Shutdown the client
-    client.shutdown()
+        if personalized_strategy:
+            log(INFO, f"Setting up client for personalized strategy: {personalized_strategy}")
+            client = personalized_client_classes[personalized_strategy](**client_kwargs)
+        else:
+            log(INFO, f"Setting up client without personalization")
+            client = NnunetClient(**client_kwargs)
+        log(INFO, f"Using client: {type(client).__name__}")
+        log(INFO, f"Parameter exchanger: {type(client.parameter_exchanger).__name__}")
+
+        start_client(server_address=server_address, client=client.to_client())
+
+        # Shutdown the client
+        client.shutdown()
 
 
 if __name__ == "__main__":
diff --git a/examples/nnunet_example/server.py b/examples/nnunet_example/server.py
@@ -11,19 +11,19 @@
 from flwr.common.typing import Config
 from flwr.server.client_manager import SimpleClientManager
 from flwr.server.strategy import FedAvg
-from fl4health.strategies.fedavg_with_adaptive_constraint import FedAvgWithAdaptiveConstraint
 
 from fl4health.checkpointing.checkpointer import PerRoundStateCheckpointer
 from fl4health.checkpointing.server_module import NnUnetServerCheckpointAndStateModule
-from fl4health.parameter_exchange.packing_exchanger import FullParameterExchangerWithPacking
 from fl4health.parameter_exchange.full_exchanger import FullParameterExchanger
+from fl4health.parameter_exchange.packing_exchanger import FullParameterExchangerWithPacking
+from fl4health.parameter_exchange.parameter_packer import (
+    ParameterPackerAdaptiveConstraint,
+)
 from fl4health.servers.nnunet_server import NnunetServer
+from fl4health.strategies.fedavg_with_adaptive_constraint import FedAvgWithAdaptiveConstraint
 from fl4health.utils.config import make_dict_with_epochs_or_steps
 from fl4health.utils.metric_aggregation import evaluate_metrics_aggregation_fn, fit_metrics_aggregation_fn
 from fl4health.utils.parameter_extraction import get_all_model_parameters
-from fl4health.parameter_exchange.parameter_packer import (
-    ParameterPackerAdaptiveConstraint,
-)
 
 
 def get_config(
diff --git a/fl4health/clients/nnunet_client.py b/fl4health/clients/nnunet_client.py
@@ -605,7 +605,10 @@ def setup_client(self, config: Config) -> None:
         # We have to call parent method after setting up nnunet trainer
         super().setup_client(config)
 
-    def _special_predict(self, model, input) -> tuple[TorchPredType, dict[str, torch.Tensor]]:
+    def _special_predict(
+        self, model: torch.nn.Module, input: torch.Tensor
+    ) -> tuple[TorchPredType, dict[str, torch.Tensor]]:
+        model.train()
         if isinstance(input, torch.Tensor):
             # If device type is cuda, nnUNet defaults to mixed precision forward pass
             if self.device.type == "cuda":
@@ -643,29 +646,8 @@ def predict(self, input: TorchInputType) -> tuple[TorchPredType, dict[str, torch
             name. The second element is unused by this subclass and therefore is always an empty dict
         """
         return self._special_predict(self.model, input)
-        # if isinstance(input, torch.Tensor):
-        #     # If device type is cuda, nnUNet defaults to mixed precision forward pass
-        #     if self.device.type == "cuda":
-        #         with torch.autocast(self.device.type, enabled=True):
-        #             output = self.model(input)
-        #     else:
-        #         output = self.model(input)
-        # else:
-        #     raise TypeError('"input" must be of type torch.Tensor for nnUNetClient')
-
-        # if isinstance(output, torch.Tensor):
-        #     return {"prediction": output}, {}
-        # # If output is a list or tuple then deep supervision is on and we need to convert preds into a dict
-        # elif isinstance(output, (list, tuple)):
-        #     num_spatial_dims = NNUNET_N_SPATIAL_DIMS[self.nnunet_config]
-        #     preds = convert_deep_supervision_list_to_dict(output, num_spatial_dims)
-        #     return preds, {}
-        # else:
-        #     raise TypeError(
-        #         "Was expecting nnunet model output to be either a torch.Tensor or a list/tuple of torch.Tensors"
-        #     )
 
-    def compute_loss_and_additional_losses(
+    def _special_compute_loss_and_additional_losses(
         self,
         preds: TorchPredType,
         features: dict[str, torch.Tensor],
@@ -688,6 +670,7 @@ def compute_loss_and_additional_losses(
         # If deep supervision is turned on we must convert loss and target dicts into lists
         loss_preds = prepare_loss_arg(preds)
         loss_targets = prepare_loss_arg(target)
+        log(DEBUG, f"Prepared loss_preds: {type(loss_preds)}, loss_targets: {type(loss_targets)}")
 
         # Ensure we have the same number of predictions and targets
         assert isinstance(
@@ -709,6 +692,28 @@ def compute_loss_and_additional_losses(
 
         return loss
 
+    def compute_loss_and_additional_losses(
+        self,
+        preds: TorchPredType,
+        features: dict[str, torch.Tensor],
+        target: TorchTargetType,
+    ) -> tuple[torch.Tensor, dict[str, torch.Tensor] | None]:
+        """
+        Checks the pred and target types and computes the loss. If device type is cuda, loss computed in mixed
+        precision.
+
+        Args:
+            preds (TorchPredType): Dictionary of model output tensors indexed by name
+            features (dict[str, torch.Tensor]): Not used by this subclass
+            target (TorchTargetType): The targets to evaluate the predictions with. If multiple prediction tensors
+                are given, target must be a dictionary with the same number of tensors
+
+        Returns:
+            tuple[torch.Tensor, dict[str, torch.Tensor] | None]: A tuple where the first element is the loss and the
+            second element is an optional additional loss
+        """
+        return self._special_compute_loss_and_additional_losses(preds, features, target)
+
     def mask_data(self, pred: torch.Tensor, target: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
         """
         Masks the pred and target tensors according to nnunet ``ignore_label``. The number of classes in the input
diff --git a/fl4health/mixins/personalized/ditto.py b/fl4health/mixins/personalized/ditto.py
@@ -1,7 +1,7 @@
 """Ditto Personalized Mixin"""
 
 from abc import ABC, abstractmethod
-from logging import INFO
+from logging import INFO, DEBUG
 from typing import cast
 
 import torch
@@ -297,8 +297,9 @@ def predict(
 
         if hasattr(self, "_special_predict"):
             log(INFO, "Using '_special_predict' to make predictions")
-            global_preds = self._special_predict(self.global_model, input)
-            local_preds = self._special_predict(self.model, input)
+            global_preds, _ = self._special_predict(self.global_model, input)
+            local_preds, _ = self._special_predict(self.model, input)
+            log(INFO, f"Successfully predicted for global and local models")
         else:
             if isinstance(input, torch.Tensor):
                 global_preds = self.global_model(input)
@@ -338,12 +339,19 @@ def compute_loss_and_additional_losses(
         """
 
         # Compute global model vanilla loss
-        assert "global" in preds
-        global_loss = self.criterion(preds["global"], target)
 
-        # Compute local model loss + ditto constraint term
-        assert "local" in preds
-        local_loss = self.criterion(preds["local"], target)
+        if hasattr(self, "_special_compute_loss_and_additional_losses"):
+            log(INFO, "Using '_special_compute_loss_and_additional_losses' to compute loss")
+            global_loss, _ = self._special_compute_loss_and_additional_losses(preds["global"], features, target)
+
+            # Compute local model loss + ditto constraint term
+            local_loss, _ = self._special_compute_loss_and_additional_losses(preds["local"], features, target)
+
+        else:
+            global_loss = self.criterion(preds["global"], target)
+
+            # Compute local model loss + ditto constraint term
+            local_loss = self.criterion(preds["local"], target)
 
         additional_losses = {"local_loss": local_loss.clone(), "global_loss": global_loss}