NVFlare/nvflare/app_common/workflows/base_model_controller.py at 9e87f6c77854cfa2699be89912cf9594e8905568 · NVIDIA/NVFlare · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import random
from abc import ABC, abstractmethod
from typing import Callable, List, Optional, Union

from nvflare.apis.client import Client
from nvflare.apis.controller_spec import ClientTask, OperatorMethod, Task, TaskOperatorKey
from nvflare.apis.fl_constant import ReturnCode
from nvflare.apis.fl_context import FLContext
from nvflare.apis.impl.controller import Controller
from nvflare.apis.shareable import Shareable
from nvflare.apis.signal import Signal
from nvflare.app_common.abstract.fl_model import FLModel, ParamsType
from nvflare.app_common.abstract.learnable_persistor import LearnablePersistor
from nvflare.app_common.abstract.model import ModelLearnable, ModelLearnableKey, make_model_learnable
from nvflare.app_common.app_constant import AppConstants
from nvflare.app_common.app_event_type import AppEventType
from nvflare.app_common.utils.error_handling_utils import get_error_handling_message, should_ignore_result_error
from nvflare.app_common.utils.fl_component_wrapper import FLComponentWrapper
from nvflare.app_common.utils.fl_model_utils import FLModelUtils
from nvflare.fuel.utils.validation_utils import check_non_negative_int, check_positive_int, check_str
from nvflare.security.logging import secure_format_exception


class BaseModelController(Controller, FLComponentWrapper, ABC):
    def __init__(
        self,
        persistor_id: str = AppConstants.DEFAULT_PERSISTOR_ID,
        ignore_result_error: Optional[bool] = None,
        allow_empty_global_weights: bool = False,
        task_check_period: float = 0.5,
    ):
        """FLModel based controller.

        Args:
            persistor_id (str, optional): ID of the persistor component. Defaults to AppConstants.DEFAULT_PERSISTOR_ID ("persistor").
            ignore_result_error (bool or None, optional): How to handle client result errors.
                - None: Dynamic mode (default) - ignore errors if min_responses still reachable, panic otherwise.
                - False: Strict mode - panic on any client error.
                - True: Resilient mode - always ignore client errors.
            allow_empty_global_weights (bool, optional): whether to allow empty global weights. Some pipelines can have
                empty global weights at first round, such that clients start training from scratch without any global info.
                Defaults to False.
            task_check_period (float, optional): interval for checking status of tasks. Defaults to 0.5.
        """
        super().__init__(task_check_period=task_check_period)

        # Check arguments
        check_str("persistor_id", persistor_id)
        if not isinstance(task_check_period, (int, float)):
            raise TypeError(f"task_check_period must be an int or float but got {type(task_check_period)}")
        elif task_check_period <= 0:
            raise ValueError("task_check_period must be greater than 0.")
        self._task_check_period = task_check_period
        self._persistor_id = persistor_id
        self.persistor = None

        # config data
        self._ignore_result_error = ignore_result_error
        self._allow_empty_global_weights = allow_empty_global_weights

        # model related
        self._results = []

        # Task context for dynamic ignore_result_error mode (when ignore_result_error=None).
        # These are reset per send_model() call to track error tolerance for the current task.
        self._current_min_responses = 0  # Minimum successful responses needed for this task
        self._current_num_targets = 0  # Total number of clients targeted for this task
        self._current_failed_clients = set()  # Set of client names that returned errors in this task

    def start_controller(self, fl_ctx: FLContext) -> None:
        self.fl_ctx = fl_ctx
        self.info("Initializing BaseModelController workflow.")

        self.engine = self.fl_ctx.get_engine()

        if self._persistor_id:
            self.persistor = self.engine.get_component(self._persistor_id)
            if not isinstance(self.persistor, LearnablePersistor):
                self.warning(
                    f"Persistor {self._persistor_id} must be a LearnablePersistor type object, "
                    f"but got {type(self.persistor)}"
                )
                self.persistor = None

        FLComponentWrapper.initialize(self)

    def _build_shareable(self, data: FLModel = None) -> Shareable:
        data_shareable: Shareable = FLModelUtils.to_shareable(data)
        data_shareable.add_cookie(
            AppConstants.CONTRIBUTION_ROUND, data_shareable.get_header(AppConstants.CURRENT_ROUND)
        )

        return data_shareable

    def broadcast_model(
        self,
        data,
        task_name: str = AppConstants.TASK_TRAIN,
        targets: Union[List[Client], List[str], None] = None,
        min_responses: int = None,
        timeout: int = 0,
        wait_time_after_min_received: int = 0,
        blocking: bool = True,
        callback: Callable[[FLModel], None] = None,
    ) -> List:
        """Send a task with data to a list of targets.

        Args:
            data: FLModel to be sent to clients. It must be a FLModel object. It will raise an exception if None.
            task_name (str, optional): name of the task. Defaults to "train".
            targets (List[str], optional): the list of target client names or None (all clients). Defaults to None.
            min_responses (int, optional): the minimum number of responses expected. If None, must receive responses from
              all clients that the task has been sent to. Defaults to None.
            timeout (int, optional): time to wait for clients to perform task. Defaults to 0, i.e., never time out.
            wait_time_after_min_received (int, optional): time to wait after
                minimum number of clients responses has been received. Defaults to 0.
            blocking (bool, optional): whether to block to wait for task result. Defaults to True.
            callback (Callable[[FLModel], None], optional): callback when a result is received, only called when blocking=False. Defaults to None.

        Returns:
            List[FLModel] if blocking=True else None
        """

        if not isinstance(task_name, str):
            raise TypeError(f"task_name must be a string but got {type(task_name)}")
        if not isinstance(data, FLModel):
            raise TypeError(f"data must be a FLModel but got {type(data)}")
        if min_responses is None:
            min_responses = 0  # this is internally used by controller's broadcast to represent all targets
        check_non_negative_int("min_responses", min_responses)
        check_non_negative_int("timeout", timeout)
        check_non_negative_int("wait_time_after_min_received", wait_time_after_min_received)
        if not blocking and not isinstance(callback, Callable):
            raise TypeError("callback must be defined if blocking is False, but got {}".format(type(callback)))

        # Store task context for dynamic ignore_result_error mode
        num_targets = len(targets) if targets else len(self.engine.get_clients())
        self._current_min_responses = min_responses if min_responses > 0 else num_targets
        self._current_num_targets = num_targets
        self._current_failed_clients = set()

        self.set_fl_context(data)

        task = self._prepare_task(data=data, task_name=task_name, timeout=timeout, callback=callback)

        if targets:
            targets = [client.name if isinstance(client, Client) else client for client in targets]
            self.info(f"Sending task {task_name} to {targets}")
        else:
            self.info(f"Sending task {task_name} to all clients")

        if blocking:
            self._results = []  # reset results list
            self.broadcast_and_wait(
                task=task,
                targets=targets,
                min_responses=min_responses,
                wait_time_after_min_received=wait_time_after_min_received,
                fl_ctx=self.fl_ctx,
                abort_signal=self.abort_signal,
            )

            if targets is not None:
                expected_responses = min_responses if min_responses != 0 else len(targets)
                if len(self._results) != expected_responses:
                    self.warning(
                        f"Number of results ({len(self._results)}) is different from number of expected responses ({expected_responses})."
                    )

            # de-reference the internal results before returning
            results = self._results
            self._results = []
            return results
        else:
            self.broadcast(
                task=task,
                targets=targets,
                min_responses=min_responses,
                wait_time_after_min_received=wait_time_after_min_received,
                fl_ctx=self.fl_ctx,
            )

    def _prepare_task(
        self,
        data: FLModel,
        task_name: str,
        timeout: int,
        callback: Callable,
    ):
        # Create task
        data_shareable = self._build_shareable(data)

        operator = {
            TaskOperatorKey.OP_ID: task_name,
            TaskOperatorKey.METHOD: OperatorMethod.BROADCAST,
            TaskOperatorKey.TIMEOUT: timeout,
        }

        task = Task(
            name=task_name,
            data=data_shareable,
            operator=operator,
            props={AppConstants.TASK_PROP_CALLBACK: callback, AppConstants.META_DATA: data.meta},
            timeout=timeout,
            before_task_sent_cb=self._prepare_task_data,
            result_received_cb=self._process_result,
        )

        return task

    def _prepare_task_data(self, client_task: ClientTask, fl_ctx: FLContext) -> None:
        self.fire_event_with_data(
            AppEventType.BEFORE_TRAIN_TASK, fl_ctx, AppConstants.TRAIN_SHAREABLE, client_task.task.data
        )

    def _process_result(self, client_task: ClientTask, fl_ctx: FLContext) -> None:
        self.fl_ctx = fl_ctx
        result = client_task.result
        client_name = client_task.client.name

        # Make round available on callback fl_ctx before contribution-accept handlers run.
        current_round = client_task.task.data.get_header(AppConstants.CURRENT_ROUND, None)
        if current_round is None:
            current_round = result.get_header(AppConstants.CURRENT_ROUND, None)
        if current_round is not None:
            fl_ctx.set_prop(AppConstants.CURRENT_ROUND, current_round, private=True, sticky=True)

        # Turn result into FLModel
        result_model = FLModelUtils.from_shareable(result)
        result_model.meta["props"] = client_task.task.props[AppConstants.META_DATA]
        result_model.meta["client_name"] = client_name

        self.event(AppEventType.BEFORE_CONTRIBUTION_ACCEPT)
        accepted = self._accept_train_result(client_name=client_name, result=result, fl_ctx=fl_ctx)
        self.event(AppEventType.AFTER_CONTRIBUTION_ACCEPT)

        # If result was rejected (error ignored or panic), skip further processing
        if not accepted:
            client_task.result = None
            return

        # Now try to convert result to FLModel
        try:
            result_model = FLModelUtils.from_shareable(result)
            result_model.meta["props"] = client_task.task.props[AppConstants.META_DATA]
            result_model.meta["client_name"] = client_name
        except Exception as e:
            self.warning(f"Failed to convert result from {client_name} to FLModel: {e}")
            client_task.result = None
            return

        callback = client_task.task.get_prop(AppConstants.TASK_PROP_CALLBACK)
        if callback:
            try:
                callback(result_model)
            except Exception as e:
                self.error(f"Unsuccessful callback {callback} for task {client_task.task.name}: {e}")
        else:
            self._results.append(result_model)

        # Cleanup task result
        client_task.result = None
        # Note: Memory cleanup (gc.collect + malloc_trim) is handled by subclasses
        # via _maybe_cleanup_memory() based on memory_gc_rounds setting

    def process_result_of_unknown_task(
        self, client: Client, task_name: str, client_task_id: str, result: Shareable, fl_ctx: FLContext
    ) -> None:
        if task_name == AppConstants.TASK_TRAIN:
            accepted = self._accept_train_result(
                client_name=client.name, result=result, fl_ctx=fl_ctx, is_unknown_task=True
            )
            if accepted:
                self.info(f"Result of unknown task {task_name} sent to aggregator.")
        else:
            self.error("Ignoring result from unknown task.")

    def _accept_train_result(
        self, client_name: str, result: Shareable, fl_ctx: FLContext, is_unknown_task: bool = False
    ) -> bool:
        """Accept or reject a training result based on error handling policy.

        Args:
            client_name: Name of the client that sent the result.
            result: The Shareable result from the client.
            fl_ctx: The FLContext.
            is_unknown_task: Whether this result is from an unknown/late task.

        Returns:
            True if the result was accepted, False if it was rejected (error ignored or panic triggered).
        """
        self.fl_ctx = fl_ctx
        rc = result.get_return_code()

        current_round = result.get_header(AppConstants.CURRENT_ROUND, None)

        # For unknown/late tasks, always ignore errors (no valid tolerance context)
        # For normal tasks, use the configured ignore_result_error setting
        ignore_result_error = True if is_unknown_task else self._ignore_result_error

        # Use empty set for unknown tasks since we don't have valid tracking context
        failed_clients = set() if is_unknown_task else self._current_failed_clients
        num_targets = 0 if is_unknown_task else self._current_num_targets
        min_responses = 0 if is_unknown_task else self._current_min_responses

        # Raise panic if bad peer context or execution exception.
        if rc and rc != ReturnCode.OK:
            should_ignore = should_ignore_result_error(
                ignore_result_error=ignore_result_error,
                client_name=client_name,
                failed_clients=failed_clients,
                num_targets=num_targets,
                min_responses=min_responses,
            )
            msg = get_error_handling_message(
                ignore_result_error=ignore_result_error,
                client_name=client_name,
                error_code=rc,
                current_round=current_round,
                controller_name=self.__class__.__name__,
                failed_clients=failed_clients,
                num_targets=num_targets,
                min_responses=min_responses,
            )
            if should_ignore:
                self.warning(msg)
                return False  # Result rejected - error ignored
            else:
                self.panic(msg)
                return False  # Result rejected - panic triggered

        self.fl_ctx.set_prop(AppConstants.TRAINING_RESULT, result, private=True, sticky=False)
        return True  # Result accepted

    @abstractmethod
    def run(self):
        """Main `run` routine called by the Controller's `control_flow` to execute the workflow.

        Returns: None.

        """
        raise NotImplementedError

    def control_flow(self, abort_signal: Signal, fl_ctx: FLContext) -> None:
        self.fl_ctx = fl_ctx
        self.abort_signal = abort_signal
        try:
            self.info("Beginning model controller run.")
            self.event(AppEventType.TRAINING_STARTED)

            self.run()
        except Exception as e:
            error_msg = f"Exception in model controller run: {secure_format_exception(e)}"
            self.exception(error_msg)
            self.panic(error_msg)

    def load_model(self):
        # initialize global model
        model = None
        if self.persistor:
            self.info("loading initial model from persistor")
            global_weights = self.persistor.load(self.fl_ctx)

            if not isinstance(global_weights, ModelLearnable):
                self.panic(
                    f"Expected global weights to be of type `ModelLearnable` but received {type(global_weights)}"
                )
                return

            if global_weights.is_empty():
                if not self._allow_empty_global_weights:
                    # if empty not allowed, further check whether it is available from fl_ctx
                    global_weights = self.fl_ctx.get_prop(AppConstants.GLOBAL_MODEL)

            if not global_weights.is_empty():
                model = FLModel(
                    params_type=ParamsType.FULL,
                    params=global_weights[ModelLearnableKey.WEIGHTS],
                    meta=global_weights[ModelLearnableKey.META],
                )
            elif self._allow_empty_global_weights:
                model = FLModel(params_type=ParamsType.FULL, params={})
            else:
                self.panic(
                    f"Neither `persistor` {self._persistor_id} or `fl_ctx` returned a global model! If this was intended, set `self._allow_empty_global_weights` to `True`."
                )
                return
        else:
            self.info("persistor not configured, creating empty initial FLModel")
            model = FLModel(params_type=ParamsType.FULL, params={})

        # persistor uses Learnable format to save model
        ml = make_model_learnable(weights=model.params, meta_props=model.meta)
        self.fl_ctx.set_prop(AppConstants.GLOBAL_MODEL, ml, private=True, sticky=True)
        self.event(AppEventType.INITIAL_MODEL_LOADED)

        return model

    def get_run_dir(self):
        """Get current run directory."""
        return self.engine.get_workspace().get_run_dir(self.fl_ctx.get_job_id())

    def get_app_dir(self):
        """Get current app directory."""
        return self.engine.get_workspace().get_app_dir(self.fl_ctx.get_job_id())

    def save_model(self, model):
        if self.persistor:
            self.info("Start persist model on server.")
            self.event(AppEventType.BEFORE_LEARNABLE_PERSIST)
            # persistor uses Learnable format to save model
            ml = make_model_learnable(weights=model.params, meta_props=model.meta)
            self.persistor.save(ml, self.fl_ctx)
            self.event(AppEventType.AFTER_LEARNABLE_PERSIST)
            self.info("End persist model on server.")
        else:
            self.error("persistor not configured, model will not be saved")

    def sample_clients(self, num_clients: int = None) -> List[str]:
        clients = [client.name for client in self.engine.get_clients()]

        if num_clients:
            check_positive_int("num_clients", num_clients)
            if num_clients < len(clients):
                random.shuffle(clients)
                clients = clients[0:num_clients]
                self.info(
                    f"num_clients ({num_clients}) is less than the number of available clients. Returning a random subset of ({num_clients}) clients."
                )
            elif num_clients > len(clients):
                self.error(
                    f"num_clients ({num_clients}) is greater than the number of available clients. Returning all ({len(clients)}) available clients."
                )

        self.info(f"Sampled clients: {clients}")

        return clients

    def set_fl_context(self, data: FLModel):
        """Set fl_ctx CURRENT_ROUND and NUM_ROUNDS from FLModel so they stay current each round.

        Uses private=True, sticky=True so the sticker is populated from round 0 and child/peer
        contexts see it. Required for flows like FedAvg that do not set CURRENT_ROUND in fl_ctx
        before send. set_prop accepts the update when the prop already exists with the same
        (private, sticky) attributes.
        """
        if not data:
            return
        if data.current_round is not None:
            self.fl_ctx.set_prop(
                AppConstants.CURRENT_ROUND,
                data.current_round,
                private=True,
                sticky=True,
            )
        else:
            self.debug("The FLModel data does not contain the current_round information.")
        if data.total_rounds is not None:
            self.fl_ctx.set_prop(
                AppConstants.NUM_ROUNDS,
                data.total_rounds,
                private=True,
                sticky=True,
            )
        else:
            self.debug("The FLModel data does not contain the total_rounds information.")

    def get_component(self, component_id: str):
        return self.engine.get_component(component_id)

    def build_component(self, config_dict: dict):
        return self.engine.build_component(config_dict)

    def stop_controller(self, fl_ctx: FLContext):
        self.fl_ctx = fl_ctx
        self.finalize()