NVIDIA
diff --git a/‎.gitignore‎
Lines changed: 3 additions & 0 deletions b/‎.gitignore‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎docs/programming_guide/memory_management.rst‎
Lines changed: 45 additions & 5 deletions b/‎docs/programming_guide/memory_management.rst‎
Lines changed: 45 additions & 5 deletions
diff --git a/‎nvflare/app_common/executors/client_api_launcher_executor.py‎
Lines changed: 47 additions & 1 deletion b/‎nvflare/app_common/executors/client_api_launcher_executor.py‎
Lines changed: 47 additions & 1 deletion
diff --git a/‎nvflare/fuel/utils/fobs/__init__.py‎
Lines changed: 8 additions & 0 deletions b/‎nvflare/fuel/utils/fobs/__init__.py‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎nvflare/fuel/utils/fobs/decomposers/via_downloader.py‎
Lines changed: 178 additions & 0 deletions b/‎nvflare/fuel/utils/fobs/decomposers/via_downloader.py‎
Lines changed: 178 additions & 0 deletions
@@ -183,3 +183,6 @@ CLAUDE.local.md
 
 # local Codex artifacts
 .codex/
+
+# memory profiler output
+tests/memory_profile/**/*.dat
@@ -142,11 +142,51 @@ measured and confirmed RSS is stable without cleanup.
 Recommended Settings
 ====================
 
-+--------+-------------------------------+----------------------+
-| Role   | ``server_memory_gc_rounds``   | ``MALLOC_ARENA_MAX``   |
-+========+===============================+======================+
-| Server | 5                             | 4                    |
-+--------+-------------------------------+----------------------+
++--------+-----------------------------+-----------------------------+----------------------+----------------------+
+| Role   | ``server_memory_gc_rounds`` | ``client_memory_gc_rounds`` | ``MALLOC_ARENA_MAX`` | ``cuda_empty_cache`` |
++========+=============================+=============================+======================+======================+
+| Server | 5                           | N/A                         | 4                    | N/A                  |
++--------+-----------------------------+-----------------------------+----------------------+----------------------+
+| Client | N/A                         | 1                           | 2                    | True (for GPU)       |
++--------+-----------------------------+-----------------------------+----------------------+----------------------+
+
+Using jemalloc
+==============
+
+For PyTorch workloads, jemalloc is recommended over glibc malloc. NVFlare startup
+scripts preload jemalloc only when explicitly enabled via
+``NVFLARE_ENABLE_JEMALLOC_PRELOAD=true`` and jemalloc is available.
+
+Startup Script
+--------------
+
+The generated ``sub_start.sh`` script includes opt-in jemalloc preload:
+
+.. code-block:: bash
+
+    # Enable jemalloc preload only when opted in
+    if [ "${NVFLARE_ENABLE_JEMALLOC_PRELOAD:-false}" = "true" ]; then
+        for JEMALLOC in /usr/lib/x86_64-linux-gnu/libjemalloc.so.2 \
+                        /usr/lib64/libjemalloc.so.2 \
+                        /usr/local/lib/libjemalloc.so; do
+            if [ -f "$JEMALLOC" ]; then
+                export LD_PRELOAD="${LD_PRELOAD:+$LD_PRELOAD:}$JEMALLOC"
+                export MALLOC_CONF="${MALLOC_CONF:-dirty_decay_ms:5000,muzzy_decay_ms:5000}"
+                break
+            fi
+        done
+    fi
+
+Installing jemalloc
+-------------------
+
+.. code-block:: bash
+
+    # Ubuntu/Debian
+    apt-get install libjemalloc2
+
+    # RHEL/CentOS
+    yum install jemalloc
 
 API Reference
 =============
 
@@ -22,6 +22,7 @@
 from nvflare.client.config import ConfigKey, ExchangeFormat, TransferType, write_config_to_file
 from nvflare.client.constants import CLIENT_API_CONFIG, EXTERNAL_PRE_INIT_TIMEOUT
 from nvflare.fuel.utils.attributes_exportable import ExportMode
+from nvflare.fuel.utils.fobs import FOBSContextKey
 from nvflare.utils.configs import get_client_config_value
 
 
@@ -107,10 +108,38 @@ def __init__(
         self._params_exchange_format = params_exchange_format
         self._params_transfer_type = params_transfer_type
         self._config_file_name = config_file_name
+        self._memory_gc_rounds = memory_gc_rounds
+        self._cuda_empty_cache = cuda_empty_cache
+        self._cell_with_pass_through = None
+        self._prev_pass_through = None
 
     def initialize(self, fl_ctx: FLContext) -> None:
         self.prepare_config_for_launch(fl_ctx)
-        super().initialize(fl_ctx)
+        # Enable PASS_THROUGH mode on the engine's communication cell so that
+        # large tensors arriving from the FL server are NOT downloaded here at
+        # the CJ.  ViaDownloaderDecomposer will instead create LazyDownloadRef
+        # placeholders that carry the original server FQCN and ref_id.  When CJ
+        # forwards the task to the subprocess agent via the task pipe, those
+        # placeholders are re-emitted as-is, causing the subprocess to download
+        # each tensor directly from the server — one tensor at a time, with no
+        # size limit and no tensor copy at CJ.
+        engine = fl_ctx.get_engine()
+        cell = engine.get_cell()
+        if cell is not None:
+            self._cell_with_pass_through = cell
+            prev_ctx = cell.core_cell.get_fobs_context()
+            self._prev_pass_through = prev_ctx.get(FOBSContextKey.PASS_THROUGH, None)
+            cell.core_cell.update_fobs_context({FOBSContextKey.PASS_THROUGH: True})
+            self.log_info(
+                fl_ctx,
+                "PASS_THROUGH enabled: task tensors will be downloaded by the subprocess "
+                "agent directly from the source, bypassing CJ memory.",
+            )
+        try:
+            super().initialize(fl_ctx)
+        except Exception:
+            self._restore_pass_through(fl_ctx)
+            raise
 
         # Check for top-level config override for external_pre_init_timeout
         # This allows jobs to configure timeout via add_client_config()
@@ -126,6 +155,23 @@ def initialize(self, fl_ctx: FLContext) -> None:
             )
             self._external_pre_init_timeout = timeout_value
 
+    def finalize(self, fl_ctx: FLContext) -> None:
+        try:
+            super().finalize(fl_ctx)
+        finally:
+            self._restore_pass_through(fl_ctx)
+
+    def _restore_pass_through(self, fl_ctx: FLContext):
+        if self._cell_with_pass_through is None:
+            return
+
+        self._cell_with_pass_through.core_cell.update_fobs_context(
+            {FOBSContextKey.PASS_THROUGH: self._prev_pass_through}
+        )
+        self.log_info(fl_ctx, f"PASS_THROUGH restored to {self._prev_pass_through}.")
+        self._cell_with_pass_through = None
+        self._prev_pass_through = None
+
     def prepare_config_for_launch(self, fl_ctx: FLContext):
         pipe_export_class, pipe_export_args = self.pipe.export(ExportMode.PEER)
         task_exchange_attributes = {
 
@@ -52,3 +52,11 @@ class FOBSContextKey:
     DOWNLOAD_REQ_TIMEOUT = "download_req_timeout"
     SEC_CREDS = "sec_creds"
     NUM_RECEIVERS = "num_receivers"
+    # When True, ViaDownloaderDecomposer will NOT download tensors at this hop.
+    # Instead it creates LazyDownloadRef placeholders that preserve the original
+    # source FQCN/ref_id so the reference can be forwarded verbatim to the next
+    # hop (e.g. a subprocess agent), which then downloads directly from the
+    # originating source.  This eliminates intermediate tensor copies at the
+    # forwarding node (the CJ) and is the foundation of the B1 pass-through
+    # architecture.
+    PASS_THROUGH = "pass_through"
@@ -31,6 +31,65 @@
 _MIN_DOWNLOAD_TIMEOUT = 60  # allow at least 1 minute gap between download activities
 
 
+class LazyDownloadRef:
+    """Placeholder created in PASS_THROUGH mode instead of downloading a tensor.
+
+    When a cell is configured as a pure forwarder (``FOBSContextKey.PASS_THROUGH``
+    is set in its FOBS context), incoming download references from the source are
+    not resolved.  Instead a ``LazyDownloadRef`` is created for each tensor item
+    in the received batch so that the original source FQCN and batch ref_id are
+    preserved.
+
+    When the forwarding node (CJ) later serialises the task for its subprocess,
+    ``LazyDownloadRefDecomposer.decompose()`` detects ``LazyDownloadRef`` targets
+    and re-emits the *original* download datum (pointing back to the server)
+    instead of creating a new datum that would point to the CJ.  The subprocess
+    agent then resolves the references directly from the originating source,
+    downloading each tensor individually without any copy passing through the CJ.
+
+    Attributes:
+        fqcn:    FQCN of the originating cell that owns the download transaction.
+        ref_id:  UUID of the batch download transaction on that cell.
+        item_id: Intra-batch item placeholder (e.g. ``"T0"``, ``"T1"``).
+        dot:     Datum Object Type of the original download datum.  Identifies
+                 which ``ViaDownloaderDecomposer`` subclass owns this ref (e.g.
+                 ``NUMPY_DOWNLOAD`` or ``TENSOR_DOWNLOAD``).  Required by
+                 ``LazyDownloadRefDecomposer`` to route serialisation and
+                 deserialisation to the correct handler.
+    """
+
+    __slots__ = ("fqcn", "ref_id", "item_id", "dot")
+
+    def __init__(self, fqcn: str, ref_id: str, item_id: str, dot: int = 0):
+        self.fqcn = fqcn
+        self.ref_id = ref_id
+        self.item_id = item_id
+        self.dot = dot
+
+
+class _LazyBatchInfo:
+    """Sentinel stored in fobs_ctx[items_key] during PASS_THROUGH mode.
+
+    Carries the (fqcn, ref_id, dot) of the *original* download batch so that
+    ``recompose()`` can build a ``LazyDownloadRef`` for each item_id it
+    encounters.  Using a named sentinel class (rather than a plain tuple)
+    makes the PASS_THROUGH path unambiguous and robust against accidental
+    type collisions.
+    """
+
+    __slots__ = ("fqcn", "ref_id", "dot")
+
+    def __init__(self, fqcn: str, ref_id: str, dot: int = 0):
+        self.fqcn = fqcn
+        self.ref_id = ref_id
+        self.dot = dot
+
+
+# fobs_ctx key used to carry the fqcn/ref_id batch info in PASS_THROUGH mode
+# so that recompose() can build per-item LazyDownloadRefs from a single datum.
+_LAZY_BATCH_CTX_SUFFIX = "_lazy_batch"
+
+
 class EncKey:
     TYPE = "type"
     DATA = "data"
@@ -178,6 +237,28 @@ def decompose(self, target: Any, manager: DatumManager = None) -> Any:
             # this should never happen
             raise RuntimeError("FOBS System Error: missing DatumManager")
 
+        # ── LazyDownloadRef: re-emit the original server datum verbatim ────────
+        # A LazyDownloadRef was created in PASS_THROUGH mode when CJ received the
+        # task from the server.  Instead of creating a *new* download transaction
+        # on *this* cell (which would make the subprocess download from CJ), we
+        # re-emit the exact datum that the server originally sent.  The subprocess
+        # agent therefore downloads each tensor directly from the server, with no
+        # tensor data ever materialised on CJ.
+        if isinstance(target, LazyDownloadRef):
+            fobs_ctx = manager.fobs_ctx
+            lazy_batch_key = f"{self.prefix}{_LAZY_BATCH_CTX_SUFFIX}"
+            if lazy_batch_key not in fobs_ctx:
+                # First LazyDownloadRef of this batch: register a post-callback
+                # that will add the single shared datum (fqcn + ref_id) after all
+                # items have been serialised.
+                fobs_ctx[lazy_batch_key] = {"fqcn": target.fqcn, "ref_id": target.ref_id}
+                manager.register_post_cb(self._finalize_lazy_batch)
+
+            self.logger.debug(
+                f"ViaDownloader: re-emitting LazyDownloadRef {target.item_id=} " f"{target.fqcn=} {target.ref_id=}"
+            )
+            return {EncKey.TYPE: EncType.REF, EncKey.DATA: target.item_id}
+
         max_chunk_size = acu.get_int_var(
             self._config_var_name(ConfigVarName.DOWNLOAD_CHUNK_SIZE),
             self.max_chunk_size,
@@ -320,6 +401,26 @@ def _delete_download_tx_on_msg_root(self, msg_root_id: str, downloader: ObjectDo
         self.logger.debug(f"ViaDownloader: deleting download transaction associated with {msg_root_id=}")
         downloader.delete_transaction()
 
+    def _finalize_lazy_batch(self, mgr: DatumManager):
+        """Post-callback used when re-emitting a LazyDownloadRef batch.
+
+        Adds a single datum containing the *original* source FQCN and ref_id so
+        that the downstream consumer (subprocess agent) can download the tensors
+        directly from the originating cell (typically the FL server) without
+        involving the CJ at all.
+        """
+        fobs_ctx = mgr.fobs_ctx
+        lazy_batch_key = f"{self.prefix}{_LAZY_BATCH_CTX_SUFFIX}"
+        lazy_batch = fobs_ctx.get(lazy_batch_key)
+        if not lazy_batch:
+            return
+        ref = {_RefKey.FQCN: lazy_batch["fqcn"], _RefKey.REF_ID: lazy_batch["ref_id"]}
+        datum = Datum(datum_type=DatumType.TEXT, value=json.dumps(ref), dot=self.get_download_dot())
+        self.logger.debug(
+            f"ViaDownloader: finalized lazy batch datum for {lazy_batch['fqcn']=} {lazy_batch['ref_id']=}"
+        )
+        mgr.add_datum(datum)
+
     def process_datum(self, datum: Datum, manager: DatumManager):
         """This is called by the manager to process a datum that has a DOT.
         This happens before the recompose processing.
@@ -340,6 +441,17 @@ def process_datum(self, datum: Datum, manager: DatumManager):
         self.logger.debug(f"ViaDownloader: pre-processing datum {datum.dot=} before recompose")
         fobs_ctx = manager.fobs_ctx
 
+        if fobs_ctx.get(fobs.FOBSContextKey.PASS_THROUGH):
+            # PASS_THROUGH mode: do NOT download tensors at this intermediate hop.
+            # Store the batch (fqcn, ref_id) so that recompose() can build a
+            # LazyDownloadRef for each item_id it encounters.  The downstream
+            # consumer (subprocess agent) will resolve the references directly
+            # from the originating source cell.
+            ref = json.loads(datum.value)
+            self.logger.debug(f"ViaDownloader PASS_THROUGH: preserving lazy ref {ref} instead of downloading")
+            fobs_ctx[self.items_key] = _LazyBatchInfo(ref[_RefKey.FQCN], ref[_RefKey.REF_ID], datum.dot)
+            return
+
         # data is to be downloaded
         ref = json.loads(datum.value)
         items = self._download_from_remote_cell(manager.fobs_ctx, ref)
@@ -377,6 +489,19 @@ def recompose(self, data: Any, manager: DatumManager = None) -> Any:
         item_id = data
         fobs_ctx = manager.fobs_ctx
         items = fobs_ctx.get(self.items_key)
+
+        # PASS_THROUGH mode: items_key holds a _LazyBatchInfo sentinel, not a dict.
+        # Build a LazyDownloadRef so the reference can be forwarded verbatim.
+        # Carry items.dot so that LazyDownloadRefDecomposer can route back to the
+        # correct ViaDownloaderDecomposer subclass during subprocess recompose().
+        if isinstance(items, _LazyBatchInfo):
+            lazy = LazyDownloadRef(fqcn=items.fqcn, ref_id=items.ref_id, item_id=item_id, dot=items.dot)
+            self.logger.debug(
+                f"ViaDownloader PASS_THROUGH: created LazyDownloadRef {item_id=} "
+                f"{items.fqcn=} {items.ref_id=} {items.dot=}"
+            )
+            return lazy
+
         self.logger.debug(f"trying to get item for {item_id=} from {type(items)=}")
 
         make_lazy_ref_fn = getattr(items, "make_lazy_ref", None)
@@ -431,3 +556,56 @@ def _download_from_remote_cell(self, fobs_ctx: dict, ref: dict):
         else:
             self.logger.debug(f"downloaded {len(items)} items successfully")
         return items
+
+
+class LazyDownloadRefDecomposer(fobs.Decomposer):
+    """Decomposer that serialises and deserialises :class:`LazyDownloadRef` objects.
+
+    ``LazyDownloadRef`` objects are created at a forwarding hop (e.g. the CJ
+    process) when ``FOBSContextKey.PASS_THROUGH`` is set.  Instead of
+    downloading tensors from the FL server, each tensor is represented as a
+    lightweight placeholder that carries the original server FQCN, batch
+    ref_id, item_id, and the Datum Object Type (``dot``) of the originating
+    ``ViaDownloaderDecomposer`` subclass.
+
+    When the forwarding node re-serialises the task for the subprocess agent,
+    FOBS routes each ``LazyDownloadRef`` to this decomposer.
+
+    **decompose()**
+        Delegates to the ``ViaDownloaderDecomposer`` subclass identified by
+        ``lazy.dot``.  That handler's ``decompose()`` re-emits the original
+        server batch datum (fqcn / ref_id) via a post-callback so the
+        subprocess knows exactly where to download from.  ``lazy_dot`` is
+        appended to the returned encoding dict so ``recompose()`` can route
+        back to the same handler.
+
+    **recompose()**
+        Uses ``lazy_dot`` to look up the original handler and delegates to
+        ``handler.recompose()``.  At the subprocess, ``process_datum()`` has
+        already populated ``fobs_ctx[handler.items_key]`` with the downloaded
+        tensors, so the call returns the real tensor value directly.
+    """
+
+    def supported_type(self):
+        return LazyDownloadRef
+
+    def decompose(self, lazy: LazyDownloadRef, manager: DatumManager = None) -> dict:
+        handler = fobs.get_dot_handler(lazy.dot)
+        if not handler:
+            raise RuntimeError(
+                f"LazyDownloadRefDecomposer: no DOT handler registered for dot={lazy.dot!r}. "
+                "Ensure the original ViaDownloaderDecomposer subclass (e.g. NumpyArrayDecomposer) "
+                "is registered before serialising LazyDownloadRef objects."
+            )
+        result = handler.decompose(lazy, manager)
+        result["lazy_dot"] = lazy.dot
+        return result
+
+    def recompose(self, data: dict, manager: DatumManager = None) -> Any:
+        lazy_dot = data.get("lazy_dot")
+        if lazy_dot is None:
+            raise RuntimeError("LazyDownloadRefDecomposer: missing 'lazy_dot' in encoded data")
+        handler = fobs.get_dot_handler(lazy_dot)
+        if not handler:
+            raise RuntimeError(f"LazyDownloadRefDecomposer: no DOT handler registered for lazy_dot={lazy_dot!r}")
+        return handler.recompose(data, manager)