v2: WIP, improve offline algo performance

MischaPanch · MischaPanch · commit 91a774420f08 · 2025-05-15T17:05:16.000+02:00
diff --git a/tianshou/data/buffer/base.py b/tianshou/data/buffer/base.py
@@ -298,10 +298,8 @@ def reset(self, keep_statistics: bool = False) -> None:
         if not keep_statistics:
             self._ep_return, self._ep_len = 0.0, 0
 
-    # TODO: is this method really necessary? It's kinda dangerous, can accidentally
-    #  remove all references to collected data
     def set_batch(self, batch: RolloutBatchProtocol) -> None:
-        """Manually choose the batch you want the ReplayBuffer to manage."""
+        """Manually choose the batch you want the ReplayBuffer to manage. Use with caution!."""
         assert len(batch) == self.maxsize and set(batch.get_keys()).issubset(
             self._reserved_keys,
         ), "Input batch doesn't meet ReplayBuffer's data form requirement."
@@ -496,12 +494,10 @@ def add(
     def sample_indices(self, batch_size: int | None) -> np.ndarray:
         """Get a random sample of index with size = batch_size.
 
-        Return all available indices in the buffer if batch_size is 0; return an empty
-        numpy array if batch_size < 0 or no available index can be sampled.
-
-        :param batch_size: the number of indices to be sampled. If None, it will be set
-            to the length of the buffer (i.e. return all available indices in a
-            random order).
+        :param batch_size: the number of indices to be sampled. Three cases are possible:
+            1. positive int - sample random indices of that size
+            2. zero - all indices in current order
+            3. None - all indices but in random order
         """
         if batch_size is None:
             batch_size = len(self)
@@ -534,8 +530,10 @@ def sample_indices(self, batch_size: int | None) -> np.ndarray:
     def sample(self, batch_size: int | None) -> tuple[RolloutBatchProtocol, np.ndarray]:
         """Get a random sample from buffer with size = batch_size.
 
-        Return all the data in the buffer if batch_size is 0.
-
+        :param batch_size: the number of indices to be sampled. Three cases are possible:
+            1. positive int - sample random indices of that size
+            2. zero - all indices in current order
+            3. None - all indices but in random order
         :return: Sample data and its corresponding index inside the buffer.
         """
         indices = self.sample_indices(batch_size)
diff --git a/tianshou/policy/base.py b/tianshou/policy/base.py
@@ -2,6 +2,7 @@
 import time
 from abc import ABC, abstractmethod
 from collections.abc import Callable, Mapping
+from copy import copy
 from dataclasses import dataclass, field
 from typing import TYPE_CHECKING, Any, Generic, Literal, TypeVar, cast
 
@@ -985,6 +986,47 @@ def update(
         )
 
 
+class OfflineAlgorithmFromOffPolicyAlgorithm(
+    OfflineAlgorithm[TPolicy],
+    Generic[TPolicy],
+    ABC,
+):
+    """Base class for offline algorithms that use the same data preprocessing as an off-policy algorithm.
+
+    Typically used within a diamond inheritance pattern for transforming the respective off-policy algorithm
+    into a derived offline variant. See usages.
+    """
+
+    def __init__(
+        self, *, policy: TPolicy, off_policy_algorithm_class: type[OfflineAlgorithm[TPolicy]]
+    ):
+        self._off_policy_algorithm_class = off_policy_algorithm_class
+        OfflineAlgorithm.__init__(self, policy=policy)
+
+    @override
+    def process_buffer(self, buffer: TBuffer) -> TBuffer:
+        """Use the off-policy algorithm's batch pre-processing for processing the buffer once before training.
+
+        This implementation avoids unnecessary re-computation of preprocessing.
+        """
+        buffer = copy(buffer)
+        batch, indices = buffer.sample(0)
+        processed_batch = self._off_policy_algorithm_class._preprocess_batch(
+            self, batch, buffer, indices  # type: ignore[arg-type]
+        )
+        buffer.set_batch(processed_batch)
+        return buffer
+
+    @override
+    def _preprocess_batch(
+        self,
+        batch: RolloutBatchProtocol,
+        buffer: ReplayBuffer,
+        indices: np.ndarray,
+    ) -> RolloutBatchProtocol | BatchWithReturnsProtocol:
+        return batch
+
+
 class OnPolicyWrapperAlgorithm(
     OnPolicyAlgorithm[TPolicy],
     Generic[TPolicy],
diff --git a/tianshou/policy/imitation/td3_bc.py b/tianshou/policy/imitation/td3_bc.py
@@ -4,14 +4,16 @@
 from tianshou.data import to_torch_as
 from tianshou.data.types import RolloutBatchProtocol
 from tianshou.policy import TD3
-from tianshou.policy.base import OfflineAlgorithm
+from tianshou.policy.base import (
+    OfflineAlgorithmFromOffPolicyAlgorithm,
+)
 from tianshou.policy.modelfree.ddpg import ContinuousDeterministicPolicy
 from tianshou.policy.modelfree.td3 import TD3TrainingStats
 from tianshou.policy.optim import OptimizerFactory
 
 
 # NOTE: This uses diamond inheritance to convert from off-policy to offline
-class TD3BC(OfflineAlgorithm[ContinuousDeterministicPolicy], TD3):  # type: ignore
+class TD3BC(OfflineAlgorithmFromOffPolicyAlgorithm[ContinuousDeterministicPolicy], TD3):  # type: ignore
     """Implementation of TD3+BC. arXiv:2106.06860."""
 
     def __init__(
@@ -97,6 +99,9 @@ def __init__(
             update_actor_freq=update_actor_freq,
             estimation_step=estimation_step,
         )
+        OfflineAlgorithmFromOffPolicyAlgorithm.__init__(
+            self, policy=policy, off_policy_algorithm_class=TD3  # type: ignore[arg-type]
+        )
         self.alpha = alpha
 
     def _update_with_batch(self, batch: RolloutBatchProtocol) -> TD3TrainingStats: