Implement async generator

JasonKChow · JasonKChow · commit f3c7074e23fe · 2025-02-10T01:46:18.000-06:00
Summary: A generator that allows asynchronous generation of points. Uses a different process to generate points. This doesn't fix the situation where modeling fitting takes a long time.

Test Plan: New test
diff --git a/aepsych/generators/__init__.py b/aepsych/generators/__init__.py
@@ -10,6 +10,7 @@
 from ..config import Config
 from .acqf_grid_search_generator import AcqfGridSearchGenerator
 from .acqf_thompson_sampler_generator import AcqfThompsonSamplerGenerator
+from .async_generator import AsyncGenerator
 from .epsilon_greedy_generator import EpsilonGreedyGenerator
 from .manual_generator import ManualGenerator, SampleAroundPointsGenerator
 from .optimize_acqf_generator import OptimizeAcqfGenerator
@@ -27,6 +28,7 @@
     "IntensityAwareSemiPGenerator",
     "AcqfThompsonSamplerGenerator",
     "AcqfGridSearchGenerator",
+    "AsyncGenerator",
 ]
 
 Config.register_module(sys.modules[__name__])
diff --git a/aepsych/generators/async_generator.py b/aepsych/generators/async_generator.py
@@ -0,0 +1,246 @@
+#!/usr/bin/env python3
+# Copyright (c) Meta Platforms, Inc. and its affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import dataclasses
+import os
+import time
+from concurrent import futures
+from multiprocessing import get_context
+from typing import Dict, List, Optional
+
+import numpy as np
+
+import torch
+from aepsych.generators.base import AEPsychGenerator
+from aepsych.models.model_protocol import ModelProtocol
+from aepsych.utils_logging import getLogger
+
+logger = getLogger()
+
+
+@dataclasses.dataclass
+class AsyncPoint:
+    """Dataclass to keep track of asynchronously generated points."""
+
+    point: torch.Tensor
+    generator_name: str
+    gen_time: float
+    fixed_features: Optional[Dict[int, float]] = None
+    model: Optional[ModelProtocol] = None
+    data: Optional[torch.Tensor] = dataclasses.field(init=False, default=None)
+
+    def __post_init__(self):
+        if self.model is not None:
+            self.data = self.model.train_inputs[0]
+            self.model = None
+        else:
+            self.data = None
+
+    @property
+    def data_len(self) -> int:
+        """Return the length of the data tensor."""
+        return self.data.shape[0] if self.data is not None else 0
+
+
+class AsyncGenerator(AEPsychGenerator):
+    """Generator that holds two generators. The primary generator will always
+    be sent to a different process to handle and if it cannot return within a
+    timeout, the backup generator will be used instead. In the case of timeout,
+    the other process will continue to run until the generator is called again.
+    """
+
+    def __init__(
+        self,
+        generator: AEPsychGenerator,
+        backup_generator: AEPsychGenerator,
+        timeout: float = 2.0,
+        data_diff_limit: Optional[int] = None,
+        n_pregen: int = 1,
+    ) -> None:
+        """Initialize an asynchronous generator. This holds two generators. The
+        primary generator will always be sent to a different process to handle
+        and if it cannot return within a timeout, the backup generator will be
+        used instead. In the case of timeout, the other process will continue to
+        run until the generator is called again.
+
+        WARNING: Whenever the gen() is called, a new processes will be
+        forked from the main one. This means that the generators will have the
+        exact same state (including internal RNG seeds). While we do reseed the
+        new process, any seeds within an object (like the seed inside the
+        SobolGenerator) will not be modified and thus can potentially generate
+        exactly the same points. This should be fine for OptimizeAcqfGenerators.
+
+        Args:
+            generator (AEPsychGenerator): The primary generator to use.
+            backup_generator (AEPsychGenerator): The backup generator to use if
+                the primary times out.
+            timeout (float): The timeout for the primary generator. Defaults to
+                2.0.
+            data_diff_limit (int, optional): The maximum difference in data
+                length between the model and the point to accept. If not set,
+                there would not be any limit.
+            n_pregen (int, optional): The number of points to pre-generate.
+                Defaults to 1.
+        """
+        self.generator = generator
+        self.backup_generator = backup_generator
+        self.timeout = timeout
+        self.data_diff_limit = data_diff_limit or np.inf
+        self.n_pregen = n_pregen
+        self.executor: Optional[futures.ProcessPoolExecutor] = None
+        self.futures: List[futures.Future] = []
+
+        # Populate generator class attributes based on main generator
+        self._requires_model = self.generator._requires_model
+        self.stimuli_per_trial = self.generator.stimuli_per_trial
+        self.max_asks = self.generator.max_asks
+        self.dim = self.generator.dim
+
+    def gen(
+        self,
+        num_points: int,
+        model: Optional[ModelProtocol] = None,
+        fixed_features: Optional[Dict[int, float]] = None,
+        timeout: Optional[float] = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        """Get a point from the generator. When called, it will check if there
+        are any points being generated by the primary generator and if so, wait
+        for it to finish. If the timeout is reached, the backup generator will
+        be used instead. Whenever there is a timeout, the primary generator will
+        continue to work and the next time gen() is called, it will be checked
+        again.
+
+        Args:
+            num_points (int): The number of points to generate.
+            model (ModelProtocol, optional): The model to use for generating
+                points. Defaults to None.
+            fixed_features (Dict[int, float], optional): The fixed features to
+                use for generating points. Defaults to None.
+            timeout (float, optional): The timeout for the primary generator.
+                If not set, defaults to the class timeout.
+            **kwargs: Additional keyword arguments to pass to the generator.
+
+        Returns:
+            torch.Tensor: The generated point.
+        """
+        if self.executor is None:  # Initialize the executor
+            self.executor = futures.ProcessPoolExecutor(
+                max_workers=self.n_pregen,
+                mp_context=get_context("spawn"),
+                initializer=self._set_process_seed,
+            )
+
+        # We keep adding futures until we have enough
+        while len(self.futures) < self.n_pregen:
+            self.futures.append(
+                self.executor.submit(
+                    self._gen,
+                    num_points,
+                    model,
+                    fixed_features=fixed_features,
+                    **kwargs,
+                )
+            )
+
+        try:
+            # We return the first future that finished
+            timeout = timeout or self.timeout
+            for future in futures.as_completed(self.futures, timeout=timeout):
+                try:
+                    result = future.result()
+
+                    # Check if fixed features match
+                    if result.fixed_features != fixed_features:
+                        # Throw it out and wait for next
+                        # Heuristic to never allow a bunch of fixed to hold us back
+                        logger.info(
+                            "AsyncGenerator found mismatched fixed features, skipping."
+                        )
+                        self.futures.remove(future)
+                        continue
+
+                    if model is not None:
+                        # Check if the data used to generate is close enough
+                        if (
+                            result.data_len - model.train_inputs[0].shape[0]
+                            <= self.data_diff_limit
+                        ):
+                            self.futures.remove(future)
+                            return result.point
+                        else:
+                            logger.info(
+                                "AsyncGenerator found a point that was generated with data that is too different, skipping."
+                            )
+                            self.futures.remove(future)
+                    else:
+                        self.futures.remove(future)
+                        return result.point
+
+                except (futures.CancelledError, futures.process.BrokenProcessPool) as e:
+                    logger.error("Generator job failed")
+                    logger.error(e)
+                    self.futures.remove(future)
+                    continue
+
+            # All futures resolved but we still have no point, so we use backup
+            return self.backup_generator.gen(
+                num_points=num_points,
+                model=model,
+                fixed_features=fixed_features,
+                **kwargs,
+            )
+
+        except futures.TimeoutError:  # Timeout backup
+            logger.info("Main generator timed out, using backup generator.")
+            return self.backup_generator.gen(
+                num_points=num_points,
+                model=model,
+                fixed_features=fixed_features,
+                **kwargs,
+            )
+
+    @staticmethod
+    def _set_process_seed():
+        # Set the random seed of numpy and pytorch based on pid and time
+        seed = os.getpid() + int(time.time())
+        torch.manual_seed(seed)
+        np.random.seed(seed)
+
+    def _gen(
+        self,
+        num_points: int,
+        model: Optional[ModelProtocol] = None,
+        fixed_features: Optional[Dict[int, float]] = None,
+        **kwargs,
+    ) -> AsyncPoint:
+        # Wrapper to pass the generator to the executor and return a async
+        # point, must be static as we don't want to pickle self.
+        start = time.time()
+        point = self.generator.gen(num_points, model, fixed_features, **kwargs)
+        end = time.time()
+        async_point = AsyncPoint(
+            point=point,
+            gen_time=end - start,
+            generator_name=self.generator.__class__.__name__,
+            model=model,
+            fixed_features=fixed_features,
+        )
+
+        return async_point
+
+    def __del__(self):
+        # To shutdown executor on deletion
+        if self.executor is not None:
+            self.executor.shutdown(wait=True, cancel_futures=True)
+
+    def __getstate__(self):
+        # Need to blank exectutor/futures to be able to pickle
+        state = self.__dict__.copy()
+        state["executor"] = None
+        state["futures"] = []
+        return state
diff --git a/tests/generators/test_async_generator.py b/tests/generators/test_async_generator.py