feat: add parallelization for embedding generation, fix floating point for subset sizes parameter

RobotSail · RobotSail · commit a4cb19513721 · 2025-04-10T16:38:49.000Z
Signed-off-by: Oleg Silkin &lt;97077423+RobotSail@users.noreply.github.com&gt;
diff --git a/requirements.txt b/requirements.txt
@@ -13,7 +13,7 @@ langchain-text-splitters
 openai>=1.13.3,<2.0.0
 sentencepiece>=0.2.0
 # Note: this dependency has to be built from source
-submodlib @ git+https://github.com/decile-team/submodlib.git
+submodlib-py==0.0.1
 tabulate>=0.9.0
 
 # Note: this dependency goes along with langchain-text-splitters and may be
diff --git a/src/instructlab/sdg/encoders/__init__.py b/src/instructlab/sdg/encoders/__init__.py
@@ -0,0 +1,19 @@
+# Standard
+import importlib
+
+
+def get_encoder_class(encoder_type: str):
+    """Get the encoder class based on the encoder type."""
+    try:
+        # Convert encoder_type to class name (e.g., 'arctic' -> 'ArcticEmbedEncoder')
+        class_name = f"{encoder_type.capitalize()}EmbedEncoder"
+
+        # Use absolute import instead of relative
+        module_name = f"sdg.src.instructlab.sdg.encoders.{encoder_type}_encoder"
+
+        module = importlib.import_module(module_name)
+
+        # Get the class from the module
+        return getattr(module, class_name)
+    except (ImportError, AttributeError) as e:
+        raise ValueError(f"Unsupported encoder type: '{encoder_type}'") from e
diff --git a/src/instructlab/sdg/encoders/arctic_encoder.py b/src/instructlab/sdg/encoders/arctic_encoder.py
@@ -72,17 +72,22 @@ def __init__(
                 f"Model {model_name} not supported. Supported models: {list(MODEL_CONFIGS.keys())}"
             )
 
-        device = device or torch.device("cuda" if torch.cuda.is_available() else "cpu")
-        num_gpus = torch.cuda.device_count()
-        batch_size = MODEL_CONFIGS[model_name]["batch_size"]
-        batch_size = batch_size * num_gpus if num_gpus > 0 else batch_size
+        # Use the provided device or default to CUDA
+        self.device = device or torch.device(
+            "cuda" if torch.cuda.is_available() else "cpu"
+        )
+
+        # Get device ID for logging
+        self.device_id = self.device.index if hasattr(self.device, "index") else 0
 
+        # We don't need multi-GPU inside this encoder instance since each instance
+        # will run on a dedicated GPU
         self.cfg = EncoderConfig(
             model_name=model_name,
             model_config=MODEL_CONFIGS[model_name],
-            device=device,
-            num_gpus=num_gpus,
-            batch_size=batch_size,
+            device=self.device,
+            num_gpus=1,  # Only use 1 GPU per encoder instance
+            batch_size=MODEL_CONFIGS[model_name]["batch_size"],
             use_default_instruction=use_default_instruction,
             use_fp16=use_fp16,
             testing_mode=testing_mode,
@@ -91,7 +96,7 @@ def __init__(
         self._initialize_model()
 
     def _initialize_model(self) -> None:
-        """Initialize model."""
+        """Initialize model on the specific GPU."""
         home_dir = os.path.expanduser("~")
         model_path = os.path.join(
             home_dir, ".cache", "instructlab", "models", self.cfg.model_name
@@ -128,11 +133,9 @@ def _initialize_model(self) -> None:
             self.model = self.model.half()
 
         self.model = self.model.to(self.cfg.device)
+        logger.info(f"Model loaded on device: {self.cfg.device}")
 
-        if self.cfg.num_gpus > 1:
-            logger.info(f"Using {self.cfg.num_gpus} GPUs")
-            self.model = torch.nn.DataParallel(self.model)
-
+        # No need for DataParallel since we're running one encoder per GPU
         self.model.eval()
 
     def _prepare_inputs(
diff --git a/src/instructlab/sdg/subset_selection.py b/src/instructlab/sdg/subset_selection.py