fix: remove redundant and unused legacy methods

eshwarprasadS · mergify[bot] · commit 70ecbf8927da · 2025-04-22T15:14:49.000Z
Signed-off-by: eshwarprasadS <eshwarprasad.s01@gmail.com> (cherry picked from commit f614e96)
diff --git a/src/instructlab/sdg/subset_selection.py b/src/instructlab/sdg/subset_selection.py
@@ -1,7 +1,7 @@
 # Standard
 from dataclasses import dataclass, field
 from multiprocessing import Pool
-from typing import Any, Dict, List, Optional, Tuple, TypedDict, TypeVar, Union
+from typing import Any, Dict, List, TypedDict, TypeVar, Union
 import gc
 import glob
 import logging
@@ -276,60 +276,6 @@ def get_subset_name(self, size_spec: Union[int, float], actual_size: int) -> str
             return f"percent_{size_spec:.1f}"
         return f"samples_{actual_size}"
 
-    def get_last_processed_batch(self, output_dir: str) -> Tuple[int, Optional[str]]:
-        """
-        Retrieves the last processed batch number and its file path from the output directory.
-
-        Args:
-            output_dir (str): The directory where batch files are stored.
-
-        Returns:
-            Tuple[int, Optional[str]]: The last batch number and the corresponding batch file path.
-        """
-        batch_files = glob.glob(os.path.join(output_dir, "batch_*.h5"))
-        if not batch_files:
-            return -1, None
-
-        # Sort batch files by batch number
-        batch_files.sort(key=self.extract_batch_number)
-        max_batch_file = batch_files[-1]
-        max_batch_number = self.extract_batch_number(max_batch_file)
-
-        # Return the max batch number and the corresponding batch file path
-        return max_batch_number, max_batch_file
-
-    @retry_on_exception
-    def process_batch(self, batch_texts: List[str], output_file: str) -> Optional[int]:
-        """
-        Processes a batch of texts by generating embeddings and saving them to a file.
-        Returns the embedding dimension or None if no embeddings were generated.
-        """
-        embeddings = (
-            self.encoder.encode(
-                inputs=batch_texts,
-                instruction=self.config.encoder.instruction,
-            )
-            .cpu()
-            .numpy()
-        )
-
-        if embeddings.size == 0:
-            logger.warning(
-                f"No embeddings generated for batch, skipping file {output_file}"
-            )
-            return None
-
-        embedding_dim = int(embeddings.shape[1])  # Cast to int
-        logger.info(f"Embedding dimension for batch: {embedding_dim}")
-
-        with h5py.File(output_file, "w") as h5f:
-            h5f.create_dataset(
-                "embeddings", data=embeddings, dtype="float32", chunks=True
-            )
-            h5f.flush()
-
-        return embedding_dim
-
     @retry_on_exception
     def generate_embeddings(self, dataset, output_dir: str) -> str:
         """
@@ -399,104 +345,6 @@ def generate_embeddings(self, dataset, output_dir: str) -> str:
 
         return merged_path
 
-    def extract_batch_number(self, filename):
-        """
-        Extracts the batch number from the filename.
-        Assumes the filename is in the format 'batch_<number>.h5'.
-
-        Args:
-            filename (str): The filename from which to extract the batch number.
-
-        Returns:
-            int: The batch number extracted from the filename.
-        """
-        basename = os.path.basename(filename)
-        match = re.search(r"batch_(\d+)\.h5$", basename)
-        if match:
-            return int(match.group(1))
-        raise ValueError(f"Filename {filename} does not match expected pattern.")
-
-    def get_embedding_size_dim_from_file(self, batch_file: str) -> Tuple[int, int]:
-        """
-        Reads the batch file to determine the embedding size (number of embeddings) and dimension.
-        """
-        with h5py.File(batch_file, "r") as h5f:
-            if "embeddings" not in h5f:
-                raise ValueError(
-                    f"The file {batch_file} does not contain 'embeddings' dataset."
-                )
-            embeddings = h5f["embeddings"]
-            embedding_size = int(embeddings.shape[0])  # Cast to int
-            embedding_dim = int(embeddings.shape[1])  # Cast to int
-            logger.info(f"Embedding dimension from {batch_file}: {embedding_dim}")
-        return embedding_size, embedding_dim
-
-    def merge_embeddings(self, output_dir, merged_file, total_samples):
-        """
-        Merges all batch embedding files into a single embeddings file.
-
-        Args:
-            output_dir (str): The directory where batch embedding files are stored.
-            merged_file (str): The path to the merged embeddings file.
-            total_samples (int): The total number of samples (embeddings).
-
-        """
-        # Find all batch files
-        batch_files = glob.glob(os.path.join(output_dir, "batch_*.h5"))
-        if not batch_files:
-            logger.warning("No batch files found to merge")
-            return
-
-        # Sort batch files by batch number
-        batch_files.sort(key=self.extract_batch_number)
-
-        # Retrieve embedding_dim from the first batch file
-        _, embedding_dim = self.get_embedding_size_dim_from_file(batch_files[0])
-
-        if os.path.exists(merged_file):
-            logger.info(f"Merged file {merged_file} already exists, skipping merge")
-            return
-
-        logger.info(
-            f"Merging {len(batch_files)} batch files into {merged_file} with {total_samples} samples"
-        )
-
-        with h5py.File(merged_file, "w") as h5f_merged:
-            # Initialize the dataset in the merged file with the retrieved embedding dimension
-            embeddings_ds = h5f_merged.create_dataset(
-                "embeddings", shape=(total_samples, embedding_dim), dtype="float32"
-            )
-
-            start_idx = 0
-            for batch_file in batch_files:
-                with h5py.File(batch_file, "r") as h5f_batch:
-                    if "embeddings" not in h5f_batch:
-                        logger.error(
-                            f"File {batch_file} does not contain 'embeddings' dataset"
-                        )
-                        continue
-
-                    embeddings = h5f_batch["embeddings"][:]
-                    batch_size = embeddings.shape[0]
-                    end_idx = start_idx + batch_size
-
-                    # Check that each file's embedding dimension matches the retrieved embedding_dim
-                    if embeddings.shape[1] != embedding_dim:
-                        logger.error(
-                            f"Embedding dimension mismatch in {batch_file}. Expected {embedding_dim}, got {embeddings.shape[1]}"
-                        )
-                        continue
-
-                    # Copy embeddings into the merged dataset
-                    embeddings_ds[start_idx:end_idx] = embeddings
-                    start_idx = end_idx
-
-                # Remove the batch file after processing
-                os.remove(batch_file)
-                logger.info(f"Processed and removed {batch_file}")
-
-            gc.collect()
-
     def select_subsets(
         self, dataset_name: str, embeddings: torch.Tensor
     ) -> Dict[Union[int, float], List[int]]: