feat: Add batch get dataset content and batch flushing for large experiments (#438)

dmallon · web-flow · commit 6cbef3c31233 · 2025-12-12T23:08:23.000Z
diff --git a/src/galileo/datasets.py b/src/galileo/datasets.py
@@ -63,7 +63,7 @@ def __init__(self, dataset_db: DatasetDB) -> None:
         self.dataset = dataset_db
         self.config = GalileoPythonConfig.get()
 
-    def get_content(self) -> Union[None, DatasetContent]:
+    def get_content(self, starting_token: int = 0, limit: int = MAX_DATASET_ROWS) -> Union[None, DatasetContent]:
         """
         Gets and returns the content of the dataset.
         Also refreshes the content of the local dataset instance.
@@ -85,7 +85,7 @@ def get_content(self) -> Union[None, DatasetContent]:
             return None
 
         content: DatasetContent = get_dataset_content_datasets_dataset_id_content_get.sync(
-            client=self.config.api_client, dataset_id=self.dataset.id, limit=MAX_DATASET_ROWS
+            client=self.config.api_client, dataset_id=self.dataset.id, limit=limit, starting_token=starting_token
         )
 
         self.content = content
diff --git a/src/galileo/experiments.py b/src/galileo/experiments.py
@@ -1,14 +1,15 @@
 import builtins
 import datetime
 import logging
+from sys import getsizeof
 from typing import Any, Callable, Optional, Union
 
 from attrs import define as _attrs_define
 from attrs import field as _attrs_field
 
 from galileo import galileo_context, log
 from galileo.config import GalileoPythonConfig
-from galileo.datasets import Dataset
+from galileo.datasets import Dataset, convert_dataset_row_to_record
 from galileo.experiment_tags import upsert_experiment_tag
 from galileo.jobs import Jobs
 from galileo.projects import Project, Projects
@@ -20,14 +21,18 @@
 from galileo.resources.models import ExperimentResponse, HTTPValidationError, PromptRunSettings, ScorerConfig, TaskType
 from galileo.schema.datasets import DatasetRecord
 from galileo.schema.metrics import GalileoScorers, LocalMetricConfig, Metric
-from galileo.utils.datasets import load_dataset_and_records
+from galileo.utils.datasets import create_rows_from_records, load_dataset
 from galileo.utils.logging import get_logger
 from galileo.utils.metrics import create_metric_configs
 
 _logger = get_logger(__name__)
 
 EXPERIMENT_TASK_TYPE: TaskType = 16
 
+MAX_REQUEST_SIZE_BYTES = 10 * 1024 * 1024  # 10 MB
+MAX_INGEST_BATCH_SIZE = 128
+DATASET_CONTENT_PAGE_SIZE = 1000
+
 
 @_attrs_define
 class ExperimentCreateRequest:
@@ -148,20 +153,54 @@ def run_with_function(
         self,
         project_obj: Project,
         experiment_obj: ExperimentResponse,
-        records: builtins.list[DatasetRecord],
+        dataset_obj: Optional[Dataset],
+        records: Optional[builtins.list[DatasetRecord]],
         func: Callable,
         local_metrics: builtins.list[LocalMetricConfig],
     ) -> dict[str, Any]:
+        if dataset_obj is None and records is None:
+            raise ValueError("Either dataset_obj or records must be provided")
         results = []
         galileo_context.init(project=project_obj.name, experiment_id=experiment_obj.id, local_metrics=local_metrics)
 
         def logged_process_func(row: DatasetRecord) -> Callable:
             return log(name=experiment_obj.name, dataset_record=row)(func)
 
-        #  process each row in the dataset
-        for row in records:
-            results.append(process_row(row, logged_process_func(row)))
-            galileo_context.reset_trace_context()
+        # For static records (list), process once
+        if records is not None:
+            _logger.info(f"Processing {len(records)} rows from dataset")
+            for row in records:
+                results.append(process_row(row, logged_process_func(row)))
+                galileo_context.reset_trace_context()
+                if getsizeof(results) > MAX_REQUEST_SIZE_BYTES or len(results) >= MAX_INGEST_BATCH_SIZE:
+                    _logger.info("Flushing logger due to size limit")
+                    galileo_context.flush()
+                    results = []
+        # For dataset object, paginate through content
+        elif dataset_obj is not None:
+            starting_token = 0
+            has_more_data = True
+
+            while has_more_data:
+                _logger.info(f"Loading dataset content starting at token {starting_token}")
+                content = dataset_obj.get_content(starting_token=starting_token, limit=DATASET_CONTENT_PAGE_SIZE)
+
+                if not content or not content.rows:
+                    _logger.info("No more dataset content to process")
+                    has_more_data = False
+                else:
+                    batch_records = [convert_dataset_row_to_record(row) for row in content.rows]
+                    _logger.info(f"Processing {len(batch_records)} rows from dataset")
+
+                    for row in batch_records:
+                        results.append(process_row(row, logged_process_func(row)))
+                        galileo_context.reset_trace_context()
+                        if getsizeof(results) > MAX_REQUEST_SIZE_BYTES or len(results) >= MAX_INGEST_BATCH_SIZE:
+                            _logger.info("Flushing logger due to size limit")
+                            galileo_context.flush()
+                            results = []
+
+                    starting_token += len(batch_records)
 
         # flush the logger
         galileo_context.flush()
@@ -247,20 +286,22 @@ def run_experiment(
         If required parameters are missing or invalid
     """
     # Load dataset and records
-    dataset_obj, records = load_dataset_and_records(dataset, dataset_id, dataset_name)
+    dataset_obj = load_dataset(dataset, dataset_id, dataset_name)
 
     # Validate experiment configuration
     if prompt_template and not dataset_obj:
         raise ValueError("A dataset record, id, or name of a dataset must be provided when a prompt_template is used")
 
-    if function and not records:
-        raise ValueError(
-            "A dataset record, id or name of a dataset, or list of records must be provided when a function is used"
-        )
-
     if function and prompt_template:
         raise ValueError("A function or prompt_template should be provided, but not both")
 
+    records = None
+    if not dataset_obj and isinstance(dataset, list):
+        records = create_rows_from_records(dataset)
+
+    if function and not dataset_obj and not records:
+        raise ValueError("A dataset record, id, name, or a list of records must be provided when a function is used")
+
     # Get the project from the name or Id
     project_obj = Projects().get_with_env_fallbacks(id=project_id, name=project)
 
@@ -303,6 +344,7 @@ def run_experiment(
         return Experiments().run_with_function(
             project_obj=project_obj,
             experiment_obj=experiment_obj,
+            dataset_obj=dataset_obj,
             records=records,
             func=function,
             local_metrics=local_metrics,
diff --git a/src/galileo/utils/datasets.py b/src/galileo/utils/datasets.py
@@ -24,6 +24,48 @@ def validate_dataset_in_project(
         raise ValueError(f"Dataset '{dataset_identifier}' is not used in project '{project_identifier}'")
 
 
+def load_dataset(
+    dataset: Union["Dataset", list[Union[dict[str, Any], str]], str, None],
+    dataset_id: Optional[str],
+    dataset_name: Optional[str],
+) -> Optional["Dataset"]:
+    """
+    Load dataset based on provided parameters.
+
+    Parameters
+    ----------
+    dataset:
+        Dataset object, list of records, or dataset name
+    dataset_id:
+        ID of the dataset
+    dataset_name:
+        Name of the dataset
+
+    Returns
+    -------
+    Dataset object or None
+
+    Raises
+    ------
+    ValueError
+            If no dataset information is provided or dataset doesn't exist
+    """
+    from galileo.datasets import get_dataset
+
+    if dataset_id:
+        return get_dataset(id=dataset_id)
+    if dataset_name:
+        return get_dataset(name=dataset_name)
+    if dataset and isinstance(dataset, str):
+        return get_dataset(name=dataset)
+    if dataset and not isinstance(dataset, (str, list)):
+        # Must be a Dataset object
+        return dataset
+    if dataset and isinstance(dataset, list):
+        return None
+    raise ValueError("To load dataset records, dataset, dataset_name, or dataset_id must be provided")
+
+
 def load_dataset_and_records(
     dataset: Union["Dataset", list[Union[dict[str, Any], str]], str, None],
     dataset_id: Optional[str],
diff --git a/tests/test_experiments.py b/tests/test_experiments.py