NVIDIA-NeMo
diff --git a/‎src/nemo_safe_synthesizer/config/generate.py‎
Lines changed: 46 additions & 1 deletion b/‎src/nemo_safe_synthesizer/config/generate.py‎
Lines changed: 46 additions & 1 deletion
diff --git a/‎src/nemo_safe_synthesizer/generation/processors.py‎
Lines changed: 71 additions & 20 deletions b/‎src/nemo_safe_synthesizer/generation/processors.py‎
Lines changed: 71 additions & 20 deletions
@@ -18,7 +18,47 @@
     range_validator,
 )
 
-__all__ = ["GenerateParameters"]
+__all__ = ["GenerateParameters", "ValidationParameters"]
+
+
+class ValidationParameters(Parameters, BaseModel):
+    """Configuration for record and sequence validation.
+
+    These parameters control the validation and automatic fixes when going
+    from LLM output to tabular data.
+    """
+
+    group_by_accept_no_delineator: Annotated[
+        bool,
+        Field(
+            title="group_by_accept_no_delineator",
+            description="Whether to accept completions without both beginning and end of sequence delineators as a single sequence.",
+        ),
+    ] = False
+
+    group_by_ignore_invalid_records: Annotated[
+        bool,
+        Field(
+            title="group_by_ignore_invalid_records",
+            description="Whether to ignore invalid records in a sequence and proceed with the valid records.",
+        ),
+    ] = False
+
+    group_by_fix_non_unique_value: Annotated[
+        bool,
+        Field(
+            title="group_by_fix_non_unique_value",
+            description="Whether to automatically fix non-unique group by values in a sequence by using the first unique value for all records.",
+        ),
+    ] = False
+
+    group_by_fix_unordered_records: Annotated[
+        bool,
+        Field(
+            title="group_by_fix_unordered_records",
+            description="Whether to automatically fix unordered records in a sequence by sorting the records.",
+        ),
+    ] = False
 
 
 class GenerateParameters(Parameters, BaseModel):
@@ -134,3 +174,8 @@ class GenerateParameters(Parameters, BaseModel):
             description="Enforce timeseries fidelity by enforcing the time series order, intervals, start and end times of the records.",
         ),
     ] = False
+
+    validation: ValidationParameters = Field(
+        description="Validation parameters controlling validation logic and automatic fixes when parsing LLM output and converting to tabular data.",
+        default_factory=ValidationParameters,
+    )
@@ -7,6 +7,7 @@
 from dataclasses import dataclass
 
 from ..config import SafeSynthesizerParameters
+from ..config.generate import ValidationParameters
 from ..data_processing.record_utils import (
     check_if_records_are_ordered,
     extract_and_validate_records,
@@ -36,8 +37,10 @@ class Processor(ABC):
         schema: JSON schema as a dictionary.
     """
 
-    def __init__(self, schema: dict):
+    def __init__(self, schema: dict, config: ValidationParameters):
         self.schema = schema
+        self.config = config
+        logger.debug(f"Initialized processor with schema={schema} and config={config}")
 
     @property
     def name(self):
@@ -102,8 +105,15 @@ def _process_text_generation(self, text: str) -> ParsedResponse:
 class TimeSeriesDataProcessor(Processor):
     """Processor for time-series data generation tasks."""
 
-    def __init__(self, schema: dict, time_column: str | None, interval_seconds: int | None, time_format: str | None):
-        super().__init__(schema=schema)
+    def __init__(
+        self,
+        schema: dict,
+        config: ValidationParameters,
+        time_column: str | None,
+        interval_seconds: int | None,
+        time_format: str | None,
+    ):
+        super().__init__(schema=schema, config=config)
         if time_column is None:
             raise ValueError(
                 "time_column is required for TimeSeriesDataProcessor but was None. "
@@ -142,12 +152,13 @@ class GroupedDataProcessor(Processor):
     def __init__(
         self,
         schema: dict,
+        config: ValidationParameters,
         bos_token: str,
         eos_token: str,
         group_by: str | list[str],
         order_by: str | None = None,
     ):
-        super().__init__(schema=schema)
+        super().__init__(schema=schema, config=config)
         if isinstance(group_by, str):
             group_by = [group_by]
         self.group_by = group_by
@@ -158,12 +169,15 @@ def __init__(
     def _process_text_generation(self, text: str) -> ParsedResponse:
         """Process the output from the fine-tuned model.
 
-        For records to be valid, they must:
+        For records to be valid, they should:
             - Be in a group that is bound by BOS and EOS tokens.
             - Respect the known JSONL schema.
             - Have a unique value for the `group_by` field(s).
             - Be ordered by the `order_by` field if specified.
 
+        These requirements may be relaxed and automatically fixed depending on
+        the settings in self.config.
+
         Args:
             text: Text generated by the fine-tuned model.
 
@@ -173,6 +187,9 @@ def _process_text_generation(self, text: str) -> ParsedResponse:
         groups = extract_groups_from_jsonl_string(text, self.bos_token, self.eos_token)
         groupby_validator = "groupby"
 
+        if len(groups) == 0 and self.config.group_by_accept_no_delineator:
+            groups = [text]
+
         if len(groups) == 0:
             return ParsedResponse(
                 valid_records=[],
@@ -186,21 +203,53 @@ def _process_text_generation(self, text: str) -> ParsedResponse:
             valid, invalid, errors = extract_and_validate_records(group, self.schema)
             valid_with_str_members = [str(item) for item in valid]
 
-            # If there are any invalid records, the entire group is invalid.
-            if len(invalid) > 0:
-                invalid = valid_with_str_members + invalid
-                errors = errors + [("Invalid JSON in other groupby records", groupby_validator)] * len(valid)
-                valid = []
-
-            # The group is invalid if the set of group_by fields is not unique.
-            elif len(set([tuple(record[group_by] for group_by in self.group_by) for record in valid])) != 1:
-                valid, invalid = [], valid_with_str_members + invalid
-                errors = [("Groupby value is not unique", groupby_validator)] * len(invalid)
+            if len(valid) == 0:
+                invalid_groups.extend(invalid)
+                errors_groups.extend(errors)
+                continue
 
-            # If order_by is specified, the group is invalid if the records are not ordered.
-            elif self.order_by is not None and not check_if_records_are_ordered(valid, self.order_by):
-                valid, invalid = [], valid_with_str_members + invalid
-                errors = [("Group not ordered", groupby_validator)] * len(invalid)
+            # Handle invalid records in the group (optionally ignore and proceed).
+            if len(invalid) > 0:
+                if self.config.group_by_ignore_invalid_records:
+                    invalid = []
+                    errors = []
+                else:
+                    # If there are any invalid records, the entire group is invalid.
+                    invalid = valid_with_str_members + invalid
+                    errors = errors + [("Invalid JSON in other groupby records", groupby_validator)] * len(valid)
+                    valid = []
+                    valid_groups.extend(valid)
+                    invalid_groups.extend(invalid)
+                    errors_groups.extend(errors)
+                    continue
+
+            # Handle non-unique group_by values (optionally fix by using first record's values).
+            if len(set(tuple(record[gb] for gb in self.group_by) for record in valid)) != 1:
+                if self.config.group_by_fix_non_unique_value:
+                    for group_by in self.group_by:
+                        for record in valid[1:]:
+                            record[group_by] = valid[0][group_by]
+                else:
+                    # The group is invalid if the set of group_by fields is not unique.
+                    valid, invalid = [], valid_with_str_members + invalid
+                    errors = [("Groupby value is not unique", groupby_validator)] * len(invalid)
+                    valid_groups.extend(valid)
+                    invalid_groups.extend(invalid)
+                    errors_groups.extend(errors)
+                    continue
+
+            # Handle unordered records when order_by is set (optionally fix by sorting).
+            if self.order_by is not None and not check_if_records_are_ordered(valid, self.order_by):
+                if self.config.group_by_fix_unordered_records:
+                    valid.sort(key=lambda x: x[self.order_by])
+                else:
+                    # If order_by is specified, the group is invalid if the records are not ordered.
+                    valid, invalid = [], valid_with_str_members + invalid
+                    errors = [("Group not ordered", groupby_validator)] * len(invalid)
+                    valid_groups.extend(valid)
+                    invalid_groups.extend(invalid)
+                    errors_groups.extend(errors)
+                    continue
 
             valid_groups.extend(valid)
             invalid_groups.extend(invalid)
@@ -227,20 +276,22 @@ def create_processor(schema: dict, metadata: ModelMetadata, config: SafeSynthesi
     if config.time_series.is_timeseries:
         processor = TimeSeriesDataProcessor(
             schema,
+            config=config.generation.validation,
             time_column=config.time_series.timestamp_column,
             interval_seconds=config.time_series.timestamp_interval_seconds,
             time_format=config.time_series.timestamp_format,
         )
     elif config.data.group_training_examples_by:
         processor = GroupedDataProcessor(
             schema,
+            config=config.generation.validation,
             group_by=config.data.group_training_examples_by,
             order_by=config.data.order_training_examples_by,
             bos_token=metadata.prompt_config.bos_token,
             eos_token=metadata.prompt_config.eos_token,
         )
     else:
-        processor = TabularDataProcessor(schema)
+        processor = TabularDataProcessor(schema, config=config.generation.validation)
 
     logger.info(f"Initialized the {processor.name}")
     return processor