NVIDIA-NeMo
diff --git a/‎src/nemo_safe_synthesizer/data_processing/actions/data_actions.py‎
Lines changed: 108 additions & 121 deletions b/‎src/nemo_safe_synthesizer/data_processing/actions/data_actions.py‎
Lines changed: 108 additions & 121 deletions
diff --git a/‎src/nemo_safe_synthesizer/data_processing/actions/dates.py‎
Lines changed: 49 additions & 10 deletions b/‎src/nemo_safe_synthesizer/data_processing/actions/dates.py‎
Lines changed: 49 additions & 10 deletions
diff --git a/‎src/nemo_safe_synthesizer/data_processing/actions/distributions.py‎
Lines changed: 19 additions & 17 deletions b/‎src/nemo_safe_synthesizer/data_processing/actions/distributions.py‎
Lines changed: 19 additions & 17 deletions
diff --git a/‎src/nemo_safe_synthesizer/data_processing/actions/utils.py‎
Lines changed: 49 additions & 51 deletions b/‎src/nemo_safe_synthesizer/data_processing/actions/utils.py‎
Lines changed: 49 additions & 51 deletions
@@ -1,6 +1,13 @@
 # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
+"""Date string parsing, formatting, and inference utilities.
+
+Supports ISO8601 timezone offsets (via ``strftime_extra`` / ``strptime_extra``),
+permutation-based format inference (``parse_date``, ``infer_from_series``),
+and date randomization for PII replacement (``randomize``).
+"""
+
 import itertools
 import re
 from collections import Counter
@@ -126,15 +133,16 @@ def strptime_extra(date_string: str, fmt: str) -> datetime:
 
 
 def date_component_permutations() -> list[tuple[str, str, str, str, str]]:
-    """Returns a list of string formats by component type. Each permutation is
-    indexed by y, m, d, hms, tz and can be passed into component formatter from
-    ``date_component_orders``.
+    """Return the Cartesian product of per-component format strings.
+
+    Each tuple is indexed by (year, month, day, hms, tz) and can be
+    passed into a formatter from ``date_component_orders``.
     """
     return list(itertools.product(*component_formats.values()))  # type:ignore
 
 
 def gen_date_str_fmt_permutations() -> set[str]:
-    """Returns a list of unique date string format permutations"""
+    """Return the set of all unique date format permutations."""
     return {order(*str_fmt) for str_fmt in date_component_permutations() for order in date_component_orders}
 
 
@@ -284,6 +292,7 @@ def tokenize_date_str(input: str) -> TokenizedStr:
 
 
 def maybe_match(date, format) -> Optional[datetime]:
+    """Attempt to parse ``date`` with ``format``, returning None on failure."""
     try:
         return strptime_extra(date, format)
     except ValueError:
@@ -294,13 +303,15 @@ def parse_date(
     input_date: str,
     date_str_fmts: list[str] | set[str] = date_str_fmt_permutations,
 ) -> Optional[ParsedDate]:
+    """Parse a date string and return the first matching ``ParsedDate``, or None."""
     return next(parse_date_multiple(input_date, date_str_fmts), None)
 
 
 def parse_date_multiple(
     input_date: str,
     date_str_fmts: list[str] | set[str] = date_str_fmt_permutations,
 ) -> Iterator[ParsedDate]:
+    """Yield all valid ``ParsedDate`` interpretations of ``input_date`` across known formats."""
     tokenized_date = tokenize_date_str(input_date)
 
     for str_fmt in date_str_fmts:
@@ -335,28 +346,31 @@ def randomize(date: str, days: int) -> Optional[str]:
 
 
 def d_str_to_fmt_multiple(input_date: str) -> Iterator[str]:
-    """Infers all likely date format from a date string."""
+    """Yield all plausible ``strftime`` format strings for a date string."""
     for parsed_date in parse_date_multiple(input_date):
         yield parsed_date.fmt_str
 
 
 def maybe_d_str_to_fmt_multiple(input_date: str) -> Iterator[str]:
-    """Infers all likely date format from a date string or nothing."""
+    """Like ``d_str_to_fmt_multiple`` but silently yields nothing on ``ValueError``."""
     try:
         yield from d_str_to_fmt_multiple(input_date)
     except ValueError:
         pass
 
 
 def d_str_to_fmt(input_date: str) -> Optional[str]:
-    """Infers a date format from a date string."""
+    """Infer the most likely ``strftime`` format string for a date string, or None."""
     return next(d_str_to_fmt_multiple(input_date), None)
 
 
 def infer_from_series(date_series: Iterable[str]) -> Optional[str]:
-    """An inference on a single date string isn't always perfect. Sometimes we mix
-    up format likes %m and %d. ``infer_from_series`` will evaluate a series of dates
-    and return the best date format for the series.
+    """Infer the best ``strftime`` format for a series of date strings.
+
+    Evaluates each date against all known format permutations and returns
+    the most frequently matched format. This is more reliable than
+    single-string inference, which can confuse ambiguous components like
+    ``%m`` and ``%d``.
     """
     fmt_occurrences = Counter()
     for date in date_series:
@@ -371,6 +385,21 @@ def fit_and_transform_dates(
     df: pd.DataFrame,
     inplace: bool = False,
 ) -> tuple[dict[str, dict[str, str]], pd.DataFrame]:
+    """Detect date columns, convert them to elapsed seconds, and record the transformation.
+
+    For each object-typed column, samples values to infer a date format. If
+    successful, converts the column to seconds elapsed since the column minimum
+    and records the format and min date for later reversal.
+
+    Args:
+        df: Input DataFrame.
+        inplace: If True, mutate ``df`` directly instead of copying.
+
+    Returns:
+        A tuple of (date_min_dict, result_df). ``date_min_dict`` maps column
+        names to ``{"format": ..., "min": ...}`` dicts needed by
+        ``transform_dates`` for reversal.
+    """
     date_min_dict = {}
     object_cols = [col for col, col_type in df.dtypes.iteritems() if col_type == "object"]
     result_df = df.copy() if not inplace else df
@@ -396,6 +425,16 @@ def fit_and_transform_dates(
 
 
 def transform_dates(dates: dict[str, dict[str, str]], df: pd.DataFrame) -> pd.DataFrame:
+    """Apply a previously fitted date-to-seconds transformation to a DataFrame.
+
+    Args:
+        dates: Mapping from column names to ``{"format": ..., "min": ...}``
+            dicts as returned by ``fit_and_transform_dates``.
+        df: DataFrame to transform.
+
+    Returns:
+        A copy of ``df`` with date columns converted to elapsed seconds.
+    """
     result_df = df.copy()
     for col, details in dates.items():
         _dates = pd.to_datetime(result_df[col], format=details["format"], errors="coerce")
 
@@ -1,6 +1,14 @@
 # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
+"""Statistical distribution models for sampling numeric and datetime values.
+
+Provides ``Distribution`` (float-valued) and ``DatetimeDistribution``
+hierarchies, each with Gaussian and Uniform concrete implementations.
+Pydantic discriminated unions (``DistributionT``, ``DatetimeDistributionT``)
+allow YAML/JSON configs to select the distribution type via ``distribution_type``.
+"""
+
 from __future__ import annotations
 
 from abc import ABC, abstractmethod
@@ -13,30 +21,24 @@
 
 
 class Distribution(BaseModel, ABC):
-    """
-    Abstract base class representing a distribution.
-    Child classes should specify whichever arguments are needed
-    to properly parametrize their distribution.
+    """Abstract base for float-valued distributions.
+
+    Subclasses specify the parameters needed to define their distribution
+    and implement ``sample`` to draw values.
     """
 
     @abstractmethod
     def sample(self, num_records: int) -> list[Any]: ...
 
 
 class DatetimeDistribution(BaseModel, ABC):
-    """
-    This class is separate from the `Distribution` ABC above
-    because datetimes need slightly different handling than floats.
-    Providing this separate class hierarchy also makes it easier
-    in pydantic to specify what datatypes we expect in the distribution
-    parameters (float vs datetime), as well as dt-specific arguments.
-
-    In practice, this means creating a "copy" `DatetimeDistribution`
-    for each regular `Distribution` where it makes sense. We could probably
-    automate some of this with generics, but IMO that'd just make it confusing
-    to read. We're still able to reuse the original `Distribution` class most
-    of the time in `DatetimeDistribution`, making the only business logic
-    really be about how we want to translate dates --> floats.
+    """Abstract base for datetime-valued distributions.
+
+    Separate from ``Distribution`` because datetime parameters (``datetime``,
+    ``timedelta``) differ from floats, and pydantic validation benefits from
+    distinct type hierarchies. Subclasses implement ``sample_datetimes`` to
+    produce raw datetime samples; universal post-processing (rounding via
+    ``precision``, formatting via ``format``) is applied by ``sample``.
     """
 
     precision: Optional[timedelta] = None
 
@@ -1,6 +1,13 @@
 # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
+"""Shared utilities for the data actions framework.
+
+Provides ``ActionCtx`` (execution context with state and dependency injection),
+``TransformsUtil`` (wrapper around the transforms_v2 engine), helper types
+(``MetadataColumns``, ``TransformsUpdate``), and subclass-discovery functions.
+"""
+
 from __future__ import annotations
 
 import inspect
@@ -35,24 +42,25 @@
 
 
 def type_alias_fn(field_name: str) -> str:
-    """
-    This alias fn allows `type_` to be parsed as `type` from config yaml. We use `type_`
-    in the actual python objects so it doesn't conflict with the python builtin `type()`.
-    """
+    """Pydantic alias generator that maps ``type_`` to ``type`` for YAML compatibility."""
     if field_name == "type_":
         return "type"
 
     return field_name
 
 
 class MetadataColumns(StrEnum):
-    INDEX = "__gretel__idx"  # used in validation to maintain a mapping to pre-transformed records
-    REJECT_REASON = (
-        "__gretel_reject_reason"  # used in validation to attach model_metadata about why the row was rejected
-    )
+    """Internal column names injected during validation phases."""
+
+    INDEX = "__nss__idx"
+    """Temporary index for mapping back to pre-transformed records."""
+
+    REJECT_REASON = "__nss_reject_reason"
+    """Reason a row was rejected during batch validation."""
 
 
 def remove_metadata_columns_from_df(df: pd.DataFrame):
+    """Drop all ``MetadataColumns`` from the DataFrame in-place."""
     metadata_cols = [col.value for col in MetadataColumns]
 
     columns_to_drop = [col for col in metadata_cols if col in df.columns]
@@ -63,6 +71,7 @@ def remove_metadata_columns_from_df(df: pd.DataFrame):
 
 
 def remove_metadata_columns_from_records(records: list[dict]) -> list[dict]:
+    """Return a copy of each record dict with ``MetadataColumns`` keys removed."""
     metadata_cols = [col.value for col in MetadataColumns]
 
     new_records: list[dict] = []
@@ -73,20 +82,18 @@ def remove_metadata_columns_from_records(records: list[dict]) -> list[dict]:
 
 
 class TransformsUpdate(BaseModel):
-    """
-    `transforms_v2` takes in untyped `dicts`, but this model adds a little
-    bit of structure for better validation.
-    """
+    """Typed wrapper for a single transforms_v2 update step."""
 
-    name: str
-    value: str
-    position: Optional[int] = None
+    name: str = Field(description="Target column name for the update.")
+    value: str = Field(description="Jinja expression evaluated by the transforms_v2 engine.")
+    position: Optional[int] = Field(default=None, description="Column insertion index when adding a new column.")
 
 
 class TransformsUtil:
-    """
-    Simple helper class to manage an instance of a TV2 `Environment` and some methods
-    to run `Step`s on input data.
+    """Wrapper around a transforms_v2 ``Environment`` for executing column updates and drop conditions.
+
+    Args:
+        seed: Random seed passed to the underlying ``Environment``.
     """
 
     def __init__(self, seed: Optional[int] = None) -> None:
@@ -148,15 +155,24 @@ def execute_drop_condition(self, batch: pd.DataFrame, conditions: list) -> pd.Da
 
 
 class DataSource(BaseModel, ABC):
+    """Abstract base for pluggable data sources used by ``GenDataSource`` actions.
+
+    Subclasses implement ``generate_data`` to populate a column in an existing
+    DataFrame. ``generate_records`` is a convenience wrapper that creates an
+    empty DataFrame first.
+    """
+
     model_config = ConfigDict(alias_generator=type_alias_fn)
 
     _ctx: ActionCtx = PrivateAttr()
 
     def with_ctx(self, ctx: ActionCtx) -> Self:
+        """Attach an ``ActionCtx`` and return self for chaining."""
         self._ctx = ctx
         return self
 
     def generate_records(self, num_records: int, col: str = "newcol") -> list[dict[Hashable, Any]]:
+        """Generate records as a list of dicts without an existing DataFrame."""
         df = pd.DataFrame(index=range(num_records))
         return self.generate_data(df, col).to_dict("records")
 
@@ -191,16 +207,12 @@ def generate_data(self, df: pd.DataFrame, col: str = "newcol") -> pd.DataFrame:
 
 
 def is_abstract(c: Any) -> bool:
-    """
-    This checks the two common ways that classes indicate themselves
-    as abstract; they either have `@abstractmethod`s, or they explicitly
-    inherit from `ABC` (or the metaclass). This checks both of these.
-    """
+    """Return True if the class has abstract methods or directly inherits ``ABC``."""
     return inspect.isabstract(c) or ABC in c.__bases__
 
 
 def all_subclasses(klass: type[T]) -> set[type[T]]:
-    """Grab all of the recursive subclasses of `klass`."""
+    """Recursively collect all subclasses of ``klass``."""
     subclasses: set[type[T]] = set()
     subclass_queue = [klass]
     while subclass_queue:
@@ -213,23 +225,17 @@ def all_subclasses(klass: type[T]) -> set[type[T]]:
 
 
 def concrete_subclasses(klass: type[T]) -> set[type[T]]:
-    """
-    Find all the subclasses of `klass`, then filter out the abstract
-    subclasses.
-
-    This is useful for passing in a very abstract parent class
-    like `BaseAction`, and finding all of the potential children
-    of that `klass`. Some of these children themselves might be abstract,
-    so we should filter those out.
+    """Return all non-abstract recursive subclasses of ``klass``.
 
-    This function is likely used to feed information to `pydantic` about
-    which potential concrete classes exist for purposes of validation and
-    schema generation.
+    Used by pydantic discriminated unions (e.g., ``ActionT``) to
+    auto-discover instantiable action types for validation and schema
+    generation.
     """
     return set(c for c in all_subclasses(klass) if not is_abstract(c))
 
 
 def guess_datetime_format(datetime_str: str) -> Optional[str]:
+    """Infer a ``strftime``-compatible format string from a date string, or None."""
     # TODO: use `pandas.tseries.api.guess_datetime_format` in the future?
     format = parse_date(datetime_str)
     if format is None:
@@ -238,25 +244,17 @@ def guess_datetime_format(datetime_str: str) -> Optional[str]:
 
 
 class ActionCtx(BaseModel):
-    """
-    Context available during all action execution. This object
-    can be used for some state specific to the execution,
-    as well as dependency injection for external services in the future.
-    """
+    """Execution context shared across all action invocations.
 
-    seed: Optional[int] = None
-    """
-    Seed used for all random generation tasks
+    Provides a random seed, a state dictionary for cross-phase communication,
+    and a lazily-initialized ``TransformsUtil`` for expression evaluation.
     """
 
-    state: dict[str, str] = {}
-    """
-    Used for tracking state across multiple action invocations.
-    This is important for actions which might have multiple functions
-    which need to remember information in latter invocations. For example,
-    a `postprocessing` function might benefit from information persisted
-    inside a `preprocessing` function.
-    """
+    seed: Optional[int] = Field(default=None, description="Seed used for all random generation tasks.")
+
+    state: dict[str, str] = Field(
+        default={}, description="Per-action state persisted across phases (keyed by BaseAction.hash())."
+    )
 
     def __init__(self, /, **data: Any) -> None:
         super().__init__(**data)