Added nerc_invoicing repo for querying invoicing Iceberg table

jimmysway · jimmysway · commit a277bb10eff3 · 2026-02-18T17:53:47.000-05:00
Added functionality to get lifetime costs grouped by project

Added pyproject.toml for future publishing

Added testing for new functionality
diff --git a/process_report/data_tools/__init__.py b/process_report/data_tools/__init__.py
diff --git a/process_report/data_tools/config.py b/process_report/data_tools/config.py
@@ -0,0 +1,49 @@
+import functools
+
+from pydantic_settings import BaseSettings
+from pyiceberg.table import StaticTable
+
+
+class DataToolsSettings(BaseSettings):
+    """Iceberg warehouse path and S3 credentials for data_tools queries."""
+
+    iceberg_warehouse_base: str = "s3://nerc-invoicing-iceberg/warehouse"
+    iceberg_table_subpath: str = "nerc_invoicing_iceberg/nerc_invoicing_iceberg"
+    iceberg_s3_access_key_id: str | None = None
+    iceberg_s3_secret_access_key: str | None = None
+    iceberg_s3_endpoint: str | None = None
+    iceberg_s3_region: str = "us-east-005"
+
+    @property
+    def table_path(self) -> str:
+        return f"{self.iceberg_warehouse_base}/{self.iceberg_table_subpath}"
+
+    def iceberg_s3_properties(self) -> dict[str, str]:
+        if not all(
+            [
+                self.iceberg_s3_access_key_id,
+                self.iceberg_s3_secret_access_key,
+                self.iceberg_s3_endpoint,
+            ]
+        ):
+            raise ValueError(
+                "Iceberg S3 credentials required: "
+                "ICEBERG_S3_ACCESS_KEY_ID, ICEBERG_S3_SECRET_ACCESS_KEY, ICEBERG_S3_ENDPOINT"
+            )
+        return {
+            "s3.access-key-id": self.iceberg_s3_access_key_id,
+            "s3.secret-access-key": self.iceberg_s3_secret_access_key,
+            "s3.endpoint": f"https://{self.iceberg_s3_endpoint}",
+            "s3.region": self.iceberg_s3_region,
+        }
+
+
+data_tools_settings = DataToolsSettings()
+
+
+@functools.cache
+def get_table() -> StaticTable:
+    return StaticTable.from_metadata(
+        data_tools_settings.table_path,
+        properties=data_tools_settings.iceberg_s3_properties(),
+    )
diff --git a/process_report/data_tools/costs.py b/process_report/data_tools/costs.py
@@ -0,0 +1,162 @@
+import functools
+from decimal import Decimal
+import logging
+
+import pandas as pd
+from pyiceberg.expressions import And, BooleanExpression, EqualTo
+
+import process_report.invoices.invoice as invoice
+from process_report.data_tools.config import get_table
+
+logger = logging.getLogger(__name__)
+FilterValue = str | int | float
+
+_LIFETIME_COLS = [
+    invoice.PROJECT_ID_FIELD,
+    invoice.CLUSTER_NAME_FIELD,
+    invoice.BALANCE_FIELD,
+]
+
+
+def _row_filter(**filters: FilterValue) -> BooleanExpression | None:
+    """Build a PyIceberg row filter expression from column=value filters.
+
+    Args:
+        **filters: Column names as keys, values to filter by. Values must be str, int, or float.
+
+    Returns:
+        PyIceberg BooleanExpression like EqualTo(col1, 'x') AND EqualTo(col2, 1),
+        or None if no filters are given.
+    """
+    if not filters:
+        return None
+    expression: BooleanExpression | None = None
+    for col, val in filters.items():
+        clause = EqualTo(col, val)
+        expression = clause if expression is None else And(expression, clause)
+    return expression
+
+
+@functools.cache
+def get_invoice_dataframe(
+    cols: tuple[str, ...] | None = None, **filters: FilterValue
+) -> pd.DataFrame:
+    """Load invoice data from the Iceberg table.
+
+    Args:
+        cols: Column names to select as a tuple. None selects all columns.
+        **filters: Column names as keys, values to filter by. Values must be str, int, or float.
+
+    Returns:
+        DataFrame of invoice data from the table.
+    """
+    table = get_table()
+    row_filter = _row_filter(**filters)
+    if row_filter:
+        scan = table.scan(row_filter=row_filter)
+    else:
+        scan = table.scan()
+    if cols:
+        scan = scan.select(*cols)
+    df = scan.to_pandas()
+    if filters and df.empty:
+        logger.warning("No invoice rows matched filters: %s", filters)
+    return df
+
+
+def group_and_sum(
+    df: pd.DataFrame,
+    group_by: tuple[str, ...],
+    *,
+    agg_col: str,
+    agg_name: str = "total",
+) -> pd.DataFrame:
+    """Group a dataframe and aggregate one column with sum.
+
+    Args:
+        df: Input dataframe.
+        group_by: Column names to group by.
+        agg_col: Column to sum.
+        agg_name: Name for the aggregated column in the output. Defaults to "total".
+
+    Returns:
+        DataFrame with one row per group and a column containing the sum of agg_col.
+    """
+    grouped_input = df.copy()
+    grouped_input[agg_col] = grouped_input[agg_col].fillna(0)
+    agg_spec = {agg_name: (agg_col, "sum")}
+    grouped_df = grouped_input.groupby(list(group_by), as_index=False).agg(**agg_spec)
+    grouped_df[agg_name] = grouped_df[agg_name].map(
+        lambda v: Decimal(str(v)).quantize(Decimal("0.01"))
+    )
+    return grouped_df
+
+
+def aggregate_by(
+    cols: tuple[str, ...],
+    group_by: tuple[str, ...],
+    *,
+    agg_col: str,
+    agg_name: str = "total",
+    **filters: FilterValue,
+) -> pd.DataFrame:
+    """Load invoice data and return grouped sum totals.
+
+    This helper fetches invoice rows using the provided selected columns and filters,
+    ensures grouping columns are included in the selection, then returns a grouped sum
+    aggregation over ``agg_col``.
+
+    Args:
+        cols: Columns to select from the invoice table before aggregation.
+        group_by: Columns to group rows by in the aggregated output.
+        agg_col: Numeric column to sum within each group.
+        agg_name: Output column name for the aggregated sum. Defaults to ``"total"``.
+        **filters: Column=value equality filters applied while loading invoice data.
+            Values must be str, int, or float.
+
+    Returns:
+        DataFrame with one row per unique ``group_by`` combination and a summed
+        ``agg_name`` column quantized to two decimal places.
+
+    Example:
+        >>> df = aggregate_by(
+        ...     cols=(invoice.BALANCE_FIELD,),
+        ...     group_by=(invoice.PROJECT_ID_FIELD, invoice.CLUSTER_NAME_FIELD),
+        ...     agg_col=invoice.BALANCE_FIELD,
+        ...     agg_name="lifetime_allocation_balance",
+        ... )
+    """
+    all_cols = list(cols)
+    for col in group_by:
+        if col not in all_cols:
+            all_cols.append(col)
+    df = get_invoice_dataframe(tuple(all_cols), **filters)
+    return group_and_sum(
+        df,
+        group_by=group_by,
+        agg_col=agg_col,
+        agg_name=agg_name,
+    )
+
+
+def calculate_lifetime_costs(**filters: FilterValue) -> pd.DataFrame:
+    """Group invoice data by project and cluster, summing balance per group.
+
+    Args:
+        **filters: Column names as keys, values to filter by. Values must be str, int, or float.
+
+    Returns:
+        DataFrame with columns: Project - Allocation, Cluster Name, lifetime_allocation_balance.
+
+    Example:
+        >>> filters = {invoice.PROJECT_ID_FIELD: "vllm-test"}
+        >>> df = calculate_lifetime_costs(**filters)
+    """
+
+    return aggregate_by(
+        tuple(_LIFETIME_COLS),
+        (invoice.PROJECT_ID_FIELD, invoice.CLUSTER_NAME_FIELD),
+        agg_col=invoice.BALANCE_FIELD,
+        agg_name="lifetime_allocation_balance",
+        **filters,
+    )
diff --git a/process_report/tests/unit/data_tools/test_data_tools.py b/process_report/tests/unit/data_tools/test_data_tools.py
@@ -0,0 +1,151 @@
+import pandas as pd
+import pytest
+
+from process_report.data_tools import costs
+
+PID = costs.invoice.PROJECT_ID_FIELD
+CLUSTER = costs.invoice.CLUSTER_NAME_FIELD
+BALANCE = costs.invoice.BALANCE_FIELD
+
+
+@pytest.fixture(autouse=True)
+def clear_dataframe_cache():
+    costs.get_invoice_dataframe.cache_clear()
+    yield
+    costs.get_invoice_dataframe.cache_clear()
+
+
+@pytest.fixture
+def sample_invoice_dataframe() -> pd.DataFrame:
+    return pd.DataFrame(
+        {
+            PID: ["vllm-test", "vllm-test", "webrca-1b021a"],
+            CLUSTER: ["ocp-test", "ocp-test", "ocp-prod"],
+            BALANCE: [1.234, 2.345, None],
+        }
+    )
+
+
+def test_row_filter_empty_returns_none():
+    assert costs._row_filter() is None
+
+
+@pytest.mark.parametrize(
+    "filters",
+    [
+        {PID: "vllm-test", CLUSTER: "ocp-test"},
+        {PID: "vllm-test", CLUSTER: "ocp-prod"},
+    ],
+)
+def test_row_filter_builds_combined_and_expression(filters: dict[str, str]):
+    expression = costs._row_filter(**filters)
+    assert isinstance(expression, costs.And)
+    assert isinstance(expression.left, costs.EqualTo)
+    assert isinstance(expression.right, costs.EqualTo)
+
+
+def test_aggregate_by_rounds_and_forwards_filters(
+    monkeypatch: pytest.MonkeyPatch, sample_invoice_dataframe: pd.DataFrame
+):
+    captured: dict[str, object] = {}
+
+    def _fake_loader(cols=None, **filters):
+        captured["cols"] = cols
+        captured["filters"] = filters
+        return sample_invoice_dataframe
+
+    monkeypatch.setattr(costs, "get_invoice_dataframe", _fake_loader)
+
+    result = costs.aggregate_by(
+        (BALANCE,),
+        (PID, CLUSTER),
+        agg_col=BALANCE,
+        agg_name="lifetime_allocation_balance",
+        **{PID: "vllm-test"},
+    )
+
+    assert captured["filters"] == {PID: "vllm-test"}
+    assert captured["cols"] == (BALANCE, PID, CLUSTER)
+
+    values = sorted(result["lifetime_allocation_balance"].tolist())
+    assert values == [costs.Decimal("0.00"), costs.Decimal("3.58")]
+    assert all(v.as_tuple().exponent == -2 for v in values)
+
+
+def test_group_and_sum_is_pure_transform(sample_invoice_dataframe: pd.DataFrame):
+    result = costs.group_and_sum(
+        sample_invoice_dataframe,
+        (PID, CLUSTER),
+        agg_col=BALANCE,
+        agg_name="lifetime_allocation_balance",
+    )
+
+    values = sorted(result["lifetime_allocation_balance"].tolist())
+    assert values == [costs.Decimal("0.00"), costs.Decimal("3.58")]
+    assert all(v.as_tuple().exponent == -2 for v in values)
+
+
+@pytest.mark.parametrize(
+    "invalid_filters",
+    [
+        {PID: "does-not-exist"},
+        {CLUSTER: "not-a-real-cluster"},
+        {PID: "does-not-exist", CLUSTER: "not-a-real-cluster"},
+    ],
+)
+def test_calculate_lifetime_costs_invalid_queries_return_empty(
+    monkeypatch: pytest.MonkeyPatch, invalid_filters: dict[str, str]
+):
+    empty_df = pd.DataFrame(columns=[PID, CLUSTER, BALANCE])
+    monkeypatch.setattr(costs, "get_invoice_dataframe", lambda cols=None, **f: empty_df)
+
+    result = costs.calculate_lifetime_costs(**invalid_filters)
+
+    assert result.empty
+    assert result.columns.tolist() == [PID, CLUSTER, "lifetime_allocation_balance"]
+
+
+class _FakeIcebergTable:
+    """Responds to .scan().select().to_pandas() chains."""
+
+    def __init__(self, df: pd.DataFrame):
+        self._df = df
+
+    def scan(self, row_filter=None):
+        return self
+
+    def select(self, *cols):
+        return self
+
+    def to_pandas(self):
+        return self._df
+
+
+def test_get_invoice_dataframe_warns_when_no_rows_match(
+    monkeypatch: pytest.MonkeyPatch, caplog: pytest.LogCaptureFixture
+):
+    table = _FakeIcebergTable(pd.DataFrame(columns=[PID, BALANCE]))
+    monkeypatch.setattr(costs, "get_table", lambda: table)
+
+    with caplog.at_level("WARNING", logger=costs.__name__):
+        result = costs.get_invoice_dataframe((PID, BALANCE), **{PID: "does-not-exist"})
+
+    assert result.empty
+    assert "No invoice rows matched filters" in caplog.text
+
+
+def test_get_invoice_dataframe_caches_repeated_query(monkeypatch: pytest.MonkeyPatch):
+    table = _FakeIcebergTable(pd.DataFrame({PID: ["vllm-test"], BALANCE: [1.0]}))
+    call_counter = {"count": 0}
+
+    def _fake_get_table():
+        call_counter["count"] += 1
+        return table
+
+    monkeypatch.setattr(costs, "get_table", _fake_get_table)
+
+    first = costs.get_invoice_dataframe((PID, BALANCE), **{PID: "vllm-test"})
+    second = costs.get_invoice_dataframe((PID, BALANCE), **{PID: "vllm-test"})
+
+    assert call_counter["count"] == 1
+    assert first is second
diff --git a/pyproject.toml b/pyproject.toml
@@ -0,0 +1,27 @@
+[project]
+name = "nerc-invoicing"
+version = "0.1.0"
+description = "Add your description here"
+readme = "README.md"
+requires-python = ">=3.12"
+dependencies = [
+    "nerc-rates>=1.0.1,<2.0.0",
+    "pandas>=3.0.0",
+    "pyarrow",
+    "pyiceberg[pyarrow]>=0.11.0",
+    "boto3>=1.42.6,<2.0",
+    "jinja2",
+    "validators",
+    "python-dateutil",
+    "pydantic-settings",
+    "pyyaml>=6.0",
+    "pre-commit>=4.5.1",
+]
+
+[build-system]
+requires = ["uv_build>=0.10.4,<0.11.0"]
+build-backend = "uv_build"
+
+[tool.uv.build-backend]
+module-name = "process_report"
+module-root = ""
diff --git a/requirements.txt b/requirements.txt