Closes #266

jimmysway · jimmysway · commit 73b5ca0fc058 · 2026-02-17T15:18:02.000-05:00
Added nerc_invoicing repo for querying invoicing Iceberg table

Added functionality to get lifetime costs grouped by project

Added pyproject.toml for future publishing

Added testing for new functionality
diff --git a/.github/ISSUE_TEMPLATE/custom.md b/.github/ISSUE_TEMPLATE/custom.md
@@ -12,7 +12,7 @@ assignees: ''
 ## Completion Criteria
 
 ## Description
-- [ ] 
+- [ ]
 
 ## Completion dates
 Desired - YYYY-MM-DD
diff --git a/data_tools/__init__.py b/data_tools/__init__.py
@@ -0,0 +1,3 @@
+from .costs import calculate_lifetime_costs
+
+__all__ = ["calculate_lifetime_costs"]
diff --git a/data_tools/config.py b/data_tools/config.py
@@ -0,0 +1,21 @@
+import os
+from pyiceberg.table import StaticTable
+
+WAREHOUSE_BASE = "s3://nerc-invoicing-iceberg/warehouse"
+TABLE_PATH = f"{WAREHOUSE_BASE}/nerc_invoicing_iceberg/nerc_invoicing_iceberg"
+
+
+def _b2_properties() -> dict[str, str]:
+    return {
+        "s3.access-key-id": os.environ["B2_APPLICATION_KEY_ID"],
+        "s3.secret-access-key": os.environ["B2_APPLICATION_KEY"],
+        "s3.endpoint": f"https://{os.environ['B2_S3_ENDPOINT']}",
+        "s3.region": "us-east-005",
+    }
+
+
+def _get_table() -> StaticTable:
+    return StaticTable.from_metadata(
+        TABLE_PATH,
+        properties=_b2_properties(),
+    )
diff --git a/data_tools/costs.py b/data_tools/costs.py
@@ -0,0 +1,118 @@
+from decimal import Decimal
+import logging
+
+import pandas as pd
+from pyiceberg.expressions import And, BooleanExpression, EqualTo
+
+import process_report.invoices.invoice as invoice
+
+from .config import _get_table
+
+logger = logging.getLogger(__name__)
+
+_LIFETIME_COLS = [
+    invoice.PROJECT_ID_FIELD,
+    invoice.CLUSTER_NAME_FIELD,
+    invoice.BALANCE_FIELD,
+]
+
+
+def _row_filter(**filters: str | int | float) -> BooleanExpression | None:
+    """Build a PyIceberg row filter expression from column=value filters.
+
+    Args:
+        **filters: Column names as keys, values to filter by. Values must be str, int, or float.
+
+    Returns:
+        PyIceberg BooleanExpression like EqualTo(col1, 'x') AND EqualTo(col2, 1),
+        or None if no filters are given.
+    """
+    if not filters:
+        return None
+    expression: BooleanExpression | None = None
+    for col, val in filters.items():
+        clause = EqualTo(col, val)
+        expression = clause if expression is None else And(expression, clause)
+    return expression
+
+
+def get_invoice_dataframe(
+    cols: list[str] | None = None, **filters: str | int | float
+) -> pd.DataFrame:
+    """Load invoice data from the Iceberg table.
+
+    Args:
+        cols: Column names to select. None selects all columns.
+        **filters: Column names as keys, values to filter by. Values must be str, int, or float.
+
+    Returns:
+        DataFrame of invoice data from the table.
+    """
+    table = _get_table()
+    row_filter = _row_filter(**filters)
+    if row_filter:
+        scan = table.scan(row_filter=row_filter)
+    else:
+        scan = table.scan()
+    if cols:
+        scan = scan.select(*cols)
+    df = scan.to_pandas()
+    if filters and df.empty:
+        logger.warning("No invoice rows matched filters: %s", filters)
+    return df
+
+
+def select_and_group(
+    cols: list[str],
+    group_by: list[str],
+    *,
+    agg_col: str,
+    agg_name: str = "total",
+    **filters: str | int | float,
+) -> pd.DataFrame:
+    """Load invoice data, group by the given columns, and aggregate one column with sum.
+
+    Args:
+        cols: Column names to load from the table.
+        group_by: Column names to group by.
+        agg_col: Column to sum.
+        agg_name: Name for the aggregated column in the output. Defaults to "total".
+        **filters: Column names as keys, values to filter by. Values must be str, int, or float.
+
+    Returns:
+        DataFrame with one row per group and a column containing the sum of agg_col.
+    """
+    all_cols = list(cols)
+    for c in group_by:
+        if c not in all_cols:
+            all_cols.append(c)
+    df = get_invoice_dataframe(all_cols, **filters)
+    df[agg_col] = df[agg_col].fillna(0)
+    agg_spec = {agg_name: (agg_col, "sum")}
+    grouped_df = df.groupby(list(group_by), as_index=False).agg(**agg_spec)
+    grouped_df[agg_name] = grouped_df[agg_name].map(
+        lambda v: Decimal(str(v)).quantize(Decimal("0.01"))
+    )
+    return grouped_df
+
+
+def calculate_lifetime_costs(**filters: str | int | float) -> pd.DataFrame:
+    """Group invoice data by project and cluster, summing balance per group.
+
+    Args:
+        **filters: Column names as keys, values to filter by. Values must be str, int, or float.
+
+    Returns:
+        DataFrame with columns: Project - Allocation, Cluster Name, lifetime_allocation_balance.
+    """
+    return select_and_group(
+        _LIFETIME_COLS,
+        [invoice.PROJECT_ID_FIELD, invoice.CLUSTER_NAME_FIELD],
+        agg_col=invoice.BALANCE_FIELD,
+        agg_name="lifetime_allocation_balance",
+        **filters,
+    )
+
+
+if __name__ == "__main__":
+    print(calculate_lifetime_costs())
diff --git a/process_report/tests/unit/data_tools/test_data_tools.py b/process_report/tests/unit/data_tools/test_data_tools.py
@@ -0,0 +1,114 @@
+import pandas as pd
+import pytest
+
+from data_tools import costs
+
+PID = costs.invoice.PROJECT_ID_FIELD
+CLUSTER = costs.invoice.CLUSTER_NAME_FIELD
+BALANCE = costs.invoice.BALANCE_FIELD
+
+
+@pytest.fixture
+def sample_invoice_dataframe() -> pd.DataFrame:
+    return pd.DataFrame(
+        {
+            PID: ["vllm-test", "vllm-test", "webrca-1b021a"],
+            CLUSTER: ["ocp-test", "ocp-test", "ocp-prod"],
+            BALANCE: [1.234, 2.345, None],
+        }
+    )
+
+
+def test_row_filter_empty_returns_none():
+    assert costs._row_filter() is None
+
+
+@pytest.mark.parametrize(
+    "filters",
+    [
+        {PID: "vllm-test", CLUSTER: "ocp-test"},
+        {PID: "vllm-test", CLUSTER: "ocp-prod"},
+    ],
+)
+def test_row_filter_builds_combined_and_expression(filters: dict[str, str]):
+    expression = costs._row_filter(**filters)
+    assert isinstance(expression, costs.And)
+    assert isinstance(expression.left, costs.EqualTo)
+    assert isinstance(expression.right, costs.EqualTo)
+
+
+def test_select_and_group_rounds_and_forwards_filters(
+    monkeypatch: pytest.MonkeyPatch, sample_invoice_dataframe: pd.DataFrame
+):
+    captured: dict[str, object] = {}
+
+    def _fake_loader(cols=None, **filters):
+        captured["cols"] = cols
+        captured["filters"] = filters
+        return sample_invoice_dataframe
+
+    monkeypatch.setattr(costs, "get_invoice_dataframe", _fake_loader)
+
+    result = costs.select_and_group(
+        [BALANCE],
+        [PID, CLUSTER],
+        agg_col=BALANCE,
+        agg_name="lifetime_allocation_balance",
+        **{PID: "vllm-test"},
+    )
+
+    assert captured["filters"] == {PID: "vllm-test"}
+    assert captured["cols"] == [BALANCE, PID, CLUSTER]
+
+    values = sorted(result["lifetime_allocation_balance"].tolist())
+    assert values == [costs.Decimal("0.00"), costs.Decimal("3.58")]
+    assert all(v.as_tuple().exponent == -2 for v in values)
+
+
+@pytest.mark.parametrize(
+    "invalid_filters",
+    [
+        {PID: "does-not-exist"},
+        {CLUSTER: "not-a-real-cluster"},
+        {PID: "does-not-exist", CLUSTER: "not-a-real-cluster"},
+    ],
+)
+def test_calculate_lifetime_costs_invalid_queries_return_empty(
+    monkeypatch: pytest.MonkeyPatch, invalid_filters: dict[str, str]
+):
+    empty_df = pd.DataFrame(columns=[PID, CLUSTER, BALANCE])
+    monkeypatch.setattr(costs, "get_invoice_dataframe", lambda cols=None, **f: empty_df)
+
+    result = costs.calculate_lifetime_costs(**invalid_filters)
+
+    assert result.empty
+    assert result.columns.tolist() == [PID, CLUSTER, "lifetime_allocation_balance"]
+
+
+class _FakeIcebergTable:
+    """Responds to .scan().select().to_pandas() chains."""
+
+    def __init__(self, df: pd.DataFrame):
+        self._df = df
+
+    def scan(self, row_filter=None):
+        return self
+
+    def select(self, *cols):
+        return self
+
+    def to_pandas(self):
+        return self._df
+
+
+def test_get_invoice_dataframe_warns_when_no_rows_match(
+    monkeypatch: pytest.MonkeyPatch, caplog: pytest.LogCaptureFixture
+):
+    table = _FakeIcebergTable(pd.DataFrame(columns=[PID, BALANCE]))
+    monkeypatch.setattr(costs, "_get_table", lambda: table)
+
+    with caplog.at_level("WARNING", logger="data_tools.costs"):
+        result = costs.get_invoice_dataframe([PID, BALANCE], **{PID: "does-not-exist"})
+
+    assert result.empty
+    assert "No invoice rows matched filters" in caplog.text
diff --git a/pyproject.toml b/pyproject.toml
@@ -0,0 +1,31 @@
+[project]
+name = "nerc-invoicing"
+version = "0.1.0"
+description = "Add your description here"
+readme = "README.md"
+requires-python = ">=3.12"
+dependencies = [
+    "nerc-rates>=1.0.1,<2.0.0",
+    "pandas>=3.0.0",
+    "pyarrow",
+    "pyiceberg[pyarrow]>=0.11.0",
+    "boto3>=1.42.6,<2.0",
+    "jinja2",
+    "validators",
+    "python-dateutil",
+    "pydantic-settings",
+    "pyyaml>=6.0",
+    "pre-commit>=4.5.1",
+]
+
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[tool.hatch.build.targets.wheel]
+packages = ["data_tools", "process_report"]
+
+[dependency-groups]
+dev = [
+    "pre-commit>=4.5.1",
+]
diff --git a/requirements.txt b/requirements.txt
@@ -1,6 +1,7 @@
 nerc-rates>=1.0.1,<2.0.0
 pandas
 pyarrow
+pyiceberg[pyarrow]>=0.11.0
 boto3>=1.42.6,<2.0
 Jinja2
 validators

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+from .costs import calculate_lifetime_costs`
	`2`	`+`
	`3`	`+__all__ = ["calculate_lifetime_costs"]`