Enable exporting monthly invoice to Iceberg

QuanMPhm · QuanMPhm · commit bfe533847942 · 2026-06-04T12:37:03.000-04:00
Added new invoice `IcebergInvoice` to export invoice data to Iceberg tables
The export process also includes a schema update step to
allow updates to Iceberg table schema.

New Iceberg integration test added to validate iceberg functionality
E2E test updated to include iceberg exporting
Both tests use a temporary sqlite catalog
diff --git a/process_report/invoices/iceberg_invoice.py b/process_report/invoices/iceberg_invoice.py
@@ -0,0 +1,82 @@
+import logging
+from dataclasses import dataclass, field
+
+from pyiceberg.table import Table
+from pyiceberg.catalog import Catalog, load_catalog
+import pyarrow
+
+import process_report.invoices.invoice as invoice
+from process_report.loader import loader
+from process_report.settings import invoice_settings
+
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO)
+
+
+def get_iceberg_catalog(config: dict, catalog_name: str) -> Catalog:
+    return load_catalog(name=catalog_name, **config)
+
+
+def get_iceberg_table(catalog: Catalog, table_path) -> Table:
+    return catalog.load_table(table_path)
+
+
+@dataclass
+class IcebergInvoice(invoice.Invoice):
+    export_columns_list = [
+        invoice.INVOICE_DATE_FIELD,
+        invoice.PROJECT_FIELD,
+        invoice.PROJECT_ID_FIELD,
+        invoice.PI_FIELD,
+        invoice.CLUSTER_NAME_FIELD,
+        invoice.INVOICE_EMAIL_FIELD,
+        invoice.INVOICE_ADDRESS_FIELD,
+        invoice.INSTITUTION_FIELD,
+        invoice.INSTITUTION_ID_FIELD,
+        invoice.IS_BILLABLE_FIELD,
+        invoice.SU_HOURS_FIELD,
+        invoice.SU_TYPE_FIELD,
+        invoice.RATE_FIELD,
+        invoice.GROUP_NAME_FIELD,
+        invoice.GROUP_INSTITUTION_FIELD,
+        invoice.GROUP_BALANCE_FIELD,
+        invoice.COST_FIELD,
+        invoice.GROUP_BALANCE_USED_FIELD,
+        invoice.CREDIT_FIELD,
+        invoice.CREDIT_CODE_FIELD,
+        invoice.BALANCE_FIELD,
+    ]
+
+    iceberg_catalog_name: str = invoice_settings.iceberg_catalog_name
+    iceberg_catalog_config: dict = field(
+        default_factory=lambda: loader.get_iceberg_config()
+    )
+    iceberg_table_path: str = invoice_settings.iceberg_table_path
+
+    def _prepare(self):
+        iceberg_catalog = get_iceberg_catalog(
+            self.iceberg_catalog_config, self.iceberg_catalog_name
+        )
+        self.iceberg_table = get_iceberg_table(iceberg_catalog, self.iceberg_table_path)
+        self.export_data = self.data
+
+    def export(self):
+        # Overrides base invoice export behavior
+        self._filter_columns()
+
+        # Update table schema, only allows "possible" migrations (i.e raises on str -> Decimal)
+        # TODO (Quan) When we implement typing validation for dataframes, change this to raise errors
+        with self.iceberg_table.update_schema() as update_schema:
+            try:
+                update_schema.union_by_name(
+                    pyarrow.Table.from_pandas(self.export_data).schema
+                )
+            except ValueError as e:
+                logger.warning(
+                    f"Dataframe contains columns not convertable to PyIceberg: {e}"
+                )
+
+        self.iceberg_table.append(pyarrow.Table.from_pandas(self.export_data))
+
+    def export_s3(self, s3_bucket):
+        return
diff --git a/process_report/loader.py b/process_report/loader.py
@@ -61,6 +61,12 @@ def get_remote_filepath(self, remote_filepath: str) -> str:
             return util.fetch_s3(remote_filepath)
         return remote_filepath
 
+    @functools.lru_cache
+    def get_iceberg_config(self) -> dict:
+        """Load an Iceberg catalog config from a YAML file."""
+        with open(invoice_settings.iceberg_config_path, "r") as f:
+            return yaml.safe_load(f)
+
     @functools.lru_cache
     def get_new_pi_credit_amount(self) -> Decimal:
         return invoice_settings.new_pi_credit_amount or get_rates_info().get_value_at(
diff --git a/process_report/process_report.py b/process_report/process_report.py
@@ -18,6 +18,7 @@
     MOCA_prepaid_invoice,
     prepay_credits_snapshot,
     ocp_test_invoice,
+    iceberg_invoice,
 )
 from process_report.processors import (
     coldfront_fetch_processor,
@@ -97,6 +98,7 @@ def main():
             MOCA_prepaid_invoice.MOCAPrepaidInvoice,
             prepay_credits_snapshot.PrepayCreditsSnapshot,
             ocp_test_invoice.OcpTestInvoice,
+            iceberg_invoice.IcebergInvoice,
         ],
         invoice_settings.upload_to_s3,
     )
diff --git a/process_report/settings.py b/process_report/settings.py
@@ -11,6 +11,11 @@ class Settings(BaseSettings):
     keycloak_client_id: str | None = None
     keycloak_client_secret: str | None = None
 
+    # Iceberg config
+    iceberg_catalog_name: str | None = None
+    iceberg_config_path: str | None = None
+    iceberg_table_path: str | None = None
+
     invoice_path_template: str = "Invoices/{invoice_month}/Service Invoices/"
     invoice_month: str = (datetime.datetime.today() - relativedelta(months=1)).strftime(
         "%Y-%m"
diff --git a/process_report/tests/base.py b/process_report/tests/base.py
@@ -55,8 +55,10 @@ def create_test_invoice(self, data_dict: dict):
 
 
 class BaseTestCaseWithTempDir(BaseTestCase):
-    def setUp(self):
-        self.tempdir = Path(tempfile.TemporaryDirectory(delete=False).name)
+    @classmethod
+    def setUpClass(cls):
+        cls.tempdir = Path(tempfile.TemporaryDirectory(delete=False).name)
 
-    def tearDown(self):
-        shutil.rmtree(self.tempdir)
+    @classmethod
+    def tearDownClass(cls):
+        shutil.rmtree(cls.tempdir)
diff --git a/process_report/tests/e2e/test_e2e_pipeline.py b/process_report/tests/e2e/test_e2e_pipeline.py
@@ -5,6 +5,9 @@
 import logging
 import subprocess
 from typing import Dict, List
+import yaml
+
+from pyiceberg import schema, catalog
 
 logger = logging.getLogger(__name__)
 
@@ -131,6 +134,28 @@ def _prepare_pipeline_execution(
     env.setdefault("CHROME_BIN_PATH", "/usr/bin/chromium")
     env["PYTHONPATH"] = str(project_root) + ":" + env.get("PYTHONPATH", "")
 
+    # Iceberg settings, init test namespace and table
+    env["iceberg_catalog_name"] = "test_catalog"
+    env["iceberg_config_path"] = workspace / "test_iceberg_config.yaml"
+    env["iceberg_table_path"] = "test_namespace.test_table"
+
+    catalog_config = {
+        "type": "sql",
+        "warehouse": f"file://{workspace}",
+        "uri": f"sqlite:///{workspace / 'test_iceberg_catalog.db'}",
+    }
+
+    with open(workspace / "test_iceberg_config.yaml", "w") as f:
+        yaml.dump(catalog_config, f)
+
+    test_catalog = catalog.load_catalog(name="test_catalog", **catalog_config)
+    test_schema = schema.Schema(
+        schema.NestedField(1, "Invoice Month", schema.StringType()),
+    )
+
+    test_catalog.create_namespace_if_not_exists("test_namespace")
+    test_catalog.create_table_if_not_exists("test_namespace.test_table", test_schema)
+
     return command, env
 
 
diff --git a/process_report/tests/integration/test_iceberg.py b/process_report/tests/integration/test_iceberg.py
@@ -0,0 +1,145 @@
+import pandas
+import pyarrow
+from pyiceberg import schema, catalog
+
+from process_report.invoices.iceberg_invoice import IcebergInvoice
+from process_report.tests.base import BaseTestCaseWithTempDir
+
+
+class TestIceberg(BaseTestCaseWithTempDir):
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
+        # Create in-memory catalog
+        cls.catalog_name = "catalog_foo"
+        cls.table_path = "namespace_foo.table_foo"
+
+        config_dict = {
+            "type": "sql",
+            "warehouse": str(cls.tempdir),
+            "uri": f"sqlite:///{str(cls.tempdir)}/foo.db",
+        }
+        cls.catalog_config = config_dict
+
+        # Initialize test schema that's used in setUp()
+        cls.catalog = catalog.load_catalog(name=cls.catalog_name, **config_dict)
+        cls.test_schema = schema.Schema(
+            schema.NestedField(1, "Invoice Month", schema.StringType()),
+            schema.NestedField(2, "Cost", schema.DecimalType(21, 2)),
+            schema.NestedField(3, "PI", schema.StringType()),
+        )
+
+    def setUp(self):
+        self.catalog.create_namespace_if_not_exists("namespace_foo")
+        self.catalog.create_table_if_not_exists(self.table_path, self.test_schema)
+
+    def tearDown(self):
+        self.catalog.drop_table(self.table_path)
+
+    def test_upload_one_dataframe(self):
+        # Create test dataframe matching table schema
+        test_df = pandas.DataFrame(
+            {
+                "Invoice Month": ["2024-01", "2024-01"],
+                "Cost": [100.0, 200.0],
+                "PI": ["PI1", "PI2"],
+            },
+        ).astype({"Cost": pandas.ArrowDtype(pyarrow.decimal128(21, 2))})
+
+        # Create IcebergInvoice instance
+        inv = IcebergInvoice(
+            invoice_month="2024-01",
+            data=test_df,
+            iceberg_catalog_name=self.catalog_name,
+            iceberg_catalog_config=self.catalog_config,
+            iceberg_table_path=self.table_path,
+        )
+        inv.process()
+        inv.export()
+
+        # Verify data was uploaded, and Iceberg cost column can be casted to Decimal
+        table = self.catalog.load_table(self.table_path)
+        uploaded_df = table.scan().to_pandas().astype(test_df.dtypes)
+        assert uploaded_df.equals(test_df)
+
+    def test_upload_new_column(self):
+        # Create test dataframe with an extra column
+        test_df = pandas.DataFrame(
+            {
+                "Invoice Month": ["2024-02", "2024-02"],
+                "Cost": [150.0, 250.0],
+                "PI": ["PI3", "PI4"],
+                "extra_column": ["extra1", "extra2"],  # New column
+            }
+        ).astype({"Cost": pandas.ArrowDtype(pyarrow.decimal128(21, 2))})
+
+        # Create IcebergInvoice instance
+        inv = IcebergInvoice(
+            invoice_month="2024-02",
+            data=test_df,
+            iceberg_catalog_name=self.catalog_name,
+            iceberg_catalog_config=self.catalog_config,
+            iceberg_table_path=self.table_path,
+        )
+        inv.process()
+        inv.export()
+
+        # Verify data was uploaded with new column (schema evolution)
+        table = self.catalog.load_table(self.table_path)
+        uploaded_df = table.scan().to_pandas().astype(test_df.dtypes)
+        assert uploaded_df.equals(test_df)
+
+    def test_schema_evolution_with_existing_data(self):
+        # First, upload initial data without extra column
+        first_df = pandas.DataFrame(
+            {
+                "Invoice Month": ["2024-01", "2024-01"],
+                "Cost": [100.0, 200.0],
+                "PI": ["PI1", "PI2"],
+            }
+        ).astype({"Cost": pandas.ArrowDtype(pyarrow.decimal128(21, 2))})
+
+        inv = IcebergInvoice(
+            invoice_month="2024-01",
+            data=first_df,
+            iceberg_catalog_name=self.catalog_name,
+            iceberg_catalog_config=self.catalog_config,
+            iceberg_table_path=self.table_path,
+        )
+        inv.process()
+        inv.export()
+
+        # Now upload data with an extra column
+        second_df = pandas.DataFrame(
+            {
+                "Invoice Month": ["2024-02", "2024-02"],
+                "Cost": [150.0, 250.0],
+                "PI": ["PI3", "PI4"],
+                "extra_column": ["new1", "new2"],  # New column
+            }
+        ).astype({"Cost": pandas.ArrowDtype(pyarrow.decimal128(21, 2))})
+
+        inv2 = IcebergInvoice(
+            invoice_month="2024-02",
+            data=second_df,
+            iceberg_catalog_name=self.catalog_name,
+            iceberg_catalog_config=self.catalog_config,
+            iceberg_table_path=self.table_path,
+        )
+        inv2.process()
+        inv2.export()
+
+        table = self.catalog.load_table(self.table_path)
+        result_df = table.scan().to_pandas().astype(second_df.dtypes)
+
+        # Verify the table has schema evolved with the new column
+        # Old rows should have None for the new column
+        expected_df = pandas.DataFrame(
+            {
+                "Invoice Month": ["2024-02", "2024-02", "2024-01", "2024-01"],
+                "Cost": [150.0, 250.0, 100.0, 200.0],
+                "PI": ["PI3", "PI4", "PI1", "PI2"],
+                "extra_column": ["new1", "new2", None, None],
+            }
+        ).astype({"Cost": pandas.ArrowDtype(pyarrow.decimal128(21, 2))})
+        assert result_df.equals(expected_df)
diff --git a/process_report/tests/test-requirements.txt b/process_report/tests/test-requirements.txt
@@ -1,2 +1,4 @@
 pytest
+pytest-env
 coverage
+pyiceberg[sql-sqlite]
diff --git a/requirements.txt b/requirements.txt
@@ -6,3 +6,4 @@ Jinja2
 validators
 python-dateutil
 pydantic-settings
+pyiceberg[pyarrow]