Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion process_report/process_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ def validate_required_env_vars(required_env_vars):
def main():
"""Remove non-billable PIs and projects"""
required_env_vars = []
if not invoice_settings.coldfront_api_filepath:
if not invoice_settings.coldfront_api_filepaths:
required_env_vars.extend(["KEYCLOAK_CLIENT_ID", "KEYCLOAK_CLIENT_SECRET"])
validate_required_env_vars(required_env_vars)

Expand Down
31 changes: 21 additions & 10 deletions process_report/processors/coldfront_fetch_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import sys
import functools
import logging
import json
import yaml
from dataclasses import dataclass, field

import requests
Expand Down Expand Up @@ -32,7 +32,7 @@ class ColdfrontFetchProcessor(processor.Processor):
nonbillable_projects: pandas.DataFrame = field(
default_factory=loader.get_nonbillable_projects
)
coldfront_data_filepath: str = invoice_settings.coldfront_api_filepath
coldfront_data_filepaths: tuple[str] = invoice_settings.coldfront_api_filepaths

initializes_columns = (invoice.IS_COURSE_COLUMN,)
operates_on_columns = (
Expand Down Expand Up @@ -94,18 +94,22 @@ def _fetch_coldfront_allocation_api(self):
return r.json()

def _get_coldfront_api_data(self):
if self.coldfront_data_filepath:
logger.info(
f"Using Coldfront data from {self.coldfront_data_filepath} instead of API"
)
with open(self.coldfront_data_filepath, "r") as f:
return json.load(f)
else:
return self._fetch_coldfront_allocation_api()
api_data = []
for api_data_file in self.coldfront_data_filepaths:
logger.info(f"Using Coldfront data from {api_data_file}")
with open(api_data_file) as f:
api_data += yaml.safe_load(f)

if invoice_settings.keycloak_client_id:
logger.info("Loading Coldfront data from remote server")
api_data += self._fetch_coldfront_allocation_api()

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This now introduces the possibility of an Allocated Project ID being matched to several instances in the ColdFront data. Can you please write a test that documents the behavior in such a scenario?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

From 2:30 discussion today, I'll keep the behavior as error out and write the test for it


return api_data

def _get_allocation_data(self, coldfront_api_data):
"""Returns a mapping of (project ID, cluster name) tupels to a dict of project name, PI name, and institution code."""
allocation_data = {}
duplicate_projects = []
for project_dict in coldfront_api_data:
try:
# Allow allocation to not have institute code
Expand All @@ -125,6 +129,8 @@ def _get_allocation_data(self, coldfront_api_data):
project_dict["attributes"].get(CF_ATTR_IS_COURSE, "No").lower()
== "yes"
)
if (project_id, cluster_name) in allocation_data:
duplicate_projects.append((project_id, cluster_name))
allocation_data[(project_id, cluster_name)] = {
invoice.PROJECT_FIELD: project_name,
invoice.PI_FIELD: pi_name,
Expand All @@ -135,6 +141,11 @@ def _get_allocation_data(self, coldfront_api_data):
except KeyError:
continue

if duplicate_projects:
raise ValueError(
f"Found allocations matched more than once in API data: {duplicate_projects}"
)

return allocation_data

def _validate_allocation_data(self, allocation_data):
Expand Down
2 changes: 1 addition & 1 deletion process_report/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

class Settings(BaseSettings):
# Coldfront info
coldfront_api_filepath: str | None = None
coldfront_api_filepaths: tuple[str, ...] = ()
keycloak_client_id: str | None = None
keycloak_client_secret: str | None = None

Expand Down
1 change: 1 addition & 0 deletions process_report/tests/e2e/test_data/test_PI.csv
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
PI,First Invoice Month,Initial Credits,1st Month Used,2nd Month Used
pi4@harvard.edu,2025-06,1000.00,400.00,0.00
pi5@harvard.edu,2025-06,0.00,0.00,0.00
pi6@mit.edu,2025-06,0.00,0.00,0.00
pi4@example.edu,2025-06,0.00,0.00,0.00
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,5 @@ Invoice Month,Project - Allocation,Project - Allocation ID,Manager (PI),Cluster
2024-01,P1ID,P1ID,,shift,,,,,280,OpenStack GPUA100SXM4,0.013,100
2024-01,P2ID,P2ID,,shift,,,,,280,OpenShift CPU,0.013,200
2024-01,P3ID,P3ID,,shift,,,,,280,Free CPU,0.013,300
2024-01,P4-supplement,P4-supplement,,shift,,,,,300,OpenShift CPU,0.013,400
2024-01,P9ID,P9ID,,shift,,,,,280,OpenShift CPU,0.013,3000
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
- project:
pi: pi4@harvard.edu
attributes:
Allocated Project ID: P4-supplement
Allocated Project Name: P4-supplement-name
resource:
name: shift
10 changes: 9 additions & 1 deletion process_report/tests/e2e/test_e2e_pipeline.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import os
import json
from pathlib import Path
import pandas as pd
import pytest
Expand Down Expand Up @@ -112,7 +113,14 @@ def _prepare_pipeline_execution(
# Environment setup for subprocess execution
env = os.environ.copy()
env["INVOICE_MONTH"] = INVOICE_MONTH
env["COLDFRONT_API_FILEPATH"] = str(test_files["test_coldfront_api_data.json"])

# pydantic_settings parses complex types as JSON-encoded strings: https://pydantic.dev/docs/validation/latest/concepts/pydantic_settings/#parsing-environment-variable-values
env["COLDFRONT_API_FILEPATHS"] = json.dumps(
(
str(test_files["test_coldfront_api_data.json"]),

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In the file parsing code, you actually only call yaml.safe_load so you don't actually support JSON files anymore. Please reintroduce logic to check the file extension and call the appropriate loading mechanism for JSON or YAML.

@QuanMPhm QuanMPhm Apr 21, 2026

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When I saw the e2e test pass, I realized yaml.safe_load() can load JSON files as well. Do you still want separate code paths for JSON and YAML loading?

json_data = '{"name": "Alice", "age": 30, "city": "New York"}'
parsed_data = yaml.safe_load(json_data)
# parsed_data is {'name': 'Alice', 'age': 30, 'city': 'New York'}

str(test_files["test_supplement_api_data.yaml"]),
)
)
env["FETCH_FROM_S3"] = "false"
env["UPLOAD_TO_S3"] = "false"
env["invoice_path_template"] = str(test_files["test_invoice_dir"])
Expand Down
102 changes: 100 additions & 2 deletions process_report/tests/unit/processors/test_coldfront_fetch_processor.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,20 @@
from unittest import mock
import yaml

import pandas
import pytest

from process_report import settings
from process_report.tests import util as test_utils
from process_report.tests.base import BaseTestCase
from process_report.tests.base import BaseTestCaseWithTempDir


class TestColdfrontFetchProcessor(BaseTestCaseWithTempDir):
def setUp(self):
# To trigger fetch from Coldfront
settings.invoice_settings.keycloak_client_id = "foo"
return super().setUp()

class TestColdfrontFetchProcessor(BaseTestCase):
def _get_test_invoice(
self,
allocation_project_id,
Expand Down Expand Up @@ -260,3 +268,93 @@ def test_missing_project_cluster_tuples(self, mock_get_allocation_data):
assert str(cm.value) == (
f"Projects {expected_missing} not found in Coldfront and are billable! Please check the project names"
)

@mock.patch(
"process_report.processors.coldfront_fetch_processor.ColdfrontFetchProcessor._fetch_coldfront_allocation_api",
)
def test_supplement_api_data_used_when_coldfront_missing(
self, mock_get_allocation_data
):
"""Supplement API rows are applied to invoice in _get_allocation_data()."""
mock_get_allocation_data.return_value = self._get_mock_allocation_data(
["P1"],
["PI1"],
["IC1"],
["stack"],
)

# Supplemental data should follow same structure as Coldfront data,
# only missing "Is Course?" and "Institution-Specific Code" fields
supplemental_df = self._get_mock_allocation_data(
["P2"],
["PI2"],
["Delete Institude Code"],
["stack"],
)
del supplemental_df[0]["attributes"]["Institution-Specific Code"]

supplemental_filepath = self.tempdir / "supplement.yaml"
with open(supplemental_filepath, "w") as f:
yaml.safe_dump(supplemental_df, f)

test_invoice = self._get_test_invoice(
["P1", "P2"], cluster_name=["stack", "stack"]
)

expected_invoice = self._get_test_invoice(
["P1", "P2"],
["P1-name", "P2-name"],
["PI1", "PI2"],
["IC1", "N/A"],
["stack", "stack"],
[False, False],
)

test_coldfront_fetch_proc = test_utils.new_coldfront_fetch_processor(
data=test_invoice, coldfront_data_filepaths=[supplemental_filepath]
)
test_coldfront_fetch_proc.process()
output_invoice = test_coldfront_fetch_proc.data

assert output_invoice.equals(expected_invoice)

@mock.patch(
"process_report.processors.coldfront_fetch_processor.ColdfrontFetchProcessor._fetch_coldfront_allocation_api",
)
def test_duplicate_allocation_cluster_in_api_data(self, mock_get_allocation_data):
"""Test that a ValueError is raised when API data contains duplicate (allocation, cluster) pairs."""
mock_data = [
{
"resource": {"name": "stack"},
"project": {"pi": "PI1"},
"attributes": {
"Allocated Project ID": "P1",
"Allocated Project Name": "P1-name",
"Institution-Specific Code": "IC1",
},
},
{
"resource": {"name": "stack"},
"project": {"pi": "PI1"},
"attributes": {
"Allocated Project ID": "P1",
"Allocated Project Name": "P1-name",
"Institution-Specific Code": "IC1",
},
},
]
mock_get_allocation_data.return_value = mock_data

test_invoice = self._get_test_invoice(["P1"], cluster_name=["stack"])

test_coldfront_fetch_proc = test_utils.new_coldfront_fetch_processor(
data=test_invoice
)

with pytest.raises(ValueError) as cm:
test_coldfront_fetch_proc.process()

assert (
str(cm.value)
== "Found allocations matched more than once in API data: [('P1', 'stack')]"
)
8 changes: 6 additions & 2 deletions process_report/tests/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ def new_coldfront_fetch_processor(
invoice_month="0000-00",
data=None,
nonbillable_projects=None,
coldfront_data_filepath=None,
coldfront_data_filepaths=(),
):
if data is None:
data = pandas.DataFrame()
Expand All @@ -71,7 +71,11 @@ def new_coldfront_fetch_processor(
columns=["Project Name", "Cluster", "Is Timed", "Is Billable Override"]
)
return coldfront_fetch_processor.ColdfrontFetchProcessor(
invoice_month, data, name, nonbillable_projects, coldfront_data_filepath
invoice_month,
data,
name,
nonbillable_projects,
coldfront_data_filepaths,
)


Expand Down
Loading