Skip to content

Commit e68a019

Browse files
committed
Handle init columns in a separate processor
During 2026-03 invoicing, a bug was found where the columns initialized by the New-PI credit processor (i.e `PI Balance` column), was being accessed by the PI-SU processor before it was initialized, causing an KeyError. All invoice columns (internal and external exported) will now be initialized by a new processor `InitColumnsProcessor` that runs before any other processor, to remove concern about the ordering of column initialization. The e2e test data has been updated to surface the bug that was found. It did not failed during the PR that introduced the bug [1] because the test data didn't have the right conditions to trigger the PI-SU processor Refactored unit tests to accomodate the new processor by adding a new base test class. [1] #279
1 parent 4029aa1 commit e68a019

16 files changed

Lines changed: 270 additions & 182 deletions

process_report/process_report.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
ocp_test_invoice,
2222
)
2323
from process_report.processors import (
24+
init_columns_processor,
2425
coldfront_fetch_processor,
2526
validate_pi_alias_processor,
2627
add_institution_processor,
@@ -69,6 +70,7 @@ def main():
6970
invoice_month,
7071
merged_dataframe,
7172
[
73+
init_columns_processor.PISUCreditProcessor,
7274
validate_cluster_name_processor.ValidateClusterNameProcessor,
7375
coldfront_fetch_processor.ColdfrontFetchProcessor,
7476
validate_pi_alias_processor.ValidatePIAliasProcessor,

process_report/processors/bu_subsidy_processor.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
from dataclasses import dataclass, field
2-
from decimal import Decimal
32

43
from process_report.loader import loader
54
from process_report.invoices import invoice
@@ -21,7 +20,6 @@ def get_project(row):
2120
return project_alloc[: project_alloc.rfind("-")]
2221

2322
self.data[invoice.PROJECT_NAME_FIELD] = self.data.apply(get_project, axis=1)
24-
self.data[invoice.SUBSIDY_FIELD] = Decimal(0)
2523

2624
def _process(self):
2725
self.data = self._apply_subsidy(self.data, self.subsidy_amount)

process_report/processors/coldfront_fetch_processor.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -143,7 +143,6 @@ def _validate_allocation_data(self, allocation_data):
143143
)
144144

145145
def _apply_allocation_data(self, allocation_data):
146-
self.data[invoice.IS_COURSE_FIELD] = False
147146
for project_cluster_tuple, data in allocation_data.items():
148147
project_id, cluster_name = project_cluster_tuple
149148
mask = (self.data[invoice.PROJECT_ID_FIELD] == project_id) & (

process_report/processors/discount_processor.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ def apply_discount_on_project(remaining_discount_amount, project_i, project):
5151
remaining_project_balance = project[pi_balance_field]
5252
applied_discount = min(remaining_project_balance, remaining_discount_amount)
5353

54-
if invoice.at[project_i, discount_field] is None:
54+
if pandas.isna(invoice.at[project_i, discount_field]):
5555
invoice.at[project_i, discount_field] = applied_discount
5656
else:
5757
invoice.at[project_i, discount_field] += applied_discount
Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
import logging
2+
from decimal import Decimal
3+
from dataclasses import dataclass
4+
5+
from process_report.invoices import invoice
6+
from process_report.processors import discount_processor
7+
8+
9+
logger = logging.getLogger(__name__)
10+
logging.basicConfig(level=logging.INFO)
11+
12+
INVOICE_FIELD_LIST = [
13+
invoice.INVOICE_DATE_FIELD,
14+
invoice.PROJECT_FIELD,
15+
invoice.PROJECT_ID_FIELD,
16+
invoice.PI_FIELD,
17+
invoice.INVOICE_EMAIL_FIELD,
18+
invoice.INVOICE_ADDRESS_FIELD,
19+
invoice.INSTITUTION_FIELD,
20+
invoice.INSTITUTION_ID_FIELD,
21+
invoice.GROUP_NAME_FIELD,
22+
invoice.GROUP_INSTITUTION_FIELD,
23+
invoice.GROUP_BALANCE_FIELD,
24+
invoice.GROUP_BALANCE_USED_FIELD,
25+
invoice.SU_HOURS_FIELD,
26+
invoice.SU_TYPE_FIELD,
27+
invoice.SU_CHARGE_FIELD,
28+
invoice.LENOVO_CHARGE_FIELD,
29+
invoice.RATE_FIELD,
30+
invoice.COST_FIELD,
31+
invoice.CREDIT_FIELD,
32+
invoice.CREDIT_CODE_FIELD,
33+
invoice.SUBSIDY_FIELD,
34+
invoice.BALANCE_FIELD,
35+
# Internally used fields
36+
invoice.IS_BILLABLE_FIELD,
37+
invoice.MISSING_PI_FIELD,
38+
invoice.PI_BALANCE_FIELD,
39+
invoice.PROJECT_NAME_FIELD,
40+
invoice.GROUP_MANAGED_FIELD,
41+
invoice.CLUSTER_NAME_FIELD,
42+
invoice.IS_COURSE_FIELD,
43+
]
44+
45+
# Fields without defaults have None as default value
46+
FIELD_DEFAULT_MAPPING = {
47+
invoice.BALANCE_FIELD: invoice.COST_FIELD,
48+
invoice.PI_BALANCE_FIELD: invoice.COST_FIELD,
49+
invoice.SUBSIDY_FIELD: Decimal(0),
50+
invoice.IS_COURSE_FIELD: False, # All rows assumed to be non-course, unless marked otherwise by ColdfrontFetchProcessor
51+
}
52+
53+
54+
@dataclass
55+
class PISUCreditProcessor(discount_processor.DiscountProcessor):
56+
"""
57+
Initializes all columns that will be used in later Processors, with appropriate types and default values.
58+
59+
Fields without defaults specified in FIELD_DEFAULT_MAPPING will have None as default value
60+
Pre-existing columns in input dataframe will only be casted, not overwritten
61+
"""
62+
63+
def _process(self):
64+
for field in INVOICE_FIELD_LIST:
65+
if field not in self.data.columns:
66+
default_value = FIELD_DEFAULT_MAPPING.get(field, None)
67+
68+
# If default value is name of another column, copy values from that column and cast to correct type
69+
if default_value in INVOICE_FIELD_LIST:
70+
default_value = self.data[default_value]
71+
72+
self.data[field] = default_value

process_report/processors/new_pi_credit_processor.py

Lines changed: 1 addition & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -176,10 +176,7 @@ def _apply_credits_new_pi(
176176
credit_eligible_projects[invoice.PI_FIELD] == pi
177177
]
178178

179-
if pi_age > 1:
180-
for i, row in pi_projects.iterrows():
181-
data.at[i, invoice.BALANCE_FIELD] = row[invoice.COST_FIELD]
182-
else:
179+
if pi_age <= 1:
183180
if pi_age == 0:
184181
old_pi_df = self._upsert_pi_entry(
185182
old_pi_df,
@@ -226,10 +223,6 @@ def _apply_credits_new_pi(
226223
return (data, old_pi_df)
227224

228225
def _prepare(self):
229-
self.data[invoice.CREDIT_FIELD] = None
230-
self.data[invoice.CREDIT_CODE_FIELD] = None
231-
self.data[invoice.PI_BALANCE_FIELD] = self.data[invoice.COST_FIELD]
232-
self.data[invoice.BALANCE_FIELD] = self.data[invoice.COST_FIELD]
233226
self.old_pi_df = self._load_old_pis(self.old_pi_filepath)
234227

235228
def _process(self):

process_report/processors/prepayment_processor.py

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -56,12 +56,6 @@ def _load_prepay_debits(prepay_debits_filepath):
5656
return prepay_debits
5757

5858
def _prepare(self):
59-
self.data[invoice.GROUP_NAME_FIELD] = None
60-
self.data[invoice.GROUP_INSTITUTION_FIELD] = None
61-
self.data[invoice.GROUP_MANAGED_FIELD] = None
62-
self.data[invoice.GROUP_BALANCE_FIELD] = None
63-
self.data[invoice.GROUP_BALANCE_USED_FIELD] = None
64-
6559
self.prepay_debits = self._load_prepay_debits(self.prepay_debits_filepath)
6660
self.group_info_dict = self._get_prepay_group_dict()
6761
if self.upload_to_s3:

process_report/tests/base.py

Lines changed: 42 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,49 @@
33
from pathlib import Path
44
from unittest import TestCase
55

6+
import pandas
67

7-
class BaseTestCaseWithTempDir(TestCase):
8+
9+
INVOICE_FIELD_LIST = [
10+
"Invoice Month",
11+
"Project - Allocation",
12+
"Project - Allocation ID",
13+
"Manager (PI)",
14+
"Invoice Email",
15+
"Invoice Address",
16+
"Institution",
17+
"Institution - Specific Code",
18+
"Prepaid Group Name",
19+
"Prepaid Group Institution",
20+
"Prepaid Group Balance",
21+
"Prepaid Group Used",
22+
"SU Hours (GBhr or SUhr)",
23+
"SU Type",
24+
"SU Charge",
25+
"Charge",
26+
"Rate",
27+
"Cost",
28+
"Credit",
29+
"Credit Code",
30+
"Subsidy",
31+
"Balance",
32+
# Internally used fields
33+
"Is Billable",
34+
"Missing PI",
35+
"PI Balance",
36+
"Project",
37+
"MGHPCC Managed",
38+
"Cluster Name",
39+
"Is Course",
40+
]
41+
42+
43+
class BaseTesCase(TestCase):
44+
def create_test_invoice(self, data_dict: dict):
45+
return pandas.DataFrame(data_dict, columns=INVOICE_FIELD_LIST)
46+
47+
48+
class BaseTestCaseWithTempDir(BaseTesCase):
849
def setUp(self):
950
self.tempdir = Path(tempfile.TemporaryDirectory(delete=False).name)
1051

process_report/tests/e2e/test_data/test_invoices/test_NERC OpenShift 2025-04.csv

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,5 +2,5 @@ Invoice Month,Project - Allocation,Project - Allocation ID,Manager (PI),Cluster
22
2024-01,P1ID,P1ID,,shift,,,,,280,OpenShift CPU,0.013,100.0
33
2024-01,P1ID,P1ID,,shift,,,,,280,OpenStack GPUA100SXM4,0.013,100
44
2024-01,P2ID,P2ID,,shift,,,,,280,OpenShift CPU,0.013,200
5-
2024-01,P3ID,P3ID,,shift,,,,,280,OpenShift CPU,0.013,300
5+
2024-01,P3ID,P3ID,,shift,,,,,280,Free CPU,0.013,300
66
2024-01,P9ID,P9ID,,shift,,,,,280,OpenShift CPU,0.013,3000
Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
- username: PI9
2-
- username: PI10
2+
- username: pi2@harvard.edu
33
non_billed_su_types:
4-
- name: SU1
4+
- name: Free CPU

0 commit comments

Comments
 (0)