11from dataclasses import dataclass
2+ from typing import Any , Callable
3+ from decimal import Decimal
24import pandas
5+ import pyarrow
6+ import logging
37
48import process_report .util as util
59
610
11+ logger = logging .getLogger (__name__ )
12+
13+
14+ @dataclass
15+ class InvoiceColumn :
16+ name : str
17+ dtype : Any
18+ default_value : Any | None = None
19+ default_initializer : Callable [[pandas .DataFrame ], pandas .Series ] | None = None
20+
21+
22+ # Field type definitions
23+ BALANCE_FIELD_TYPE = pandas .ArrowDtype (pyarrow .decimal128 (21 , 2 ))
24+ RATE_FIELD_TYPE = pandas .ArrowDtype (pyarrow .decimal128 (21 , 13 ))
25+ INTEGER_FIELD_TYPE = pandas .ArrowDtype (pyarrow .int64 ())
26+ STRING_FIELD_TYPE = pandas .StringDtype ()
27+ BOOL_FIELD_TYPE = pandas .BooleanDtype ()
28+
29+
730### PI file field names
831PI_PI_FIELD = "PI"
932PI_FIRST_MONTH = "First Invoice Month"
6588IS_COURSE_FIELD = "Is Course"
6689###
6790
91+ ### Initialized Column objects
92+ INVOICE_DATE_COLUMN = InvoiceColumn (name = INVOICE_DATE_FIELD , dtype = STRING_FIELD_TYPE )
93+ PROJECT_COLUMN = InvoiceColumn (name = PROJECT_FIELD , dtype = STRING_FIELD_TYPE )
94+ PROJECT_ID_COLUMN = InvoiceColumn (name = PROJECT_ID_FIELD , dtype = STRING_FIELD_TYPE )
95+ PI_COLUMN = InvoiceColumn (name = PI_FIELD , dtype = STRING_FIELD_TYPE )
96+ INVOICE_EMAIL_COLUMN = InvoiceColumn (name = INVOICE_EMAIL_FIELD , dtype = STRING_FIELD_TYPE )
97+ INVOICE_ADDRESS_COLUMN = InvoiceColumn (
98+ name = INVOICE_ADDRESS_FIELD , dtype = STRING_FIELD_TYPE
99+ )
100+ INSTITUTION_COLUMN = InvoiceColumn (name = INSTITUTION_FIELD , dtype = STRING_FIELD_TYPE )
101+ INSTITUTION_ID_COLUMN = InvoiceColumn (
102+ name = INSTITUTION_ID_FIELD , dtype = STRING_FIELD_TYPE
103+ )
104+ GROUP_NAME_COLUMN = InvoiceColumn (name = GROUP_NAME_FIELD , dtype = STRING_FIELD_TYPE )
105+ GROUP_INSTITUTION_COLUMN = InvoiceColumn (
106+ name = GROUP_INSTITUTION_FIELD , dtype = STRING_FIELD_TYPE
107+ )
108+ GROUP_BALANCE_COLUMN = InvoiceColumn (name = GROUP_BALANCE_FIELD , dtype = BALANCE_FIELD_TYPE )
109+ GROUP_BALANCE_USED_COLUMN = InvoiceColumn (
110+ name = GROUP_BALANCE_USED_FIELD , dtype = BALANCE_FIELD_TYPE
111+ )
112+ SU_HOURS_COLUMN = InvoiceColumn (name = SU_HOURS_FIELD , dtype = INTEGER_FIELD_TYPE )
113+ SU_TYPE_COLUMN = InvoiceColumn (name = SU_TYPE_FIELD , dtype = STRING_FIELD_TYPE )
114+ SU_CHARGE_COLUMN = InvoiceColumn (name = SU_CHARGE_FIELD , dtype = BALANCE_FIELD_TYPE )
115+ LENOVO_CHARGE_COLUMN = InvoiceColumn (name = LENOVO_CHARGE_FIELD , dtype = BALANCE_FIELD_TYPE )
116+ RATE_COLUMN = InvoiceColumn (
117+ name = RATE_FIELD , dtype = RATE_FIELD_TYPE
118+ ) # Using decimal to suppress scientific notation in export
119+ COST_COLUMN = InvoiceColumn (name = COST_FIELD , dtype = BALANCE_FIELD_TYPE )
120+ CREDIT_COLUMN = InvoiceColumn (name = CREDIT_FIELD , dtype = BALANCE_FIELD_TYPE )
121+ CREDIT_CODE_COLUMN = InvoiceColumn (name = CREDIT_CODE_FIELD , dtype = STRING_FIELD_TYPE )
122+ SUBSIDY_COLUMN = InvoiceColumn (
123+ name = SUBSIDY_FIELD , dtype = BALANCE_FIELD_TYPE , default_value = Decimal (0 )
124+ )
125+ BALANCE_COLUMN = InvoiceColumn (
126+ name = BALANCE_FIELD ,
127+ dtype = BALANCE_FIELD_TYPE ,
128+ default_initializer = lambda df : df [COST_FIELD ],
129+ )
130+ PI_BALANCE_COLUMN = InvoiceColumn (
131+ name = PI_BALANCE_FIELD ,
132+ dtype = BALANCE_FIELD_TYPE ,
133+ default_initializer = lambda df : df [COST_FIELD ],
134+ )
135+
136+ # Internally used fields
137+ IS_BILLABLE_COLUMN = InvoiceColumn (name = IS_BILLABLE_FIELD , dtype = BOOL_FIELD_TYPE )
138+ MISSING_PI_COLUMN = InvoiceColumn (name = MISSING_PI_FIELD , dtype = BOOL_FIELD_TYPE )
139+ PROJECT_NAME_COLUMN = InvoiceColumn (name = PROJECT_NAME_FIELD , dtype = STRING_FIELD_TYPE )
140+ GROUP_MANAGED_COLUMN = InvoiceColumn (name = GROUP_MANAGED_FIELD , dtype = BOOL_FIELD_TYPE )
141+ CLUSTER_NAME_COLUMN = InvoiceColumn (name = CLUSTER_NAME_FIELD , dtype = STRING_FIELD_TYPE )
142+ IS_COURSE_COLUMN = InvoiceColumn (
143+ name = IS_COURSE_FIELD , dtype = BOOL_FIELD_TYPE , default_value = False
144+ )
145+ ###
146+
68147
69148@dataclass
70149class Invoice :
71150 export_columns_list = list ()
72151 exported_columns_map = dict ()
152+ initializes_columns = tuple ()
153+ operates_on_columns = tuple ()
73154
74155 invoice_month : str
75156 data : pandas .DataFrame
76157 name : str = ""
77158 export_data = None
78159
79160 def process (self ):
161+ self ._init_columns ()
80162 self ._prepare ()
81163 self ._process ()
82164 self ._prepare_export ()
@@ -93,6 +175,24 @@ def output_s3_key(self) -> str:
93175 def output_s3_archive_key (self ):
94176 return f"Invoices/{ self .invoice_month } /Archive/{ self .name } { self .invoice_month } { util .get_iso8601_time ()} .csv"
95177
178+ def _init_columns (self ):
179+ """Initializes columns specified in `initializes_columns` and cast them to appropriate types
180+
181+ If column already exists, only do casting
182+ If no default value is given, column initialized to None
183+ """
184+ for field in self .initializes_columns :
185+ if field .name not in self .data .columns :
186+ field_default = field .default_value
187+ if field .default_initializer :
188+ field_default = field .default_initializer (self .data )
189+ self .data [field .name ] = field_default
190+ elif self .data .dtypes [field .name ] != field .dtype :
191+ logger .warning (
192+ f"Column { field .name } has dtype { self .data .dtypes [field .name ]} instead of expected { field .dtype } ."
193+ )
194+ self .data = self .data .astype ({field .name : field .dtype })
195+
96196 def _prepare (self ):
97197 """Prepares the data for processing.
98198
0 commit comments