Skip to content

Commit ca78e5c

Browse files
committed
Feat: make D expenditures from NetFile V2 API (#334)
1 parent 322efde commit ca78e5c

26 files changed

+28154
-424
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,3 +5,4 @@ inputs
55
.local
66
**/__pycache__
77
build/candidates.xlsx
8+
.vscode/

download/README.md

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
11
# Query NetFile V2 API to Load Disclosure-Backend DB
22

3-
Run main.py to download raw JSON files and create csv files. To download, the key for the service account used to access Google Drive has to be placed in the file .local/SERVICE_ACCOUNT_KEY_JSON.json.
3+
Run main.py to download redacted JSON files from Google Drive and create csv files. To download, the key for the service account used to access Google Drive has to be placed in the file .local/SERVICE_ACCOUNT_KEY_JSON.json.
4+
5+
# Run tests
6+
7+
Tests are in the folder **tests**. To run them all, simply do `pytest tests`.

download/conftest.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
''' Pytest config '''
2+
3+
pytest_plugins = [
4+
# Autoload all fixtures in every test because they are kept in a separate file from the tests themselves
5+
"tests.fixtures.data_fixtures"
6+
]

download/main.py

Lines changed: 16 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,16 @@
11
""" main, to run everything """
2-
from collections import Counter
3-
from datetime import datetime
42
import json
5-
import pandas as pd
6-
from sqlalchemy import create_engine
73
from model.a_contributions import A_Contributions
84
from model.committee import Committees
9-
from model.election import Elections
5+
# Next line ingored because Pylint reports cannot find election in model
6+
from model.election import Elections # pylint: disable=import-error,no-name-in-module
107
from model.filing import Filings
118
from model.transaction import Transactions
129

1310
from gdrive_datastore.gdrive import pull_data
1411

15-
def get_last_status(status_list):
16-
"""
17-
Return a tuple of index, status_item
18-
for max value of status_item['startDate']
19-
"""
12+
DATA_DIR_PATH = '.local/downloads'
13+
OUTPUT_DIR = '.local'
2014

2115
def unique_statuses(filers):
2216
""" What are the unique values for status? """
@@ -27,39 +21,35 @@ def unique_statuses(filers):
2721

2822
def main():
2923
""" Do everyting """
30-
data_dir_path = '.local/downloads'
31-
3224
# pull data from gdrive and put it in .local/downloads
3325
pull_data(subfolder='main', default_folder='OpenDisclosure')
3426

35-
#engine = create_engine('postgresql+psycopg2://localhost/disclosure-backend-v2', echo=True)
36-
37-
with open(f'{data_dir_path}/elections.json', encoding='utf8') as f:
27+
with open(f'{DATA_DIR_PATH}/elections.json', encoding='utf8') as f:
3828
elections_json = json.loads(f.read())
3929

4030
elections = Elections(elections_json)
4131

42-
with open(f'{data_dir_path}/filers.json', encoding='utf8') as f:
32+
with open(f'{DATA_DIR_PATH}/filers.json', encoding='utf8') as f:
4333
filers = json.loads(f.read())
4434

45-
committees = Committees.from_filers(filers, elections.df)
35+
committees = Committees(filers, elections)
4636

4737
# A-Contribs:
4838
# join filers + filings + elections + transactions
4939
# transactions.filing_nid -> filings.filing_nid
5040
# filings.filer_nid -> committees.filer_nid
5141
# committees.Ballot_Measure_Election -> elections.Ballot_Measure_Election
5242
# where trans['transaction']['calTransactionType'] == 'F460A'
53-
with open(f'{data_dir_path}/filings.json', encoding='utf8') as f:
54-
filings = Filings(json.loads(f.read())).df
43+
with open(f'{DATA_DIR_PATH}/filings.json', encoding='utf8') as f:
44+
filings = Filings(json.loads(f.read()))
5545

56-
with open(f'{data_dir_path}/transactions.json', encoding='utf8') as f:
46+
with open(f'{DATA_DIR_PATH}/transactions.json', encoding='utf8') as f:
5747
records = json.loads(f.read())
58-
transactions = Transactions(records).df
48+
transactions = Transactions(records)
5949

60-
a_contributions = A_Contributions(transactions, filings, committees.df)
50+
a_contributions = A_Contributions(transactions, filings, committees)
6151
a_contribs_df = a_contributions.df
62-
if not a_contribs_df.empty:
52+
if not a_contribs_df.is_empty:
6353
print(a_contribs_df.drop(columns=[
6454
'BakRef_TID',
6555
'Bal_Name',
@@ -82,16 +72,9 @@ def main():
8272
'XRef_Match',
8373
]).sample(n=20))
8474

85-
elections.df.to_csv('.local/elections.csv', index=False)
86-
committees.df.to_csv('.local/committees.csv', index=False)
87-
a_contributions.df.to_csv('.local/a_contributions.csv', index=False)
88-
89-
'''
90-
with engine.connect() as conn:
91-
elections.to_sql(conn)
92-
committees.to_sql(conn)
93-
a_contributions.to_sql(conn)
94-
'''
75+
elections.df.write_csv(f'{OUTPUT_DIR}/elections.csv')
76+
committees.df.write_csv(f'{OUTPUT_DIR}/committees.csv')
77+
a_contributions.df.write_csv(f'{OUTPUT_DIR}/a_contributions.csv')
9578

9679
if __name__ == '__main__':
9780
main()

download/model/a_contributions.py

Lines changed: 14 additions & 192 deletions
Original file line numberDiff line numberDiff line change
@@ -2,203 +2,25 @@
22
Schedule A, Contributions
33
Hopefully this can be joined with other Schedule classes into a single Transaction class
44
"""
5-
import pandas as pd
6-
from sqlalchemy.types import BOOLEAN, DATE, DOUBLE_PRECISION, INTEGER, TIME, VARCHAR
7-
from .base import BaseModel
5+
from .committee import Committees
6+
from .filing import Filings
7+
from .transaction import Transactions
8+
from .schedule import ScheduleBase
89

9-
class A_Contributions(BaseModel):
10+
class A_Contributions(ScheduleBase):
1011
"""
1112
Each record represents Schedule A - Contributions from form 460
1213
"""
1314
def __init__(
1415
self,
15-
transactions:pd.DataFrame,
16-
filings:pd.DataFrame,
17-
committees:pd.DataFrame
16+
transactions:Transactions,
17+
filings:Filings,
18+
committees:Committees
1819
):
19-
f460a_trans = transactions.loc[transactions['cal_tran_type'] == 'F460A'].drop(
20-
columns=['cal_tran_type']
20+
self._form_id = 'F460A'
21+
super().__init__(
22+
self._form_id,
23+
transactions,
24+
filings,
25+
committees
2126
)
22-
23-
unique_committees = committees.groupby(['Filer_ID'], as_index=False).first()[
24-
['filer_nid','Filer_ID','Filer_NamL','_Committee_Type']
25-
]
26-
27-
committee_filings = unique_committees.merge(filings, on='filer_nid', how='left').drop(
28-
columns=['filer_nid']
29-
).rename(
30-
columns={
31-
'RptNum': 'Report_Num',
32-
'_Committee_Type': 'Committee_Type'
33-
}
34-
)
35-
36-
committees_sans_filings = committee_filings[committee_filings['filing_nid'].isna()]
37-
38-
f460a = committee_filings.merge(f460a_trans,
39-
how='inner',
40-
on='filing_nid'
41-
).drop(
42-
columns=['filing_nid']
43-
)
44-
45-
f460a[['Form_Type','tblCover_Offic_Dscr','tblCover_Office_Cd']] = ['00:00:00', '', '']
46-
47-
super().__init__(f460a)
48-
49-
self._dtypes = {
50-
'Filer_ID': 'string',
51-
'Filer_NamL': 'string',
52-
'Report_Num': 'Int64',
53-
'Committee_Type': 'string',
54-
'Rpt_Date': 'string',
55-
'From_Date': 'string',
56-
'Thru_Date': 'string',
57-
'Elect_Date': 'string',
58-
'tblCover_Office_Cd': 'string',
59-
'tblCover_Offic_Dscr': 'string',
60-
'Rec_Type': 'string',
61-
'Form_Type': 'string',
62-
'Tran_ID': 'string',
63-
'Entity_Cd': 'string',
64-
'Tran_NamL': 'string',
65-
'Tran_NamF': 'string',
66-
'Tran_NamT': 'string',
67-
'Tran_NamS': 'string',
68-
'Tran_Adr1': 'string',
69-
'Tran_Adr2': 'string',
70-
'Tran_City': 'string',
71-
'Tran_State': 'string',
72-
'Tran_Zip4': 'string',
73-
'Tran_Emp': 'string',
74-
'Tran_Occ': 'string',
75-
'Tran_Self': bool,
76-
'Tran_Type': 'string',
77-
'Tran_Date': 'string',
78-
'Tran_Date1': 'string',
79-
'Tran_Amt1': float,
80-
'Tran_Amt2': float,
81-
'Tran_Dscr': 'string',
82-
'Cmte_ID': 'string',
83-
'Tres_NamL': 'string',
84-
'Tres_NamF': 'string',
85-
'Tres_NamT': 'string',
86-
'Tres_NamS': 'string',
87-
'Tres_Adr1': 'string',
88-
'Tres_Adr2': 'string',
89-
'Tres_City': 'string',
90-
'Tres_State': 'string',
91-
'Tres_Zip': 'string',
92-
'Intr_NamL': 'string',
93-
'Intr_NamF': 'string',
94-
'Intr_NamT': 'string',
95-
'Intr_NamS': 'string',
96-
'Intr_Adr1': 'string',
97-
'Intr_Adr2': 'string',
98-
'Intr_City': 'string',
99-
'Intr_State': 'string',
100-
'Intr_Zip4': 'string',
101-
'Intr_Emp': 'string',
102-
'Intr_Occ': 'string',
103-
'Intr_Self': bool,
104-
'Cand_NamL': 'string',
105-
'Cand_NamF': 'string',
106-
'Cand_NamT': 'string',
107-
'Cand_NamS': 'string',
108-
'tblDetlTran_Office_Cd': 'string',
109-
'tblDetlTran_Offic_Dscr': 'string',
110-
'Juris_Cd': 'string',
111-
'Juris_Dscr': 'string',
112-
'Dist_No': 'string',
113-
'Off_S_H_Cd': 'string',
114-
'Bal_Name': 'string',
115-
'Bal_Num': 'string',
116-
'Bal_Juris': 'string',
117-
'Sup_Opp_Cd': 'string',
118-
'Memo_Code': 'string',
119-
'Memo_RefNo': 'string',
120-
'BakRef_TID': 'string',
121-
'XRef_SchNm': 'string',
122-
'XRef_Match': 'string',
123-
'Loan_Rate': 'string',
124-
'Int_CmteId': 'Int64'
125-
}
126-
self._sql_dtypes = {
127-
'Filer_ID': VARCHAR(9),
128-
'Filer_NamL': VARCHAR(183),
129-
'Report_Num': INTEGER,
130-
'Committee_Type': VARCHAR(64),
131-
'Rpt_Date': DATE,
132-
'From_Date': DATE,
133-
'Thru_Date': DATE,
134-
'Elect_Date': DATE,
135-
'tblCover_Office_Cd': VARCHAR(64),
136-
'tblCover_Offic_Dscr': VARCHAR(64),
137-
'Rec_Type': VARCHAR(4),
138-
'Form_Type': TIME,
139-
'Tran_ID': VARCHAR(12),
140-
'Entity_Cd': VARCHAR(3),
141-
'Tran_NamL': VARCHAR(199),
142-
'Tran_NamF': VARCHAR(38),
143-
'Tran_NamT': VARCHAR(6),
144-
'Tran_NamS': VARCHAR(5),
145-
'Tran_Adr1': VARCHAR(64),
146-
'Tran_Adr2': VARCHAR(64),
147-
'Tran_City': VARCHAR(50),
148-
'Tran_State': VARCHAR(4),
149-
'Tran_Zip4': VARCHAR(10),
150-
'Tran_Emp': VARCHAR(92),
151-
'Tran_Occ': VARCHAR(60),
152-
'Tran_Self': BOOLEAN,
153-
'Tran_Type': VARCHAR(4),
154-
'Tran_Date': DATE,
155-
'Tran_Date1': DATE,
156-
'Tran_Amt1': DOUBLE_PRECISION,
157-
'Tran_Amt2': DOUBLE_PRECISION,
158-
'Tran_Dscr': VARCHAR(56),
159-
'Cmte_ID': VARCHAR(9),
160-
'Tres_NamL': VARCHAR(4),
161-
'Tres_NamF': VARCHAR(4),
162-
'Tres_NamT': VARCHAR(64),
163-
'Tres_NamS': VARCHAR(64),
164-
'Tres_Adr1': VARCHAR(64),
165-
'Tres_Adr2': VARCHAR(64),
166-
'Tres_City': VARCHAR(7),
167-
'Tres_State': VARCHAR(4),
168-
'Tres_Zip': INTEGER,
169-
'Intr_NamL': VARCHAR(74),
170-
'Intr_NamF': VARCHAR(6),
171-
'Intr_NamT': VARCHAR(64),
172-
'Intr_NamS': VARCHAR(64),
173-
'Intr_Adr1': VARCHAR(64),
174-
'Intr_Adr2': VARCHAR(64),
175-
'Intr_City': VARCHAR(13),
176-
'Intr_State': VARCHAR(4),
177-
'Intr_Zip4': VARCHAR(10),
178-
'Intr_Emp': VARCHAR(15),
179-
'Intr_Occ': VARCHAR(8),
180-
'Intr_Self': BOOLEAN,
181-
'Cand_NamL': VARCHAR(64),
182-
'Cand_NamF': VARCHAR(64),
183-
'Cand_NamT': VARCHAR(64),
184-
'Cand_NamS': VARCHAR(64),
185-
'tblDetlTran_Office_Cd': VARCHAR(4),
186-
'tblDetlTran_Offic_Dscr': VARCHAR(19),
187-
'Juris_Cd': VARCHAR(4),
188-
'Juris_Dscr': VARCHAR(64),
189-
'Dist_No': VARCHAR(64),
190-
'Off_S_H_Cd': VARCHAR(64),
191-
'Bal_Name': VARCHAR(64),
192-
'Bal_Num': VARCHAR(4),
193-
'Bal_Juris': VARCHAR(64),
194-
'Sup_Opp_Cd': VARCHAR(64),
195-
'Memo_Code': VARCHAR(64),
196-
'Memo_RefNo': VARCHAR(11),
197-
'BakRef_TID': VARCHAR(64),
198-
'XRef_SchNm': VARCHAR(64),
199-
'XRef_Match': VARCHAR(64),
200-
'Loan_Rate': VARCHAR(64),
201-
'Int_CmteId': INTEGER
202-
}
203-
self._sql_cols = list(self._sql_dtypes.keys())
204-
self._sql_table_name = 'A-Contributions'

download/model/base.py

Lines changed: 14 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,14 @@
11
""" This is the base model, upon all others shall be based """
2-
import pandas as pd
2+
import polars as pl
33

44
class BaseModel:
55
""" Base model other models inherit from """
66
def __init__(self, data):
77
self._data = data
88
self._df = None
9+
self._lazy = None
910
self._dtypes = []
11+
self._pl_dtypes = []
1012
self._sql_dtypes = []
1113
self._sql_cols = []
1214
self._sql_table_name = ''
@@ -16,25 +18,19 @@ def data(self):
1618
""" Just return the data """
1719
return self._data
1820

21+
@property
22+
def lazy(self):
23+
''' Return a Polars Lazyframe '''
24+
if self._lazy is None:
25+
self._lazy = pl.LazyFrame(self._data, schema=self._dtypes)
26+
27+
return self._lazy
28+
1929
@property
2030
def df(self):
21-
""" Get a dataframe of the data """
22-
if self._df is None or self._df.empty:
23-
self._df = pd.DataFrame(self._data).astype(self._dtypes)
31+
''' Return a Polars dataframe '''
32+
if self._df is None:
33+
self._df = self.lazy.collect()
2434

2535
return self._df
2636

27-
def to_sql(self, connection, **kwargs):
28-
""" Write to a postgresql table """
29-
options = {
30-
'index_label': 'id',
31-
'if_exists': 'replace'
32-
}
33-
options.update(kwargs)
34-
35-
self.df[self._sql_cols].to_sql(
36-
self._sql_table_name,
37-
connection,
38-
dtype=self._sql_dtypes,
39-
**options
40-
)

0 commit comments

Comments
 (0)