caciviclab
diff --git a/‎.gitignore
Lines changed: 1 addition & 0 deletions b/‎.gitignore
Lines changed: 1 addition & 0 deletions
diff --git a/‎download/README.md
Lines changed: 5 additions & 1 deletion b/‎download/README.md
Lines changed: 5 additions & 1 deletion
diff --git a/‎download/conftest.py
Lines changed: 6 additions & 0 deletions b/‎download/conftest.py
Lines changed: 6 additions & 0 deletions
diff --git a/‎download/main.py
Lines changed: 16 additions & 33 deletions b/‎download/main.py
Lines changed: 16 additions & 33 deletions
diff --git a/‎download/model/a_contributions.py
Lines changed: 14 additions & 192 deletions b/‎download/model/a_contributions.py
Lines changed: 14 additions & 192 deletions
diff --git a/‎download/model/base.py
Lines changed: 14 additions & 18 deletions b/‎download/model/base.py
Lines changed: 14 additions & 18 deletions
@@ -5,3 +5,4 @@ inputs
 .local
 **/__pycache__
 build/candidates.xlsx
+.vscode/
@@ -1,3 +1,7 @@
 # Query NetFile V2 API to Load Disclosure-Backend DB
 
-Run main.py to download raw JSON files and create csv files.  To download, the key for the service account used to access Google Drive has to be placed in the file .local/SERVICE_ACCOUNT_KEY_JSON.json.
+Run main.py to download redacted JSON files from Google Drive and create csv files.  To download, the key for the service account used to access Google Drive has to be placed in the file .local/SERVICE_ACCOUNT_KEY_JSON.json.
+
+# Run tests
+
+Tests are in the folder **tests**. To run them all, simply do `pytest tests`.
@@ -0,0 +1,6 @@
+''' Pytest config '''
+
+pytest_plugins = [
+    # Autoload all fixtures in every test because they are kept in a separate file from the tests themselves
+    "tests.fixtures.data_fixtures"
+]
@@ -1,22 +1,16 @@
 """ main, to run everything """
-from collections import Counter
-from datetime import datetime
 import json
-import pandas as pd
-from sqlalchemy import create_engine
 from model.a_contributions import A_Contributions
 from model.committee import Committees
-from model.election import Elections
+# Next line ingored because Pylint reports cannot find election in model
+from model.election import Elections # pylint: disable=import-error,no-name-in-module
 from model.filing import Filings
 from model.transaction import Transactions
 
 from gdrive_datastore.gdrive import pull_data
 
-def get_last_status(status_list):
-    """
-    Return a tuple of index, status_item
-    for max value of status_item['startDate']
-    """
+DATA_DIR_PATH = '.local/downloads'
+OUTPUT_DIR = '.local'
 
 def unique_statuses(filers):
     """ What are the unique values for status? """
@@ -27,39 +21,35 @@ def unique_statuses(filers):
 
 def main():
     """ Do everyting """
-    data_dir_path = '.local/downloads'
-
     # pull data from gdrive and put it in .local/downloads
     pull_data(subfolder='main', default_folder='OpenDisclosure')
 
-    #engine = create_engine('postgresql+psycopg2://localhost/disclosure-backend-v2', echo=True)
-
-    with open(f'{data_dir_path}/elections.json', encoding='utf8') as f:
+    with open(f'{DATA_DIR_PATH}/elections.json', encoding='utf8') as f:
         elections_json = json.loads(f.read())
 
     elections = Elections(elections_json)
 
-    with open(f'{data_dir_path}/filers.json', encoding='utf8') as f:
+    with open(f'{DATA_DIR_PATH}/filers.json', encoding='utf8') as f:
         filers = json.loads(f.read())
 
-    committees = Committees.from_filers(filers, elections.df)
+    committees = Committees(filers, elections)
 
     # A-Contribs:
     # join filers + filings + elections + transactions
     # transactions.filing_nid -> filings.filing_nid
     #   filings.filer_nid -> committees.filer_nid
     #     committees.Ballot_Measure_Election -> elections.Ballot_Measure_Election
     # where trans['transaction']['calTransactionType'] == 'F460A'
-    with open(f'{data_dir_path}/filings.json', encoding='utf8') as f:
-        filings = Filings(json.loads(f.read())).df
+    with open(f'{DATA_DIR_PATH}/filings.json', encoding='utf8') as f:
+        filings = Filings(json.loads(f.read()))
 
-    with open(f'{data_dir_path}/transactions.json', encoding='utf8') as f:
+    with open(f'{DATA_DIR_PATH}/transactions.json', encoding='utf8') as f:
         records = json.loads(f.read())
-        transactions = Transactions(records).df
+        transactions = Transactions(records)
 
-    a_contributions = A_Contributions(transactions, filings, committees.df)
+    a_contributions = A_Contributions(transactions, filings, committees)
     a_contribs_df = a_contributions.df
-    if not a_contribs_df.empty:
+    if not a_contribs_df.is_empty:
         print(a_contribs_df.drop(columns=[
             'BakRef_TID',
             'Bal_Name',
@@ -82,16 +72,9 @@ def main():
             'XRef_Match',
         ]).sample(n=20))
 
-    elections.df.to_csv('.local/elections.csv', index=False)
-    committees.df.to_csv('.local/committees.csv', index=False)
-    a_contributions.df.to_csv('.local/a_contributions.csv', index=False)
-
-    '''
-    with engine.connect() as conn:
-        elections.to_sql(conn)
-        committees.to_sql(conn)
-        a_contributions.to_sql(conn)
-    '''
+    elections.df.write_csv(f'{OUTPUT_DIR}/elections.csv')
+    committees.df.write_csv(f'{OUTPUT_DIR}/committees.csv')
+    a_contributions.df.write_csv(f'{OUTPUT_DIR}/a_contributions.csv')
 
 if __name__ == '__main__':
     main()
@@ -2,203 +2,25 @@
 Schedule A, Contributions
 Hopefully this can be joined with other Schedule classes into a single Transaction class
 """
-import pandas as pd
-from sqlalchemy.types import BOOLEAN, DATE, DOUBLE_PRECISION, INTEGER, TIME, VARCHAR
-from .base import BaseModel
+from .committee import Committees
+from .filing import Filings
+from .transaction import Transactions
+from .schedule import ScheduleBase
 
-class A_Contributions(BaseModel):
+class A_Contributions(ScheduleBase):
     """
     Each record represents Schedule A - Contributions from form 460
     """
     def __init__(
         self,
-        transactions:pd.DataFrame,
-        filings:pd.DataFrame,
-        committees:pd.DataFrame
+        transactions:Transactions,
+        filings:Filings,
+        committees:Committees
     ):
-        f460a_trans = transactions.loc[transactions['cal_tran_type'] == 'F460A'].drop(
-            columns=['cal_tran_type']
+        self._form_id = 'F460A'
+        super().__init__(
+            self._form_id,
+            transactions,
+            filings,
+            committees
         )
-
-        unique_committees = committees.groupby(['Filer_ID'], as_index=False).first()[
-            ['filer_nid','Filer_ID','Filer_NamL','_Committee_Type']
-        ]
-
-        committee_filings = unique_committees.merge(filings, on='filer_nid', how='left').drop(
-            columns=['filer_nid']
-        ).rename(
-            columns={
-                'RptNum': 'Report_Num',
-                '_Committee_Type': 'Committee_Type'
-            }
-        )
-
-        committees_sans_filings = committee_filings[committee_filings['filing_nid'].isna()]
-
-        f460a = committee_filings.merge(f460a_trans,
-            how='inner',
-            on='filing_nid'
-        ).drop(
-            columns=['filing_nid']
-        )
-
-        f460a[['Form_Type','tblCover_Offic_Dscr','tblCover_Office_Cd']] = ['00:00:00', '', '']
-
-        super().__init__(f460a)
-
-        self._dtypes = {
-            'Filer_ID': 'string',
-            'Filer_NamL': 'string',
-            'Report_Num': 'Int64',
-            'Committee_Type': 'string',
-            'Rpt_Date': 'string',
-            'From_Date': 'string',
-            'Thru_Date': 'string',
-            'Elect_Date': 'string',
-            'tblCover_Office_Cd': 'string',
-            'tblCover_Offic_Dscr': 'string',
-            'Rec_Type': 'string',
-            'Form_Type': 'string',
-            'Tran_ID': 'string',
-            'Entity_Cd': 'string',
-            'Tran_NamL': 'string',
-            'Tran_NamF': 'string',
-            'Tran_NamT': 'string',
-            'Tran_NamS': 'string',
-            'Tran_Adr1': 'string',
-            'Tran_Adr2': 'string',
-            'Tran_City': 'string',
-            'Tran_State': 'string',
-            'Tran_Zip4': 'string',
-            'Tran_Emp': 'string',
-            'Tran_Occ': 'string',
-            'Tran_Self': bool,
-            'Tran_Type': 'string',
-            'Tran_Date': 'string',
-            'Tran_Date1': 'string',
-            'Tran_Amt1': float,
-            'Tran_Amt2': float,
-            'Tran_Dscr': 'string',
-            'Cmte_ID': 'string',
-            'Tres_NamL': 'string',
-            'Tres_NamF': 'string',
-            'Tres_NamT': 'string',
-            'Tres_NamS': 'string',
-            'Tres_Adr1': 'string',
-            'Tres_Adr2': 'string',
-            'Tres_City': 'string',
-            'Tres_State': 'string',
-            'Tres_Zip': 'string',
-            'Intr_NamL': 'string',
-            'Intr_NamF': 'string',
-            'Intr_NamT': 'string',
-            'Intr_NamS': 'string',
-            'Intr_Adr1': 'string',
-            'Intr_Adr2': 'string',
-            'Intr_City': 'string',
-            'Intr_State': 'string',
-            'Intr_Zip4': 'string',
-            'Intr_Emp': 'string',
-            'Intr_Occ': 'string',
-            'Intr_Self': bool,
-            'Cand_NamL': 'string',
-            'Cand_NamF': 'string',
-            'Cand_NamT': 'string',
-            'Cand_NamS': 'string',
-            'tblDetlTran_Office_Cd': 'string',
-            'tblDetlTran_Offic_Dscr': 'string',
-            'Juris_Cd': 'string',
-            'Juris_Dscr': 'string',
-            'Dist_No': 'string',
-            'Off_S_H_Cd': 'string',
-            'Bal_Name': 'string',
-            'Bal_Num': 'string',
-            'Bal_Juris': 'string',
-            'Sup_Opp_Cd': 'string',
-            'Memo_Code': 'string',
-            'Memo_RefNo': 'string',
-            'BakRef_TID': 'string',
-            'XRef_SchNm': 'string',
-            'XRef_Match': 'string',
-            'Loan_Rate': 'string',
-            'Int_CmteId': 'Int64'
-        }
-        self._sql_dtypes = {
-            'Filer_ID': VARCHAR(9),
-            'Filer_NamL': VARCHAR(183),
-            'Report_Num': INTEGER,
-            'Committee_Type': VARCHAR(64),
-            'Rpt_Date': DATE,
-            'From_Date': DATE,
-            'Thru_Date': DATE,
-            'Elect_Date': DATE,
-            'tblCover_Office_Cd': VARCHAR(64),
-            'tblCover_Offic_Dscr': VARCHAR(64),
-            'Rec_Type': VARCHAR(4),
-            'Form_Type': TIME,
-            'Tran_ID': VARCHAR(12),
-            'Entity_Cd': VARCHAR(3),
-            'Tran_NamL': VARCHAR(199),
-            'Tran_NamF': VARCHAR(38),
-            'Tran_NamT': VARCHAR(6),
-            'Tran_NamS': VARCHAR(5),
-            'Tran_Adr1': VARCHAR(64),
-            'Tran_Adr2': VARCHAR(64),
-            'Tran_City': VARCHAR(50),
-            'Tran_State': VARCHAR(4),
-            'Tran_Zip4': VARCHAR(10),
-            'Tran_Emp': VARCHAR(92),
-            'Tran_Occ': VARCHAR(60),
-            'Tran_Self': BOOLEAN,
-            'Tran_Type': VARCHAR(4),
-            'Tran_Date': DATE,
-            'Tran_Date1': DATE,
-            'Tran_Amt1': DOUBLE_PRECISION,
-            'Tran_Amt2': DOUBLE_PRECISION,
-            'Tran_Dscr': VARCHAR(56),
-            'Cmte_ID': VARCHAR(9),
-            'Tres_NamL': VARCHAR(4),
-            'Tres_NamF': VARCHAR(4),
-            'Tres_NamT': VARCHAR(64),
-            'Tres_NamS': VARCHAR(64),
-            'Tres_Adr1': VARCHAR(64),
-            'Tres_Adr2': VARCHAR(64),
-            'Tres_City': VARCHAR(7),
-            'Tres_State': VARCHAR(4),
-            'Tres_Zip': INTEGER,
-            'Intr_NamL': VARCHAR(74),
-            'Intr_NamF': VARCHAR(6),
-            'Intr_NamT': VARCHAR(64),
-            'Intr_NamS': VARCHAR(64),
-            'Intr_Adr1': VARCHAR(64),
-            'Intr_Adr2': VARCHAR(64),
-            'Intr_City': VARCHAR(13),
-            'Intr_State': VARCHAR(4),
-            'Intr_Zip4': VARCHAR(10),
-            'Intr_Emp': VARCHAR(15),
-            'Intr_Occ': VARCHAR(8),
-            'Intr_Self': BOOLEAN,
-            'Cand_NamL': VARCHAR(64),
-            'Cand_NamF': VARCHAR(64),
-            'Cand_NamT': VARCHAR(64),
-            'Cand_NamS': VARCHAR(64),
-            'tblDetlTran_Office_Cd': VARCHAR(4),
-            'tblDetlTran_Offic_Dscr': VARCHAR(19),
-            'Juris_Cd': VARCHAR(4),
-            'Juris_Dscr': VARCHAR(64),
-            'Dist_No': VARCHAR(64),
-            'Off_S_H_Cd': VARCHAR(64),
-            'Bal_Name': VARCHAR(64),
-            'Bal_Num': VARCHAR(4),
-            'Bal_Juris': VARCHAR(64),
-            'Sup_Opp_Cd': VARCHAR(64),
-            'Memo_Code': VARCHAR(64),
-            'Memo_RefNo': VARCHAR(11),
-            'BakRef_TID': VARCHAR(64),
-            'XRef_SchNm': VARCHAR(64),
-            'XRef_Match': VARCHAR(64),
-            'Loan_Rate': VARCHAR(64),
-            'Int_CmteId': INTEGER
-        }
-        self._sql_cols = list(self._sql_dtypes.keys())
-        self._sql_table_name = 'A-Contributions'
@@ -1,12 +1,14 @@
 """ This is the base model, upon all others shall be based """
-import pandas as pd
+import polars as pl
 
 class BaseModel:
     """ Base model other models inherit from """
     def __init__(self, data):
         self._data = data
         self._df = None
+        self._lazy = None
         self._dtypes = []
+        self._pl_dtypes = []
         self._sql_dtypes = []
         self._sql_cols = []
         self._sql_table_name = ''
@@ -16,25 +18,19 @@ def data(self):
         """ Just return the data """
         return self._data
 
+    @property
+    def lazy(self):
+        ''' Return a Polars Lazyframe '''
+        if self._lazy is None:
+            self._lazy = pl.LazyFrame(self._data, schema=self._dtypes)
+
+        return self._lazy
+
     @property
     def df(self):
-        """ Get a dataframe of the data """
-        if self._df is None or self._df.empty:
-            self._df = pd.DataFrame(self._data).astype(self._dtypes)
+        ''' Return a Polars dataframe '''
+        if self._df is None:
+            self._df = self.lazy.collect()
 
         return self._df
 
-    def to_sql(self, connection, **kwargs):
-        """ Write to a postgresql table """
-        options = {
-            'index_label': 'id',
-            'if_exists': 'replace'
-        }
-        options.update(kwargs)
-
-        self.df[self._sql_cols].to_sql(
-            self._sql_table_name,
-            connection,
-            dtype=self._sql_dtypes,
-            **options
-        )