diff --git a/.github/workflows/pull-request-checks.yaml b/.github/workflows/pull-request-checks.yaml index 04c83ecf09..cdc653cf41 100644 --- a/.github/workflows/pull-request-checks.yaml +++ b/.github/workflows/pull-request-checks.yaml @@ -54,14 +54,6 @@ jobs: if: ${{ !failure() && !cancelled() }} uses: ./.github/workflows/test-spark-integration-load-transactions-fabs-fpds.yaml - Run-Spark-Integration-Load-Transactions-Lookup-Tests: - name: Run Spark Integration Load Transactions Lookup Tests - needs: - - Run-Code-Style-Checks - - Build-Broker-Docker-Image - if: ${{ !failure() && !cancelled() }} - uses: ./.github/workflows/test-spark-integration-load-transactions-lookup.yaml - Run-Spark-Integration-Load-To-From-Delta-Tests: name: Run Spark Integration Load To From Delta Tests needs: diff --git a/.github/workflows/test-spark-integration-load-transactions-fabs-fpds.yaml b/.github/workflows/test-spark-integration-load-transactions-fabs-fpds.yaml index cfef1cb856..d99c9beb44 100644 --- a/.github/workflows/test-spark-integration-load-transactions-fabs-fpds.yaml +++ b/.github/workflows/test-spark-integration-load-transactions-fabs-fpds.yaml @@ -62,7 +62,7 @@ jobs: with: cov-report-name: 'spark-load-transactions-fabs-fpds-tests' include-glob: 'test_*.py *_test.py' - keyword: 'test_load_transactions_in_delta_fabs_fpds.py' + keyword: 'test_load_transactions.py' marker: 'spark' num-processes: 0 working-directory: ./usaspending-api diff --git a/.github/workflows/test-spark-integration-load-transactions-lookup.yaml b/.github/workflows/test-spark-integration-load-transactions-lookup.yaml deleted file mode 100644 index 329c37d612..0000000000 --- a/.github/workflows/test-spark-integration-load-transactions-lookup.yaml +++ /dev/null @@ -1,68 +0,0 @@ -name: Spark Integration Tests - test_load_transactions_in_delta_lookups.py - -env: - BROKER_DB_HOST: localhost - BROKER_DB_PORT: 5432 - BROKER_DB_USER: admin - BROKER_DB_PASSWORD: root - BROKER_DB_NAME: data_broker - DJANGO_SETTINGS_MODULE: usaspending_api.settings - ES_SCHEME: http - ES_HOST: localhost - ES_PORT: 9200 - MINIO_HOST: localhost - USASPENDING_DB_HOST: localhost - USASPENDING_DB_PORT: 5432 - USASPENDING_DB_USER: usaspending - USASPENDING_DB_PASSWORD: usaspender - USASPENDING_DB_NAME: data_store_api - IS_LOCAL: true - -on: - workflow_call: - -defaults: - run: - working-directory: ./usaspending-api - -jobs: - Setup-Broker-Branch: - uses: ./.github/workflows/determine-broker-branch.yaml - - Run: - name: Run - runs-on: ${{ vars.RUNNER_VERSION }} - steps: - - name: Checkout Source Repository - uses: actions/checkout@v4 - with: - path: usaspending-api - - - name: Checkout Broker Backend Repository - uses: actions/checkout@v4 - with: - repository: fedspendingtransparency/data-act-broker-backend - path: data-act-broker-backend - ref: ${{ needs.Setup-Broker-Branch.outputs.branch }} - - - name: Init Python Environment - uses: ./usaspending-api/.github/actions/init-python-environment - with: - working-directory: ./usaspending-api - - - name: Init Test Environment - uses: ./usaspending-api/.github/actions/init-test-environment - with: - is-integration-test: true - is-spark-test: true - working-directory: ./usaspending-api - - - name: Run Test Cases - uses: ./usaspending-api/.github/actions/run-pytest - with: - cov-report-name: 'spark-load-transactions-lookup-tests' - include-glob: 'test_*.py *_test.py' - keyword: 'test_load_transactions_in_delta_lookups.py' - marker: 'spark' - num-processes: 0 - working-directory: ./usaspending-api diff --git a/.github/workflows/test-spark-integration-other.yaml b/.github/workflows/test-spark-integration-other.yaml index 9545878495..364654c26c 100644 --- a/.github/workflows/test-spark-integration-other.yaml +++ b/.github/workflows/test-spark-integration-other.yaml @@ -62,6 +62,6 @@ jobs: with: cov-report-name: 'spark-other-tests' include-glob: 'test_*.py *_test.py' - keyword: '(not test_load_to_from_delta.py and not test_load_transactions_in_delta_lookups.py and not test_load_transactions_in_delta_fabs_fpds.py)' + keyword: '(not test_load_to_from_delta.py and not test_load_transactions.py)' marker: 'spark' working-directory: ./usaspending-api diff --git a/usaspending_api/awards/delta_models/awards.py b/usaspending_api/awards/delta_models/awards.py index 51c105c089..1b5119cf21 100644 --- a/usaspending_api/awards/delta_models/awards.py +++ b/usaspending_api/awards/delta_models/awards.py @@ -15,7 +15,7 @@ "fpds_parent_agency_id": "STRING", "funding_agency_id": "INTEGER", "generated_unique_award_id": "STRING NOT NULL", - "id": "LONG NOT NULL", + "id": "LONG", "is_fpds": "BOOLEAN NOT NULL", "last_modified_date": "DATE", "latest_transaction_id": "LONG", diff --git a/usaspending_api/common/helpers/spark_helpers.py b/usaspending_api/common/helpers/spark_helpers.py index 91421aa671..30a15b33c8 100644 --- a/usaspending_api/common/helpers/spark_helpers.py +++ b/usaspending_api/common/helpers/spark_helpers.py @@ -31,7 +31,10 @@ from usaspending_api.common.helpers.aws_helpers import is_aws, get_aws_credentials from usaspending_api.config import CONFIG from usaspending_api.config.utils import parse_pg_uri, parse_http_url -from usaspending_api.transactions.delta_models import DETACHED_AWARD_PROCUREMENT_DELTA_COLUMNS, PUBLISHED_FABS_COLUMNS +from usaspending_api.transactions.delta_models import ( + DETACHED_AWARD_PROCUREMENT_DELTA_COLUMNS, + PUBLISHED_FABS_DELTA_COLUMNS, +) from usaspending_api.transactions.delta_models.transaction_fabs import ( TRANSACTION_FABS_COLUMN_INFO, TRANSACTION_FABS_COLUMNS, @@ -575,7 +578,7 @@ def load_dict_to_delta_table(spark, s3_data_bucket, table_schema, table_name, da table_to_col_names_dict["awards"] = list(AWARDS_COLUMNS) table_to_col_names_dict["financial_accounts_by_awards"] = list(FINANCIAL_ACCOUNTS_BY_AWARDS_COLUMNS) table_to_col_names_dict["detached_award_procurement"] = list(DETACHED_AWARD_PROCUREMENT_DELTA_COLUMNS) - table_to_col_names_dict["published_fabs"] = list(PUBLISHED_FABS_COLUMNS) + table_to_col_names_dict["published_fabs"] = list(PUBLISHED_FABS_DELTA_COLUMNS) table_to_col_info_dict = {} for tbl_name, col_info in zip( @@ -586,15 +589,16 @@ def load_dict_to_delta_table(spark, s3_data_bucket, table_schema, table_name, da table_to_col_info_dict[tbl_name][col.dest_name] = col # Make sure the table has been created first - call_command( - "create_delta_table", - "--destination-table", - table_name, - "--alt-db", - table_schema, - "--spark-s3-bucket", - s3_data_bucket, - ) + if not spark.catalog.tableExists(table_name, table_schema): + call_command( + "create_delta_table", + "--destination-table", + table_name, + "--alt-db", + table_schema, + "--spark-s3-bucket", + s3_data_bucket, + ) if data: insert_sql = f"INSERT {'OVERWRITE' if overwrite else 'INTO'} {table_schema}.{table_name} VALUES\n" diff --git a/usaspending_api/common/spark/configs.py b/usaspending_api/common/spark/configs.py index 4113a4afb3..d50fa94d21 100644 --- a/usaspending_api/common/spark/configs.py +++ b/usaspending_api/common/spark/configs.py @@ -41,9 +41,9 @@ # process is started from, even if started under the hood of a Py4J JavaGateway). With a "standalone" (not # YARN or Mesos or Kubernetes) cluster manager, only client mode is supported. "spark.submit.deployMode": "client", - # Default of 1g (1GiB) for Driver. Increase here if the Java process is crashing with memory errors - "spark.driver.memory": "1g", - "spark.executor.memory": "1g", + # Default of 4g (4GiB) for Driver. Increase here if the Java process is crashing with memory errors + "spark.driver.memory": "4g", + "spark.executor.memory": "4g", "spark.ui.enabled": "false", # Does the same as setting SPARK_TESTING=true env var "spark.jars.packages": ",".join(SPARK_SESSION_JARS), } diff --git a/usaspending_api/etl/management/commands/load_awards_in_delta.py b/usaspending_api/etl/management/commands/load_awards_in_delta.py new file mode 100644 index 0000000000..8602bd2488 --- /dev/null +++ b/usaspending_api/etl/management/commands/load_awards_in_delta.py @@ -0,0 +1,305 @@ +import logging +from contextlib import contextmanager +from datetime import datetime, timezone + +from django.core.management import BaseCommand +from pyspark.sql import SparkSession +from pyspark.sql.types import ArrayType, StringType + +from usaspending_api.awards.delta_models.awards import AWARDS_COLUMNS +from usaspending_api.broker.helpers.get_business_categories import ( + get_business_categories_fabs, + get_business_categories_fpds, +) +from usaspending_api.broker.helpers.last_load_date import ( + get_earliest_load_date, + update_last_load_date, +) +from usaspending_api.common.helpers.spark_helpers import ( + configure_spark_session, + get_active_spark_session, +) +from usaspending_api.config import CONFIG + + +logger = logging.getLogger(__name__) + + +class Command(BaseCommand): + help = """ + This command reads transaction data from source / bronze tables in delta and creates the delta silver tables + for awards. + """ + + spark_s3_bucket: str + spark: SparkSession + + def add_arguments(self, parser): + parser.add_argument( + "--spark-s3-bucket", + type=str, + required=False, + default=CONFIG.SPARK_S3_BUCKET, + help="The destination bucket in S3 for creating the tables.", + ) + + def handle(self, *args, **options): + with self.prepare_spark(): + self.spark_s3_bucket = options["spark_s3_bucket"] + + # Capture earliest last load date of the source tables to update the "last_load_date" after completion + next_last_load = get_earliest_load_date( + ("source_procurement_transaction", "source_assistance_transaction"), datetime.utcfromtimestamp(0) + ) + + # Do this check now to avoid uncaught errors later when running queries + # Use 'int' because that is what will be targeted for deletes/updates/etc. + table_exists = self.spark._jsparkSession.catalog().tableExists(f"int.awards") + if not table_exists: + raise Exception(f"Table: int.awards does not exist.") + + logger.info(f"Running delete SQL for awards ETL") + self.spark.sql(self.delete_records_sql()) + + logger.info(f"Running UPSERT SQL for awards ETL") + self.update_awards() + update_last_load_date("awards", next_last_load) + + @contextmanager + def prepare_spark(self): + extra_conf = { + # Config for additional packages needed + # "spark.jars.packages": "org.postgresql:postgresql:42.2.23,io.delta:delta-core_2.12:1.2.1,org.apache.hadoop:hadoop-aws:3.3.1,org.apache.spark:spark-hive_2.12:3.2.1", + # Config for Delta Lake tables and SQL. Need these to keep Dela table metadata in the metastore + "spark.sql.extensions": "io.delta.sql.DeltaSparkSessionExtension", + "spark.sql.catalog.spark_catalog": "org.apache.spark.sql.delta.catalog.DeltaCatalog", + # See comment below about old date and time values cannot parsed without these + "spark.sql.parquet.datetimeRebaseModeInWrite": "LEGACY", # for dates at/before 1900 + "spark.sql.parquet.int96RebaseModeInWrite": "LEGACY", # for timestamps at/before 1900 + "spark.sql.jsonGenerator.ignoreNullFields": "false", # keep nulls in our json + } + + # Create the Spark Session + self.spark = get_active_spark_session() + spark_created_by_command = False + if not self.spark: + spark_created_by_command = True + self.spark = configure_spark_session(**extra_conf, spark_context=self.spark) # type: SparkSession + + # Create UDFs for Business Categories + self.spark.udf.register( + name="get_business_categories_fabs", f=get_business_categories_fabs, returnType=ArrayType(StringType()) + ) + self.spark.udf.register( + name="get_business_categories_fpds", f=get_business_categories_fpds, returnType=ArrayType(StringType()) + ) + + yield # Going to wait for the Django command to complete then stop the spark session if needed + + if spark_created_by_command: + self.spark.stop() + + def delete_records_sql(self): + id_col = "generated_unique_award_id" + # TODO could do an outer join here to find awards that do not join to transaction fpds or transaction fabs + subquery = """ + SELECT awards.generated_unique_award_id AS id_to_remove + FROM int.awards + LEFT JOIN int.transaction_normalized on awards.transaction_unique_id = transaction_normalized.transaction_unique_id + WHERE awards.generated_unique_award_id IS NOT NULL AND transaction_normalized.transaction_unique_id IS NULL + """ + + sql = f""" + MERGE INTO int.awards + USING ( + {subquery} + ) AS deleted_records + ON awards.{id_col} = deleted_records.id_to_remove + WHEN MATCHED + THEN DELETE + """ + return sql + + def update_awards(self): + load_datetime = datetime.now(timezone.utc) + + set_insert_special_columns = ["total_subaward_amount", "create_date", "update_date"] + subquery_ignored_columns = set_insert_special_columns + ["id", "subaward_count"] + + # Use a UNION in award_ids_to_update, not UNION ALL because there could be duplicates among the award ids + # between the query parts or in int.award_ids_delete_modified. + subquery = f""" + WITH + transaction_earliest AS ( + SELECT * FROM ( + SELECT + tn.award_id AS id, + tn.unique_award_key, + tn.id AS earliest_transaction_id, + tn.action_date AS date_signed, + tn.description, + tn.period_of_performance_start_date, + ROW_NUMBER() OVER ( + PARTITION BY tn.unique_award_key + /* NOTE: In Postgres, the default sorting order sorts NULLs as larger than all other values. + However, in Spark, the default sorting order sorts NULLs as smaller than all other + values. In the Postgres transaction loader the default sorting behavior was used, so to + be consistent with the behavior of the previous loader, we need to reverse the default + Spark NULL sorting behavior for any field that can be NULL. */ + ORDER BY tn.unique_award_key, tn.action_date ASC NULLS LAST, tn.modification_number ASC NULLS LAST, + tn.transaction_unique_id ASC + ) AS rank + FROM int.transaction_normalized AS tn + ) + WHERE rank = 1 + ), + transaction_latest AS ( + SELECT * FROM ( + SELECT + -- General update columns (id at top, rest alphabetically by alias/name) + tn.award_id AS id, + tn.unique_award_key, + tn.awarding_agency_id, + CASE + WHEN tn.type IN ('A', 'B', 'C', 'D') THEN 'contract' + WHEN tn.type IN ('02', '03', '04', '05') THEN 'grant' + WHEN tn.type IN ('06', '10') THEN 'direct payment' + WHEN tn.type IN ('07', '08') THEN 'loans' + WHEN tn.type = '09' THEN 'insurance' + WHEN tn.type = '11' THEN 'other' + WHEN tn.type LIKE 'IDV%%' THEN 'idv' + ELSE NULL + END AS category, + tn.action_date AS certified_date, + CASE + WHEN month(tn.action_date) > 9 THEN year(tn.action_date) + 1 + ELSE year(tn.action_date) + END AS fiscal_year, + tn.funding_agency_id, + tn.unique_award_key AS generated_unique_award_id, + tn.is_fpds, + tn.last_modified_date, + tn.id AS latest_transaction_id, + tn.period_of_performance_current_end_date, + tn.transaction_unique_id, + tn.type, + tn.type_description, + -- FPDS Columns + fpds.agency_id AS fpds_agency_id, + fpds.referenced_idv_agency_iden AS fpds_parent_agency_id, + fpds.parent_award_id AS parent_award_piid, + fpds.piid, + -- FABS Columns + fabs.fain, + fabs.uri, + -- Other + 'DBR' AS data_source, + -- Windowing Function + ROW_NUMBER() OVER ( + PARTITION BY tn.unique_award_key + -- See note in transaction_earliest about NULL ordering. + ORDER BY tn.unique_award_key, tn.action_date DESC NULLS FIRST, + tn.modification_number DESC NULLS FIRST, tn.transaction_unique_id DESC + ) as rank + FROM int.transaction_normalized AS tn + LEFT JOIN int.transaction_fpds AS fpds ON fpds.detached_award_proc_unique = tn.transaction_unique_id + LEFT JOIN int.transaction_fabs AS fabs ON fabs.afa_generated_unique = tn.transaction_unique_id + ) + WHERE rank = 1 + ), + -- For executive compensation information, we want the latest transaction for each award + -- for which there is at least an officer_1_name. + transaction_ec AS ( + SELECT * FROM ( + SELECT + tn.award_id AS id, + tn.unique_award_key, + COALESCE(fpds.officer_1_amount, fabs.officer_1_amount) AS officer_1_amount, + COALESCE(fpds.officer_1_name, fabs.officer_1_name) AS officer_1_name, + COALESCE(fpds.officer_2_amount, fabs.officer_2_amount) AS officer_2_amount, + COALESCE(fpds.officer_2_name, fabs.officer_2_name) AS officer_2_name, + COALESCE(fpds.officer_3_amount, fabs.officer_3_amount) AS officer_3_amount, + COALESCE(fpds.officer_3_name, fabs.officer_3_name) AS officer_3_name, + COALESCE(fpds.officer_4_amount, fabs.officer_4_amount) AS officer_4_amount, + COALESCE(fpds.officer_4_name, fabs.officer_4_name) AS officer_4_name, + COALESCE(fpds.officer_5_amount, fabs.officer_5_amount) AS officer_5_amount, + COALESCE(fpds.officer_5_name, fabs.officer_5_name) AS officer_5_name, + ROW_NUMBER() OVER ( + PARTITION BY tn.unique_award_key + -- See note in transaction_earliest about NULL ordering. + ORDER BY tn.unique_award_key, tn.action_date DESC NULLS FIRST, + tn.modification_number DESC NULLS FIRST, tn.transaction_unique_id DESC + ) as rank + FROM int.transaction_normalized AS tn + LEFT JOIN int.transaction_fpds AS fpds ON fpds.detached_award_proc_unique = tn.transaction_unique_id + LEFT JOIN int.transaction_fabs AS fabs ON fabs.afa_generated_unique = tn.transaction_unique_id + WHERE fpds.officer_1_name IS NOT NULL OR fabs.officer_1_name IS NOT NULL + ) + WHERE rank = 1 + ), + transaction_totals AS ( + SELECT + -- Transaction Normalized Fields + tn.unique_award_key, + SUM(tn.federal_action_obligation) AS total_obligation, + SUM(tn.original_loan_subsidy_cost) AS total_subsidy_cost, + SUM(tn.funding_amount) AS total_funding_amount, + SUM(tn.face_value_loan_guarantee) AS total_loan_value, + SUM(tn.non_federal_funding_amount) AS non_federal_funding_amount, + SUM(tn.indirect_federal_sharing) AS total_indirect_federal_sharing, + -- Transaction FPDS Fields + SUM(CAST(fpds.base_and_all_options_value AS NUMERIC(23, 2))) AS base_and_all_options_value, + SUM(CAST(fpds.base_exercised_options_val AS NUMERIC(23, 2))) AS base_exercised_options_val, + COUNT(tn.transaction_unique_id) AS transaction_count + FROM int.transaction_normalized AS tn + LEFT JOIN int.transaction_fpds AS fpds ON tn.transaction_unique_id = fpds.detached_award_proc_unique + GROUP BY tn.unique_award_key + ) + SELECT + latest.id, + latest.unique_award_key, + 0 AS subaward_count, -- for consistency with Postgres table + {", ".join([col_name for col_name in AWARDS_COLUMNS if col_name not in subquery_ignored_columns])} + FROM transaction_latest AS latest + INNER JOIN transaction_earliest AS earliest ON latest.unique_award_key = earliest.unique_award_key + INNER JOIN transaction_totals AS totals on latest.unique_award_key = totals.unique_award_key + -- Not every award will have a record in transaction_ec, so need to do a LEFT JOIN on it. + LEFT JOIN transaction_ec AS ec ON latest.unique_award_key = ec.unique_award_key + """ + + # On set, create_date will not be changed and update_date will be set below. The subaward columns will not + # be changed, and id is used to match. All other column values will come from the subquery. + set_cols = [ + f"int.awards.{col_name} = source_subquery.{col_name}" + for col_name in AWARDS_COLUMNS + if col_name not in set_insert_special_columns + ] + set_cols.append(f"""int.awards.update_date = '{load_datetime.isoformat(" ")}'""") + + # Move insert_special_columns to the end of the list of column names for ease of handling + # during record insert + insert_col_name_list = [col_name for col_name in AWARDS_COLUMNS if col_name not in set_insert_special_columns] + insert_col_name_list.extend(set_insert_special_columns) + insert_col_names = ", ".join([col_name for col_name in insert_col_name_list]) + + # On insert, all values except for those in insert_special_columns will come from the subquery + insert_value_list = insert_col_name_list[: -len(set_insert_special_columns)] + insert_value_list.extend(["NULL"]) + insert_value_list.extend([f"""'{load_datetime.isoformat(" ")}'"""] * 2) + insert_values = ", ".join([value for value in insert_value_list]) + + sql = f""" + MERGE INTO int.awards + USING ( + {subquery} + ) AS source_subquery + ON awards.id = source_subquery.id + WHEN MATCHED + THEN UPDATE SET + {", ".join(set_cols)} + WHEN NOT MATCHED + THEN INSERT + ({insert_col_names}) + VALUES ({insert_values}) + """ + self.spark.sql(sql) diff --git a/usaspending_api/etl/management/commands/load_table_to_delta.py b/usaspending_api/etl/management/commands/load_table_to_delta.py index dd031c7115..efe4bdd332 100644 --- a/usaspending_api/etl/management/commands/load_table_to_delta.py +++ b/usaspending_api/etl/management/commands/load_table_to_delta.py @@ -1,6 +1,7 @@ import logging from django.core.management import BaseCommand +from pyspark.sql import functions as sf from usaspending_api.awards.delta_models import ( AWARDS_COLUMNS, @@ -40,7 +41,7 @@ transaction_normalized_sql_string, TRANSACTION_SEARCH_POSTGRES_COLUMNS, transaction_search_create_sql_string, - PUBLISHED_FABS_COLUMNS, + PUBLISHED_FABS_DELTA_COLUMNS, published_fabs_create_sql_string, ) from usaspending_api.transactions.models import SourceAssistanceTransaction @@ -75,6 +76,7 @@ "custom_schema": "", "column_names": list(AWARDS_COLUMNS), "tsvectors": None, + "add_hash_field": False, }, "detached_award_procurement": { "model": SourceProcurementTransaction, @@ -92,6 +94,7 @@ "custom_schema": "", "column_names": list(DETACHED_AWARD_PROCUREMENT_DELTA_COLUMNS), "tsvectors": None, + "add_hash_field": True, }, "financial_accounts_by_awards": { "model": FinancialAccountsByAwards, @@ -109,6 +112,7 @@ "custom_schema": "award_id LONG", "column_names": list(FINANCIAL_ACCOUNTS_BY_AWARDS_COLUMNS), "tsvectors": None, + "add_hash_field": False, }, "transaction_fabs": { "model": TransactionFABS, @@ -126,6 +130,7 @@ "custom_schema": "", "column_names": TRANSACTION_FABS_VIEW_COLUMNS, "tsvectors": None, + "add_hash_field": True, }, "published_fabs": { "model": SourceAssistanceTransaction, @@ -141,8 +146,9 @@ "delta_table_create_sql": published_fabs_create_sql_string, "source_schema": None, "custom_schema": "", - "column_names": list(PUBLISHED_FABS_COLUMNS), + "column_names": list(PUBLISHED_FABS_DELTA_COLUMNS), "tsvectors": None, + "add_hash_field": True, }, "transaction_fpds": { "model": TransactionFPDS, @@ -160,6 +166,7 @@ "custom_schema": "", "column_names": TRANSACTION_FPDS_VIEW_COLUMNS, "tsvectors": None, + "add_hash_field": True, }, "transaction_normalized": { "model": TransactionNormalized, @@ -177,6 +184,7 @@ "custom_schema": "", "column_names": list(TRANSACTION_NORMALIZED_COLUMNS), "tsvectors": None, + "add_hash_field": True, }, # Tables loaded in from the Broker "subaward": { @@ -195,6 +203,7 @@ "custom_schema": "", "column_names": list(BROKER_SUBAWARDS_COLUMNS), "tsvectors": None, + "add_hash_field": False, }, "zips": { "model": None, @@ -212,6 +221,7 @@ "custom_schema": "", "column_names": list(ZIPS_COLUMNS), "tsvectors": None, + "add_hash_field": False, }, # Additional definitions for use in testing; # These are copies of Views / Materialized Views / Tables from Postgres to Spark to aid in @@ -233,6 +243,7 @@ "STRING, federal_accounts STRING, cfdas ARRAY, tas_components ARRAY", "column_names": list(AWARD_SEARCH_COLUMNS), "tsvectors": None, + "add_hash_field": False, }, "recipient_lookup_testing": { "model": RecipientLookup, @@ -250,6 +261,7 @@ "custom_schema": "recipient_hash STRING", "column_names": list(RECIPIENT_LOOKUP_COLUMNS), "tsvectors": None, + "add_hash_field": False, }, "recipient_profile_testing": { "model": RecipientProfile, @@ -267,6 +279,7 @@ "custom_schema": "recipient_hash STRING", "column_names": list(RECIPIENT_PROFILE_DELTA_COLUMNS), "tsvectors": None, + "add_hash_field": False, }, "sam_recipient_testing": { "model": DUNS, @@ -284,6 +297,7 @@ "custom_schema": "broker_duns_id STRING, business_types_codes ARRAY", "column_names": list(SAM_RECIPIENT_COLUMNS), "tsvectors": None, + "add_hash_field": False, }, "transaction_search_testing": { "model": TransactionSearch, @@ -301,6 +315,7 @@ "custom_schema": "recipient_hash STRING, federal_accounts STRING, parent_recipient_hash STRING", "column_names": list(TRANSACTION_SEARCH_POSTGRES_COLUMNS), "tsvectors": None, + "add_hash_field": False, }, } @@ -366,6 +381,7 @@ def handle(self, *args, **options): partition_column_type = table_spec["partition_column_type"] is_partition_column_unique = table_spec["is_partition_column_unique"] custom_schema = table_spec["custom_schema"] + add_hash_field = table_spec["add_hash_field"] # Set the database that will be interacted with for all Delta Lake table Spark-based activity logger.info(f"Using Spark Database: {destination_database}") @@ -414,6 +430,9 @@ def handle(self, *args, **options): properties=get_jdbc_connection_properties(), ) + if add_hash_field: + df = df.withColumn("hash", sf.xxhash64("*")) + # Make sure that the column order defined in the Delta table schema matches # that of the Spark dataframe used to pull from the Postgres table. While not # always needed, this should help to prevent any future mismatch between the two. diff --git a/usaspending_api/etl/management/commands/load_transaction_fabs_in_delta.py b/usaspending_api/etl/management/commands/load_transaction_fabs_in_delta.py new file mode 100644 index 0000000000..9dbb3d64ae --- /dev/null +++ b/usaspending_api/etl/management/commands/load_transaction_fabs_in_delta.py @@ -0,0 +1,32 @@ +import logging + +from django.core.management import BaseCommand +from usaspending_api.config import CONFIG +from usaspending_api.etl.transaction_delta_loaders.context_managers import prepare_spark +from usaspending_api.etl.transaction_delta_loaders.loaders import FABSDeltaTransactionLoader + +logger = logging.getLogger(__name__) + + +class Command(BaseCommand): + help = """ + This command reads transaction data from source / bronze tables in delta and creates the delta silver tables. + """ + + spark_s3_bucket: str + + @staticmethod + def add_arguments(parser): + parser.add_argument( + "--spark-s3-bucket", + type=str, + required=False, + default=CONFIG.SPARK_S3_BUCKET, + help="The destination bucket in S3 for creating the tables.", + ) + + @staticmethod + def handle(*args, **options): + with prepare_spark() as spark: + loader = FABSDeltaTransactionLoader(spark=spark, spark_s3_bucket=options["spark_s3_bucket"]) + loader.load_transactions() diff --git a/usaspending_api/etl/management/commands/load_transaction_fpds_in_delta.py b/usaspending_api/etl/management/commands/load_transaction_fpds_in_delta.py new file mode 100644 index 0000000000..783ab20ca1 --- /dev/null +++ b/usaspending_api/etl/management/commands/load_transaction_fpds_in_delta.py @@ -0,0 +1,32 @@ +import logging + +from django.core.management import BaseCommand + +from usaspending_api.config import CONFIG +from usaspending_api.etl.transaction_delta_loaders.context_managers import prepare_spark +from usaspending_api.etl.transaction_delta_loaders.loaders import FPDSDeltaTransactionLoader + + +logger = logging.getLogger(__name__) + + +class Command(BaseCommand): + help = """ + This command reads transaction fpds data from source / bronze tables in delta and creates the delta silver tables. + """ + + @staticmethod + def add_arguments(parser): + parser.add_argument( + "--spark-s3-bucket", + type=str, + required=False, + default=CONFIG.SPARK_S3_BUCKET, + help="The destination bucket in S3 for creating the tables.", + ) + + @staticmethod + def handle(*args, **options): + with prepare_spark() as spark: + loader = FPDSDeltaTransactionLoader(spark=spark, spark_s3_bucket=options["spark_s3_bucket"]) + loader.load_transactions() diff --git a/usaspending_api/etl/management/commands/load_transaction_normalized.py b/usaspending_api/etl/management/commands/load_transaction_normalized.py new file mode 100644 index 0000000000..dd89e851af --- /dev/null +++ b/usaspending_api/etl/management/commands/load_transaction_normalized.py @@ -0,0 +1,36 @@ +import logging + +from django.core.management import BaseCommand + +from usaspending_api.config import CONFIG +from usaspending_api.etl.transaction_delta_loaders.context_managers import prepare_spark +from usaspending_api.etl.transaction_delta_loaders.loaders import ( + FABSNormalizedDeltaTransactionLoader, + FPDSNormalizedDeltaTransactionLoader, +) + +logger = logging.getLogger(__name__) + + +class Command(BaseCommand): + help = """ + This command reads transaction data from source / bronze tables in delta and creates the delta silver tables. + """ + + @staticmethod + def add_arguments(parser): + parser.add_argument( + "--spark-s3-bucket", + type=str, + required=False, + default=CONFIG.SPARK_S3_BUCKET, + help="The destination bucket in S3 for creating the tables.", + ) + + @staticmethod + def handle(*args, **options): + with prepare_spark() as spark: + fabs_loader = FABSNormalizedDeltaTransactionLoader(spark=spark, spark_s3_bucket=options["spark_s3_bucket"]) + fpds_loader = FPDSNormalizedDeltaTransactionLoader(spark=spark, spark_s3_bucket=options["spark_s3_bucket"]) + fabs_loader.load_transactions() + fpds_loader.load_transactions() diff --git a/usaspending_api/etl/management/commands/load_transactions_in_delta.md b/usaspending_api/etl/management/commands/load_transactions_in_delta.md deleted file mode 100644 index 07720c9a0b..0000000000 --- a/usaspending_api/etl/management/commands/load_transactions_in_delta.md +++ /dev/null @@ -1,62 +0,0 @@ -# Overview -This command is used to initialize and update Transaction related delta tables. -## Key Tables being updated -### `int.transaction_id_lookup` -This table maps raw transaction data (`raw.published_fabs` and `raw.detached_award_procurement`) to intermediary transaction data (`int.transaction_fabs`, `int.transaction_fpds`, and `int.transaction_normalized`). One entry exists in this table for each transaction represented by a `int.transaction_normalized` record and **either** a `int.transaction_fabs` or `int.transaction_fpds` record. - -#### Fields -- `transaction_id` - Generated by this loader. -- `transaction_unique_id` - This corresponds to `afa_generated_unique` for fabs transactions and `detached_award_proc_unique` for fpds transactions. -- `published_fabs_id` - Unique identifier for `transaction_fabs` data. -- `detached_award_procurement_id` - Unique identifier for `transaction_fpds` data. - -### `int.award_id_lookup` -This table maps award data to transaction data. Because an award can have many transactions, a single award may appear multiple times in this table. A record exists for each transaction, which maps it to its award. - -#### Fields -- `award_id` - Generated by this loader. This is an id unique to an award, so this column allows for duplicates. -- `transaction_unique_id` - -- `published_fabs_id` - Unique identifier for `transaction_fabs` data -- `detached_award_procurement_id` - Unique identifier for `transaction_fpds` data -- `generated_unique_award_id` - This value comes from the `unique_award_key` from incoming raw transactions - -### `int.transaction_fabs` -Represents FABS transactions. Has one corresponding `int.transaction_normalized` record. -### `int.transaction_fpds` -Represents FPDS transactions. Has one corresponding `int.transaction_normalized` record. -### `int.transaction_normalized` -Represents data elements of a transaction not specific to FABS or FPDS. This has either a corresponding `int.transaction_fabs` or `int.transaction_fpds` record. -### `int.awards` -Represents data corresponding to an Award, which comprises of one or more Transactions. Transactions are grouped together as an award by their `unique_award_key`. Fields on awards come from either: -- The most recent Transaction for the Award -- The earliest Transaction for the Award -- Aggregate values of all Transactions for the Award - -# Usage - -This command will perform different actions depending on the argument provided with the `--etl-level` flag. - -## Initial Run -Before this script can be used routinely, it must first be run with the `--etl-level` flag set to `initial_run`. This will perform a few setup actions. - -1. Create the the `transaction_id_lookup` table. -2. Use the existing `raw.transaction_normalized` table to prepopulate the `transaction_id_lookup` table. This will ensure that existing `transaction_id`s created by our system will be retained. If the `raw.transaction_normalized` table doesn't exist, this skep is skipped. -3. Set the `transaction_id_seq` sequence to the maximum `id` in the `transaction_id_lookup` table. If there are no records in the table, the sequence will be set to 1. -4. Create the `award_id_lookup` table. -5. Use the existing `raw.awards` table to prepopulate the `award_id_lookup` table. This will ensure any existing `id`s created by our system will be retained. If the `raw.awards` table does not exist, this step is skipped. -6. Set the `award_id_seq` to the maximum `id` value in the `award_id_lookup` table. If there are no records in the table, the sequence will be set to 1. -7. Create each of the following intermediary tables and attempt to backfill them with data from their raw counterpart. If the `--no-initial-copy` flag is provided, the backfill will be skipped. - - `int.transaction_fabs` - - `int.transaction_fpds` - - `int.transaction_normalized` - -### Note on `--no-initial-copy` -This flag should not be used in production. The intermediary tables should be backfilled during the initial run so that update and create dates are preserved from the old, raw versions of the tables. -This flag may be used during testing to compare the data from the raw tables to the intermediary tables. - -## Daily Usage - -During the nightly pipeline, this command should be run multiple times using different `--etl-level` flag arguments. Below is a description of the steps required: -1. `transaction_id_lookup` and `award_id_lookup` can be run in parallel. These will update the lookup tables with the latest data from the raw tables: `raw.published_fabs` and `raw.detached_award_procurement`. -2. `transaction_fabs`, `transaction_fpds`, and `transaction_normalized` can be run in parallel. -3. `awards` must be run by itself. \ No newline at end of file diff --git a/usaspending_api/etl/management/commands/load_transactions_in_delta.py b/usaspending_api/etl/management/commands/load_transactions_in_delta.py deleted file mode 100644 index b7c90a00f5..0000000000 --- a/usaspending_api/etl/management/commands/load_transactions_in_delta.py +++ /dev/null @@ -1,1575 +0,0 @@ -import copy -import logging -import re -from contextlib import contextmanager -from datetime import datetime, timezone - -from django.core.management import BaseCommand, call_command -from django.db import connection -from pyspark.sql import SparkSession -from pyspark.sql.types import ArrayType, StringType -from pyspark.sql.utils import AnalysisException - -from usaspending_api.awards.delta_models.awards import AWARDS_COLUMNS -from usaspending_api.broker.helpers.build_business_categories_boolean_dict import fpds_boolean_columns -from usaspending_api.broker.helpers.get_business_categories import ( - get_business_categories_fabs, - get_business_categories_fpds, -) -from usaspending_api.broker.helpers.last_load_date import ( - get_earliest_load_date, - get_last_load_date, - update_last_load_date, -) -from usaspending_api.common.data_classes import TransactionColumn -from usaspending_api.common.etl.spark import create_ref_temp_views -from usaspending_api.common.helpers.spark_helpers import ( - configure_spark_session, - get_active_spark_session, -) -from usaspending_api.config import CONFIG -from usaspending_api.transactions.delta_models.transaction_fabs import ( - FABS_TO_NORMALIZED_COLUMN_INFO, - TRANSACTION_FABS_COLUMN_INFO, - TRANSACTION_FABS_COLUMNS, -) -from usaspending_api.transactions.delta_models.transaction_fpds import ( - DAP_TO_NORMALIZED_COLUMN_INFO, - TRANSACTION_FPDS_COLUMN_INFO, - TRANSACTION_FPDS_COLUMNS, -) -from usaspending_api.transactions.delta_models.transaction_normalized import TRANSACTION_NORMALIZED_COLUMNS - -logger = logging.getLogger(__name__) - - -class Command(BaseCommand): - help = """ - This command reads transaction data from source / bronze tables in delta and creates the delta silver tables - specified via the "etl_level" argument. Each "etl_level" uses an exclusive value for "last_load_date" from the - "external_data_load_date" table in Postgres to determine the subset of transactions to load. For a full - pipeline run the "award_id_lookup" and "transaction_id_lookup" levels should be run first in order to populate the - lookup tables. These lookup tables are used to keep track of PK values across the different silver tables. - - *****NOTE*****: Before running this command for the first time on a usual basis, it should be run with the - "etl_level" set to "initial_run" to set up the needed lookup tables and populate the needed sequences and - "last_load_date" values for the lookup tables. - """ - - etl_level: str - last_etl_date: str - spark_s3_bucket: str - no_initial_copy: bool - spark: SparkSession - # See comments in delete_records_sql, transaction_id_lookup ETL level, for more info about logic in the - # query below. - award_id_lookup_delete_subquery: str = """ - -- Adding CTEs to pre-filter award_id_lookup table for significant speedups when joining - WITH - aidlu_fpds AS ( - SELECT * FROM int.award_id_lookup - WHERE is_fpds = TRUE - ), - aidlu_fabs AS ( - SELECT * FROM int.award_id_lookup - WHERE is_fpds = FALSE - ) - SELECT aidlu.transaction_unique_id AS id_to_remove - FROM aidlu_fpds AS aidlu LEFT JOIN raw.detached_award_procurement AS dap ON ( - aidlu.transaction_unique_id = ucase(dap.detached_award_proc_unique) - ) - WHERE dap.detached_award_proc_unique IS NULL - UNION ALL - SELECT aidlu.transaction_unique_id AS id_to_remove - FROM aidlu_fabs AS aidlu LEFT JOIN raw.published_fabs AS pfabs ON ( - aidlu.transaction_unique_id = ucase(pfabs.afa_generated_unique) - ) - WHERE pfabs.afa_generated_unique IS NULL - """ - - def add_arguments(self, parser): - parser.add_argument( - "--etl-level", - type=str, - required=True, - help="The silver delta table that should be updated from the bronze delta data.", - choices=[ - "award_id_lookup", - "awards", - "initial_run", - "transaction_fabs", - "transaction_fpds", - "transaction_id_lookup", - "transaction_normalized", - ], - ) - parser.add_argument( - "--spark-s3-bucket", - type=str, - required=False, - default=CONFIG.SPARK_S3_BUCKET, - help="The destination bucket in S3 for creating the tables.", - ) - parser.add_argument( - "--no-initial-copy", - action="store_true", - required=False, - help="Whether to skip copying tables from the 'raw' database to the 'int' database during initial_run.", - ) - - def handle(self, *args, **options): - with self.prepare_spark(): - self.etl_level = options["etl_level"] - self.spark_s3_bucket = options["spark_s3_bucket"] - self.no_initial_copy = options["no_initial_copy"] - - # Capture earliest last load date of the source tables to update the "last_load_date" after completion - next_last_load = get_earliest_load_date( - ("source_procurement_transaction", "source_assistance_transaction"), datetime.utcfromtimestamp(0) - ) - - if self.etl_level == "initial_run": - logger.info("Running initial setup") - self.initial_run(next_last_load) - return - - # Do this check now to avoid uncaught errors later when running queries - # Use 'int' because that is what will be targeted for deletes/updates/etc. - table_exists = self.spark._jsparkSession.catalog().tableExists(f"int.{self.etl_level}") - if not table_exists: - raise Exception(f"Table: int.{self.etl_level} does not exist.") - - if self.etl_level == "award_id_lookup": - logger.info(f"Running pre-delete SQL for '{self.etl_level}' ETL") - possibly_modified_award_ids = self.award_id_lookup_pre_delete() - - logger.info(f"Running delete SQL for '{self.etl_level}' ETL") - self.spark.sql(self.delete_records_sql()) - - if self.etl_level == "award_id_lookup": - logger.info(f"Running post-delete SQL for '{self.etl_level}' ETL") - self.award_id_lookup_post_delete(possibly_modified_award_ids) - - last_etl_date = get_last_load_date(self.etl_level) - if last_etl_date is None: - # Table has not been loaded yet. To avoid checking for None in all the locations where - # last_etl_date is used, set it to a long time ago. - last_etl_date = datetime.utcfromtimestamp(0) - self.last_etl_date = str(last_etl_date) - - logger.info(f"Running UPSERT SQL for '{self.etl_level}' ETL") - if self.etl_level == "transaction_id_lookup": - self.update_transaction_lookup_ids() - elif self.etl_level == "award_id_lookup": - self.update_award_lookup_ids() - elif self.etl_level in ("transaction_fabs", "transaction_fpds"): - self.spark.sql(self.transaction_fabs_fpds_merge_into_sql()) - elif self.etl_level == "transaction_normalized": - create_ref_temp_views(self.spark) - self.spark.sql(self.transaction_normalized_merge_into_sql("fabs")) - self.spark.sql(self.transaction_normalized_merge_into_sql("fpds")) - elif self.etl_level == "awards": - self.update_awards() - - update_last_load_date(self.etl_level, next_last_load) - - @contextmanager - def prepare_spark(self): - extra_conf = { - # Config for additional packages needed - # "spark.jars.packages": "org.postgresql:postgresql:42.2.23,io.delta:delta-core_2.12:1.2.1,org.apache.hadoop:hadoop-aws:3.3.1,org.apache.spark:spark-hive_2.12:3.2.1", - # Config for Delta Lake tables and SQL. Need these to keep Dela table metadata in the metastore - "spark.sql.extensions": "io.delta.sql.DeltaSparkSessionExtension", - "spark.sql.catalog.spark_catalog": "org.apache.spark.sql.delta.catalog.DeltaCatalog", - # See comment below about old date and time values cannot parsed without these - "spark.sql.parquet.datetimeRebaseModeInWrite": "LEGACY", # for dates at/before 1900 - "spark.sql.parquet.int96RebaseModeInWrite": "LEGACY", # for timestamps at/before 1900 - "spark.sql.jsonGenerator.ignoreNullFields": "false", # keep nulls in our json - } - - # Create the Spark Session - self.spark = get_active_spark_session() - spark_created_by_command = False - if not self.spark: - spark_created_by_command = True - self.spark = configure_spark_session(**extra_conf, spark_context=self.spark) # type: SparkSession - - # Create UDFs for Business Categories - self.spark.udf.register( - name="get_business_categories_fabs", f=get_business_categories_fabs, returnType=ArrayType(StringType()) - ) - self.spark.udf.register( - name="get_business_categories_fpds", f=get_business_categories_fpds, returnType=ArrayType(StringType()) - ) - - yield # Going to wait for the Django command to complete then stop the spark session if needed - - if spark_created_by_command: - self.spark.stop() - - def award_id_lookup_pre_delete(self): - """ - Return a list of the award ids corresponding to the transaction_unique_ids that are about to be deleted. - """ - sql = f""" - WITH txns_to_delete AS ( - {self.award_id_lookup_delete_subquery} - ) - SELECT DISTINCT(award_id) AS award_id - FROM int.award_id_lookup AS aidlu INNER JOIN txns_to_delete AS to_del ON ( - aidlu.transaction_unique_id = to_del.id_to_remove - ) - """ - - # TODO: The values returned here are put into a list in an 'IN' clause in award_id_lookup_post_delete. - # However, there is a limit on the number of values one can manually put into an 'IN' clause (i.e., not - # returned by a SELECT subquery inside the 'IN'). Thus, this code should return a dataframe directly, - # create a temporary view from the dataframe in award_id_lookup_post_delete, and use that temporary - # view to either do a subquery in the 'IN' clause or to JOIN against. - possibly_modified_award_ids = [str(row["award_id"]) for row in self.spark.sql(sql).collect()] - return possibly_modified_award_ids - - def delete_records_sql(self): - if self.etl_level == "transaction_id_lookup": - id_col = "transaction_id" - subquery = """ - -- Adding CTEs to pre-filter transaction_id_lookup table for significant speedups when joining - WITH - tidlu_fpds AS ( - SELECT * FROM int.transaction_id_lookup - WHERE is_fpds = TRUE - ), - tidlu_fabs AS ( - SELECT * FROM int.transaction_id_lookup - WHERE is_fpds = FALSE - ) - SELECT transaction_id AS id_to_remove - /* Joining on tidlu.transaction_unique_id = ucase(dap.detached_award_proc_unique) for consistency with - fabs records, even though fpds records *shouldn't* update the same way as fabs records might (see - comment below). - Using fpds pre-filtered table to avoid having this part of the query think that everything - in transaction_id_lookup that corresponds to a fabs transaction needs to be deleted. */ - FROM tidlu_fpds AS tidlu LEFT JOIN raw.detached_award_procurement AS dap ON ( - tidlu.transaction_unique_id = ucase(dap.detached_award_proc_unique) - ) - WHERE dap.detached_award_proc_unique IS NULL - UNION ALL - SELECT transaction_id AS id_to_remove - /* Need to join on tidlu.transaction_unique_id = ucase(pfabs.afa_generated_unique) rather than on - tidlu.published_fabs_id = pfabs.published_fabs_id because a newer record with a different - published_fabs_id could come in with the same afa_generated_unique as a prior record, as an update - to the transaction. When this happens, the older record should also be deleted from the - raw.published_fabs table, but we don't actually want to delete the record in the lookup table - because that transaction is still valid. - Same logic as above as to why we are using the fabs pre-filtered table to avoid - deleting all of the fpds records. */ - FROM tidlu_fabs AS tidlu LEFT JOIN raw.published_fabs AS pfabs ON ( - tidlu.transaction_unique_id = ucase(pfabs.afa_generated_unique) - ) - WHERE pfabs.afa_generated_unique IS NULL - """ - elif self.etl_level == "award_id_lookup": - id_col = "transaction_unique_id" - subquery = self.award_id_lookup_delete_subquery - elif self.etl_level in ("transaction_fabs", "transaction_fpds", "transaction_normalized"): - id_col = "id" if self.etl_level == "transaction_normalized" else "transaction_id" - subquery = f""" - SELECT {self.etl_level}.{id_col} AS id_to_remove - FROM int.{self.etl_level} LEFT JOIN int.transaction_id_lookup ON ( - {self.etl_level}.{id_col} = transaction_id_lookup.transaction_id - ) - WHERE {self.etl_level}.{id_col} IS NOT NULL AND transaction_id_lookup.transaction_id IS NULL - """ - elif self.etl_level == "awards": - id_col = "id" - subquery = """ - SELECT awards.id AS id_to_remove - FROM int.awards LEFT JOIN int.award_id_lookup ON awards.id = award_id_lookup.award_id - WHERE awards.id IS NOT NULL AND award_id_lookup.award_id IS NULL - """ - - sql = f""" - MERGE INTO int.{self.etl_level} - USING ( - {subquery} - ) AS deleted_records - ON {self.etl_level}.{id_col} = deleted_records.id_to_remove - WHEN MATCHED - THEN DELETE - """ - - return sql - - def award_id_lookup_post_delete(self, possibly_modified_award_ids): - """ - Now that deletion from the award_id_lookup table is done, we need to figure out which awards in - possibly_modified_award_ids remain. - """ - - # Of those possibly_modified_award_ids, find those that remain after deleting transactions. Those are - # the award_ids which have had some, but not all, transactions deleted from them. - # This function will always append to int.award_ids_delete_modified because award_id_lookup ETL - # level could be run more than once before awards ETL level is run. - # Avoid SQL error if possibly_modified_award_ids is empty - if possibly_modified_award_ids: - # TODO: see award_id_lookup_pre_delete - self.spark.sql( - f""" - INSERT INTO int.award_ids_delete_modified - SELECT award_id - FROM int.award_id_lookup - WHERE award_id IN ({", ".join(possibly_modified_award_ids)}) - """ - ) - - def update_awards(self): - load_datetime = datetime.now(timezone.utc) - - set_insert_special_columns = ["total_subaward_amount", "create_date", "update_date"] - subquery_ignored_columns = set_insert_special_columns + ["id", "subaward_count"] - - # Use a UNION in award_ids_to_update, not UNION ALL because there could be duplicates among the award ids - # between the query parts or in int.award_ids_delete_modified. - subquery = f""" - WITH - award_ids_to_update AS ( - SELECT DISTINCT(award_id) - FROM int.award_id_lookup - WHERE transaction_unique_id IN (SELECT transaction_unique_id - FROM int.transaction_normalized - WHERE update_date >= '{self.last_etl_date}') - UNION - SELECT award_id FROM int.award_ids_delete_modified - ), - transaction_earliest AS ( - SELECT * FROM ( - SELECT - tn.award_id AS id, - tn.id AS earliest_transaction_id, - tn.action_date AS date_signed, - tn.description, - tn.period_of_performance_start_date, - ROW_NUMBER() OVER ( - PARTITION BY tn.award_id - /* NOTE: In Postgres, the default sorting order sorts NULLs as larger than all other values. - However, in Spark, the default sorting order sorts NULLs as smaller than all other - values. In the Postgres transaction loader the default sorting behavior was used, so to - be consistent with the behavior of the previous loader, we need to reverse the default - Spark NULL sorting behavior for any field that can be NULL. */ - ORDER BY tn.award_id, tn.action_date ASC NULLS LAST, tn.modification_number ASC NULLS LAST, - tn.transaction_unique_id ASC - ) AS rank - FROM int.transaction_normalized AS tn - WHERE tn.award_id IN (SELECT * FROM award_ids_to_update) - ) - WHERE rank = 1 - ), - transaction_latest AS ( - SELECT * FROM ( - SELECT - -- General update columns (id at top, rest alphabetically by alias/name) - tn.award_id AS id, - tn.awarding_agency_id, - CASE - WHEN tn.type IN ('A', 'B', 'C', 'D') THEN 'contract' - WHEN tn.type IN ('02', '03', '04', '05') THEN 'grant' - WHEN tn.type IN ('06', '10') THEN 'direct payment' - WHEN tn.type IN ('07', '08') THEN 'loans' - WHEN tn.type = '09' THEN 'insurance' - WHEN tn.type = '11' THEN 'other' - WHEN tn.type LIKE 'IDV%%' THEN 'idv' - ELSE NULL - END AS category, - tn.action_date AS certified_date, - CASE - WHEN month(tn.action_date) > 9 THEN year(tn.action_date) + 1 - ELSE year(tn.action_date) - END AS fiscal_year, - tn.funding_agency_id, - tn.unique_award_key AS generated_unique_award_id, - tn.is_fpds, - tn.last_modified_date, - tn.id AS latest_transaction_id, - tn.period_of_performance_current_end_date, - tn.transaction_unique_id, - tn.type, - tn.type_description, - -- FPDS Columns - fpds.agency_id AS fpds_agency_id, - fpds.referenced_idv_agency_iden AS fpds_parent_agency_id, - fpds.parent_award_id AS parent_award_piid, - fpds.piid, - -- FABS Columns - fabs.fain, - fabs.uri, - -- Other - 'DBR' AS data_source, - -- Windowing Function - ROW_NUMBER() OVER ( - PARTITION BY tn.award_id - -- See note in transaction_earliest about NULL ordering. - ORDER BY tn.award_id, tn.action_date DESC NULLS FIRST, - tn.modification_number DESC NULLS FIRST, tn.transaction_unique_id DESC - ) as rank - FROM int.transaction_normalized AS tn - LEFT JOIN int.transaction_fpds AS fpds ON fpds.transaction_id = tn.id - LEFT JOIN int.transaction_fabs AS fabs ON fabs.transaction_id = tn.id - WHERE tn.award_id IN (SELECT * FROM award_ids_to_update) - ) - WHERE rank = 1 - ), - -- For executive compensation information, we want the latest transaction for each award - -- for which there is at least an officer_1_name. - transaction_ec AS ( - SELECT * FROM ( - SELECT - tn.award_id AS id, - COALESCE(fpds.officer_1_amount, fabs.officer_1_amount) AS officer_1_amount, - COALESCE(fpds.officer_1_name, fabs.officer_1_name) AS officer_1_name, - COALESCE(fpds.officer_2_amount, fabs.officer_2_amount) AS officer_2_amount, - COALESCE(fpds.officer_2_name, fabs.officer_2_name) AS officer_2_name, - COALESCE(fpds.officer_3_amount, fabs.officer_3_amount) AS officer_3_amount, - COALESCE(fpds.officer_3_name, fabs.officer_3_name) AS officer_3_name, - COALESCE(fpds.officer_4_amount, fabs.officer_4_amount) AS officer_4_amount, - COALESCE(fpds.officer_4_name, fabs.officer_4_name) AS officer_4_name, - COALESCE(fpds.officer_5_amount, fabs.officer_5_amount) AS officer_5_amount, - COALESCE(fpds.officer_5_name, fabs.officer_5_name) AS officer_5_name, - ROW_NUMBER() OVER ( - PARTITION BY tn.award_id - -- See note in transaction_earliest about NULL ordering. - ORDER BY tn.award_id, tn.action_date DESC NULLS FIRST, - tn.modification_number DESC NULLS FIRST, tn.transaction_unique_id DESC - ) as rank - FROM int.transaction_normalized AS tn - LEFT JOIN int.transaction_fpds AS fpds ON fpds.transaction_id = tn.id - LEFT JOIN int.transaction_fabs AS fabs ON fabs.transaction_id = tn.id - WHERE - tn.award_id IN (SELECT * FROM award_ids_to_update) - AND (fpds.officer_1_name IS NOT NULL OR fabs.officer_1_name IS NOT NULL) - ) - WHERE rank = 1 - ), - transaction_totals AS ( - SELECT - -- Transaction Normalized Fields - tn.award_id AS id, - SUM(tn.federal_action_obligation) AS total_obligation, - SUM(tn.original_loan_subsidy_cost) AS total_subsidy_cost, - SUM(tn.funding_amount) AS total_funding_amount, - SUM(tn.face_value_loan_guarantee) AS total_loan_value, - SUM(tn.non_federal_funding_amount) AS non_federal_funding_amount, - SUM(tn.indirect_federal_sharing) AS total_indirect_federal_sharing, - -- Transaction FPDS Fields - SUM(CAST(fpds.base_and_all_options_value AS NUMERIC(23, 2))) AS base_and_all_options_value, - SUM(CAST(fpds.base_exercised_options_val AS NUMERIC(23, 2))) AS base_exercised_options_val, - COUNT(tn.id) AS transaction_count - FROM int.transaction_normalized AS tn - LEFT JOIN int.transaction_fpds AS fpds ON tn.id = fpds.transaction_id - WHERE tn.award_id IN (SELECT * FROM award_ids_to_update) - GROUP BY tn.award_id - ) - SELECT - latest.id, - 0 AS subaward_count, -- for consistency with Postgres table - {", ".join([col_name for col_name in AWARDS_COLUMNS if col_name not in subquery_ignored_columns])} - FROM transaction_latest AS latest - INNER JOIN transaction_earliest AS earliest ON latest.id = earliest.id - INNER JOIN transaction_totals AS totals on latest.id = totals.id - -- Not every award will have a record in transaction_ec, so need to do a LEFT JOIN on it. - LEFT JOIN transaction_ec AS ec ON latest.id = ec.id - """ - - # On set, create_date will not be changed and update_date will be set below. The subaward columns will not - # be changed, and id is used to match. All other column values will come from the subquery. - set_cols = [ - f"int.awards.{col_name} = source_subquery.{col_name}" - for col_name in AWARDS_COLUMNS - if col_name not in set_insert_special_columns - ] - set_cols.append(f"""int.awards.update_date = '{load_datetime.isoformat(" ")}'""") - - # Move insert_special_columns to the end of the list of column names for ease of handling - # during record insert - insert_col_name_list = [col_name for col_name in AWARDS_COLUMNS if col_name not in set_insert_special_columns] - insert_col_name_list.extend(set_insert_special_columns) - insert_col_names = ", ".join([col_name for col_name in insert_col_name_list]) - - # On insert, all values except for those in insert_special_columns will come from the subquery - insert_value_list = insert_col_name_list[:-3] - insert_value_list.extend(["NULL"]) - insert_value_list.extend([f"""'{load_datetime.isoformat(" ")}'"""] * 2) - insert_values = ", ".join([value for value in insert_value_list]) - - sql = f""" - MERGE INTO int.awards - USING ( - {subquery} - ) AS source_subquery - ON awards.id = source_subquery.id - WHEN MATCHED - THEN UPDATE SET - {", ".join(set_cols)} - WHEN NOT MATCHED - THEN INSERT - ({insert_col_names}) - VALUES ({insert_values}) - """ - - self.spark.sql(sql) - - # Now that the award table update is done, we can empty the award_ids_delete_modified table. - # Note that an external (unmanaged) table can't be TRUNCATED; use blanket DELETE instead. - self.spark.sql("DELETE FROM int.award_ids_delete_modified") - - def source_subquery_sql(self, transaction_type=None): - def build_date_format_sql(col: TransactionColumn, is_casted_to_date: bool = True) -> str: - """Builder function to wrap a column in date-parsing logic. - - It will either parse it in mmddYYYY format with - or / as a required separator, or in YYYYmmdd format - with or without either of - or / as a separator. - Args: - is_casted_to_date (bool): if true, the parsed result will be cast to DATE to provide a DATE datatype, - otherwise it remains a STRING in YYYY-mm-dd format - """ - # Each of these regexps allows for an optional timestamp portion, separated from the date by some character, - # and the timestamp allows for an optional UTC offset. In any case, the timestamp is ignored, though. - regexp_mmddYYYY = ( - r"(\\d{2})(?[-/])(\\d{2})(\\k)(\\d{4})(.\\d{2}:\\d{2}:\\d{2}([+-]\\d{2}:\\d{2})?)?" - ) - regexp_YYYYmmdd = ( - r"(\\d{4})(?[-/]?)(\\d{2})(\\k)(\\d{2})(.\\d{2}:\\d{2}:\\d{2}([+-]\\d{2}:\\d{2})?)?" - ) - - mmddYYYY_fmt = f""" - (regexp_extract({bronze_table_name}.{col.source}, '{regexp_mmddYYYY}', 5) - || '-' || - regexp_extract({bronze_table_name}.{col.source}, '{regexp_mmddYYYY}', 1) - || '-' || - regexp_extract({bronze_table_name}.{col.source}, '{regexp_mmddYYYY}', 3)) - """ - YYYYmmdd_fmt = f""" - (regexp_extract({bronze_table_name}.{col.source}, '{regexp_YYYYmmdd}', 1) - || '-' || - regexp_extract({bronze_table_name}.{col.source}, '{regexp_YYYYmmdd}', 3) - || '-' || - regexp_extract({bronze_table_name}.{col.source}, '{regexp_YYYYmmdd}', 5)) - """ - - if is_casted_to_date: - mmddYYYY_fmt = f"""CAST({mmddYYYY_fmt} - AS DATE) - """ - YYYYmmdd_fmt = f"""CAST({YYYYmmdd_fmt} - AS DATE) - """ - - sql_snippet = f""" - CASE WHEN regexp({bronze_table_name}.{col.source}, '{regexp_mmddYYYY}') - THEN {mmddYYYY_fmt} - ELSE {YYYYmmdd_fmt} - END - """ - - return sql_snippet - - def handle_column(col: TransactionColumn, bronze_table_name, is_result_aliased=True): - """ - Args: - is_result_aliased (bool) if true, aliases the parsing result with the given ``col``'s ``dest_name`` - """ - if col.handling == "cast": - retval = f"CAST({bronze_table_name}.{col.source} AS {col.delta_type})" - elif col.handling == "literal": - # Use col.source directly as the value - retval = f"{col.source}" - elif col.handling == "parse_string_datetime_to_date": - # These are string fields that actually hold DATES/TIMESTAMPS and need to be cast as dates. - # However, they may not be properly parsed when calling CAST(... AS DATE). - retval = build_date_format_sql(col, is_casted_to_date=True) - elif col.handling == "string_datetime_remove_timestamp": - # These are string fields that actually hold DATES/TIMESTAMPS, but need the non-DATE part discarded, - # even though they remain as strings - retval = build_date_format_sql(col, is_casted_to_date=False) - elif col.delta_type.upper() == "STRING": - # Capitalize and remove leading & trailing whitespace from all string values - retval = f"ucase(trim({bronze_table_name}.{col.source}))" - elif col.delta_type.upper() == "BOOLEAN" and not col.handling == "leave_null": - # Unless specified, convert any nulls to false for boolean columns - retval = f"COALESCE({bronze_table_name}.{col.source}, FALSE)" - else: - retval = f"{bronze_table_name}.{col.source}" - - # Handle scalar transformations if the column requires it - if col.scalar_transformation is not None: - retval = col.scalar_transformation.format(input=retval) - - retval = f"{retval}{' AS ' + col.dest_name if is_result_aliased else ''}" - return retval - - def select_columns_transaction_fabs_fpds(bronze_table_name): - if self.etl_level == "transaction_fabs": - col_info = copy.copy(TRANSACTION_FABS_COLUMN_INFO) - elif self.etl_level == "transaction_fpds": - col_info = copy.copy(TRANSACTION_FPDS_COLUMN_INFO) - else: - raise RuntimeError( - f"Function called with invalid 'etl_level': {self.etl_level}. " - "Only for use with 'transaction_fabs' or 'transaction_fpds' etl_level." - ) - - select_cols = [] - for col in filter(lambda x: x.dest_name not in ["transaction_id"], col_info): - select_cols.append(handle_column(col, bronze_table_name)) - - return select_cols - - def select_columns_transaction_normalized(bronze_table_name): - action_date_col = next( - filter( - lambda c: c.dest_name == "action_date" and c.source == "action_date", - FABS_TO_NORMALIZED_COLUMN_INFO if transaction_type == "fabs" else DAP_TO_NORMALIZED_COLUMN_INFO, - ) - ) - parse_action_date_sql_snippet = handle_column(action_date_col, bronze_table_name, is_result_aliased=False) - select_cols = [ - "award_id_lookup.award_id", - "awarding_agency.id AS awarding_agency_id", - f"""CASE WHEN month({parse_action_date_sql_snippet}) > 9 - THEN year({parse_action_date_sql_snippet}) + 1 - ELSE year({parse_action_date_sql_snippet}) - END AS fiscal_year""", - "funding_agency.id AS funding_agency_id", - ] - - if transaction_type == "fabs": - select_cols.extend( - [ - # business_categories - f"get_business_categories_fabs({bronze_table_name}.business_types) AS business_categories", - # funding_amount - # In theory, this should be equal to - # CAST(COALESCE({bronze_table_name}.federal_action_obligation, 0) - # + COALESCE({bronze_table_name}.non_federal_funding_amount, 0) - # AS NUMERIC(23, 2)) - # However, for some historical records, this isn't true. - f""" - CAST({bronze_table_name}.total_funding_amount AS NUMERIC(23, 2)) AS funding_amount - """, - ] - ) - map_col_info = copy.copy(FABS_TO_NORMALIZED_COLUMN_INFO) - else: - fpds_business_category_columns = copy.copy(fpds_boolean_columns) - # Add a couple of non-boolean columns that are needed in the business category logic - fpds_business_category_columns.extend(["contracting_officers_deter", "domestic_or_foreign_entity"]) - named_struct_text = ", ".join( - [f"'{col}', {bronze_table_name}.{col}" for col in fpds_business_category_columns] - ) - - select_cols.extend( - [ - # business_categories - f"get_business_categories_fpds(named_struct({named_struct_text})) AS business_categories", - # type - f""" - CASE WHEN {bronze_table_name}.pulled_from <> 'IDV' THEN {bronze_table_name}.contract_award_type - WHEN {bronze_table_name}.idv_type = 'B' AND {bronze_table_name}.type_of_idc IS NOT NULL - THEN 'IDV_B_' || {bronze_table_name}.type_of_idc - WHEN {bronze_table_name}.idv_type = 'B' - AND {bronze_table_name}.type_of_idc_description = 'INDEFINITE DELIVERY / REQUIREMENTS' - THEN 'IDV_B_A' - WHEN {bronze_table_name}.idv_type = 'B' - AND {bronze_table_name}.type_of_idc_description = - 'INDEFINITE DELIVERY / INDEFINITE QUANTITY' - THEN 'IDV_B_B' - WHEN {bronze_table_name}.idv_type = 'B' - AND {bronze_table_name}.type_of_idc_description = - 'INDEFINITE DELIVERY / DEFINITE QUANTITY' - THEN 'IDV_B_C' - ELSE 'IDV_' || {bronze_table_name}.idv_type - END AS type - """, - # type_description - f""" - CASE WHEN {bronze_table_name}.pulled_from <> 'IDV' - THEN {bronze_table_name}.contract_award_type_desc - WHEN {bronze_table_name}.idv_type = 'B' - AND {bronze_table_name}.type_of_idc_description IS NOT NULL - AND ucase({bronze_table_name}.type_of_idc_description) <> 'NAN' - THEN {bronze_table_name}.type_of_idc_description - WHEN {bronze_table_name}.idv_type = 'B' - THEN 'INDEFINITE DELIVERY CONTRACT' - ELSE {bronze_table_name}.idv_type_description - END AS type_description - """, - ] - ) - map_col_info = copy.copy(DAP_TO_NORMALIZED_COLUMN_INFO) - - for col in map_col_info: - select_cols.append(handle_column(col, bronze_table_name)) - - return select_cols - - if self.etl_level == "transaction_fabs": - bronze_table_name = "raw.published_fabs" - unique_id = "afa_generated_unique" - id_col_name = "transaction_id" - select_columns = select_columns_transaction_fabs_fpds(bronze_table_name) - additional_joins = "" - elif self.etl_level == "transaction_fpds": - bronze_table_name = "raw.detached_award_procurement" - unique_id = "detached_award_proc_unique" - id_col_name = "transaction_id" - select_columns = select_columns_transaction_fabs_fpds(bronze_table_name) - additional_joins = "" - elif self.etl_level == "transaction_normalized": - if transaction_type == "fabs": - bronze_table_name = "raw.published_fabs" - unique_id = "afa_generated_unique" - elif transaction_type == "fpds": - bronze_table_name = "raw.detached_award_procurement" - unique_id = "detached_award_proc_unique" - else: - raise ValueError( - f"Invalid value for 'transaction_type': {transaction_type}; must select either: 'fabs' or 'fpds'" - ) - - id_col_name = "id" - select_columns = select_columns_transaction_normalized(bronze_table_name) - additional_joins = f""" - INNER JOIN int.award_id_lookup AS award_id_lookup ON ( - ucase({bronze_table_name}.{unique_id}) = award_id_lookup.transaction_unique_id - ) - LEFT OUTER JOIN global_temp.subtier_agency AS funding_subtier_agency ON ( - funding_subtier_agency.subtier_code = {bronze_table_name}.funding_sub_tier_agency_co - ) - LEFT OUTER JOIN global_temp.agency AS funding_agency ON ( - funding_agency.subtier_agency_id = funding_subtier_agency.subtier_agency_id - ) - LEFT OUTER JOIN global_temp.subtier_agency AS awarding_subtier_agency ON ( - awarding_subtier_agency.subtier_code = {bronze_table_name}.awarding_sub_tier_agency_c - ) - LEFT OUTER JOIN global_temp.agency AS awarding_agency ON ( - awarding_agency.subtier_agency_id = awarding_subtier_agency.subtier_agency_id - ) - """ - else: - raise RuntimeError( - f"Function called with invalid 'etl_level': {self.etl_level}. " - "Only for use with 'transaction_fabs', 'transaction_fpds', or 'transaction_normalized' " - "etl_level." - ) - - # Since the select columns may have complicated logic, put them on separate lines for debugging. - # However, strings inside {} expressions in f-strings can't contain backslashes, so will join them first - # before inserting into overall sql statement. - select_columns_str = ",\n ".join(select_columns) - sql = f""" - SELECT - transaction_id_lookup.transaction_id AS {id_col_name}, - {select_columns_str} - FROM {bronze_table_name} - INNER JOIN int.transaction_id_lookup ON ( - ucase({bronze_table_name}.{unique_id}) = transaction_id_lookup.transaction_unique_id - ) - {additional_joins} - WHERE {bronze_table_name}.updated_at >= '{self.last_etl_date}' - """ - - return sql - - def transaction_fabs_fpds_merge_into_sql(self): - if self.etl_level == "transaction_fabs": - col_info = copy.copy(TRANSACTION_FABS_COLUMN_INFO) - elif self.etl_level == "transaction_fpds": - col_info = copy.copy(TRANSACTION_FPDS_COLUMN_INFO) - else: - raise RuntimeError( - f"Function called with invalid 'etl_level': {self.etl_level}. " - "Only for use with 'transaction_fabs' or 'transaction_fpds' etl_level." - ) - - set_cols = [f"silver_table.{col.dest_name} = source_subquery.{col.dest_name}" for col in col_info] - silver_table_cols = ", ".join([col.dest_name for col in col_info]) - - sql = f""" - MERGE INTO int.{self.etl_level} AS silver_table - USING ( - {self.source_subquery_sql()} - ) AS source_subquery - ON silver_table.transaction_id = source_subquery.transaction_id - WHEN MATCHED - THEN UPDATE SET - {", ".join(set_cols)} - WHEN NOT MATCHED - THEN INSERT - ({silver_table_cols}) - VALUES ({silver_table_cols}) - """ - - return sql - - def transaction_normalized_merge_into_sql(self, transaction_type): - if transaction_type != "fabs" and transaction_type != "fpds": - raise ValueError( - f"Invalid value for 'transaction_type': {transaction_type}. Must select either: 'fabs' or 'fpds'" - ) - - load_datetime = datetime.now(timezone.utc) - special_columns = ["create_date", "update_date"] - - # On set, create_date will not be changed and update_date will be set below. All other column - # values will come from the subquery. - set_cols = [ - f"int.transaction_normalized.{col_name} = source_subquery.{col_name}" - for col_name in TRANSACTION_NORMALIZED_COLUMNS - if col_name not in (special_columns + ["id"]) - ] - set_cols.append(f"""int.transaction_normalized.update_date = '{load_datetime.isoformat(" ")}'""") - - # Move create_date and update_date to the end of the list of column names for ease of handling - # during record insert - insert_col_name_list = [ - col_name for col_name in TRANSACTION_NORMALIZED_COLUMNS if col_name not in special_columns - ] - insert_col_name_list.extend(special_columns) - insert_col_names = ", ".join([col_name for col_name in insert_col_name_list]) - - # On insert, all values except for create_date and update_date will come from the subquery - insert_value_list = insert_col_name_list[:-2] - insert_value_list.extend([f"""'{load_datetime.isoformat(" ")}'"""] * 2) - insert_values = ", ".join([value for value in insert_value_list]) - - sql = f""" - MERGE INTO int.transaction_normalized - USING ( - {self.source_subquery_sql(transaction_type)} - ) AS source_subquery - ON transaction_normalized.id = source_subquery.id - WHEN MATCHED - THEN UPDATE SET - {", ".join(set_cols)} - WHEN NOT MATCHED - THEN INSERT - ({insert_col_names}) - VALUES ({insert_values}) - """ - - return sql - - def update_transaction_lookup_ids(self): - logger.info("Getting the next transaction_id from transaction_id_seq") - with connection.cursor() as cursor: - cursor.execute("SELECT nextval('transaction_id_seq')") - # Since all calls to setval() set the is_called flag to false, nextval() returns the actual maximum id - previous_max_id = cursor.fetchone()[0] - - logger.info("Creating new 'transaction_id_lookup' records for new transactions") - self.spark.sql( - f""" - WITH - dap_filtered AS ( - SELECT detached_award_proc_unique - FROM raw.detached_award_procurement - WHERE updated_at >= '{self.last_etl_date}' - ), - pfabs_filtered AS ( - SELECT afa_generated_unique - FROM raw.published_fabs - WHERE updated_at >= '{self.last_etl_date}' - ), - -- Adding CTEs to pre-filter transaction_id_lookup table for significant speedups when joining - tidlu_fpds AS ( - SELECT * FROM int.transaction_id_lookup - WHERE is_fpds = TRUE - ), - tidlu_fabs AS ( - SELECT * FROM int.transaction_id_lookup - WHERE is_fpds = FALSE - ) - INSERT INTO int.transaction_id_lookup - SELECT - {previous_max_id} + ROW_NUMBER() OVER ( - ORDER BY all_new_transactions.transaction_unique_id - ) AS transaction_id, - all_new_transactions.is_fpds, - all_new_transactions.transaction_unique_id - FROM ( - ( - SELECT - TRUE AS is_fpds, - -- The transaction loader code will convert this to upper case, so use that version here. - ucase(dap.detached_award_proc_unique) AS transaction_unique_id - FROM - dap_filtered AS dap LEFT JOIN tidlu_fpds AS tidlu ON ( - ucase(dap.detached_award_proc_unique) = tidlu.transaction_unique_id - ) - WHERE tidlu.transaction_unique_id IS NULL - ) - UNION ALL - ( - SELECT - FALSE AS is_fpds, - -- The transaction loader code will convert this to upper case, so use that version here. - ucase(pfabs.afa_generated_unique) AS transaction_unique_id - FROM - pfabs_filtered AS pfabs LEFT JOIN tidlu_fabs AS tidlu ON ( - ucase(pfabs.afa_generated_unique) = tidlu.transaction_unique_id - ) - WHERE tidlu.transaction_unique_id IS NULL - ) - ) AS all_new_transactions - """ - ) - - logger.info("Updating transaction_id_seq to the new maximum id value seen so far") - poss_max_id = self.spark.sql("SELECT MAX(transaction_id) AS max_id FROM int.transaction_id_lookup").collect()[ - 0 - ]["max_id"] - if poss_max_id is None: - # Since initial_run will always start the id sequence from at least 1, and we take the max of - # poss_max_id and previous_max_id below, this can be set to 0 here. - poss_max_id = 0 - with connection.cursor() as cursor: - # Set is_called flag to false so that the next call to nextval() will return the specified value, and - # avoid the possibility of gaps in the transaction_id sequence - # https://www.postgresql.org/docs/13/functions-sequence.html - # If load_transactions_to_delta is called with --etl-level of transaction_id_lookup, and records are - # deleted which happen to correspond to transactions at the end of the transaction_id_lookup table, - # but no records are inserted, then poss_max_id will be less than previous_max_id above. Just assigning - # the current value of transaction_id_seq to poss_max_id would cause problems in a subsequent call - # with inserts, as it would assign the new transactions the same ids as the previously deleted ones. - # To avoid this possibility, set the current value of transaction_id_seq to the maximum of poss_max_id - # and previous_max_id. - cursor.execute(f"SELECT setval('transaction_id_seq', {max(poss_max_id, previous_max_id)}, false)") - - def update_award_lookup_ids(self): - logger.info("Getting the next award_id from award_id_seq") - with connection.cursor() as cursor: - cursor.execute("SELECT nextval('award_id_seq')") - # Since all calls to setval() set the is_called flag to false, nextval() returns the actual maximum id - previous_max_id = cursor.fetchone()[0] - - logger.info("Creating new 'award_id_lookup' records for new awards") - self.spark.sql( - f""" - WITH - dap_filtered AS ( - SELECT detached_award_proc_unique, unique_award_key - FROM raw.detached_award_procurement - WHERE updated_at >= '{self.last_etl_date}' - ), - pfabs_filtered AS ( - SELECT afa_generated_unique, unique_award_key - FROM raw.published_fabs - WHERE updated_at >= '{self.last_etl_date}' - ), - -- Adding CTEs to pre-filter award_id_lookup table for significant speedups when joining - aidlu_fpds AS ( - SELECT * FROM int.award_id_lookup - WHERE is_fpds = TRUE - ), - aidlu_fpds_map AS ( - SELECT award_id, generated_unique_award_id FROM aidlu_fpds - GROUP BY award_id, generated_unique_award_id - ), - aidlu_fabs AS ( - SELECT * FROM int.award_id_lookup - WHERE is_fpds = FALSE - ), - aidlu_fabs_map AS ( - SELECT award_id, generated_unique_award_id FROM aidlu_fabs - GROUP BY award_id, generated_unique_award_id - ) - INSERT INTO int.award_id_lookup - SELECT - COALESCE( - all_new_awards.existing_award_id, - {previous_max_id} + DENSE_RANK(all_new_awards.unique_award_key) OVER ( - ORDER BY all_new_awards.unique_award_key - ) - ) AS award_id, - all_new_awards.is_fpds, - all_new_awards.transaction_unique_id, - all_new_awards.unique_award_key AS generated_unique_award_id - FROM ( - ( - SELECT - TRUE AS is_fpds, - -- The transaction loader code will convert these to upper case, so use those versions here. - ucase(dap.detached_award_proc_unique) AS transaction_unique_id, - ucase(dap.unique_award_key) AS unique_award_key, - award_aidlu.award_id AS existing_award_id - FROM - dap_filtered AS dap - LEFT JOIN aidlu_fpds AS trans_aidlu ON ( - ucase(dap.detached_award_proc_unique) = trans_aidlu.transaction_unique_id - ) - LEFT JOIN aidlu_fpds_map AS award_aidlu ON ( - ucase(dap.unique_award_key) = award_aidlu.generated_unique_award_id - ) - WHERE trans_aidlu.transaction_unique_id IS NULL - ) - UNION ALL - ( - SELECT - FALSE AS is_fpds, - -- The transaction loader code will convert these to upper case, so use those versions here. - ucase(pfabs.afa_generated_unique) AS transaction_unique_id, - ucase(pfabs.unique_award_key) AS unique_award_key, - award_aidlu.award_id AS existing_award_id - FROM - pfabs_filtered AS pfabs - LEFT JOIN aidlu_fabs AS trans_aidlu ON ( - ucase(pfabs.afa_generated_unique) = trans_aidlu.transaction_unique_id - ) - LEFT JOIN aidlu_fabs_map AS award_aidlu ON ( - ucase(pfabs.unique_award_key) = award_aidlu.generated_unique_award_id - ) - WHERE trans_aidlu.transaction_unique_id IS NULL - ) - ) AS all_new_awards - """ - ) - - logger.info("Updating award_id_seq to the new maximum id value seen so far") - poss_max_id = self.spark.sql("SELECT MAX(award_id) AS max_id FROM int.award_id_lookup").collect()[0]["max_id"] - if poss_max_id is None: - # Since initial_run will always start the id sequence from at least 1, and we take the max of - # poss_max_id and previous_max_id below, this can be set to 0 here. - poss_max_id = 0 - with connection.cursor() as cursor: - # Set is_called flag to false so that the next call to nextval() will return the specified value, and - # avoid the possibility of gaps in the transaction_id sequence - # https://www.postgresql.org/docs/13/functions-sequence.html - # If load_transactions_to_delta is called with --etl-level of award_id_lookup, and records are deleted - # which happen to correspond to transactions at the end of the award_id_lookup table, but no records - # are inserted, then poss_max_id will be less than previous_max_id above. Just assigning the current - # value of award_id_seq to poss_max_id would cause problems in a subsequent call with inserts, as it - # would assign the new awards the same ids as the previously deleted ones. To avoid this possibility, - # set the current value of award_id_seq to the maximum of poss_max_id and previous_max_id. - cursor.execute(f"SELECT setval('award_id_seq', {max(poss_max_id, previous_max_id)}, false)") - - def initial_run(self, next_last_load): - """ - Procedure to create & set up transaction_id_lookup and award_id_lookup tables and create other tables in - int database that will be populated by subsequent calls. - """ - - # Creating 2 context managers to be able to handle error if either temp table is not created correctly. - @contextmanager - def prepare_orphaned_transaction_temp_table(): - # Since the table to track the orphaned transactions is only needed for this function, just using a - # managed table in the temp database. - self.spark.sql("CREATE DATABASE IF NOT EXISTS temp") - self.spark.sql( - f""" - CREATE OR REPLACE TABLE temp.orphaned_transaction_info ( - transaction_id LONG NOT NULL, - transaction_unique_id STRING NOT NULL, - is_fpds BOOLEAN NOT NULL, - unique_award_key STRING NOT NULL - ) - USING DELTA - LOCATION 's3a://{CONFIG.SPARK_S3_BUCKET}/{CONFIG.DELTA_LAKE_S3_PATH}/temp/orphaned_transaction_info' - """ - ) - - # Need a try...finally here to properly handle the case where an inner context manager raises an error - # during its __enter__ phase. - try: - yield - finally: - self.spark.sql("DROP TABLE IF EXISTS temp.orphaned_transaction_info") - - @contextmanager - def prepare_orphaned_award_temp_table(): - # We actually need another temporary table to handle orphaned awards - self.spark.sql( - f""" - CREATE OR REPLACE TABLE temp.orphaned_award_info ( - award_id LONG NOT NULL - ) - USING DELTA - LOCATION 's3a://{CONFIG.SPARK_S3_BUCKET}/{CONFIG.DELTA_LAKE_S3_PATH}/temp/orphaned_award_info' - """ - ) - - # Using another try...finally here just in case another context manager is used. - try: - yield - finally: - self.spark.sql("DROP TABLE IF EXISTS temp.orphaned_award_info") - - delta_lake_s3_path = CONFIG.DELTA_LAKE_S3_PATH - destination_database = "int" - - # transaction_id_lookup - destination_table = "transaction_id_lookup" - set_last_load_date = True - - logger.info(f"Creating database {destination_database}, if not already existing.") - self.spark.sql(f"CREATE DATABASE IF NOT EXISTS {destination_database}") - - logger.info(f"Creating {destination_table} table") - self.spark.sql( - f""" - CREATE OR REPLACE TABLE {destination_database}.{destination_table} ( - transaction_id LONG NOT NULL, - -- The is_fpds flag is needed in this table to allow the transaction_id_lookup ETL level to choose - -- the correct rows for deleting. - is_fpds BOOLEAN NOT NULL, - transaction_unique_id STRING NOT NULL - ) - USING DELTA - LOCATION 's3a://{self.spark_s3_bucket}/{delta_lake_s3_path}/{destination_database}/{destination_table}' - """ - ) - - # Although there SHOULDN'T be any "orphaned" transactions (transactions that are missing records - # in one of the source tables) by the time this code is ultimately run in production, putting in - # code to avoid copying orphaned transactions to the int tables, just in case. - # Due to the size of the dataset, need to keep information about the orphaned transactions in a table. - # If we tried to insert the data directly into a SQL statement, it could break the Spark driver. - with prepare_orphaned_transaction_temp_table(), prepare_orphaned_award_temp_table(): - # To avoid re-testing for raw.transaction_normalized, use a variable to keep track. Initially - # assume that the table does exist. - raw_transaction_normalized_exists = True - - # Test to see if raw.transaction_normalized exists - try: - self.spark.sql("SELECT 1 FROM raw.transaction_normalized") - except AnalysisException as e: - if re.match( - r"^\[TABLE_OR_VIEW_NOT_FOUND\] The table or view `raw`\.`transaction_normalized` cannot be found\..*$", - str(e), - re.MULTILINE, - ): - # In this case, we just don't populate transaction_id_lookup - logger.warning( - "Skipping population of transaction_id_lookup table; no raw.transaction_normalized table." - ) - raw_transaction_normalized_exists = False - # Without a raw.transaction_normalized table, can't get a maximum id from it, either. - max_id = None - else: - # Don't try to handle anything else - raise e - else: - self._insert_orphaned_transactions() - - # Extend the orphaned transactions to any transactions found in raw.transaction_normalized that - # don't have a corresponding entry in raw.transaction_fabs|fpds. Beyond the records found above, - # this will find problematic records that are duplicated (have the same transaction_unique_id) in - # raw.transaction_normalized, but only have single records with that transaction_unique_id in - # raw.transaction_fabs|fpds. - - # First, check that raw.transaction_fabs|fpds exist - try: - self.spark.sql("SELECT 1 FROM raw.transaction_fabs") - except AnalysisException as e: - if re.match( - r"^\[TABLE_OR_VIEW_NOT_FOUND\] The table or view `raw`\.`transaction_fabs` cannot be found\..*$", - str(e), - re.MULTILINE, - ): - # In this case, we just skip extending the orphaned transactions with this table - logger.warning( - "Skipping extension of orphaned_transaction_info table using raw.transaction_fabs table." - ) - - fabs_join = "" - fabs_transaction_id_where = "" - fabs_is_fpds_where = "" - else: - # Don't try to handle anything else - raise e - else: - fabs_join = """ - LEFT JOIN raw.transaction_fabs AS fabs ON ( - tn.id = fabs.transaction_id - ) - """ - fabs_transaction_id_where = "fabs.transaction_id IS NULL" - fabs_is_fpds_where = "is_fpds = FALSE" - - try: - self.spark.sql("SELECT 1 FROM raw.transaction_fpds") - except AnalysisException as e: - if re.match( - r"^\[TABLE_OR_VIEW_NOT_FOUND\] The table or view `raw`\.`transaction_fpds` cannot be found\..*$", - str(e), - re.MULTILINE, - ): - # In this case, we just skip extending the orphaned transactions with this table - logger.warning( - "Skipping extension of orphaned_transaction_info table using raw.transaction_fpds table." - ) - - fpds_join = "" - fpds_transaction_id_where = "" - fpds_is_fpds_where = "" - else: - # Don't try to handle anything else - raise e - else: - fpds_join = """ - LEFT JOIN raw.transaction_fpds AS fpds ON ( - tn.id = fpds.transaction_id - ) - """ - fpds_transaction_id_where = "fpds.transaction_id IS NULL" - fpds_is_fpds_where = "is_fpds = TRUE" - - # As long as one of raw.transaction_fabs|fpds exists, extend temp.orphaned_transaction_info table - if fabs_join or fpds_join: - if fabs_join and fpds_join: - # If both raw.transaction_fabs and raw.transaction_fpds exist, don't need *_is_fpds_where - # in WHERE clause - where_str = "".join(("WHERE ", fabs_transaction_id_where, " AND ", fpds_transaction_id_where)) - elif fabs_join: - # raw.transaction_fabs exists, but not raw.transaction_fpds - where_str = "".join(("WHERE ", fabs_transaction_id_where, " AND ", fabs_is_fpds_where)) - else: - # raw.transaction_fpds exists, but not raw.transaction_fabs - where_str = "".join(("WHERE ", fpds_transaction_id_where, " AND ", fpds_is_fpds_where)) - - logger.info( - "Finding additional orphaned transactions in raw.transaction_normalized (those with missing " - "records in raw.transaction_fabs or raw.transaction_fpds)" - ) - self.spark.sql( - f""" - INSERT INTO temp.orphaned_transaction_info - SELECT - tn.id AS transaction_id, tn.transaction_unique_id, tn.is_fpds, tn.unique_award_key - FROM raw.transaction_normalized AS tn - {fabs_join} - {fpds_join} - {where_str} - """ - ) - else: - logger.warning( - "No raw.transaction_fabs or raw.transaction_fpds tables, so not finding additional orphaned " - "transactions in raw.transaction_normalized" - ) - - # Insert existing non-orphaned transactions into the lookup table - logger.info("Populating transaction_id_lookup table") - - # Note that the transaction loader code will convert string fields to upper case, so we have to match - # on the upper-cased versions of the strings. - self.spark.sql( - f""" - INSERT OVERWRITE {destination_database}.{destination_table} - SELECT - tn.id AS transaction_id, - TRUE AS is_fpds, - tn.transaction_unique_id - FROM raw.transaction_normalized AS tn INNER JOIN raw.detached_award_procurement AS dap ON ( - tn.transaction_unique_id = ucase(dap.detached_award_proc_unique) - ) - -- Want to exclude orphaned transactions, as they will not be copied into the int schema. - WHERE tn.id NOT IN (SELECT transaction_id FROM temp.orphaned_transaction_info WHERE is_fpds) - UNION ALL - SELECT - tn.id AS transaction_id, - FALSE AS is_fpds, - tn.transaction_unique_id - FROM raw.transaction_normalized AS tn INNER JOIN raw.published_fabs AS pfabs ON ( - tn.transaction_unique_id = ucase(pfabs.afa_generated_unique) - ) - -- Want to exclude orphaned transactions, as they will not be copied into the int schema. - WHERE tn.id NOT IN (SELECT transaction_id FROM temp.orphaned_transaction_info WHERE NOT is_fpds) - """ - ) - - logger.info("Updating transaction_id_seq to the max transaction_id value") - # Make sure to get the maximum transaction id from the raw table in case there are records in - # raw.transaction_normalized that don't correspond to a record in either of the source tables. - # This way, new transaction_ids won't repeat the ids of any of those "orphaned" transaction records. - max_id = self.spark.sql("SELECT MAX(id) AS max_id FROM raw.transaction_normalized").collect()[0][ - "max_id" - ] - - if max_id is None: - # Can't set a Postgres sequence to 0, so set to 1 in this case. If this happens, the transaction IDs - # will start at 2. - max_id = 1 - # Also, don't set the last load date in this case - set_last_load_date = False - with connection.cursor() as cursor: - # Set is_called flag to false so that the next call to nextval() will return the specified value - # https://www.postgresql.org/docs/13/functions-sequence.html - cursor.execute(f"SELECT setval('transaction_id_seq', {max_id}, false)") - - if set_last_load_date: - update_last_load_date(destination_table, next_last_load) - # es_deletes should remain in lockstep with transaction load dates, so if they are reset, - # it should be reset - update_last_load_date("es_deletes", next_last_load) - - # Need a table to keep track of awards in which some, but not all, transactions are deleted. - destination_table = "award_ids_delete_modified" - - logger.info(f"Creating {destination_table} table") - self.spark.sql( - f""" - CREATE OR REPLACE TABLE {destination_database}.{destination_table} ( - award_id LONG NOT NULL - ) - USING DELTA - LOCATION - 's3a://{self.spark_s3_bucket}/{delta_lake_s3_path}/{destination_database}/{destination_table}' - """ - ) - # Nothing to add to this table yet. - - # award_id_lookup - destination_table = "award_id_lookup" - set_last_load_date = True - - if raw_transaction_normalized_exists: - # Before creating table or running INSERT, make sure unique_award_key has no NULLs - # (nothing needed to check before transaction_id_lookup table creation) - logger.info("Checking for NULLs in unique_award_key") - num_nulls = self.spark.sql( - "SELECT COUNT(*) AS count FROM raw.transaction_normalized WHERE unique_award_key IS NULL" - ).collect()[0]["count"] - - if num_nulls > 0: - raise ValueError( - f"Found {num_nulls} NULL{'s' if num_nulls > 1 else ''} in 'unique_award_key' in table " - "raw.transaction_normalized!" - ) - - logger.info(f"Creating {destination_table} table") - self.spark.sql( - f""" - CREATE OR REPLACE TABLE {destination_database}.{destination_table} ( - award_id LONG NOT NULL, - -- The is_fpds flag is needed in this table to allow the award_id_lookup ETL level to choose - -- the correct rows for deleting so that it can be run in parallel with the - -- transaction_id_lookup ETL level - is_fpds BOOLEAN NOT NULL, - transaction_unique_id STRING NOT NULL, - generated_unique_award_id STRING NOT NULL - ) - USING DELTA - LOCATION - 's3a://{self.spark_s3_bucket}/{delta_lake_s3_path}/{destination_database}/{destination_table}' - """ - ) - - if not raw_transaction_normalized_exists: - # In this case, we just don't populate award_id_lookup - logger.warning("Skipping population of award_id_lookup table; no raw.transaction_normalized table.") - - # Without a raw.transaction_normalized table, can't get a maximum award_id from it, either. - max_id = None - else: - # Insert existing non-orphaned transactions and their corresponding award_ids into the lookup table - logger.info("Populating award_id_lookup table") - - # Once again we have to match on the upper-cased versions of the strings from published_fabs - # and detached_award_procurement. - self.spark.sql( - f""" - INSERT OVERWRITE {destination_database}.{destination_table} - SELECT - existing_awards.award_id, - existing_awards.is_fpds, - existing_awards.transaction_unique_id, - existing_awards.generated_unique_award_id - FROM ( - ( - SELECT - tn.award_id, - TRUE AS is_fpds, - -- The transaction loader code will convert these to upper case, so use those - -- versions here. - ucase(dap.detached_award_proc_unique) AS transaction_unique_id, - ucase(dap.unique_award_key) AS generated_unique_award_id - FROM raw.transaction_normalized AS tn - INNER JOIN raw.detached_award_procurement AS dap ON ( - tn.transaction_unique_id = ucase(dap.detached_award_proc_unique) - ) - /* Again, want to exclude orphaned transactions, as they will not be copied into the - int schema. We have to be careful and only exclude transactions based on their - transaction_id, though! There shouldn't be, but there can be multiple - transactions with the same transaction_unique_id in raw.transaction_normalized! - We only want to exclude those records in transaction_normalized that don't have - matching records in raw.transaction_fabs|fpds. */ - WHERE tn.id NOT IN ( - SELECT transaction_id FROM temp.orphaned_transaction_info WHERE is_fpds - ) - ) - UNION ALL - ( - SELECT - tn.award_id, - FALSE AS is_fpds, - -- The transaction loader code will convert these to upper case, so use those - -- versions here. - ucase(pfabs.afa_generated_unique) AS transaction_unique_id, - ucase(pfabs.unique_award_key) AS generated_unique_award_id - FROM raw.transaction_normalized AS tn - INNER JOIN raw.published_fabs AS pfabs ON ( - tn.transaction_unique_id = ucase(pfabs.afa_generated_unique) - ) - -- See note above about excluding orphaned transactions. - WHERE tn.id NOT IN ( - SELECT transaction_id FROM temp.orphaned_transaction_info WHERE NOT is_fpds - ) - ) - ) AS existing_awards - """ - ) - - # Any award that has a transaction inserted into award_id_lookup table that also has an orphaned - # transaction is an award that will have to be updated the first time this command is called with the - # awards ETL level, so add those awards to the award_ids_delete_modified table. - logger.info("Updating award_ids_delete_modified table") - self.spark.sql( - """ - INSERT INTO int.award_ids_delete_modified - SELECT DISTINCT(award_id) - FROM int.award_id_lookup - WHERE transaction_unique_id IN ( - SELECT transaction_unique_id FROM temp.orphaned_transaction_info - ) - """ - ) - - # Awards that have orphaned transactions, but that *aren't* in the award_ids_delete_modified table are - # orphaned awards (those with no remaining transactions), so put those into the orphaned_award_info - # table. - logger.info("Populating orphaned_award_info table") - self.spark.sql( - """ - INSERT INTO temp.orphaned_award_info - SELECT DISTINCT(aidlu.award_id) - FROM temp.orphaned_transaction_info AS oti INNER JOIN int.award_id_lookup AS aidlu ON ( - oti.transaction_unique_id = aidlu.transaction_unique_id - ) - WHERE aidlu.award_id NOT IN (SELECT * FROM int.award_ids_delete_modified) - """ - ) - - logger.info("Updating award_id_seq to the max award_id value") - # As for transaction_id_seq, make sure to get the maximum award id from the raw table in case there are - # records in raw.awards that don't correspond to any records in either of the source tables. - # This way, new award_ids won't repeat the ids of any of those "orphaned" award records. - max_id = self.spark.sql("SELECT MAX(award_id) AS max_id FROM raw.transaction_normalized").collect()[0][ - "max_id" - ] - - if max_id is None: - # Can't set a Postgres sequence to 0, so set to 1 in this case. If this happens, the award IDs - # will start at 2. - max_id = 1 - # Also, don't set the last load date in this case - set_last_load_date = False - with connection.cursor() as cursor: - # Set is_called flag to false so that the next call to nextval() will return the specified value - # https://www.postgresql.org/docs/13/functions-sequence.html - cursor.execute(f"SELECT setval('award_id_seq', {max_id}, false)") - - if set_last_load_date: - update_last_load_date(destination_table, next_last_load) - # es_deletes should remain in lockstep with transaction load dates, so if they are reset, - # it should be reset - update_last_load_date("es_deletes", next_last_load) - - # Create other tables in 'int' database - for destination_table, col_names, orphaned_record_key in zip( - ("transaction_fabs", "transaction_fpds", "transaction_normalized", "awards"), - ( - TRANSACTION_FABS_COLUMNS, - TRANSACTION_FPDS_COLUMNS, - list(TRANSACTION_NORMALIZED_COLUMNS), - list(AWARDS_COLUMNS), - ), - ("transaction_id", "transaction_id", "id", "id"), - ): - call_command( - "create_delta_table", - "--destination-table", - destination_table, - "--spark-s3-bucket", - self.spark_s3_bucket, - "--alt-db", - destination_database, - ) - - if not self.no_initial_copy: - # Test to see if the raw table exists - try: - self.spark.sql(f"SELECT 1 FROM raw.{destination_table}") - except AnalysisException as e: - if re.match( - rf"^\[TABLE_OR_VIEW_NOT_FOUND\] The table or view `raw`\.`{destination_table}` cannot be found\..*$", - str(e), - re.MULTILINE, - ): - # In this case, we just don't copy anything over - logger.warning( - f"Skipping copy of {destination_table} table from 'raw' to 'int' database; " - f"no raw.{destination_table} table." - ) - else: - # Don't try to handle anything else - raise e - else: - # Handle exclusion of orphaned records - if destination_table != "awards": - orphan_str = f""" - WHERE {orphaned_record_key} NOT IN ( - SELECT transaction_id FROM temp.orphaned_transaction_info - ) - """ - else: - orphan_str = ( - f"WHERE {orphaned_record_key} NOT IN (SELECT award_id FROM temp.orphaned_award_info)" - ) - - # Handle the possibility that the order of columns is different between the raw and int tables. - self.spark.sql( - f""" - INSERT OVERWRITE {destination_database}.{destination_table} ({", ".join(col_names)}) - SELECT {", ".join(col_names)} FROM raw.{destination_table} - {orphan_str} - """ - ) - - count = self.spark.sql( - f"SELECT COUNT(*) AS count FROM {destination_database}.{destination_table}" - ).collect()[0]["count"] - if count > 0: - update_last_load_date(destination_table, next_last_load) - # es_deletes should remain in lockstep with transaction load dates, so if they are reset, - # it should be reset - update_last_load_date("es_deletes", next_last_load) - - def _insert_orphaned_transactions(self): - # First, find orphaned transactions - logger.info( - "Finding orphaned transactions in raw.transaction_normalized (those with missing records in " - "the source tables)" - ) - self.spark.sql( - """ - INSERT OVERWRITE temp.orphaned_transaction_info - SELECT tn.id AS transaction_id, tn.transaction_unique_id, tn.is_fpds, tn.unique_award_key - FROM raw.transaction_normalized AS tn - LEFT JOIN raw.detached_award_procurement AS dap ON ( - tn.transaction_unique_id = ucase(dap.detached_award_proc_unique) - ) - LEFT JOIN raw.published_fabs AS pfabs ON ( - tn.transaction_unique_id = ucase(pfabs.afa_generated_unique) - ) - WHERE dap.detached_award_proc_unique IS NULL AND pfabs.afa_generated_unique IS NULL - """ - ) diff --git a/usaspending_api/etl/tests/conftest.py b/usaspending_api/etl/tests/conftest.py index 19af2eac9f..b082e86df4 100644 --- a/usaspending_api/etl/tests/conftest.py +++ b/usaspending_api/etl/tests/conftest.py @@ -3,12 +3,6 @@ from model_bakery import baker from usaspending_api.etl.tests.data.submissions import submissions -from usaspending_api.etl.tests.integration.test_load_transactions_in_delta_lookups import ( - _BEGINNING_OF_TIME, - _INITIAL_ASSISTS, - _INITIAL_PROCURES, - _INITIAL_SOURCE_TABLE_LOAD_DATETIME, -) # Pulling in specific fixtures elsewhere __all__ = [ @@ -17,53 +11,6 @@ @pytest.fixture -def _populate_initial_source_tables_pg(db): - # Populate transactions.SourceAssistanceTransaction and associated broker.ExternalDataType data in Postgres - for assist in _INITIAL_ASSISTS: - baker.make("transactions.SourceAssistanceTransaction", **assist) - - # `name` and `external_data_type_id` must match those in `usaspending.broker.lookups` - edt = baker.make( - "broker.ExternalDataType", - name="source_assistance_transaction", - external_data_type_id=11, - update_date=_INITIAL_SOURCE_TABLE_LOAD_DATETIME, - ) - baker.make( - "broker.ExternalDataLoadDate", last_load_date=_INITIAL_SOURCE_TABLE_LOAD_DATETIME, external_data_type=edt - ) - - # Populate transactions.SourceProcurementTransaction and associated broker.ExternalDataType data in Postgres - for procure in _INITIAL_PROCURES: - baker.make("transactions.SourceProcurementTransaction", **procure) - - # `name` and `external_data_type_id` must match those in `usaspending.broker.lookups` - edt = baker.make( - "broker.ExternalDataType", - name="source_procurement_transaction", - external_data_type_id=10, - update_date=_INITIAL_SOURCE_TABLE_LOAD_DATETIME, - ) - baker.make( - "broker.ExternalDataLoadDate", last_load_date=_INITIAL_SOURCE_TABLE_LOAD_DATETIME, external_data_type=edt - ) - - # Also need to populate values for es_deletes, int.transaction_[fabs|fpds|normalized], int.awards, - # and id lookup tables in broker.ExternalData[Type|LoadDate] tables - # `name` and `external_data_type_id` must match those in `usaspending.broker.lookups` - edt = baker.make("broker.ExternalDataType", name="es_deletes", external_data_type_id=102, update_date=None) - baker.make("broker.ExternalDataLoadDate", last_load_date=_BEGINNING_OF_TIME, external_data_type=edt) - - for table_name, id in zip( - ( - "transaction_fpds", - "transaction_fabs", - "transaction_normalized", - "awards", - "transaction_id_lookup", - "award_id_lookup", - ), - range(201, 207), - ): - edt = baker.make("broker.ExternalDataType", name=table_name, external_data_type_id=id, update_date=None) - baker.make("broker.ExternalDataLoadDate", last_load_date=_BEGINNING_OF_TIME, external_data_type=edt) +def external_data_type(db): + for _id in [201, 202, 203, 204]: + baker.make("broker.ExternalDataType", external_data_type_id=_id) diff --git a/usaspending_api/etl/tests/data/delta_model_for_test.py b/usaspending_api/etl/tests/data/delta_model_for_test.py index e4d63eb538..a65411ee37 100644 --- a/usaspending_api/etl/tests/data/delta_model_for_test.py +++ b/usaspending_api/etl/tests/data/delta_model_for_test.py @@ -42,5 +42,6 @@ class Meta: "custom_schema": "", "column_names": ["id", "test_timestamp"], "tsvectors": None, + "add_hash_field": False, } } diff --git a/usaspending_api/etl/tests/integration/test_load_transactions.py b/usaspending_api/etl/tests/integration/test_load_transactions.py new file mode 100644 index 0000000000..7b0dee6958 --- /dev/null +++ b/usaspending_api/etl/tests/integration/test_load_transactions.py @@ -0,0 +1,254 @@ +from datetime import datetime, timedelta, timezone + +import pytest +from django.core.management import call_command +from usaspending_api.common.helpers.spark_helpers import load_dict_to_delta_table + +_BEGINNING_OF_TIME = datetime(1970, 1, 1, tzinfo=timezone.utc) +_INITIAL_DATETIME = datetime(2022, 10, 31, tzinfo=timezone.utc) +_INITIAL_SOURCE_TABLE_LOAD_DATETIME = _INITIAL_DATETIME + timedelta(hours=12) +_INITIAL_ASSISTS = [ + { + "published_fabs_id": 1, + "afa_generated_unique": "award_assist_0001_trans_0001", + "action_date": _INITIAL_DATETIME.isoformat(), + "created_at": _INITIAL_DATETIME, + "updated_at": _INITIAL_DATETIME, + "is_active": True, + "unique_award_key": "award_assist_0001", + "hash": 1020304, + }, + { + "published_fabs_id": 2, + "afa_generated_unique": "award_assist_0002_trans_0001", + "action_date": _INITIAL_DATETIME.isoformat(), + "created_at": _INITIAL_DATETIME, + "updated_at": _INITIAL_DATETIME, + "is_active": True, + "unique_award_key": "award_assist_0002", + "hash": 5060708, + }, + { + "published_fabs_id": 3, + "afa_generated_unique": "award_assist_0002_trans_0002", + # Deliberately formatting this action_date somewhat unusually. + "action_date": _INITIAL_DATETIME.strftime("%Y%m%d"), + "created_at": _INITIAL_DATETIME, + "updated_at": _INITIAL_DATETIME, + "is_active": True, + "unique_award_key": "award_assist_0002", + "hash": 9101112, + }, + { + "published_fabs_id": 4, + "afa_generated_unique": "award_assist_0003_trans_0001", + # Deliberately formatting this action_date somewhat unusually. + "action_date": _INITIAL_DATETIME.strftime("%Y%m%d"), + "created_at": _INITIAL_DATETIME, + "updated_at": _INITIAL_DATETIME, + "is_active": True, + "unique_award_key": "award_assist_0003", + "hash": 13141516, + }, + { + "published_fabs_id": 5, + "afa_generated_unique": "award_assist_0004_trans_0001", + "action_date": _INITIAL_DATETIME.isoformat(), + "created_at": _INITIAL_DATETIME, + "updated_at": _INITIAL_DATETIME, + "is_active": True, + "unique_award_key": "award_assist_0004", + "hash": 17181920, + }, +] +_INITIAL_PROCURES = [ + { + "detached_award_procurement_id": 1, + "detached_award_proc_unique": "award_procure_0001_trans_0001", + "action_date": _INITIAL_DATETIME.isoformat(), + "created_at": _INITIAL_DATETIME, + "updated_at": _INITIAL_DATETIME, + "unique_award_key": "award_procure_0001", + "hash": 1020304, + }, + { + "detached_award_procurement_id": 2, + "detached_award_proc_unique": "award_procure_0002_trans_0001", + "action_date": _INITIAL_DATETIME.isoformat(), + "created_at": _INITIAL_DATETIME, + "updated_at": _INITIAL_DATETIME, + "unique_award_key": "award_procure_0002", + "hash": 5060708, + }, + { + "detached_award_procurement_id": 3, + "detached_award_proc_unique": "award_procure_0002_trans_0002", + # Deliberately formatting this action_date somewhat unusually. + "action_date": _INITIAL_DATETIME.strftime("%Y%m%d"), + "created_at": _INITIAL_DATETIME, + "updated_at": _INITIAL_DATETIME, + "unique_award_key": "award_procure_0002", + "hash": 9101112, + }, + { + "detached_award_procurement_id": 4, + "detached_award_proc_unique": "award_procure_0003_trans_0001", + # Deliberately formatting this action_date somewhat unusually. + "action_date": _INITIAL_DATETIME.strftime("%Y%m%d"), + "created_at": _INITIAL_DATETIME, + "updated_at": _INITIAL_DATETIME, + "unique_award_key": "award_procure_0003", + "hash": 13141516, + }, + { + "detached_award_procurement_id": 5, + "detached_award_proc_unique": "award_procure_0003_trans_0002", + "action_date": _INITIAL_DATETIME.isoformat(), + "created_at": _INITIAL_DATETIME, + "updated_at": _INITIAL_DATETIME, + "unique_award_key": "award_procure_0003", + "hash": 17181920, + }, +] + + +@pytest.mark.django_db +def test_load_transactions(spark, s3_unittest_data_bucket, hive_unittest_metastore_db, external_data_type): + + # Load initial Data + load_dict_to_delta_table( + spark, s3_unittest_data_bucket, "raw", "detached_award_procurement", _INITIAL_PROCURES, True + ) + load_dict_to_delta_table(spark, s3_unittest_data_bucket, "raw", "published_fabs", _INITIAL_ASSISTS, True) + + # Create Delta Tables + call_command("create_delta_table", "--destination-table=transaction_fpds", "--alt-db=int") + call_command("create_delta_table", "--destination-table=transaction_fabs", "--alt-db=int") + call_command("create_delta_table", "--destination-table=transaction_normalized", "--alt-db=int") + call_command("create_delta_table", "--destination-table=awards", "--alt-db=int") + + # Load Transactions + call_command("load_transaction_fpds_in_delta") + call_command("load_transaction_fabs_in_delta") + call_command("load_transaction_normalized") + call_command("load_awards_in_delta") + + # Check Transaction FPDS + fpds_df = spark.sql("select * from int.transaction_fpds") + fpds_transactions = set( + row["detached_award_proc_unique"] for row in fpds_df.select("detached_award_proc_unique").collect() + ) + initial_fpds_transactions = set( + transaction["detached_award_proc_unique"].upper() for transaction in _INITIAL_PROCURES + ) + assert initial_fpds_transactions == fpds_transactions + + # Check Transaction FABS + fabs_df = spark.sql("select * from int.transaction_fabs") + fabs_transactions = set(row["afa_generated_unique"] for row in fabs_df.select("afa_generated_unique").collect()) + initial_fabs_transactions = set(transaction["afa_generated_unique"].upper() for transaction in _INITIAL_ASSISTS) + assert initial_fabs_transactions == fabs_transactions + + # Check Transaction Normalized + norm_df = spark.sql("select * from int.transaction_normalized") + norm_transactions = set(row["transaction_unique_id"] for row in norm_df.select("transaction_unique_id").collect()) + assert norm_transactions == initial_fpds_transactions.union(initial_fabs_transactions) + + # Check Awards + award_df = spark.sql("select * from int.awards") + awards = set(row["generated_unique_award_id"] for row in award_df.select("generated_unique_award_id").collect()) + initial_awards = set(transaction["unique_award_key"].upper() for transaction in _INITIAL_PROCURES).union( + set(transaction["unique_award_key"].upper() for transaction in _INITIAL_ASSISTS) + ) + assert initial_awards == awards + + # Delete some contracts and assistance + spark.sql("DELETE FROM raw.detached_award_procurement WHERE unique_award_key = 'award_procure_0001'") + spark.sql("DELETE FROM raw.published_fabs WHERE unique_award_key = 'award_assist_0001'") + + # Update a transaction + spark.sql( + "UPDATE raw.published_fabs set award_description = 'test award', hash = 25262728 where published_fabs_id = 2" + ) + + # Load some new contracts and assistance + new_assist = [ + { + "published_fabs_id": 6, + "afa_generated_unique": "award_assist_0005_trans_0001", + "action_date": _INITIAL_DATETIME.isoformat(), + "created_at": _INITIAL_DATETIME, + "updated_at": _INITIAL_DATETIME, + "is_active": True, + "unique_award_key": "award_assist_0005", + "hash": 21222324, + } + ] + new_proc = [ + { + "detached_award_procurement_id": 6, + "detached_award_proc_unique": "award_procure_0004_trans_0001", + "action_date": _INITIAL_DATETIME.isoformat(), + "created_at": _INITIAL_DATETIME, + "updated_at": _INITIAL_DATETIME, + "unique_award_key": "award_procure_0004", + "hash": 21222324, + } + ] + load_dict_to_delta_table(spark, s3_unittest_data_bucket, "raw", "detached_award_procurement", new_proc, False) + load_dict_to_delta_table(spark, s3_unittest_data_bucket, "raw", "published_fabs", new_assist, False) + + # Reload Transactions + call_command("load_transaction_fpds_in_delta") + call_command("load_transaction_fabs_in_delta") + call_command("load_transaction_normalized") + call_command("load_awards_in_delta") + + # Check Transaction FPDS + fpds_df = spark.sql("select * from int.transaction_fpds") + fpds_transactions = set( + row["detached_award_proc_unique"] for row in fpds_df.select("detached_award_proc_unique").collect() + ) + expected_fpds_transactions = set( + transaction["detached_award_proc_unique"].upper() + for transaction in _INITIAL_PROCURES + new_proc + if transaction["unique_award_key"] != "award_procure_0001" + ) + assert expected_fpds_transactions == fpds_transactions + + # Check Transaction FABS + fabs_df = spark.sql("select * from int.transaction_fabs") + fabs_transactions = set(row["afa_generated_unique"] for row in fabs_df.select("afa_generated_unique").collect()) + expected_fabs_transactions = set( + transaction["afa_generated_unique"].upper() + for transaction in _INITIAL_ASSISTS + new_assist + if transaction["unique_award_key"] != "award_assist_0001" + ) + assert expected_fabs_transactions == fabs_transactions + + # Check Transaction Normalized + norm_df = spark.sql("select * from int.transaction_normalized") + norm_transactions = set(row["transaction_unique_id"] for row in norm_df.select("transaction_unique_id").collect()) + assert norm_transactions == expected_fpds_transactions.union(expected_fabs_transactions) + + # Check Awards + award_df = spark.sql("select * from int.awards") + awards = set(row["generated_unique_award_id"] for row in award_df.select("generated_unique_award_id").collect()) + expected_awards = set( + transaction["unique_award_key"].upper() + for transaction in _INITIAL_PROCURES + new_proc + if transaction["unique_award_key"] != "award_procure_0001" + ).union( + set( + transaction["unique_award_key"].upper() + for transaction in _INITIAL_ASSISTS + new_assist + if transaction["unique_award_key"] != "award_assist_0001" + ) + ) + assert expected_awards == awards + assert ( + spark.sql("select description from int.awards where generated_unique_award_id = 'AWARD_ASSIST_0002'") + .collect()[0] + .description + == "TEST AWARD" + ) diff --git a/usaspending_api/etl/tests/integration/test_load_transactions_in_delta_fabs_fpds.py b/usaspending_api/etl/tests/integration/test_load_transactions_in_delta_fabs_fpds.py deleted file mode 100644 index bf74f2e3fc..0000000000 --- a/usaspending_api/etl/tests/integration/test_load_transactions_in_delta_fabs_fpds.py +++ /dev/null @@ -1,600 +0,0 @@ -"""Automated Unit Tests for the loading of transaction and award tables in Delta Lake. - -NOTE: Uses Pytest Fixtures from immediate parent conftest.py: usaspending_api/etl/tests/conftest.py -""" - -from copy import deepcopy -from datetime import datetime, timedelta, timezone -from django.core.management import call_command -from model_bakery import baker -from pytest import mark - -from usaspending_api.broker.helpers.last_load_date import get_last_load_date, update_last_load_date -from usaspending_api.etl.tests.integration.test_load_to_from_delta import load_delta_table_from_postgres, equal_datasets -from usaspending_api.etl.tests.integration.test_load_transactions_in_delta_lookups import ( - _BEGINNING_OF_TIME, - _INITIAL_SOURCE_TABLE_LOAD_DATETIME, - _InitialRunWithPostgresLoader, - _TableLoadInfo, - TestInitialRun as InitialRun, # Remove 'test' prefix to avoid pytest running these tests twice - TestInitialRunNoPostgresLoader as InitialRunNoPostgresLoader, # Remove 'test' prefix to avoid pytest running these tests twice -) -from usaspending_api.config import CONFIG -from usaspending_api.etl.management.commands.load_table_to_delta import TABLE_SPEC - - -class _TransactionFabsFpdsCore: - - new_transaction_fabs_fpds_id = 6 - new_transaction_id = 11 - - def __init__( - self, - spark, - s3_data_bucket, - etl_level, - pk_field, - compare_fields, - usas_source_table_name, - broker_source_table_name, - baker_table, - baker_kwargs, - expected_initial_transaction_fabs, - expected_initial_transaction_fpds, - ): - self.spark = spark - self.s3_data_bucket = s3_data_bucket - self.etl_level = etl_level - self.pk_field = pk_field - self.usas_source_table_name = usas_source_table_name - self.broker_source_table_name = broker_source_table_name - self.baker_table = baker_table - self.compare_fields = compare_fields - self.baker_kwargs = baker_kwargs - self.expected_initial_transaction_fabs = expected_initial_transaction_fabs - self.expected_initial_transaction_fpds = expected_initial_transaction_fpds - - def unexpected_paths_source_tables_only_test_core(self): - # Setup some source tables without data, this test does not require these tables to be populated - raw_db = "raw" - self.spark.sql(f"create database if not exists {raw_db};") - self.spark.sql(f"use {raw_db};") - self.spark.sql( - TABLE_SPEC["published_fabs"]["delta_table_create_sql"].format( - DESTINATION_TABLE="published_fabs", - DESTINATION_DATABASE=raw_db, - SPARK_S3_BUCKET=self.s3_data_bucket, - DELTA_LAKE_S3_PATH=CONFIG.DELTA_LAKE_S3_PATH, - ) - ) - self.spark.sql( - TABLE_SPEC["detached_award_procurement"]["delta_table_create_sql"].format( - DESTINATION_TABLE="detached_award_procurement", - DESTINATION_DATABASE=raw_db, - SPARK_S3_BUCKET=self.s3_data_bucket, - DELTA_LAKE_S3_PATH=CONFIG.DELTA_LAKE_S3_PATH, - ) - ) - - # 1. Call load_transactions_in_delta with etl-level of initial_run first, but without first loading - # raw.transaction_normalized or raw.awards. Then immediately call load_transactions_in_delta with - # etl-level of transaction_f[ab|pd]s. - InitialRun.initial_run(self.s3_data_bucket) - call_command("load_transactions_in_delta", "--etl-level", self.etl_level) - - # Verify the transaction and award id lookup tables and other int transaction tables. They should all be empty. - kwargs = { - "expected_last_load_transaction_id_lookup": _BEGINNING_OF_TIME, - "expected_last_load_award_id_lookup": _BEGINNING_OF_TIME, - "expected_last_load_transaction_normalized": _BEGINNING_OF_TIME, - "expected_last_load_transaction_fabs": _BEGINNING_OF_TIME, - "expected_last_load_transaction_fpds": _BEGINNING_OF_TIME, - } - # Even though nothing will have been loaded to that table, the table whose etl_level has been called will - # have its last load date set to the date of the source tables' load. - kwargs[f"expected_last_load_{self.etl_level}"] = _INITIAL_SOURCE_TABLE_LOAD_DATETIME - InitialRun.verify(self.spark, [], [], **kwargs) - - # 2. With raw.transaction_normalized and raw.awards still not created, call load_transactions_in_delta - # with etl-level of transaction_id_lookup, and then again with etl-level of transaction_f[ab|pd]s. - - # Since the call to load_transactions_in_delta with etl-level of transaction_f[ab|pd]s above succeeded, we first - # need to reset the last load date on transaction_fabs - update_last_load_date(self.etl_level, _BEGINNING_OF_TIME) - - call_command("load_transactions_in_delta", "--etl-level", "transaction_id_lookup") - call_command("load_transactions_in_delta", "--etl-level", self.etl_level) - - # The expected transaction_id_lookup table should be the same as in _InitialRunWithPostgresLoader, - # but all of the transaction ids should be 1 larger than expected there. - expected_transaction_id_lookup = deepcopy(_InitialRunWithPostgresLoader.expected_initial_transaction_id_lookup) - for item in expected_transaction_id_lookup: - item["transaction_id"] += 1 - # Also, the last load date of the transaction_id_lookup table and of the table whose etl_level is being - # called should be updated to the load time of the source tables - kwargs["expected_last_load_transaction_id_lookup"] = _INITIAL_SOURCE_TABLE_LOAD_DATETIME - kwargs[f"expected_last_load_{self.etl_level}"] = _INITIAL_SOURCE_TABLE_LOAD_DATETIME - InitialRun.verify( - self.spark, - expected_transaction_id_lookup, - [], - 0, - len(self.expected_initial_transaction_fabs), - len(self.expected_initial_transaction_fpds), - **kwargs, - ) - - # Verify key fields in transaction_f[ab|pd]s table. Note that the transaction_ids should be 1 more than - # in those from _InitialRunWithPostgresLoader - query = f"SELECT {', '.join(self.compare_fields)} FROM int.{self.etl_level} ORDER BY {self.pk_field}" - delta_data = [row.asDict() for row in self.spark.sql(query).collect()] - - if len(self.expected_initial_transaction_fabs) > 0: - expected_transaction_fabs_fpds = deepcopy(self.expected_initial_transaction_fabs) - else: - expected_transaction_fabs_fpds = deepcopy(self.expected_initial_transaction_fpds) - for item in expected_transaction_fabs_fpds: - item["transaction_id"] += 1 - assert equal_datasets(expected_transaction_fabs_fpds, delta_data, "") - - def unexpected_paths_test_core( - self, load_other_raw_tables, expected_initial_transaction_id_lookup, expected_initial_award_id_lookup - ): - # 1. Call load_transactions_in_delta with etl-level of initial_run first, making sure to load - # raw.transaction_normalized along with the source tables, but don't copy the raw tables to int. - # Then immediately call load_transactions_in_delta with etl-level of transaction_f[ab|pd]s. - InitialRun.initial_run(self.s3_data_bucket, load_other_raw_tables=load_other_raw_tables, initial_copy=False) - call_command("load_transactions_in_delta", "--etl-level", self.etl_level) - - # Even without the call to load_transactions_in_delta with etl-level of transaction_id_lookup, the appropriate - # data will be populated in the transaction_id_lookup table via initial_run to allow the call to - # load_transactions_in_delta with etl-level of transaction_fabs to populate int.transaction_fabs correctly with - # the initial data. - kwargs = { - "expected_last_load_transaction_id_lookup": _INITIAL_SOURCE_TABLE_LOAD_DATETIME, - "expected_last_load_award_id_lookup": _INITIAL_SOURCE_TABLE_LOAD_DATETIME, - "expected_last_load_transaction_normalized": _BEGINNING_OF_TIME, - "expected_last_load_transaction_fabs": _BEGINNING_OF_TIME, - "expected_last_load_transaction_fpds": _BEGINNING_OF_TIME, - } - kwargs[f"expected_last_load_{self.etl_level}"] = _INITIAL_SOURCE_TABLE_LOAD_DATETIME - InitialRun.verify( - self.spark, - expected_initial_transaction_id_lookup, - expected_initial_award_id_lookup, - 0, - len(self.expected_initial_transaction_fabs), - len(self.expected_initial_transaction_fpds), - **kwargs, - ) - - # Verify key fields in transaction_fabs table. - query = f"SELECT {', '.join(self.compare_fields)} FROM int.{self.etl_level} ORDER BY {self.pk_field}" - delta_data = [row.asDict() for row in self.spark.sql(query).collect()] - if len(self.expected_initial_transaction_fabs) > 0: - assert equal_datasets(self.expected_initial_transaction_fabs, delta_data, "") - else: - assert equal_datasets(self.expected_initial_transaction_fpds, delta_data, "") - - # 2. Test inserting, updating, and deleting without calling load_transactions_in_delta with etl-level - # of transaction_id_lookup before calling load_transactions_in_delta with etl-level of transaction_f[ab|pd]s. - - # Since changes to the source tables will go to the Postgres table first, use model baker to add new rows to - # Postgres table, and then push the updated table to Delta. - last_load_datetime = datetime.now(timezone.utc) - insert_update_datetime = last_load_datetime + timedelta(minutes=-15) - self.baker_kwargs.update( - { - "action_date": insert_update_datetime.isoformat(), - "created_at": insert_update_datetime, - "updated_at": insert_update_datetime, - } - ) - baker.make(self.baker_table, **self.baker_kwargs) - update_last_load_date(self.broker_source_table_name, last_load_datetime) - load_delta_table_from_postgres(self.usas_source_table_name, self.s3_data_bucket) - - self.spark.sql( - f""" - UPDATE raw.{self.usas_source_table_name} - SET updated_at = '{insert_update_datetime}' - WHERE {self.pk_field} = 4 OR {self.pk_field} = 5 - """ - ) - - self.spark.sql( - f""" - DELETE FROM raw.{self.usas_source_table_name} - WHERE {self.pk_field} = 2 OR {self.pk_field} = 3 - """ - ) - - call_command("load_transactions_in_delta", "--etl-level", self.etl_level) - - # Verify the transaction and award id lookup tables. Without a call to load_transactions_in_delta with an - # --etl-level of transaction_id_lookup or award_id_lookup, they should be the same as during the initial run. - InitialRun.verify( - self.spark, - expected_initial_transaction_id_lookup, - expected_initial_award_id_lookup, - 0, - len(self.expected_initial_transaction_fabs), - len(self.expected_initial_transaction_fpds), - **kwargs, - ) - - # Verify key fields in transaction_f[ab|pd]s table - query = f"SELECT {', '.join(self.compare_fields)} FROM int.{self.etl_level} ORDER BY {self.pk_field}" - delta_data = [row.asDict() for row in self.spark.sql(query).collect()] - - # With no call to load_transactions_in_delta with etl-level of transaction_id_lookup, the above call to - # load_transactions_in_delta with etl-level of transaction_f[ab|pd]s *should* pick up the *updates* in the - # published f[ab|pd]s table because those transactions already exist in the transaction_id_lookup table. - # However, this call should *NOT* pick up the inserts or deletes, since those transactions will not - # have changed in the transaction_id_lookup table. - if len(self.expected_initial_transaction_fabs) > 0: - expected_transaction_fabs_fpds = deepcopy(self.expected_initial_transaction_fabs) - else: - expected_transaction_fabs_fpds = deepcopy(self.expected_initial_transaction_fpds) - expected_transaction_fabs_fpds[-2]["updated_at"] = insert_update_datetime - expected_transaction_fabs_fpds[-1]["updated_at"] = insert_update_datetime - assert equal_datasets(expected_transaction_fabs_fpds, delta_data, "") - - def unexpected_paths_no_pg_loader_test_core(self): - self.unexpected_paths_test_core( - [ - _TableLoadInfo( - self.spark, - "transaction_normalized", - InitialRunNoPostgresLoader.initial_transaction_normalized, - ) - ], - InitialRunNoPostgresLoader.expected_initial_transaction_id_lookup, - InitialRunNoPostgresLoader.expected_initial_award_id_lookup, - ) - - def happy_paths_test_core( - self, - load_other_raw_tables, - expected_initial_transaction_id_lookup, - expected_initial_award_id_lookup, - expected_transaction_id_lookup_pops, - expected_transaction_id_lookup_append, - expected_transaction_fabs_fpds_append, - ): - # 1, Test calling load_transactions_in_delta with etl-level of transaction_f[ab|pd]s after calling with - # etl-levels of initial_run and transaction_id_lookup. - InitialRun.initial_run(self.s3_data_bucket, load_other_raw_tables=load_other_raw_tables) - call_command("load_transactions_in_delta", "--etl-level", "transaction_id_lookup") - call_command("load_transactions_in_delta", "--etl-level", self.etl_level) - - # Verify the tables. The transaction and award id lookup tables should be the same as during the initial run. - # The transaction_normalized and transaction_f[ab|pd]s tables should have been copied from raw to int. - kwargs = { - "expected_last_load_transaction_id_lookup": _INITIAL_SOURCE_TABLE_LOAD_DATETIME, - "expected_last_load_award_id_lookup": _INITIAL_SOURCE_TABLE_LOAD_DATETIME, - "expected_last_load_transaction_normalized": _INITIAL_SOURCE_TABLE_LOAD_DATETIME, - "expected_last_load_transaction_fabs": _BEGINNING_OF_TIME, - "expected_last_load_transaction_fpds": _BEGINNING_OF_TIME, - } - kwargs[f"expected_last_load_{self.etl_level}"] = _INITIAL_SOURCE_TABLE_LOAD_DATETIME - InitialRun.verify( - self.spark, - expected_initial_transaction_id_lookup, - expected_initial_award_id_lookup, - len(expected_initial_transaction_id_lookup), - len(self.expected_initial_transaction_fabs), - len(self.expected_initial_transaction_fpds), - **kwargs, - ) - - # Verify key fields in transaction_fabs table - transaction_fabs_fpds_query = ( - f"SELECT {', '.join(self.compare_fields)} FROM int.{self.etl_level} ORDER BY {self.pk_field}" - ) - delta_data = [row.asDict() for row in self.spark.sql(transaction_fabs_fpds_query).collect()] - if len(self.expected_initial_transaction_fabs) > 0: - assert equal_datasets(self.expected_initial_transaction_fabs, delta_data, "") - else: - assert equal_datasets(self.expected_initial_transaction_fpds, delta_data, "") - - # 2. Test inserting, updating, and deleting records followed by calling load_transactions_in_delta with - # etl-levels of transaction_id_lookup and then transaction_f[ab|pd]s. - - # Since changes to the source tables will go to the Postgres table first, use model baker to add new rows to - # Postgres table, and then push the updated table to Delta. - last_load_datetime = datetime.now(timezone.utc) - insert_update_datetime = last_load_datetime + timedelta(minutes=-15) - self.baker_kwargs.update( - { - "action_date": insert_update_datetime.isoformat(), - "created_at": insert_update_datetime, - "updated_at": insert_update_datetime, - } - ) - baker.make(self.baker_table, **self.baker_kwargs) - update_last_load_date(self.broker_source_table_name, last_load_datetime) - load_delta_table_from_postgres(self.usas_source_table_name, self.s3_data_bucket) - - self.spark.sql( - f""" - UPDATE raw.{self.usas_source_table_name} - SET updated_at = '{insert_update_datetime}' - WHERE {self.pk_field} = 4 OR {self.pk_field} = 5 - """ - ) - - self.spark.sql( - f""" - DELETE FROM raw.{self.usas_source_table_name} - WHERE {self.pk_field} = 2 OR {self.pk_field} = 3 - """ - ) - - self.spark.sql( - f""" - UPDATE raw.{self.usas_source_table_name} - SET place_of_perform_country_c = 'UNITED STATES' - WHERE {self.pk_field} = 4 OR {self.pk_field} = 5 - """ - ) - - self.spark.sql( - f""" - UPDATE raw.{self.usas_source_table_name} - SET legal_entity_country_code = 'UNITED STATES' - WHERE {self.pk_field} = 4 OR {self.pk_field} = 5 - """ - ) - - self.spark.sql( - f""" - UPDATE raw.{self.usas_source_table_name} - SET place_of_perform_country_n = 'USA' - WHERE {self.pk_field} = 4 OR {self.pk_field} = 5 - """ - ) - - self.spark.sql( - f""" - UPDATE raw.{self.usas_source_table_name} - SET legal_entity_country_name = 'USA' - WHERE {self.pk_field} = 4 OR {self.pk_field} = 5 - """ - ) - - # Need to load changes into the transaction_id_lookup table. - call_command("load_transactions_in_delta", "--etl-level", "transaction_id_lookup") - call_command("load_transactions_in_delta", "--etl-level", self.etl_level) - - # Verify transaction_id_lookup table - query = "SELECT * FROM int.transaction_id_lookup ORDER BY transaction_id" - delta_data = [row.asDict() for row in self.spark.sql(query).collect()] - - expected_transaction_id_lookup = deepcopy(expected_initial_transaction_id_lookup) - for pop_index in expected_transaction_id_lookup_pops: - expected_transaction_id_lookup.pop(pop_index) - expected_transaction_id_lookup_append.update( - { - "transaction_id": self.new_transaction_id, - } - ) - expected_transaction_id_lookup.append(expected_transaction_id_lookup_append) - assert equal_datasets(expected_transaction_id_lookup, delta_data, "") - - # Verify country code scalar transformation - query = f"SELECT DISTINCT legal_entity_country_code, place_of_perform_country_c FROM int.{self.etl_level} WHERE {self.pk_field} = 4 OR {self.pk_field} = 5" - delta_data = [row.asDict() for row in self.spark.sql(query).collect()] - assert len(delta_data) == 1 - assert delta_data[0]["legal_entity_country_code"] == "USA" - assert delta_data[0]["place_of_perform_country_c"] == "USA" - - # Verify country name scalar transformation - query = f"SELECT DISTINCT legal_entity_country_name, place_of_perform_country_n FROM int.{self.etl_level} WHERE {self.pk_field} = 4 OR {self.pk_field} = 5" - delta_data = [row.asDict() for row in self.spark.sql(query).collect()] - assert len(delta_data) == 1 - assert delta_data[0]["legal_entity_country_name"] == "UNITED STATES" - assert delta_data[0]["place_of_perform_country_n"] == "UNITED STATES" - - # Verify key fields in transaction_f[ab|pd]s table - delta_data = [row.asDict() for row in self.spark.sql(transaction_fabs_fpds_query).collect()] - - if len(self.expected_initial_transaction_fabs) > 0: - expected_transaction_fabs_fpds = deepcopy(self.expected_initial_transaction_fabs) - else: - expected_transaction_fabs_fpds = deepcopy(self.expected_initial_transaction_fpds) - expected_transaction_fabs_fpds.pop(1) - expected_transaction_fabs_fpds.pop(1) - expected_transaction_fabs_fpds[-2]["updated_at"] = insert_update_datetime - expected_transaction_fabs_fpds[-1]["updated_at"] = insert_update_datetime - expected_transaction_fabs_fpds_append.update( - { - "transaction_id": self.new_transaction_id, - "action_date": insert_update_datetime.date().isoformat(), - "created_at": insert_update_datetime, - "updated_at": insert_update_datetime, - } - ) - expected_transaction_fabs_fpds.append(expected_transaction_fabs_fpds_append) - assert equal_datasets(expected_transaction_fabs_fpds, delta_data, "") - - # Verify that the last_load_dates of the transaction_id_lookup table and the table whose etl_level has been - # called did NOT change, since only one of the broker source tables' last load date was changed. - assert get_last_load_date("transaction_id_lookup") == _INITIAL_SOURCE_TABLE_LOAD_DATETIME - assert get_last_load_date(self.etl_level) == _INITIAL_SOURCE_TABLE_LOAD_DATETIME - - def happy_paths_no_pg_loader_test_core( - self, - initial_transaction_fabs_fpds, - expected_transaction_id_lookup_pops, - expected_transaction_id_lookup_append, - expected_transaction_fabs_fpds_append, - ): - self.happy_paths_test_core( - ( - _TableLoadInfo( - self.spark, - "transaction_normalized", - InitialRunNoPostgresLoader.initial_transaction_normalized, - ), - _TableLoadInfo( - self.spark, - self.etl_level, - initial_transaction_fabs_fpds, - ), - _TableLoadInfo(self.spark, "awards", InitialRunNoPostgresLoader.initial_awards), - ), - InitialRunNoPostgresLoader.expected_initial_transaction_id_lookup, - InitialRunNoPostgresLoader.expected_initial_award_id_lookup, - expected_transaction_id_lookup_pops, - expected_transaction_id_lookup_append, - expected_transaction_fabs_fpds_append, - ) - - -class TestTransactionFabs: - - etl_level = "transaction_fabs" - pk_field = "published_fabs_id" - usas_source_table_name = "published_fabs" - broker_source_table_name = "source_assistance_transaction" - baker_table = "transactions.SourceAssistanceTransaction" - compare_fields = _InitialRunWithPostgresLoader.expected_initial_transaction_fabs[0].keys() - new_afa_generated_unique = "award_assist_0004_trans_0001" - new_unique_award_key = "award_assist_0004" - baker_kwargs = { - "published_fabs_id": _TransactionFabsFpdsCore.new_transaction_fabs_fpds_id, - "afa_generated_unique": new_afa_generated_unique, - "is_active": True, - "unique_award_key": new_unique_award_key, - } - expected_transaction_id_lookup_append = { - "is_fpds": False, - "transaction_unique_id": new_afa_generated_unique.upper(), - } - expected_transaction_fabs_fpds_append = { - "afa_generated_unique": new_afa_generated_unique.upper(), - "is_active": True, - "published_fabs_id": _TransactionFabsFpdsCore.new_transaction_fabs_fpds_id, - "unique_award_key": new_unique_award_key.upper(), - } - - def _generate_transaction_fabs_fpds_core(self, spark, s3_data_bucket, expected_initial_transaction_fabs): - return _TransactionFabsFpdsCore( - spark, - s3_data_bucket, - self.etl_level, - self.pk_field, - self.compare_fields, - self.usas_source_table_name, - self.broker_source_table_name, - self.baker_table, - deepcopy(self.baker_kwargs), - expected_initial_transaction_fabs, - [], - ) - - @mark.django_db(transaction=True) - def test_unexpected_paths_source_tables_only( - self, spark, s3_unittest_data_bucket, hive_unittest_metastore_db, _populate_initial_source_tables_pg - ): - transaction_fabs_fpds_core = self._generate_transaction_fabs_fpds_core( - spark, s3_unittest_data_bucket, _InitialRunWithPostgresLoader.expected_initial_transaction_fabs - ) - transaction_fabs_fpds_core.unexpected_paths_source_tables_only_test_core() - - @mark.django_db(transaction=True) - def test_unexpected_paths_no_pg_loader( - self, spark, s3_unittest_data_bucket, hive_unittest_metastore_db, _populate_initial_source_tables_pg - ): - transaction_fabs_fpds_core = self._generate_transaction_fabs_fpds_core( - spark, s3_unittest_data_bucket, InitialRunNoPostgresLoader.initial_transaction_fabs - ) - transaction_fabs_fpds_core.unexpected_paths_no_pg_loader_test_core() - - @mark.django_db(transaction=True) - def test_happy_paths_no_pg_loader( - self, spark, s3_unittest_data_bucket, hive_unittest_metastore_db, _populate_initial_source_tables_pg - ): - transaction_fabs_fpds_core = self._generate_transaction_fabs_fpds_core( - spark, s3_unittest_data_bucket, InitialRunNoPostgresLoader.initial_transaction_fabs - ) - transaction_fabs_fpds_core.happy_paths_no_pg_loader_test_core( - InitialRunNoPostgresLoader.initial_transaction_fabs, - (2, 3), - self.expected_transaction_id_lookup_append, - self.expected_transaction_fabs_fpds_append, - ) - - -class TestTransactionFpds: - - etl_level = "transaction_fpds" - pk_field = "detached_award_procurement_id" - usas_source_table_name = "detached_award_procurement" - broker_source_table_name = "source_procurement_transaction" - baker_table = "transactions.SourceProcurementTransaction" - compare_fields = _InitialRunWithPostgresLoader.expected_initial_transaction_fpds[0].keys() - new_detached_award_proc_unique = "award_procure_0004_trans_0001" - new_unique_award_key = "award_procure_0004" - baker_kwargs = { - "detached_award_procurement_id": _TransactionFabsFpdsCore.new_transaction_fabs_fpds_id, - "detached_award_proc_unique": new_detached_award_proc_unique, - "unique_award_key": new_unique_award_key, - } - expected_transaction_id_lookup_append = { - "is_fpds": True, - "transaction_unique_id": new_detached_award_proc_unique.upper(), - } - expected_transaction_fabs_fpds_append = { - "detached_award_proc_unique": new_detached_award_proc_unique.upper(), - "detached_award_procurement_id": _TransactionFabsFpdsCore.new_transaction_fabs_fpds_id, - "unique_award_key": new_unique_award_key.upper(), - } - - def _generate_transaction_fabs_fpds_core(self, spark, s3_data_bucket, expected_initial_transaction_fpds): - return _TransactionFabsFpdsCore( - spark, - s3_data_bucket, - self.etl_level, - self.pk_field, - self.compare_fields, - self.usas_source_table_name, - self.broker_source_table_name, - self.baker_table, - deepcopy(self.baker_kwargs), - [], - expected_initial_transaction_fpds, - ) - - @mark.django_db(transaction=True) - def test_unexpected_paths_source_tables_only( - self, spark, s3_unittest_data_bucket, hive_unittest_metastore_db, _populate_initial_source_tables_pg - ): - transaction_fabs_fpds_core = self._generate_transaction_fabs_fpds_core( - spark, s3_unittest_data_bucket, _InitialRunWithPostgresLoader.expected_initial_transaction_fpds - ) - transaction_fabs_fpds_core.unexpected_paths_source_tables_only_test_core() - - @mark.django_db(transaction=True) - def test_unexpected_paths_no_pg_loader( - self, spark, s3_unittest_data_bucket, hive_unittest_metastore_db, _populate_initial_source_tables_pg - ): - transaction_fabs_fpds_core = self._generate_transaction_fabs_fpds_core( - spark, s3_unittest_data_bucket, InitialRunNoPostgresLoader.initial_transaction_fpds - ) - transaction_fabs_fpds_core.unexpected_paths_no_pg_loader_test_core() - - @mark.django_db(transaction=True) - def test_happy_paths_no_pg_loader( - self, spark, s3_unittest_data_bucket, hive_unittest_metastore_db, _populate_initial_source_tables_pg - ): - transaction_fabs_fpds_core = self._generate_transaction_fabs_fpds_core( - spark, s3_unittest_data_bucket, InitialRunNoPostgresLoader.initial_transaction_fpds - ) - transaction_fabs_fpds_core.happy_paths_no_pg_loader_test_core( - InitialRunNoPostgresLoader.initial_transaction_fpds, - (3, 4), - self.expected_transaction_id_lookup_append, - self.expected_transaction_fabs_fpds_append, - ) diff --git a/usaspending_api/etl/tests/integration/test_load_transactions_in_delta_lookups.py b/usaspending_api/etl/tests/integration/test_load_transactions_in_delta_lookups.py deleted file mode 100644 index 978cfa345d..0000000000 --- a/usaspending_api/etl/tests/integration/test_load_transactions_in_delta_lookups.py +++ /dev/null @@ -1,1526 +0,0 @@ -"""Automated Unit Tests for the loading of transaction and award tables in Delta Lake. - -NOTE: Uses Pytest Fixtures from immediate parent conftest.py: usaspending_api/etl/tests/conftest.py -""" - -import dateutil -import re -import pyspark - -from copy import deepcopy -from dataclasses import dataclass -from datetime import datetime, timedelta, timezone -from django.db import connection -from django.core.management import call_command -from model_bakery import baker -from pyspark.sql import SparkSession -from pytest import mark, raises -from typing import Any, Dict, Optional, Sequence -from unittest.mock import patch - -from usaspending_api.broker.helpers.last_load_date import get_last_load_date, update_last_load_date -from usaspending_api.common.helpers.spark_helpers import load_dict_to_delta_table -from usaspending_api.etl.tests.integration.test_load_to_from_delta import load_delta_table_from_postgres, equal_datasets -from usaspending_api.transactions.delta_models.transaction_fabs import TRANSACTION_FABS_COLUMNS -from usaspending_api.transactions.delta_models.transaction_fpds import TRANSACTION_FPDS_COLUMNS -from usaspending_api.transactions.delta_models.transaction_normalized import TRANSACTION_NORMALIZED_COLUMNS -from usaspending_api.config import CONFIG -from usaspending_api.etl.management.commands.load_table_to_delta import TABLE_SPEC - -_BEGINNING_OF_TIME = datetime(1970, 1, 1, tzinfo=timezone.utc) -_INITIAL_DATETIME = datetime(2022, 10, 31, tzinfo=timezone.utc) -_INITIAL_SOURCE_TABLE_LOAD_DATETIME = _INITIAL_DATETIME + timedelta(hours=12) -_INITIAL_ASSISTS = [ - { - "published_fabs_id": 1, - "afa_generated_unique": "award_assist_0001_trans_0001", - "action_date": _INITIAL_DATETIME.isoformat(), - "created_at": _INITIAL_DATETIME, - "updated_at": _INITIAL_DATETIME, - "is_active": True, - "unique_award_key": "award_assist_0001", - }, - { - "published_fabs_id": 2, - "afa_generated_unique": "award_assist_0002_trans_0001", - "action_date": _INITIAL_DATETIME.isoformat(), - "created_at": _INITIAL_DATETIME, - "updated_at": _INITIAL_DATETIME, - "is_active": True, - "unique_award_key": "award_assist_0002", - }, - { - "published_fabs_id": 3, - "afa_generated_unique": "award_assist_0002_trans_0002", - # Deliberately formatting this action_date somewhat unusually. - "action_date": _INITIAL_DATETIME.strftime("%Y%m%d"), - "created_at": _INITIAL_DATETIME, - "updated_at": _INITIAL_DATETIME, - "is_active": True, - "unique_award_key": "award_assist_0002", - }, - { - "published_fabs_id": 4, - "afa_generated_unique": "award_assist_0003_trans_0001", - # Deliberately formatting this action_date somewhat unusually. - "action_date": _INITIAL_DATETIME.strftime("%Y%m%d"), - "created_at": _INITIAL_DATETIME, - "updated_at": _INITIAL_DATETIME, - "is_active": True, - "unique_award_key": "award_assist_0003", - }, - { - "published_fabs_id": 5, - "afa_generated_unique": "award_assist_0003_trans_0002", - "action_date": _INITIAL_DATETIME.isoformat(), - "created_at": _INITIAL_DATETIME, - "updated_at": _INITIAL_DATETIME, - "is_active": True, - "unique_award_key": "award_assist_0003", - }, -] -_INITIAL_PROCURES = [ - { - "detached_award_procurement_id": 1, - "detached_award_proc_unique": "award_procure_0001_trans_0001", - "action_date": _INITIAL_DATETIME.isoformat(), - "created_at": _INITIAL_DATETIME, - "updated_at": _INITIAL_DATETIME, - "unique_award_key": "award_procure_0001", - }, - { - "detached_award_procurement_id": 2, - "detached_award_proc_unique": "award_procure_0002_trans_0001", - "action_date": _INITIAL_DATETIME.isoformat(), - "created_at": _INITIAL_DATETIME, - "updated_at": _INITIAL_DATETIME, - "unique_award_key": "award_procure_0002", - }, - { - "detached_award_procurement_id": 3, - "detached_award_proc_unique": "award_procure_0002_trans_0002", - # Deliberately formatting this action_date somewhat unusually. - "action_date": _INITIAL_DATETIME.strftime("%Y%m%d"), - "created_at": _INITIAL_DATETIME, - "updated_at": _INITIAL_DATETIME, - "unique_award_key": "award_procure_0002", - }, - { - "detached_award_procurement_id": 4, - "detached_award_proc_unique": "award_procure_0003_trans_0001", - # Deliberately formatting this action_date somewhat unusually. - "action_date": _INITIAL_DATETIME.strftime("%Y%m%d"), - "created_at": _INITIAL_DATETIME, - "updated_at": _INITIAL_DATETIME, - "unique_award_key": "award_procure_0003", - }, - { - "detached_award_procurement_id": 5, - "detached_award_proc_unique": "award_procure_0003_trans_0002", - "action_date": _INITIAL_DATETIME.isoformat(), - "created_at": _INITIAL_DATETIME, - "updated_at": _INITIAL_DATETIME, - "unique_award_key": "award_procure_0003", - }, -] -_NEW_ASSIST = { - "published_fabs_id": 6, - "afa_generated_unique": "award_assist_0004_trans_0001", - "is_active": True, - "unique_award_key": "award_assist_0004", -} -_NEW_PROCURE = { - "detached_award_procurement_id": 6, - "detached_award_proc_unique": "award_procure_0004_trans_0001", - "unique_award_key": "award_procure_0004", -} - - -@dataclass -class _TableLoadInfo: - spark: SparkSession - table_name: str - data: Sequence[Dict[str, Any]] - overwrite: Optional[bool] = False - - -def _load_tables_to_delta(s3_data_bucket, load_source_tables=True, load_other_raw_tables=None): - if load_source_tables: - load_delta_table_from_postgres("published_fabs", s3_data_bucket) - load_delta_table_from_postgres("detached_award_procurement", s3_data_bucket) - - if load_other_raw_tables: - for item in load_other_raw_tables: - if isinstance(item, _TableLoadInfo): - load_dict_to_delta_table(item.spark, s3_data_bucket, "raw", item.table_name, item.data, item.overwrite) - else: - load_delta_table_from_postgres(item, s3_data_bucket) - - -class TestInitialRun: - @staticmethod - def initial_run(s3_data_bucket, load_source_tables=True, load_other_raw_tables=None, initial_copy=True): - _load_tables_to_delta(s3_data_bucket, load_source_tables, load_other_raw_tables) - call_params = ["load_transactions_in_delta", "--etl-level", "initial_run", "--spark-s3-bucket", s3_data_bucket] - if not initial_copy: - call_params.append("--no-initial-copy") - call_command(*call_params) - - @staticmethod - def verify_transaction_ids(spark, expected_transaction_id_lookup, expected_last_load=None): - # Verify transaction_id_lookup table - query = "SELECT * FROM int.transaction_id_lookup ORDER BY transaction_id" - delta_data = [row.asDict() for row in spark.sql(query).collect()] - assert equal_datasets(expected_transaction_id_lookup, delta_data, "") - - # Verify max transaction id - with connection.cursor() as cursor: - cursor.execute("SELECT nextval('transaction_id_seq')") - # Since all calls to setval() set the is_called flag to false, nextval() returns the actual maximum id - max_transaction_id = cursor.fetchone()[0] - if expected_transaction_id_lookup: - assert max_transaction_id == max( - [transaction["transaction_id"] for transaction in expected_transaction_id_lookup] - ) - else: - assert max_transaction_id == 1 - - # Since this test just called nextval(), need to reset the sequence with the is_called flag set to false - # so that the next call to nextval() will return the same value. - with connection.cursor() as cursor: - cursor.execute(f"SELECT setval('transaction_id_seq', {max_transaction_id}, false)") - - @staticmethod - def verify_award_ids(spark, expected_award_id_lookup, expected_last_load=None): - # Verify award_id_lookup table - query = "SELECT * FROM int.award_id_lookup ORDER BY award_id, transaction_unique_id" - delta_data = [row.asDict() for row in spark.sql(query).collect()] - assert equal_datasets(expected_award_id_lookup, delta_data, "") - - # Verify max award id - with connection.cursor() as cursor: - cursor.execute("SELECT nextval('award_id_seq')") - # Since all calls to setval() set the is_called flag to false, nextval() returns the actual maximum id - max_award_id = cursor.fetchone()[0] - if expected_award_id_lookup: - assert max_award_id == max([award["award_id"] for award in expected_award_id_lookup]) - else: - assert max_award_id == 1 - - # Since this test just called nextval(), need to reset the sequence with the is_called flag set to false - # so that the next call to nextval() will return the same value. - with connection.cursor() as cursor: - cursor.execute(f"SELECT setval('award_id_seq', {max_award_id}, false)") - - @staticmethod - def verify_lookup_info( - spark, - expected_transaction_id_lookup, - expected_award_id_lookup, - expected_last_load_transaction_id_lookup=None, - expected_load_load_award_id_lookup=None, - ): - TestInitialRun.verify_transaction_ids( - spark, expected_transaction_id_lookup, expected_last_load_transaction_id_lookup - ) - TestInitialRun.verify_award_ids(spark, expected_award_id_lookup, expected_load_load_award_id_lookup) - - @staticmethod - def verify_raw_vs_int_tables(spark, table_name, col_names): - # Make sure the raw and int versions of the given table match - result = spark.sql( - f""" - SELECT {', '.join(col_names)} FROM int.{table_name} - MINUS - SELECT {', '.join(col_names)} FROM raw.{table_name} - """ - ).collect() - assert len(result) == 0 - - result = spark.sql( - f""" - SELECT {', '.join(col_names)} FROM raw.{table_name} - MINUS - SELECT {', '.join(col_names)} FROM int.{table_name} - """ - ).collect() - assert len(result) == 0 - - @staticmethod - def verify( - spark, - expected_transaction_id_lookup, - expected_award_id_lookup, - expected_normalized_count=0, - expected_fabs_count=0, - expected_fpds_count=0, - expected_last_load_transaction_id_lookup=None, - expected_last_load_award_id_lookup=None, - expected_last_load_transaction_normalized=None, - expected_last_load_transaction_fabs=None, - expected_last_load_transaction_fpds=None, - ): - TestInitialRun.verify_lookup_info( - spark, - expected_transaction_id_lookup, - expected_award_id_lookup, - expected_last_load_transaction_id_lookup, - expected_last_load_award_id_lookup, - ) - - # int.award_ids_delete_modified should exist, but be empty - actual_count = spark.sql("SELECT COUNT(*) AS count from int.award_ids_delete_modified").collect()[0]["count"] - assert actual_count == 0 - - # Make sure int.transaction_[normalized,fabs,fpds] tables have been created and have the expected sizes. - for table_name, expected_count, expected_last_load, col_names in zip( - (f"transaction_{t}" for t in ("normalized", "fabs", "fpds")), - (expected_normalized_count, expected_fabs_count, expected_fpds_count), - ( - expected_last_load_transaction_normalized, - expected_last_load_transaction_fabs, - expected_last_load_transaction_fpds, - ), - (list(TRANSACTION_NORMALIZED_COLUMNS), TRANSACTION_FABS_COLUMNS, TRANSACTION_FPDS_COLUMNS), - ): - actual_count = spark.sql(f"SELECT COUNT(*) AS count from int.{table_name}").collect()[0]["count"] - assert actual_count == expected_count - - if expected_count > 0: - # Only verify raw vs int tables if raw table exists - try: - spark.sql(f"SELECT 1 FROM raw.{table_name}") - except pyspark.sql.utils.AnalysisException as e: - if re.match( - rf"^\[TABLE_OR_VIEW_NOT_FOUND\] The table or view `raw`\.`{table_name}` cannot be found\..*$", - str(e), - re.MULTILINE, - ): - pass - else: - raise e - else: - TestInitialRun.verify_raw_vs_int_tables(spark, table_name, col_names) - - @mark.django_db(transaction=True) - def test_edge_cases_using_only_source_tables(self, spark, s3_unittest_data_bucket, hive_unittest_metastore_db): - # Setup some source tables without data, this test does not require these tables to be populated - raw_db = "raw" - spark.sql(f"create database if not exists {raw_db};") - spark.sql(f"use {raw_db};") - spark.sql( - TABLE_SPEC["published_fabs"]["delta_table_create_sql"].format( - DESTINATION_TABLE="published_fabs", - DESTINATION_DATABASE=raw_db, - SPARK_S3_BUCKET=s3_unittest_data_bucket, - DELTA_LAKE_S3_PATH=CONFIG.DELTA_LAKE_S3_PATH, - ) - ) - spark.sql( - TABLE_SPEC["detached_award_procurement"]["delta_table_create_sql"].format( - DESTINATION_TABLE="detached_award_procurement", - DESTINATION_DATABASE=raw_db, - SPARK_S3_BUCKET=s3_unittest_data_bucket, - DELTA_LAKE_S3_PATH=CONFIG.DELTA_LAKE_S3_PATH, - ) - ) - call_command( - "load_transactions_in_delta", - "--etl-level", - "initial_run", - "--spark-s3-bucket", - s3_unittest_data_bucket, - "--no-initial-copy", - ) - kwargs = { - "expected_last_load_transaction_id_lookup": _BEGINNING_OF_TIME, - "expected_last_load_award_id_lookup": _BEGINNING_OF_TIME, - "expected_last_load_transaction_normalized": _BEGINNING_OF_TIME, - "expected_last_load_transaction_fabs": _BEGINNING_OF_TIME, - "expected_last_load_transaction_fpds": _BEGINNING_OF_TIME, - } - TestInitialRun.verify(spark, [], [], **kwargs) - - -# Even though all the tests that use the Postgres loader have been removed, these variables are still -# needed for some tests. -class _InitialRunWithPostgresLoader: - expected_initial_transaction_id_lookup = [ - { - "transaction_id": id, - "is_fpds": False, - "transaction_unique_id": _INITIAL_ASSISTS[id - 1]["afa_generated_unique"].upper(), - } - for id in range(1, len(_INITIAL_ASSISTS) + 1) - ] + [ - { - "transaction_id": id, - "is_fpds": True, - "transaction_unique_id": _INITIAL_PROCURES[id - 6]["detached_award_proc_unique"].upper(), - } - for id in range(len(_INITIAL_ASSISTS) + 1, len(_INITIAL_ASSISTS) + len(_INITIAL_PROCURES) + 1) - ] - - expected_initial_award_id_lookup = [ - { - "award_id": int(assist["unique_award_key"].split("_")[-1]), - "is_fpds": False, - "transaction_unique_id": assist["afa_generated_unique"].upper(), - "generated_unique_award_id": assist["unique_award_key"].upper(), - } - for assist in _INITIAL_ASSISTS - ] + [ - { - "award_id": ( - int(procure["unique_award_key"].split("_")[-1]) - + max([int(assist["unique_award_key"].split("_")[-1]) for assist in _INITIAL_ASSISTS]) - ), - "is_fpds": True, - "transaction_unique_id": procure["detached_award_proc_unique"].upper(), - "generated_unique_award_id": procure["unique_award_key"].upper(), - } - for procure in _INITIAL_PROCURES - ] - - expected_initial_transaction_fabs = [ - { - **assist, - "action_date": dateutil.parser.parse(assist["action_date"]).date().isoformat(), - "afa_generated_unique": assist["afa_generated_unique"].upper(), - "transaction_id": assist["published_fabs_id"], - "unique_award_key": assist["unique_award_key"].upper(), - } - for assist in _INITIAL_ASSISTS - ] - - expected_initial_transaction_fpds = [ - { - **procure, - "action_date": dateutil.parser.parse(procure["action_date"]).date().isoformat(), - "detached_award_proc_unique": procure["detached_award_proc_unique"].upper(), - "transaction_id": procure["detached_award_procurement_id"] + len(_INITIAL_ASSISTS), - "unique_award_key": procure["unique_award_key"].upper(), - } - for procure in _INITIAL_PROCURES - ] - - -class TestInitialRunNoPostgresLoader: - expected_initial_transaction_id_lookup = [ - { - "transaction_id": 1, - "is_fpds": False, - "transaction_unique_id": _INITIAL_ASSISTS[0]["afa_generated_unique"].upper(), - }, - { - "transaction_id": 2, - "is_fpds": True, - "transaction_unique_id": _INITIAL_PROCURES[0]["detached_award_proc_unique"].upper(), - }, - { - "transaction_id": 3, - "is_fpds": False, - "transaction_unique_id": _INITIAL_ASSISTS[1]["afa_generated_unique"].upper(), - }, - { - "transaction_id": 4, - "is_fpds": True, - "transaction_unique_id": _INITIAL_PROCURES[1]["detached_award_proc_unique"].upper(), - }, - { - "transaction_id": 5, - "is_fpds": False, - "transaction_unique_id": _INITIAL_ASSISTS[2]["afa_generated_unique"].upper(), - }, - { - "transaction_id": 6, - "is_fpds": True, - "transaction_unique_id": _INITIAL_PROCURES[2]["detached_award_proc_unique"].upper(), - }, - { - "transaction_id": 7, - "is_fpds": False, - "transaction_unique_id": _INITIAL_ASSISTS[3]["afa_generated_unique"].upper(), - }, - { - "transaction_id": 8, - "is_fpds": False, - "transaction_unique_id": _INITIAL_ASSISTS[4]["afa_generated_unique"].upper(), - }, - { - "transaction_id": 9, - "is_fpds": True, - "transaction_unique_id": _INITIAL_PROCURES[3]["detached_award_proc_unique"].upper(), - }, - { - "transaction_id": 10, - "is_fpds": True, - "transaction_unique_id": _INITIAL_PROCURES[4]["detached_award_proc_unique"].upper(), - }, - ] - - expected_initial_award_id_lookup = [ - { - "award_id": 1, - "is_fpds": False, - "transaction_unique_id": _INITIAL_ASSISTS[0]["afa_generated_unique"].upper(), - "generated_unique_award_id": _INITIAL_ASSISTS[0]["unique_award_key"].upper(), - }, - { - "award_id": 2, - "is_fpds": False, - "transaction_unique_id": _INITIAL_ASSISTS[1]["afa_generated_unique"].upper(), - "generated_unique_award_id": _INITIAL_ASSISTS[1]["unique_award_key"].upper(), - }, - { - "award_id": 2, - "is_fpds": False, - "transaction_unique_id": _INITIAL_ASSISTS[2]["afa_generated_unique"].upper(), - "generated_unique_award_id": _INITIAL_ASSISTS[2]["unique_award_key"].upper(), - }, - { - "award_id": 3, - "is_fpds": True, - "transaction_unique_id": _INITIAL_PROCURES[0]["detached_award_proc_unique"].upper(), - "generated_unique_award_id": _INITIAL_PROCURES[0]["unique_award_key"].upper(), - }, - { - "award_id": 4, - "is_fpds": True, - "transaction_unique_id": _INITIAL_PROCURES[1]["detached_award_proc_unique"].upper(), - "generated_unique_award_id": _INITIAL_PROCURES[1]["unique_award_key"].upper(), - }, - { - "award_id": 4, - "is_fpds": True, - "transaction_unique_id": _INITIAL_PROCURES[2]["detached_award_proc_unique"].upper(), - "generated_unique_award_id": _INITIAL_PROCURES[2]["unique_award_key"].upper(), - }, - { - "award_id": 5, - "is_fpds": False, - "transaction_unique_id": _INITIAL_ASSISTS[3]["afa_generated_unique"].upper(), - "generated_unique_award_id": _INITIAL_ASSISTS[3]["unique_award_key"].upper(), - }, - { - "award_id": 5, - "is_fpds": False, - "transaction_unique_id": _INITIAL_ASSISTS[4]["afa_generated_unique"].upper(), - "generated_unique_award_id": _INITIAL_ASSISTS[4]["unique_award_key"].upper(), - }, - { - "award_id": 6, - "is_fpds": True, - "transaction_unique_id": _INITIAL_PROCURES[3]["detached_award_proc_unique"].upper(), - "generated_unique_award_id": _INITIAL_PROCURES[3]["unique_award_key"].upper(), - }, - { - "award_id": 6, - "is_fpds": True, - "transaction_unique_id": _INITIAL_PROCURES[4]["detached_award_proc_unique"].upper(), - "generated_unique_award_id": _INITIAL_PROCURES[4]["unique_award_key"].upper(), - }, - ] - - initial_award_trans_norm_update_create_date = _INITIAL_DATETIME + timedelta(days=1) - - initial_awards = [ - { - "id": 1, - "update_date": initial_award_trans_norm_update_create_date, - "generated_unique_award_id": _INITIAL_ASSISTS[0]["unique_award_key"].upper(), - "is_fpds": False, - "transaction_unique_id": _INITIAL_ASSISTS[0]["afa_generated_unique"].upper(), - "subaward_count": 0, - }, - { - "id": 2, - "update_date": initial_award_trans_norm_update_create_date, - "generated_unique_award_id": _INITIAL_ASSISTS[1]["unique_award_key"].upper(), - "is_fpds": False, - "transaction_unique_id": _INITIAL_ASSISTS[1]["afa_generated_unique"].upper(), - "subaward_count": 0, - }, - { - "id": 3, - "update_date": initial_award_trans_norm_update_create_date, - "generated_unique_award_id": _INITIAL_PROCURES[0]["unique_award_key"].upper(), - "is_fpds": True, - "transaction_unique_id": _INITIAL_PROCURES[0]["detached_award_proc_unique"].upper(), - "subaward_count": 0, - }, - { - "id": 4, - "update_date": initial_award_trans_norm_update_create_date, - "generated_unique_award_id": _INITIAL_PROCURES[1]["unique_award_key"].upper(), - "is_fpds": True, - "transaction_unique_id": _INITIAL_PROCURES[1]["detached_award_proc_unique"].upper(), - "subaward_count": 0, - }, - { - "id": 5, - "update_date": initial_award_trans_norm_update_create_date, - "generated_unique_award_id": _INITIAL_ASSISTS[3]["unique_award_key"].upper(), - "is_fpds": False, - "transaction_unique_id": _INITIAL_ASSISTS[3]["afa_generated_unique"].upper(), - "subaward_count": 0, - }, - { - "id": 6, - "update_date": initial_award_trans_norm_update_create_date, - "generated_unique_award_id": _INITIAL_PROCURES[3]["unique_award_key"].upper(), - "is_fpds": True, - "transaction_unique_id": _INITIAL_PROCURES[3]["detached_award_proc_unique"].upper(), - "subaward_count": 0, - }, - ] - - initial_transaction_normalized = [ - { - "id": 1, - "award_id": 1, - "business_categories": [], - "action_date": dateutil.parser.parse(_INITIAL_ASSISTS[0]["action_date"]).date(), - "create_date": initial_award_trans_norm_update_create_date, - "transaction_unique_id": _INITIAL_ASSISTS[0]["afa_generated_unique"].upper(), - "update_date": initial_award_trans_norm_update_create_date, - "is_fpds": False, - "unique_award_key": _INITIAL_ASSISTS[0]["unique_award_key"].upper(), - }, - { - "id": 2, - "award_id": 3, - "business_categories": [], - "action_date": dateutil.parser.parse(_INITIAL_PROCURES[0]["action_date"]).date(), - "create_date": initial_award_trans_norm_update_create_date, - "transaction_unique_id": _INITIAL_PROCURES[0]["detached_award_proc_unique"].upper(), - "update_date": initial_award_trans_norm_update_create_date, - "is_fpds": True, - "unique_award_key": _INITIAL_PROCURES[0]["unique_award_key"].upper(), - }, - { - "id": 3, - "award_id": 2, - "business_categories": [], - "action_date": dateutil.parser.parse(_INITIAL_ASSISTS[1]["action_date"]).date(), - "create_date": initial_award_trans_norm_update_create_date, - "transaction_unique_id": _INITIAL_ASSISTS[1]["afa_generated_unique"].upper(), - "update_date": initial_award_trans_norm_update_create_date, - "is_fpds": False, - "unique_award_key": _INITIAL_ASSISTS[1]["unique_award_key"].upper(), - }, - { - "id": 4, - "award_id": 4, - "business_categories": [], - "action_date": dateutil.parser.parse(_INITIAL_PROCURES[1]["action_date"]).date(), - "create_date": initial_award_trans_norm_update_create_date, - "transaction_unique_id": _INITIAL_PROCURES[1]["detached_award_proc_unique"].upper(), - "update_date": initial_award_trans_norm_update_create_date, - "is_fpds": True, - "unique_award_key": _INITIAL_PROCURES[1]["unique_award_key"].upper(), - }, - { - "id": 5, - "award_id": 2, - "business_categories": [], - "action_date": dateutil.parser.parse(_INITIAL_ASSISTS[2]["action_date"]).date(), - "create_date": initial_award_trans_norm_update_create_date, - "transaction_unique_id": _INITIAL_ASSISTS[2]["afa_generated_unique"].upper(), - "update_date": initial_award_trans_norm_update_create_date, - "is_fpds": False, - "unique_award_key": _INITIAL_ASSISTS[2]["unique_award_key"].upper(), - }, - { - "id": 6, - "award_id": 4, - "business_categories": [], - "action_date": dateutil.parser.parse(_INITIAL_PROCURES[2]["action_date"]).date(), - "create_date": initial_award_trans_norm_update_create_date, - "transaction_unique_id": _INITIAL_PROCURES[2]["detached_award_proc_unique"].upper(), - "update_date": initial_award_trans_norm_update_create_date, - "is_fpds": True, - "unique_award_key": _INITIAL_PROCURES[2]["unique_award_key"].upper(), - }, - { - "id": 7, - "award_id": 5, - "business_categories": [], - "action_date": dateutil.parser.parse(_INITIAL_ASSISTS[3]["action_date"]).date(), - "create_date": initial_award_trans_norm_update_create_date, - "transaction_unique_id": _INITIAL_ASSISTS[3]["afa_generated_unique"].upper(), - "update_date": initial_award_trans_norm_update_create_date, - "is_fpds": False, - "unique_award_key": _INITIAL_ASSISTS[3]["unique_award_key"].upper(), - }, - { - "id": 8, - "award_id": 5, - "business_categories": [], - "action_date": dateutil.parser.parse(_INITIAL_ASSISTS[4]["action_date"]).date(), - "create_date": initial_award_trans_norm_update_create_date, - "transaction_unique_id": _INITIAL_ASSISTS[4]["afa_generated_unique"].upper(), - "update_date": initial_award_trans_norm_update_create_date, - "is_fpds": False, - "unique_award_key": _INITIAL_ASSISTS[4]["unique_award_key"].upper(), - }, - { - "id": 9, - "award_id": 6, - "business_categories": [], - "action_date": dateutil.parser.parse(_INITIAL_PROCURES[3]["action_date"]).date(), - "create_date": initial_award_trans_norm_update_create_date, - "transaction_unique_id": _INITIAL_PROCURES[3]["detached_award_proc_unique"].upper(), - "update_date": initial_award_trans_norm_update_create_date, - "is_fpds": True, - "unique_award_key": _INITIAL_PROCURES[3]["unique_award_key"].upper(), - }, - { - "id": 10, - "award_id": 6, - "business_categories": [], - "action_date": dateutil.parser.parse(_INITIAL_PROCURES[3]["action_date"]).date(), - "create_date": initial_award_trans_norm_update_create_date, - "transaction_unique_id": _INITIAL_PROCURES[4]["detached_award_proc_unique"].upper(), - "update_date": initial_award_trans_norm_update_create_date, - "is_fpds": True, - "unique_award_key": _INITIAL_PROCURES[4]["unique_award_key"].upper(), - }, - ] - - initial_transaction_fabs = [ - { - **assist, - "action_date": dateutil.parser.parse(assist["action_date"]).date().isoformat(), - "afa_generated_unique": assist["afa_generated_unique"].upper(), - "transaction_id": (assist["published_fabs_id"] - 1) * 2 + 1, - "unique_award_key": assist["unique_award_key"].upper(), - } - for assist in _INITIAL_ASSISTS[:4] - ] + [ - { - **_INITIAL_ASSISTS[4], - "action_date": dateutil.parser.parse(_INITIAL_ASSISTS[4]["action_date"]).date().isoformat(), - "afa_generated_unique": _INITIAL_ASSISTS[4]["afa_generated_unique"].upper(), - "transaction_id": 8, - "unique_award_key": _INITIAL_ASSISTS[4]["unique_award_key"].upper(), - } - ] - - initial_transaction_fpds = [ - { - **procure, - "action_date": dateutil.parser.parse(procure["action_date"]).date().isoformat(), - "detached_award_proc_unique": procure["detached_award_proc_unique"].upper(), - "transaction_id": procure["detached_award_procurement_id"] * 2, - "unique_award_key": procure["unique_award_key"].upper(), - } - for procure in _INITIAL_PROCURES[:3] - ] + [ - { - **_INITIAL_PROCURES[3], - "action_date": dateutil.parser.parse(_INITIAL_PROCURES[3]["action_date"]).date().isoformat(), - "detached_award_proc_unique": _INITIAL_PROCURES[3]["detached_award_proc_unique"].upper(), - "transaction_id": 9, - "unique_award_key": _INITIAL_PROCURES[3]["unique_award_key"].upper(), - }, - { - **_INITIAL_PROCURES[4], - "action_date": dateutil.parser.parse(_INITIAL_PROCURES[4]["action_date"]).date().isoformat(), - "detached_award_proc_unique": _INITIAL_PROCURES[4]["detached_award_proc_unique"].upper(), - "transaction_id": 10, - "unique_award_key": _INITIAL_PROCURES[4]["unique_award_key"].upper(), - }, - ] - - # This test will only load the source tables from postgres, and NOT use the Postgres transaction loader - # to populate any other Delta tables, so can only test for NULLs originating in Delta. - @mark.django_db(transaction=True) - @patch("usaspending_api.etl.management.commands.load_transactions_in_delta.Command._insert_orphaned_transactions") - def test_nulls_in_trans_norm_unique_award_key_from_delta( - self, - orphaned_txns_patch, - spark, - s3_unittest_data_bucket, - hive_unittest_metastore_db, - _populate_initial_source_tables_pg, - ): - raw_db = "raw" - spark.sql(f"create database if not exists {raw_db};") - spark.sql(f"use {raw_db};") - spark.sql( - TABLE_SPEC["published_fabs"]["delta_table_create_sql"].format( - DESTINATION_TABLE="published_fabs", - DESTINATION_DATABASE=raw_db, - SPARK_S3_BUCKET=s3_unittest_data_bucket, - DELTA_LAKE_S3_PATH=CONFIG.DELTA_LAKE_S3_PATH, - ) - ) - spark.sql( - TABLE_SPEC["detached_award_procurement"]["delta_table_create_sql"].format( - DESTINATION_TABLE="detached_award_procurement", - DESTINATION_DATABASE=raw_db, - SPARK_S3_BUCKET=s3_unittest_data_bucket, - DELTA_LAKE_S3_PATH=CONFIG.DELTA_LAKE_S3_PATH, - ) - ) - spark.sql( - TABLE_SPEC["transaction_normalized"]["delta_table_create_sql"].format( - DESTINATION_TABLE="transaction_normalized", - DESTINATION_DATABASE=raw_db, - SPARK_S3_BUCKET=s3_unittest_data_bucket, - DELTA_LAKE_S3_PATH=CONFIG.DELTA_LAKE_S3_PATH, - ) - ) - spark.sql( - """ - INSERT INTO raw.transaction_normalized - VALUES('2022-10-31' - , NULL - , NULL - , 5 - , NULL - , ARRAY() - , NULL - , '2022-11-01T00:00:00+00:00' - , NULL - , NULL - , NULL - , NULL - , NULL - , NULL - , 5 - , NULL - , TRUE - , NULL - , NULL - , NULL - , NULL - , NULL - , NULL - , 'AWARD_ASSIST_0002_TRANS_0002' - , NULL - , NULL - , NULL - , '2022-11-01T00:00:00+00:00' - , NULL - ) - """ - ) - - with raises(ValueError, match="Found 1 NULL in 'unique_award_key' in table raw.transaction_normalized!"): - call_command( - "load_transactions_in_delta", "--etl-level", "initial_run", "--spark-s3-bucket", s3_unittest_data_bucket - ) - - spark.sql( - """ - INSERT INTO raw.transaction_normalized - VALUES('2022-10-31' - , NULL - , NULL - , 6 - , NULL - , ARRAY() - , NULL - , '2022-11-01T00:00:00+00:00' - , NULL - , NULL - , NULL - , NULL - , NULL - , NULL - , 6 - , NULL - , TRUE - , NULL - , NULL - , NULL - , NULL - , NULL - , NULL - , 'AWARD_PROCURE_0002_TRANS_0002' - , NULL - , NULL - , NULL - , '2022-11-01T00:00:00+00:00' - , NULL - ) - """ - ) - - with raises(ValueError, match="Found 2 NULLs in 'unique_award_key' in table raw.transaction_normalized!"): - call_command( - "load_transactions_in_delta", "--etl-level", "initial_run", "--spark-s3-bucket", s3_unittest_data_bucket - ) - - @mark.django_db(transaction=True) - def test_happy_path_scenarios( - self, spark, s3_unittest_data_bucket, hive_unittest_metastore_db, _populate_initial_source_tables_pg - ): - # Since we're not using the Postgres transaction loader, load raw.transaction_normalized and raw.awards - # from expected data when making initial run - load_other_raw_tables = [ - _TableLoadInfo(spark, "transaction_normalized", self.initial_transaction_normalized), - _TableLoadInfo(spark, "awards", self.initial_awards), - ] - # Setup some source tables with data, without loading these Delta Tables from Postgres - # for efficiency reasons. - raw_db = "raw" - spark.sql(f"create database if not exists {raw_db};") - spark.sql(f"use {raw_db};") - spark.sql( - TABLE_SPEC["published_fabs"]["delta_table_create_sql"].format( - DESTINATION_TABLE="published_fabs", - DESTINATION_DATABASE=raw_db, - SPARK_S3_BUCKET=s3_unittest_data_bucket, - DELTA_LAKE_S3_PATH=CONFIG.DELTA_LAKE_S3_PATH, - ) - ) - spark.sql( - TABLE_SPEC["detached_award_procurement"]["delta_table_create_sql"].format( - DESTINATION_TABLE="detached_award_procurement", - DESTINATION_DATABASE=raw_db, - SPARK_S3_BUCKET=s3_unittest_data_bucket, - DELTA_LAKE_S3_PATH=CONFIG.DELTA_LAKE_S3_PATH, - ) - ) - load_dict_to_delta_table( - spark, - s3_unittest_data_bucket, - "raw", - "detached_award_procurement", - _INITIAL_PROCURES, - True, - ) - load_dict_to_delta_table( - spark, - s3_unittest_data_bucket, - "raw", - "published_fabs", - _INITIAL_ASSISTS, - True, - ) - TestInitialRun.initial_run( - s3_unittest_data_bucket, - load_source_tables=False, - load_other_raw_tables=load_other_raw_tables, - initial_copy=False, - ) - kwargs = { - "expected_last_load_transaction_id_lookup": _INITIAL_SOURCE_TABLE_LOAD_DATETIME, - "expected_last_load_award_id_lookup": _INITIAL_SOURCE_TABLE_LOAD_DATETIME, - "expected_last_load_transaction_normalized": _BEGINNING_OF_TIME, - "expected_last_load_transaction_fabs": _BEGINNING_OF_TIME, - "expected_last_load_transaction_fpds": _BEGINNING_OF_TIME, - } - TestInitialRun.verify( - spark, self.expected_initial_transaction_id_lookup, self.expected_initial_award_id_lookup, **kwargs - ) - - # 2. Call initial_run with initial-copy, and have all raw tables populated - - # Since we're not using the Postgres transaction loader, load raw.transaction_normalized and raw.awards - # from expected data when making initial run - load_other_raw_tables = [ - _TableLoadInfo(spark, "transaction_fabs", self.initial_transaction_fabs), - _TableLoadInfo(spark, "transaction_fpds", self.initial_transaction_fpds), - ] - # Don't call Postgres loader or re-load the source tables, though. - TestInitialRun.initial_run(s3_unittest_data_bucket, False, load_other_raw_tables) - kwargs["expected_last_load_transaction_normalized"] = _INITIAL_SOURCE_TABLE_LOAD_DATETIME - kwargs["expected_last_load_transaction_fabs"] = _INITIAL_SOURCE_TABLE_LOAD_DATETIME - kwargs["expected_last_load_transaction_fpds"] = _INITIAL_SOURCE_TABLE_LOAD_DATETIME - TestInitialRun.verify( - spark, - self.expected_initial_transaction_id_lookup, - self.expected_initial_award_id_lookup, - len(self.initial_transaction_normalized), - len(self.initial_transaction_fabs), - len(self.initial_transaction_fpds), - **kwargs, - ) - - -class TestTransactionIdLookup: - @mark.django_db(transaction=True) - def test_unexpected_paths( - self, spark, s3_unittest_data_bucket, hive_unittest_metastore_db, _populate_initial_source_tables_pg - ): - # Setup some source tables with data, without loading these Delta Tables from Postgres - # for efficiency reasons. - raw_db = "raw" - spark.sql(f"create database if not exists {raw_db};") - spark.sql(f"use {raw_db};") - spark.sql( - TABLE_SPEC["published_fabs"]["delta_table_create_sql"].format( - DESTINATION_TABLE="published_fabs", - DESTINATION_DATABASE=raw_db, - SPARK_S3_BUCKET=s3_unittest_data_bucket, - DELTA_LAKE_S3_PATH=CONFIG.DELTA_LAKE_S3_PATH, - ) - ) - spark.sql( - TABLE_SPEC["detached_award_procurement"]["delta_table_create_sql"].format( - DESTINATION_TABLE="detached_award_procurement", - DESTINATION_DATABASE=raw_db, - SPARK_S3_BUCKET=s3_unittest_data_bucket, - DELTA_LAKE_S3_PATH=CONFIG.DELTA_LAKE_S3_PATH, - ) - ) - load_dict_to_delta_table( - spark, - s3_unittest_data_bucket, - "raw", - "detached_award_procurement", - _INITIAL_PROCURES, - True, - ) - load_dict_to_delta_table( - spark, - s3_unittest_data_bucket, - "raw", - "published_fabs", - _INITIAL_ASSISTS, - True, - ) - - # 1. Test calling load_transactions_in_delta with the etl-level set to the proper sequencing of - # initial_run, then transaction_id_lookup. However, call initial_run with blank raw.transaction_normalized - # and raw.awards tables. - - # First, create blank raw.transaction_normalized and raw.awards tables - spark.sql( - TABLE_SPEC["transaction_normalized"]["delta_table_create_sql"].format( - DESTINATION_TABLE="transaction_normalized", - DESTINATION_DATABASE=raw_db, - SPARK_S3_BUCKET=s3_unittest_data_bucket, - DELTA_LAKE_S3_PATH=CONFIG.DELTA_LAKE_S3_PATH, - ) - ) - spark.sql( - TABLE_SPEC["awards"]["delta_table_create_sql"].format( - DESTINATION_TABLE="awards", - DESTINATION_DATABASE=raw_db, - SPARK_S3_BUCKET=s3_unittest_data_bucket, - DELTA_LAKE_S3_PATH=CONFIG.DELTA_LAKE_S3_PATH, - ) - ) - - # Then, call load_transactions_in_delta with etl-level of initial_run and verify. - # Don't reload the source tables, and don't do initial copy of transaction tables, though. - TestInitialRun.initial_run(s3_unittest_data_bucket, load_source_tables=False, initial_copy=False) - kwargs = { - "expected_last_load_transaction_id_lookup": _BEGINNING_OF_TIME, - "expected_last_load_award_id_lookup": _BEGINNING_OF_TIME, - "expected_last_load_transaction_normalized": _BEGINNING_OF_TIME, - "expected_last_load_transaction_fabs": _BEGINNING_OF_TIME, - "expected_last_load_transaction_fpds": _BEGINNING_OF_TIME, - } - TestInitialRun.verify(spark, [], [], **kwargs) - - # Then, call load_transactions_in_delta with etl-level of transaction_id_lookup. - call_command("load_transactions_in_delta", "--etl-level", "transaction_id_lookup") - - # The expected transaction_id_lookup table should be the same as in _InitialRunWithPostgresLoader, - # but all of the transaction ids should be 1 larger than expected there. - expected_transaction_id_lookup = deepcopy(_InitialRunWithPostgresLoader.expected_initial_transaction_id_lookup) - for item in expected_transaction_id_lookup: - item["transaction_id"] += 1 - # Also, the last load date for the transaction_id_lookup table should be updated to the date of the - # initial loads. - kwargs["expected_last_load_transaction_id_lookup"] = _INITIAL_SOURCE_TABLE_LOAD_DATETIME - TestInitialRun.verify(spark, expected_transaction_id_lookup, [], **kwargs) - - @staticmethod - def _happy_path_test_core( - spark, - s3_data_bucket, - load_other_raw_tables, - expected_initial_transaction_id_lookup, - expected_initial_award_id_lookup, - expected_transaction_id_lookup_pops, - ): - # First, setup some source tables with data, without loading these Delta Tables from Postgres - # for efficiency reasons. - raw_db = "raw" - spark.sql(f"create database if not exists {raw_db};") - spark.sql(f"use {raw_db};") - spark.sql( - TABLE_SPEC["published_fabs"]["delta_table_create_sql"].format( - DESTINATION_TABLE="published_fabs", - DESTINATION_DATABASE=raw_db, - SPARK_S3_BUCKET=s3_data_bucket, - DELTA_LAKE_S3_PATH=CONFIG.DELTA_LAKE_S3_PATH, - ) - ) - spark.sql( - TABLE_SPEC["detached_award_procurement"]["delta_table_create_sql"].format( - DESTINATION_TABLE="detached_award_procurement", - DESTINATION_DATABASE=raw_db, - SPARK_S3_BUCKET=s3_data_bucket, - DELTA_LAKE_S3_PATH=CONFIG.DELTA_LAKE_S3_PATH, - ) - ) - load_dict_to_delta_table( - spark, - s3_data_bucket, - "raw", - "detached_award_procurement", - _INITIAL_PROCURES, - True, - ) - load_dict_to_delta_table( - spark, - s3_data_bucket, - "raw", - "published_fabs", - _INITIAL_ASSISTS, - True, - ) - # Trigger initial run of load transactions in delta. This step is required as it creates various data sources. - TestInitialRun.initial_run( - s3_data_bucket, load_source_tables=False, load_other_raw_tables=load_other_raw_tables, initial_copy=False - ) - - # 1. Test deleting the transaction(s) with the last transaction ID(s) from the appropriate raw table, - # followed by a call to load_transaction_in_delta with etl-level of transaction_id_lookup - # 2. Test for a single inserted transaction, and another call to load_transaction_in_delta with etl-level of - # transaction_id_lookup. - - spark.sql( - """ - DELETE FROM raw.detached_award_procurement - WHERE detached_award_procurement_id = 4 OR detached_award_procurement_id = 5 - """ - ) - call_command("load_transactions_in_delta", "--etl-level", "transaction_id_lookup") - - # Verify transaction_id_lookup table - query = "SELECT * FROM int.transaction_id_lookup ORDER BY transaction_id" - delta_data = [row.asDict() for row in spark.sql(query).collect()] - - expected_transaction_id_lookup = deepcopy(expected_initial_transaction_id_lookup) - expected_transaction_id_lookup.pop() - expected_transaction_id_lookup.pop() - assert equal_datasets(expected_transaction_id_lookup, delta_data, "") - - # Also, make sure transaction_id_seq hasn't gone backwards - with connection.cursor() as cursor: - cursor.execute("SELECT nextval('transaction_id_seq')") - # Since all calls to setval() set the is_called flag to false, nextval() returns the actual maximum id - max_transaction_id = cursor.fetchone()[0] - assert max_transaction_id == (len(_INITIAL_ASSISTS) + len(_INITIAL_PROCURES)) - - # Since this test just called nextval(), need to reset the sequence with the is_called flag set to false - # so that the next call to nextval() will return the same value as previously. - with connection.cursor() as cursor: - cursor.execute(f"SELECT setval('transaction_id_seq', {max_transaction_id}, false)") - - # 3. Test for a single inserted transaction, and another call to load_transaction_in_delta with etl-level of - # transaction_id_lookup. - - # Since changes to the source tables will go to the Postgres table first, use model baker to add new rows to - # Postgres table, and then push the updated table to Delta. - last_assist_load_datetime = datetime.now(timezone.utc) - insert_datetime = last_assist_load_datetime + timedelta(minutes=-15) - assist = deepcopy(_NEW_ASSIST) - assist.update( - {"action_date": insert_datetime.isoformat(), "created_at": insert_datetime, "updated_at": insert_datetime} - ) - baker.make("transactions.SourceAssistanceTransaction", **assist) - update_last_load_date("source_assistance_transaction", last_assist_load_datetime) - load_delta_table_from_postgres("published_fabs", s3_data_bucket) - call_command("load_transactions_in_delta", "--etl-level", "transaction_id_lookup") - - # Verify transaction_id_lookup table - query = "SELECT * FROM int.transaction_id_lookup ORDER BY transaction_id" - delta_data = [row.asDict() for row in spark.sql(query).collect()] - - expected_transaction_id_lookup = deepcopy(expected_initial_transaction_id_lookup) - expected_transaction_id_lookup.pop() - expected_transaction_id_lookup.pop() - - expected_transaction_id_lookup.append( - { - "transaction_id": 11, - "is_fpds": False, - "transaction_unique_id": _NEW_ASSIST["afa_generated_unique"].upper(), - } - ) - - # Verify the data has been loaded and changed correctly - # Although the last load date for the source_assistance_transaction was updated above, the code in - # load_transactions_in_delta takes the minimum last load date of that table and of the - # source_procurement_transaction table, which has not been updated since the initial load of both tables. - kwargs = { - "expected_last_load_transaction_id_lookup": _INITIAL_SOURCE_TABLE_LOAD_DATETIME, - "expected_last_load_award_id_lookup": _INITIAL_SOURCE_TABLE_LOAD_DATETIME, - "expected_last_load_transaction_normalized": _BEGINNING_OF_TIME, - "expected_last_load_transaction_fabs": _BEGINNING_OF_TIME, - "expected_last_load_transaction_fpds": _BEGINNING_OF_TIME, - } - TestInitialRun.verify(spark, expected_transaction_id_lookup, expected_initial_award_id_lookup, **kwargs) - - # Also, make sure transaction_id_seq hasn't gone backwards - with connection.cursor() as cursor: - cursor.execute("SELECT nextval('transaction_id_seq')") - # Since all calls to setval() set the is_called flag to false, nextval() returns the actual maximum id - max_transaction_id = cursor.fetchone()[0] - assert max_transaction_id == (len(_INITIAL_ASSISTS) + len(_INITIAL_PROCURES) + 1) # Add one for the insert - - # Since this test just called nextval(), need to reset the sequence with the is_called flag set to false - # so that the next call to nextval() will return the same value as previously. - with connection.cursor() as cursor: - cursor.execute(f"SELECT setval('transaction_id_seq', {max_transaction_id}, false)") - - # 3. Make inserts to and deletes from the raw tables, call load_transaction_in_delta with etl-level of - # transaction_id_lookup, and test that the results are as expected. - last_procure_load_datetime = datetime.now(timezone.utc) - insert_datetime = last_procure_load_datetime + timedelta(minutes=-15) - procure = deepcopy(_NEW_PROCURE) - procure.update( - {"action_date": insert_datetime.isoformat(), "created_at": insert_datetime, "updated_at": insert_datetime} - ) - baker.make("transactions.SourceProcurementTransaction", **procure) - update_last_load_date("source_procurement_transaction", last_procure_load_datetime) - load_delta_table_from_postgres("detached_award_procurement", s3_data_bucket) - - spark.sql( - """ - DELETE FROM raw.published_fabs - WHERE published_fabs_id = 2 OR published_fabs_id = 3 - """ - ) - spark.sql( - """ - DELETE FROM raw.detached_award_procurement - WHERE detached_award_procurement_id = 1 - """ - ) - - call_command("load_transactions_in_delta", "--etl-level", "transaction_id_lookup") - - # Verify transaction_id_lookup table - query = "SELECT * FROM int.transaction_id_lookup ORDER BY transaction_id" - delta_data = [row.asDict() for row in spark.sql(query).collect()] - - for pop in expected_transaction_id_lookup_pops: - expected_transaction_id_lookup.pop(pop) - expected_transaction_id_lookup.append( - { - "transaction_id": 12, - "is_fpds": True, - "transaction_unique_id": _NEW_PROCURE["detached_award_proc_unique"].upper(), - } - ) - assert equal_datasets(expected_transaction_id_lookup, delta_data, "") - - assert get_last_load_date("transaction_id_lookup") == last_assist_load_datetime - - @mark.django_db(transaction=True) - def test_happy_path_scenarios_no_pg_loader( - self, spark, s3_unittest_data_bucket, hive_unittest_metastore_db, _populate_initial_source_tables_pg - ): - # Since we're not using the Postgres transaction loader, load raw.transaction_normalized and raw.awards - # from expected data when making initial run - load_other_raw_tables = [ - _TableLoadInfo( - spark, "transaction_normalized", TestInitialRunNoPostgresLoader.initial_transaction_normalized - ), - _TableLoadInfo(spark, "awards", TestInitialRunNoPostgresLoader.initial_awards), - ] - - self._happy_path_test_core( - spark, - s3_unittest_data_bucket, - load_other_raw_tables, - TestInitialRunNoPostgresLoader.expected_initial_transaction_id_lookup, - TestInitialRunNoPostgresLoader.expected_initial_award_id_lookup, - (1, 1, 2), - ) - - -class TestAwardIdLookup: - @mark.django_db(transaction=True) - def test_unexpected_paths( - self, spark, s3_unittest_data_bucket, hive_unittest_metastore_db, _populate_initial_source_tables_pg - ): - # First, setup some source tables with data, without loading these Delta Tables from Postgres - # for efficiency reasons. - raw_db = "raw" - spark.sql(f"create database if not exists {raw_db};") - spark.sql(f"use {raw_db};") - spark.sql( - TABLE_SPEC["published_fabs"]["delta_table_create_sql"].format( - DESTINATION_TABLE="published_fabs", - DESTINATION_DATABASE=raw_db, - SPARK_S3_BUCKET=s3_unittest_data_bucket, - DELTA_LAKE_S3_PATH=CONFIG.DELTA_LAKE_S3_PATH, - ) - ) - spark.sql( - TABLE_SPEC["detached_award_procurement"]["delta_table_create_sql"].format( - DESTINATION_TABLE="detached_award_procurement", - DESTINATION_DATABASE=raw_db, - SPARK_S3_BUCKET=s3_unittest_data_bucket, - DELTA_LAKE_S3_PATH=CONFIG.DELTA_LAKE_S3_PATH, - ) - ) - load_dict_to_delta_table( - spark, - s3_unittest_data_bucket, - "raw", - "detached_award_procurement", - _INITIAL_PROCURES, - True, - ) - load_dict_to_delta_table( - spark, - s3_unittest_data_bucket, - "raw", - "published_fabs", - _INITIAL_ASSISTS, - True, - ) - - # 1. Test calling load_transactions_in_delta with the etl-level set to the proper sequencing of - # initial_run, then award_id_lookup. However, call initial_run with blank raw.transaction_normalized - # and raw.awards tables. - - # First, create blank raw.transaction_normalized and raw.awards tables - spark.sql( - TABLE_SPEC["transaction_normalized"]["delta_table_create_sql"].format( - DESTINATION_TABLE="transaction_normalized", - DESTINATION_DATABASE=raw_db, - SPARK_S3_BUCKET=s3_unittest_data_bucket, - DELTA_LAKE_S3_PATH=CONFIG.DELTA_LAKE_S3_PATH, - ) - ) - spark.sql( - TABLE_SPEC["awards"]["delta_table_create_sql"].format( - DESTINATION_TABLE="awards", - DESTINATION_DATABASE=raw_db, - SPARK_S3_BUCKET=s3_unittest_data_bucket, - DELTA_LAKE_S3_PATH=CONFIG.DELTA_LAKE_S3_PATH, - ) - ) - - # Then, call load_transactions_in_delta with etl-level of initial_run and verify. - # Don't reload the source tables, and don't do initial copy of transaction tables, though. - TestInitialRun.initial_run(s3_unittest_data_bucket, load_source_tables=False, initial_copy=False) - kwargs = { - "expected_last_load_transaction_id_lookup": _BEGINNING_OF_TIME, - "expected_last_load_award_id_lookup": _BEGINNING_OF_TIME, - "expected_last_load_transaction_normalized": _BEGINNING_OF_TIME, - "expected_last_load_transaction_fabs": _BEGINNING_OF_TIME, - "expected_last_load_transaction_fpds": _BEGINNING_OF_TIME, - } - TestInitialRun.verify(spark, [], [], **kwargs) - - # Then, call load_transactions_in_delta with etl-level of award_id_lookup. - call_command("load_transactions_in_delta", "--etl-level", "award_id_lookup") - - # The expected award_id_lookup table should be the same as in TestInitialRunWithPostgresLoader, - # but all of the award ids should be 1 larger than expected there. - expected_award_id_lookup = deepcopy(_InitialRunWithPostgresLoader.expected_initial_award_id_lookup) - for item in expected_award_id_lookup: - item["award_id"] += 1 - # Also, the last load date for the award_id_lookup table should be updated to the date of the initial loads. - kwargs["expected_last_load_award_id_lookup"] = _INITIAL_SOURCE_TABLE_LOAD_DATETIME - TestInitialRun.verify(spark, [], expected_award_id_lookup, **kwargs) - - @staticmethod - def _happy_path_test_core( - spark, - s3_data_bucket, - load_other_raw_tables, - expected_initial_transaction_id_lookup, - expected_initial_award_id_lookup, - expected_award_id_lookup_pops, - partially_deleted_award_id, - ): - # First, setup some source tables with data, without loading these Delta Tables from Postgres - # for efficiency reasons. - raw_db = "raw" - spark.sql(f"create database if not exists {raw_db};") - spark.sql(f"use {raw_db};") - spark.sql( - TABLE_SPEC["published_fabs"]["delta_table_create_sql"].format( - DESTINATION_TABLE="published_fabs", - DESTINATION_DATABASE=raw_db, - SPARK_S3_BUCKET=s3_data_bucket, - DELTA_LAKE_S3_PATH=CONFIG.DELTA_LAKE_S3_PATH, - ) - ) - spark.sql( - TABLE_SPEC["detached_award_procurement"]["delta_table_create_sql"].format( - DESTINATION_TABLE="detached_award_procurement", - DESTINATION_DATABASE=raw_db, - SPARK_S3_BUCKET=s3_data_bucket, - DELTA_LAKE_S3_PATH=CONFIG.DELTA_LAKE_S3_PATH, - ) - ) - load_dict_to_delta_table( - spark, - s3_data_bucket, - "raw", - "detached_award_procurement", - _INITIAL_PROCURES, - True, - ) - load_dict_to_delta_table( - spark, - s3_data_bucket, - "raw", - "published_fabs", - _INITIAL_ASSISTS, - True, - ) - # Trigger initial run of load transactions in delta. This step is required as it creates various data sources. - TestInitialRun.initial_run( - s3_data_bucket, load_source_tables=False, load_other_raw_tables=load_other_raw_tables, initial_copy=False - ) - - # 1. Test deleting the transactions with the last award ID from the appropriate raw table, - # followed by a call to load_transaction_in_delta with etl-level of award_id_lookup - # 2. Test for a single inserted transaction, and another call to load_transaction_in_delta with etl-level of - # award_id_lookup. - - spark.sql( - """ - DELETE FROM raw.detached_award_procurement - WHERE detached_award_procurement_id = 4 OR detached_award_procurement_id = 5 - """ - ) - - # Can't use spark.sql to just insert rows with only values for desired columns (need to specify values for - # all of them), so using model baker to add new rows to Postgres table, and then pushing new table to Delta. - last_assist_load_datetime = datetime.now(timezone.utc) - insert_datetime = last_assist_load_datetime + timedelta(minutes=-15) - assist = deepcopy(_NEW_ASSIST) - assist.update( - {"action_date": insert_datetime.isoformat(), "created_at": insert_datetime, "updated_at": insert_datetime} - ) - baker.make("transactions.SourceAssistanceTransaction", **assist) - update_last_load_date("source_assistance_transaction", last_assist_load_datetime) - load_delta_table_from_postgres("published_fabs", s3_data_bucket) - call_command("load_transactions_in_delta", "--etl-level", "award_id_lookup") - - # Verify award_id_lookup table - query = "SELECT * FROM int.award_id_lookup ORDER BY award_id, transaction_unique_id" - delta_data = [row.asDict() for row in spark.sql(query).collect()] - - expected_award_id_lookup = deepcopy(expected_initial_award_id_lookup) - expected_award_id_lookup.pop() - expected_award_id_lookup.pop() - - expected_award_id_lookup.append( - { - "award_id": 7, - "is_fpds": False, - "transaction_unique_id": _NEW_ASSIST["afa_generated_unique"].upper(), - "generated_unique_award_id": _NEW_ASSIST["unique_award_key"].upper(), - } - ) - - # Verify the data has been loaded and changed correctly - # Although the last load date for the source_assistance_transaction was updated above, the code in - # load_transactions_in_delta takes the minimum last load date of that table and of the - # source_procurement_transaction table, which has not been updated since the initial load of both tables. - kwargs = { - "expected_last_load_transaction_id_lookup": _INITIAL_SOURCE_TABLE_LOAD_DATETIME, - "expected_last_load_award_id_lookup": _INITIAL_SOURCE_TABLE_LOAD_DATETIME, - "expected_last_load_transaction_normalized": _BEGINNING_OF_TIME, - "expected_last_load_transaction_fabs": _BEGINNING_OF_TIME, - "expected_last_load_transaction_fpds": _BEGINNING_OF_TIME, - } - TestInitialRun.verify(spark, expected_initial_transaction_id_lookup, expected_award_id_lookup, **kwargs) - - # Make sure award_id_seq hasn't gone backwards - with connection.cursor() as cursor: - cursor.execute("SELECT nextval('award_id_seq')") - # Since all calls to setval() set the is_called flag to false, nextval() returns the actual maximum id - max_award_id = cursor.fetchone()[0] - assert ( - max_award_id == max([award["id"] for award in TestInitialRunNoPostgresLoader.initial_awards]) + 1 - ) # Add one for the insert - - # Since this test just called nextval(), need to reset the sequence with the is_called flag set to false - # so that the next call to nextval() will return the same value as previously. - with connection.cursor() as cursor: - cursor.execute(f"SELECT setval('award_id_seq', {max_award_id}, false)") - - # 3. Make inserts to and deletes from the raw tables, call load_transaction_in_delta with etl-level of - # award_id_lookup, and test that the results are as expected, and that int.award_ids_delete_modified has - # tracked the appropriate delete. - last_procure_load_datetime = datetime.now(timezone.utc) - insert_datetime = last_procure_load_datetime + timedelta(minutes=-15) - procure = deepcopy(_NEW_PROCURE) - procure.update( - {"action_date": insert_datetime.isoformat(), "created_at": insert_datetime, "updated_at": insert_datetime} - ) - baker.make("transactions.SourceProcurementTransaction", **procure) - update_last_load_date("source_procurement_transaction", last_procure_load_datetime) - load_delta_table_from_postgres("detached_award_procurement", s3_data_bucket) - - spark.sql( - """ - DELETE FROM raw.published_fabs - WHERE published_fabs_id = 2 - """ - ) - spark.sql( - """ - DELETE FROM raw.detached_award_procurement - WHERE detached_award_procurement_id = 1 - """ - ) - - call_command("load_transactions_in_delta", "--etl-level", "award_id_lookup") - - # Verify award_id_lookup table - query = "SELECT * FROM int.award_id_lookup ORDER BY award_id, transaction_unique_id" - delta_data = [row.asDict() for row in spark.sql(query).collect()] - - for pop in expected_award_id_lookup_pops: - expected_award_id_lookup.pop(pop) - expected_award_id_lookup.append( - { - "award_id": 8, - "is_fpds": True, - "transaction_unique_id": _NEW_PROCURE["detached_award_proc_unique"].upper(), - "generated_unique_award_id": _NEW_PROCURE["unique_award_key"].upper(), - } - ) - assert equal_datasets(expected_award_id_lookup, delta_data, "") - - assert get_last_load_date("award_id_lookup") == last_assist_load_datetime - - # Verify award_ids_delete_modified table - query = "SELECT * FROM int.award_ids_delete_modified ORDER BY award_id" - delta_data = [row.asDict() for row in spark.sql(query).collect()] - assert equal_datasets([{"award_id": partially_deleted_award_id}], delta_data, "") - - @mark.django_db(transaction=True) - def test_happy_path_scenarios_no_pg_loader( - self, spark, s3_unittest_data_bucket, hive_unittest_metastore_db, _populate_initial_source_tables_pg - ): - # Since we're not using the Postgres transaction loader, load raw.transaction_normalized and raw.awards - # from expected data when making initial run - load_other_raw_tables = [ - _TableLoadInfo( - spark, "transaction_normalized", TestInitialRunNoPostgresLoader.initial_transaction_normalized - ), - _TableLoadInfo(spark, "awards", TestInitialRunNoPostgresLoader.initial_awards), - ] - - self._happy_path_test_core( - spark, - s3_unittest_data_bucket, - load_other_raw_tables, - TestInitialRunNoPostgresLoader.expected_initial_transaction_id_lookup, - TestInitialRunNoPostgresLoader.expected_initial_award_id_lookup, - (3, 1), - 2, - ) diff --git a/usaspending_api/etl/tests/unit/test_load_transactions_in_delta.py b/usaspending_api/etl/tests/unit/test_load_transactions_in_delta.py deleted file mode 100644 index 5665174e54..0000000000 --- a/usaspending_api/etl/tests/unit/test_load_transactions_in_delta.py +++ /dev/null @@ -1,50 +0,0 @@ -from mock import MagicMock, Mock, patch -from pytest import raises -from usaspending_api.etl.management.commands.load_transactions_in_delta import Command - - -def _setup_spark_table_exists_mock(return_value): - spark_mock = Mock() - catalog_mock = Mock() - spark_mock._jsparkSession.catalog.return_value = catalog_mock - catalog_mock.tableExists.return_value = return_value - return spark_mock - - -def _setup_spark_mock(): - # Weird mock to get around the 'with self.prepare_spark()' statement - spark_mock = MagicMock() - spark_mock.__enter__.return_value = None # This can be any value - return spark_mock - - -@patch("usaspending_api.etl.management.commands.load_transactions_in_delta.get_earliest_load_date", return_value=None) -@patch( - "usaspending_api.etl.management.commands.load_transactions_in_delta.Command.prepare_spark", - new_callable=_setup_spark_mock, -) -def test_delete_records_sql_throws_exception(patch_prepare_spark, patch_load_date): - command = Command() - # This is needed along with 'prepare_spark' being patched to get around spark issues - command.spark = _setup_spark_table_exists_mock(return_value=False) - test_options = {"spark_s3_bucket": "some_bucket", "no_initial_copy": False} - - test_options["etl_level"] = "award_id_lookup" - with raises(Exception, match="Table: int.award_id_lookup does not exist."): - command.handle(**test_options) - - test_options["etl_level"] = "transaction_id_lookup" - with raises(Exception, match="Table: int.transaction_id_lookup does not exist."): - command.handle(**test_options) - - test_options["etl_level"] = "transaction_fabs" - with raises(Exception, match="Table: int.transaction_fabs does not exist."): - command.handle(**test_options) - - test_options["etl_level"] = "transaction_fpds" - with raises(Exception, match="Table: int.transaction_fpds does not exist."): - command.handle(**test_options) - - test_options["etl_level"] = "awards" - with raises(Exception, match="Table: int.awards does not exist."): - command.handle(**test_options) diff --git a/usaspending_api/etl/transaction_delta_loaders/__init__.py b/usaspending_api/etl/transaction_delta_loaders/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/usaspending_api/etl/transaction_delta_loaders/context_managers.py b/usaspending_api/etl/transaction_delta_loaders/context_managers.py new file mode 100644 index 0000000000..cda8cc3003 --- /dev/null +++ b/usaspending_api/etl/transaction_delta_loaders/context_managers.py @@ -0,0 +1,44 @@ +from contextlib import contextmanager +from typing import Generator + +from pyspark.sql import SparkSession +from pyspark.sql.types import ArrayType, StringType +from usaspending_api.broker.helpers.get_business_categories import ( + get_business_categories_fabs, + get_business_categories_fpds, +) +from usaspending_api.common.helpers.spark_helpers import ( + configure_spark_session, + get_active_spark_session, +) + + +@contextmanager +def prepare_spark() -> Generator[SparkSession, None, None]: + extra_conf = { + "spark.sql.extensions": "io.delta.sql.DeltaSparkSessionExtension", + "spark.sql.catalog.spark_catalog": "org.apache.spark.sql.delta.catalog.DeltaCatalog", + "spark.sql.parquet.datetimeRebaseModeInWrite": "LEGACY", # for dates at/before 1900 + "spark.sql.parquet.int96RebaseModeInWrite": "LEGACY", # for timestamps at/before 1900 + "spark.sql.jsonGenerator.ignoreNullFields": "false", # keep nulls in our json + } + + # Create the Spark Session + spark = get_active_spark_session() + spark_created_by_command = False + if not spark: + spark_created_by_command = True + spark = configure_spark_session(**extra_conf, spark_context=spark) + + # Create UDFs for Business Categories + spark.udf.register( + name="get_business_categories_fabs", f=get_business_categories_fabs, returnType=ArrayType(StringType()) + ) + spark.udf.register( + name="get_business_categories_fpds", f=get_business_categories_fpds, returnType=ArrayType(StringType()) + ) + + yield spark + + if spark_created_by_command: + spark.stop() diff --git a/usaspending_api/etl/transaction_delta_loaders/loaders.py b/usaspending_api/etl/transaction_delta_loaders/loaders.py new file mode 100644 index 0000000000..dc04dd0795 --- /dev/null +++ b/usaspending_api/etl/transaction_delta_loaders/loaders.py @@ -0,0 +1,440 @@ +import copy +import logging +from abc import ABC +from datetime import datetime, timezone +from typing import Callable, Literal + +from delta import DeltaTable +from pyspark.sql import functions as sf, SparkSession, Window + + +from usaspending_api.broker.helpers.build_business_categories_boolean_dict import fpds_boolean_columns + +from usaspending_api.broker.helpers.last_load_date import ( + get_earliest_load_date, + update_last_load_date, +) +from usaspending_api.common.data_classes import TransactionColumn +from usaspending_api.common.etl.spark import create_ref_temp_views + + +from usaspending_api.transactions.delta_models.transaction_fabs import ( + FABS_TO_NORMALIZED_COLUMN_INFO, + TRANSACTION_FABS_COLUMN_INFO, +) +from usaspending_api.transactions.delta_models.transaction_fpds import ( + DAP_TO_NORMALIZED_COLUMN_INFO, + TRANSACTION_FPDS_COLUMN_INFO, +) +from usaspending_api.transactions.delta_models.transaction_normalized import TRANSACTION_NORMALIZED_COLUMNS + +logger = logging.getLogger(__name__) + + +class AbstractDeltaTransactionLoader(ABC): + + spark: SparkSession + id_col: str + source_table: str + col_info = list[TransactionColumn] + + def __init__(self, spark, etl_level: Literal["fabs", "fpds", "normalized"], spark_s3_bucket: str) -> None: + self.etl_level = etl_level + self.spark_s3_bucket: spark_s3_bucket + self.spark = spark + + def load_transactions(self) -> None: + if not self.spark._jsparkSession.catalog().tableExists(f"int.transaction_{self.etl_level}"): + raise Exception(f"Table: int.transaction_{self.etl_level} does not exist.") + logger.info(f"Running UPSERT SQL for transaction_{self.etl_level} ETL") + self.spark.sql(self.transaction_merge_into_sql()) + next_last_load = get_earliest_load_date( + ("source_procurement_transaction", "source_assistance_transaction"), datetime.utcfromtimestamp(0) + ) + update_last_load_date(f"transaction_{self.etl_level}", next_last_load) + + def build_date_format_sql(self, col: TransactionColumn, is_casted_to_date: bool = True) -> str: + # Each of these regexps allows for an optional timestamp portion, separated from the date by some character, + # and the timestamp allows for an optional UTC offset. In any case, the timestamp is ignored, though. + regexp_mmddYYYY = r"(\\d{2})(?[-/])(\\d{2})(\\k)(\\d{4})(.\\d{2}:\\d{2}:\\d{2}([+-]\\d{2}:\\d{2})?)?" + regexp_YYYYmmdd = r"(\\d{4})(?[-/]?)(\\d{2})(\\k)(\\d{2})(.\\d{2}:\\d{2}:\\d{2}([+-]\\d{2}:\\d{2})?)?" + + mmddYYYY_fmt = f""" + (regexp_extract({self.source_table}.{col.source}, '{regexp_mmddYYYY}', 5) + || '-' || + regexp_extract({self.source_table}.{col.source}, '{regexp_mmddYYYY}', 1) + || '-' || + regexp_extract({self.source_table}.{col.source}, '{regexp_mmddYYYY}', 3)) + """ + YYYYmmdd_fmt = f""" + (regexp_extract({self.source_table}.{col.source}, '{regexp_YYYYmmdd}', 1) + || '-' || + regexp_extract({self.source_table}.{col.source}, '{regexp_YYYYmmdd}', 3) + || '-' || + regexp_extract({self.source_table}.{col.source}, '{regexp_YYYYmmdd}', 5)) + """ + + if is_casted_to_date: + mmddYYYY_fmt = f"""CAST({mmddYYYY_fmt} + AS DATE) + """ + YYYYmmdd_fmt = f"""CAST({YYYYmmdd_fmt} + AS DATE) + """ + + sql_snippet = f""" + CASE WHEN regexp({self.source_table}.{col.source}, '{regexp_mmddYYYY}') + THEN {mmddYYYY_fmt} + ELSE {YYYYmmdd_fmt} + END + """ + + return sql_snippet + + def handle_column(self, col: TransactionColumn, is_result_aliased=True) -> str: + if col.handling == "cast": + retval = f"CAST({self.source_table}.{col.source} AS {col.delta_type})" + elif col.handling == "literal": + # Use col.source directly as the value + retval = f"{col.source}" + elif col.handling == "parse_string_datetime_to_date": + # These are string fields that actually hold DATES/TIMESTAMPS and need to be cast as dates. + # However, they may not be properly parsed when calling CAST(... AS DATE). + retval = self.build_date_format_sql(col, is_casted_to_date=True) + elif col.handling == "string_datetime_remove_timestamp": + # These are string fields that actually hold DATES/TIMESTAMPS, but need the non-DATE part discarded, + # even though they remain as strings + retval = self.build_date_format_sql(col, is_casted_to_date=False) + elif col.delta_type.upper() == "STRING": + # Capitalize and remove leading & trailing whitespace from all string values + retval = f"ucase(trim({self.source_table}.{col.source}))" + elif col.delta_type.upper() == "BOOLEAN" and not col.handling == "leave_null": + # Unless specified, convert any nulls to false for boolean columns + retval = f"COALESCE({self.source_table}.{col.source}, FALSE)" + else: + retval = f"{self.source_table}.{col.source}" + + # Handle scalar transformations if the column requires it + if col.scalar_transformation is not None: + retval = col.scalar_transformation.format(input=retval) + + retval = f"{retval}{' AS ' + col.dest_name if is_result_aliased else ''}" + return retval + + @property + def select_columns(self) -> list[str]: + return ["CAST(NULL AS LONG) AS transaction_id"] + [ + self.handle_column(col) for col in self.col_info if col.dest_name != "transaction_id" + ] + + def source_subquery_sql(self) -> str: + select_columns_str = ",\n ".join(self.select_columns) + sql = f""" + SELECT + {select_columns_str} + FROM {self.source_table} + """ + return sql + + def transaction_merge_into_sql(self) -> str: + silver_table_cols = ", ".join([col.dest_name for col in self.col_info if col.dest_name != "transaction_id"]) + sql = f""" + MERGE INTO int.transaction_{self.etl_level} AS silver_table + USING ( + {self.source_subquery_sql()} + ) AS source_subquery + ON + silver_table.{self.id_col} = source_subquery.{self.id_col} + AND silver_table.hash = source_subquery.hash + WHEN NOT MATCHED + THEN INSERT + ({silver_table_cols}) + VALUES ({silver_table_cols}) + WHEN NOT MATCHED BY SOURCE + THEN DELETE + """ + + return sql + + +class FPDSDeltaTransactionLoader(AbstractDeltaTransactionLoader): + + def __init__(self, spark: SparkSession, spark_s3_bucket: str) -> None: + super().__init__(spark=spark, etl_level="fpds", spark_s3_bucket=spark_s3_bucket) + self.id_col = "detached_award_proc_unique" + self.source_table = "raw.detached_award_procurement" + self.col_info = TRANSACTION_FPDS_COLUMN_INFO + + +class FABSDeltaTransactionLoader(AbstractDeltaTransactionLoader): + + def __init__(self, spark: SparkSession, spark_s3_bucket: str) -> None: + super().__init__(spark=spark, etl_level="fabs", spark_s3_bucket=spark_s3_bucket) + self.id_col = "afa_generated_unique" + self.source_table = "raw.published_fabs" + self.col_info = TRANSACTION_FABS_COLUMN_INFO + + +class NormalizedMixin: + + spark: SparkSession + handle_column: Callable + source_table: str + etl_level: str + select_columns: list[str] + to_normalized_col_info: list[TransactionColumn] + normalization_type: Literal["fabs", "fpds"] + prepare_spark: Callable + + def source_subquery_sql(self) -> str: + additional_joins = f""" + LEFT OUTER JOIN global_temp.subtier_agency AS funding_subtier_agency ON ( + funding_subtier_agency.subtier_code = {self.source_table}.funding_sub_tier_agency_co + ) + LEFT OUTER JOIN global_temp.agency AS funding_agency ON ( + funding_agency.subtier_agency_id = funding_subtier_agency.subtier_agency_id + ) + LEFT OUTER JOIN global_temp.subtier_agency AS awarding_subtier_agency ON ( + awarding_subtier_agency.subtier_code = {self.source_table}.awarding_sub_tier_agency_c + ) + LEFT OUTER JOIN global_temp.agency AS awarding_agency ON ( + awarding_agency.subtier_agency_id = awarding_subtier_agency.subtier_agency_id + ) + """ + + # Since the select columns may have complicated logic, put them on separate lines for debugging. + # However, strings inside {} expressions in f-strings can't contain backslashes, so will join them first + # before inserting into overall sql statement. + select_columns_str = ",\n ".join(self.select_columns) + return f""" + SELECT + {select_columns_str} + FROM {self.source_table} + {additional_joins} + """ + + def transaction_merge_into_sql(self) -> str: + create_ref_temp_views(self.spark) + load_datetime = datetime.now(timezone.utc) + special_columns = ["create_date", "update_date"] + # On set, create_date will not be changed and update_date will be set below. All other column + # values will come from the subquery. + set_cols = [ + f"int.transaction_normalized.{col_name} = source_subquery.{col_name}" + for col_name in TRANSACTION_NORMALIZED_COLUMNS + if col_name not in special_columns + ] + set_cols.append(f"""int.transaction_normalized.update_date = '{load_datetime.isoformat(" ")}'""") + # Move create_date and update_date to the end of the list of column names for ease of handling + # during record insert + insert_col_name_list = [ + col_name for col_name in TRANSACTION_NORMALIZED_COLUMNS if col_name not in special_columns + ] + insert_col_name_list.extend(special_columns) + insert_col_names = ", ".join([col_name for col_name in insert_col_name_list]) + + # On insert, all values except for create_date and update_date will come from the subquery + insert_value_list = insert_col_name_list[:-2] + insert_value_list.extend([f"""'{load_datetime.isoformat(" ")}'"""] * 2) + insert_values = ", ".join([value for value in insert_value_list]) + + sql = f""" + MERGE INTO int.transaction_normalized + USING ( + {self.source_subquery_sql()} + ) AS source_subquery + ON transaction_normalized.transaction_unique_id = source_subquery.transaction_unique_id + AND transaction_normalized.hash = source_subquery.hash + WHEN NOT MATCHED + THEN INSERT + ({insert_col_names}) + VALUES ({insert_values}) + WHEN NOT MATCHED BY SOURCE AND {'NOT' if self.normalization_type== 'fabs' else ''} transaction_normalized.is_fpds + THEN DELETE + """ + + return sql + + def populate_transaction_normalized_ids(self) -> None: + target = DeltaTable.forName(self.spark, "int.transaction_normalized").alias("t") + tn = self.spark.table("int.transaction_normalized") + needs_ids = tn.filter(tn.id.isNull()) + if not needs_ids.isEmpty(): + max_id = tn.agg(sf.max("id")).collect()[0][0] + max_id = max_id if max_id else 0 + w = Window.orderBy(needs_ids.transaction_unique_id) + with_ids = needs_ids.withColumn("id", (max_id + sf.row_number().over(w)).cast("LONG")).alias("s") + ( + target.merge(with_ids, "t.transaction_unique_id = s.transaction_unique_id AND t.hash = s.hash") + .whenMatchedUpdateAll() + .execute() + ) + + def link_transactions_to_normalized(self) -> None: + tn = self.spark.table("int.transaction_normalized") + tablename = f"int.transaction_{self.normalization_type}" + id_col = "detached_award_proc_unique" if self.normalization_type == "fpds" else "afa_generated_unique" + target = DeltaTable.forName(self.spark, tablename).alias("t") + source = self.spark.table(tablename) + needs_ids = ( + source.join( + tn, + on=( + (tn.transaction_unique_id == source[id_col]) + & (tn.hash == source.hash) + & (source.transaction_id.isNull() | (source.transaction_id != tn.id)) + ), + how="inner", + ) + .select(tn.id, source[id_col], source.hash) + .alias("s") + ) + ( + target.merge(needs_ids, f"t.{id_col} = s.{id_col} AND t.hash = s.hash") + .whenMatchedUpdate(set={"t.transaction_id": "s.id"}) + .execute() + ) + + def populate_award_ids(self) -> None: + awards = self.spark.table("int.awards") + max_id = awards.agg(sf.max("id")).collect()[0][0] + max_id = max_id if max_id else 0 + target = DeltaTable.forName(self.spark, "int.transaction_normalized").alias("t") + source = self.spark.table("int.transaction_normalized") + needs_ids = ( + source.join(awards, awards.generated_unique_award_id == source.unique_award_key, how="left") + .filter(awards.id.isNull()) + .select(source.unique_award_key) + .distinct() + ) + w = Window.orderBy(needs_ids.unique_award_key) + with_ids = needs_ids.withColumn("award_id", (max_id + sf.row_number().over(w)).cast("LONG")).alias("s") + ( + target.merge(with_ids, f"t.unique_award_key = s.unique_award_key") + .whenMatchedUpdate(set={"t.award_id": "s.award_id"}) + .execute() + ) + + def load_transactions(self) -> None: + super().load_transactions() + self.populate_award_ids() + self.populate_transaction_normalized_ids() + self.link_transactions_to_normalized() + + +class FABSNormalizedDeltaTransactionLoader(NormalizedMixin, AbstractDeltaTransactionLoader): + + def __init__(self, spark: SparkSession, spark_s3_bucket: str) -> None: + super().__init__(spark=spark, etl_level="normalized", spark_s3_bucket=spark_s3_bucket) + self.id_col = "transaction_unique_id" + self.source_table = "raw.published_fabs" + self.to_normalized_col_info = FABS_TO_NORMALIZED_COLUMN_INFO + self.normalization_type = "fabs" + + @property + def select_columns(self) -> list[str]: + action_date_col = next( + filter(lambda c: c.dest_name == "action_date" and c.source == "action_date", FABS_TO_NORMALIZED_COLUMN_INFO) + ) + parse_action_date_sql_snippet = self.handle_column(action_date_col, is_result_aliased=False) + select_cols = [ + "CAST(NULL AS LONG) AS id", + "CAST(NULL AS LONG) AS award_id", + "awarding_agency.id AS awarding_agency_id", + f"""CASE WHEN month({parse_action_date_sql_snippet}) > 9 + THEN year({parse_action_date_sql_snippet}) + 1 + ELSE year({parse_action_date_sql_snippet}) + END AS fiscal_year""", + "funding_agency.id AS funding_agency_id", + ] + select_cols.extend( + [ + # business_categories + f"get_business_categories_fabs({self.source_table}.business_types) AS business_categories", + # funding_amount + # In theory, this should be equal to + # CAST(COALESCE({bronze_table_name}.federal_action_obligation, 0) + # + COALESCE({bronze_table_name}.non_federal_funding_amount, 0) + # AS NUMERIC(23, 2)) + # However, for some historical records, this isn't true. + f""" + CAST({self.source_table}.total_funding_amount AS NUMERIC(23, 2)) AS funding_amount + """, + ] + ) + + for col in FABS_TO_NORMALIZED_COLUMN_INFO: + select_cols.append(self.handle_column(col)) + return select_cols + + +class FPDSNormalizedDeltaTransactionLoader(NormalizedMixin, AbstractDeltaTransactionLoader): + + def __init__(self, spark, spark_s3_bucket: str) -> None: + super().__init__(spark=spark, etl_level="normalized", spark_s3_bucket=spark_s3_bucket) + self.id_col = "transaction_unique_id" + self.source_table = "raw.detached_award_procurement" + self.to_normalized_col_info = DAP_TO_NORMALIZED_COLUMN_INFO + self.normalization_type = "fpds" + + @property + def select_columns(self) -> list[str]: + action_date_col = next( + filter(lambda c: c.dest_name == "action_date" and c.source == "action_date", DAP_TO_NORMALIZED_COLUMN_INFO) + ) + parse_action_date_sql_snippet = self.handle_column(action_date_col, is_result_aliased=False) + select_cols = [ + "CAST(NULL AS LONG) AS id", + "CAST(NULL AS LONG) AS award_id", + "awarding_agency.id AS awarding_agency_id", + f"""CASE WHEN month({parse_action_date_sql_snippet}) > 9 + THEN year({parse_action_date_sql_snippet}) + 1 + ELSE year({parse_action_date_sql_snippet}) + END AS fiscal_year""", + "funding_agency.id AS funding_agency_id", + ] + fpds_business_category_columns = copy.copy(fpds_boolean_columns) + # Add a couple of non-boolean columns that are needed in the business category logic + fpds_business_category_columns.extend(["contracting_officers_deter", "domestic_or_foreign_entity"]) + named_struct_text = ", ".join([f"'{col}', {self.source_table}.{col}" for col in fpds_business_category_columns]) + select_cols.extend( + [ + # business_categories + f"get_business_categories_fpds(named_struct({named_struct_text})) AS business_categories", + # type + f""" + CASE WHEN {self.source_table}.pulled_from <> 'IDV' THEN {self.source_table}.contract_award_type + WHEN {self.source_table}.idv_type = 'B' AND {self.source_table}.type_of_idc IS NOT NULL + THEN 'IDV_B_' || {self.source_table}.type_of_idc + WHEN {self.source_table}.idv_type = 'B' + AND {self.source_table}.type_of_idc_description = 'INDEFINITE DELIVERY / REQUIREMENTS' + THEN 'IDV_B_A' + WHEN {self.source_table}.idv_type = 'B' + AND {self.source_table}.type_of_idc_description = + 'INDEFINITE DELIVERY / INDEFINITE QUANTITY' + THEN 'IDV_B_B' + WHEN {self.source_table}.idv_type = 'B' + AND {self.source_table}.type_of_idc_description = + 'INDEFINITE DELIVERY / DEFINITE QUANTITY' + THEN 'IDV_B_C' + ELSE 'IDV_' || {self.source_table}.idv_type + END AS type + """, + # type_description + f""" + CASE WHEN {self.source_table}.pulled_from <> 'IDV' + THEN {self.source_table}.contract_award_type_desc + WHEN {self.source_table}.idv_type = 'B' + AND {self.source_table}.type_of_idc_description IS NOT NULL + AND ucase({self.source_table}.type_of_idc_description) <> 'NAN' + THEN {self.source_table}.type_of_idc_description + WHEN {self.source_table}.idv_type = 'B' + THEN 'INDEFINITE DELIVERY CONTRACT' + ELSE {self.source_table}.idv_type_description + END AS type_description + """, + ] + ) + for col in DAP_TO_NORMALIZED_COLUMN_INFO: + select_cols.append(self.handle_column(col)) + return select_cols diff --git a/usaspending_api/transactions/delta_models/detached_award_procurement.py b/usaspending_api/transactions/delta_models/detached_award_procurement.py index 0f75d954b5..b2bc834687 100644 --- a/usaspending_api/transactions/delta_models/detached_award_procurement.py +++ b/usaspending_api/transactions/delta_models/detached_award_procurement.py @@ -309,9 +309,13 @@ "woman_owned_business": {"delta": "BOOLEAN", "postgres": "BOOLEAN"}, "women_owned_small_business": {"delta": "BOOLEAN", "postgres": "BOOLEAN"}, } - - -DETACHED_AWARD_PROCUREMENT_DELTA_COLUMNS = {k: v["delta"] for k, v in DETACHED_AWARD_PROCUREMENT_COLUMNS.items()} +DELTA_ONLY_COLUMNS = { + "hash": "LONG", +} +DETACHED_AWARD_PROCUREMENT_DELTA_COLUMNS = { + **{k: v["delta"] for k, v in DETACHED_AWARD_PROCUREMENT_COLUMNS.items()}, + **DELTA_ONLY_COLUMNS, +} DETACHED_AWARD_PROCUREMENT_POSTGRES_COLUMNS = {k: v["postgres"] for k, v in DETACHED_AWARD_PROCUREMENT_COLUMNS.items()} detached_award_procurement_create_sql_string = rf""" diff --git a/usaspending_api/transactions/delta_models/published_fabs.py b/usaspending_api/transactions/delta_models/published_fabs.py index 150aec077a..21ac24bd4d 100644 --- a/usaspending_api/transactions/delta_models/published_fabs.py +++ b/usaspending_api/transactions/delta_models/published_fabs.py @@ -101,7 +101,13 @@ "updated_at": {"delta": "TIMESTAMP", "postgres": "TIMESTAMP"}, "uri": {"delta": "STRING", "postgres": "TEXT"}, } -PUBLISHED_FABS_DELTA_COLUMNS = {k: v["delta"] for k, v in PUBLISHED_FABS_COLUMNS.items()} +DELTA_ONLY_COLUMNS = { + "hash": "LONG", +} +PUBLISHED_FABS_DELTA_COLUMNS = { + **{k: v["delta"] for k, v in PUBLISHED_FABS_COLUMNS.items()}, + **DELTA_ONLY_COLUMNS, +} PUBLISHED_FABS_POSTGRES_COLUMNS = {k: v["postgres"] for k, v in PUBLISHED_FABS_COLUMNS.items()} published_fabs_create_sql_string = rf""" diff --git a/usaspending_api/transactions/delta_models/transaction_fabs.py b/usaspending_api/transactions/delta_models/transaction_fabs.py index dc8cce5fe3..d6cc32481c 100644 --- a/usaspending_api/transactions/delta_models/transaction_fabs.py +++ b/usaspending_api/transactions/delta_models/transaction_fabs.py @@ -128,7 +128,7 @@ TransactionColumn("sai_number", "sai_number", "STRING"), TransactionColumn("submission_id", "submission_id", "INTEGER"), TransactionColumn("total_funding_amount", "total_funding_amount", "STRING"), - TransactionColumn("transaction_id", None, "LONG NOT NULL"), + TransactionColumn("transaction_id", None, "LONG"), TransactionColumn("uei", "uei", "STRING"), TransactionColumn("ultimate_parent_legal_enti", "ultimate_parent_legal_enti", "STRING"), TransactionColumn("ultimate_parent_uei", "ultimate_parent_uei", "STRING"), @@ -136,6 +136,7 @@ TransactionColumn("unique_award_key", "unique_award_key", "STRING"), TransactionColumn("updated_at", "updated_at", "TIMESTAMP"), TransactionColumn("uri", "uri", "STRING"), + TransactionColumn("hash", "hash", "LONG"), ] TRANSACTION_FABS_COLUMNS = [col.dest_name for col in TRANSACTION_FABS_COLUMN_INFO] @@ -189,4 +190,5 @@ TransactionColumn("type_description", "assistance_type_desc", "STRING"), TransactionColumn("unique_award_key", "unique_award_key", "STRING"), TransactionColumn("usaspending_unique_transaction_id", "NULL", "STRING", "literal"), + TransactionColumn("hash", "hash", "LONG"), ] diff --git a/usaspending_api/transactions/delta_models/transaction_fpds.py b/usaspending_api/transactions/delta_models/transaction_fpds.py index e8f016f29f..bba0ab70ba 100644 --- a/usaspending_api/transactions/delta_models/transaction_fpds.py +++ b/usaspending_api/transactions/delta_models/transaction_fpds.py @@ -307,7 +307,7 @@ TransactionColumn("the_ability_one_program", "the_ability_one_program", "BOOLEAN"), TransactionColumn("total_obligated_amount", "total_obligated_amount", "STRING"), TransactionColumn("township_local_government", "township_local_government", "BOOLEAN"), - TransactionColumn("transaction_id", None, "LONG NOT NULL"), + TransactionColumn("transaction_id", None, "LONG"), TransactionColumn("transaction_number", "transaction_number", "STRING"), TransactionColumn("transit_authority", "transit_authority", "BOOLEAN"), TransactionColumn("tribal_college", "tribal_college", "BOOLEAN"), @@ -344,6 +344,7 @@ TransactionColumn("veterinary_hospital", "veterinary_hospital", "BOOLEAN"), TransactionColumn("woman_owned_business", "woman_owned_business", "BOOLEAN"), TransactionColumn("women_owned_small_business", "women_owned_small_business", "BOOLEAN"), + TransactionColumn("hash", "hash", "LONG"), ] TRANSACTION_FPDS_COLUMNS = [col.dest_name for col in TRANSACTION_FPDS_COLUMN_INFO] @@ -410,4 +411,5 @@ TransactionColumn("transaction_unique_id", "detached_award_proc_unique", "STRING"), TransactionColumn("unique_award_key", "unique_award_key", "STRING"), TransactionColumn("usaspending_unique_transaction_id", "NULL", "STRING", "literal"), + TransactionColumn("hash", "hash", "LONG"), ] diff --git a/usaspending_api/transactions/delta_models/transaction_normalized.py b/usaspending_api/transactions/delta_models/transaction_normalized.py index f9261f54f9..c9c073002c 100644 --- a/usaspending_api/transactions/delta_models/transaction_normalized.py +++ b/usaspending_api/transactions/delta_models/transaction_normalized.py @@ -2,7 +2,7 @@ "action_date": "DATE", "action_type": "STRING", "action_type_description": "STRING", - "award_id": "LONG NOT NULL", + "award_id": "LONG", "awarding_agency_id": "INTEGER", "business_categories": "ARRAY", "certified_date": "DATE", @@ -13,7 +13,7 @@ "fiscal_year": "INTEGER", "funding_agency_id": "INTEGER", "funding_amount": "NUMERIC(23, 2)", - "id": "LONG NOT NULL", + "id": "LONG", "indirect_federal_sharing": "NUMERIC(23, 2)", "is_fpds": "BOOLEAN NOT NULL", "last_modified_date": "DATE", @@ -28,6 +28,7 @@ "unique_award_key": "STRING", "update_date": "TIMESTAMP", "usaspending_unique_transaction_id": "STRING", + "hash": "LONG", } transaction_normalized_sql_string = rf"""