Upload final model training data (#804)

Damonamajor · wrridgeway · jeancochrane · web-flow · commit 63bd65bcdcef · 2025-05-29T10:49:10.000-05:00
* land_nbhd_rate_unique_by_town_nbhd_class_and_year * Initial draft * update docs * Add run_id to query * add table_ * Switch to training data * Update schema.yml * Update model-training_data.R * Initial draft * update docs * Add run_id to query * add table_ * Switch to training data * Update schema.yml * Update model-training_data.R * Remove unintended commit * Correct unique columns * Error if > 2 * Update etl/scripts-ccao-data-warehouse-us-east-1/model/model-training_data.R Co-authored-by: William Ridgeway <10358980+wrridgeway@users.noreply.github.com> * Update etl/scripts-ccao-data-warehouse-us-east-1/model/model-training_data.R Co-authored-by: William Ridgeway <10358980+wrridgeway@users.noreply.github.com> * updated py script * styler * test2 * Fix ref * Updated push * updates * Another attempt * Resolve SSL errors and use Spark DataFrames for `model.training_data` (#821) * Remove default file_format * Add unique key * possible functional version * Functional version * Remove old script * Remove unique_key * Commenting * Update docs.md * Update dbt/models/model/schema.yml Co-authored-by: Jean Cochrane <jeancochrane@users.noreply.github.com> * Update dbt/models/model/model.training_data.py Co-authored-by: Jean Cochrane <jeancochrane@users.noreply.github.com> * Update dbt/models/model/model.training_data.py Co-authored-by: Jean Cochrane <jeancochrane@users.noreply.github.com> * lintr * Update dbt/models/model/docs.md Co-authored-by: Jean Cochrane <jeancochrane@users.noreply.github.com> * Update dbt/models/model/model.training_data.py Co-authored-by: Jean Cochrane <jeancochrane@users.noreply.github.com> * Update dbt/models/model/model.training_data.py Co-authored-by: Jean Cochrane <jeancochrane@users.noreply.github.com> * Update dbt/models/model/model.training_data.py Co-authored-by: Jean Cochrane <jeancochrane@users.noreply.github.com> * update to training_data * update to training_data * Update schema * update schema * push attempt * update model * update naming * update naming * Update dbt/models/model/schema.yml Co-authored-by: Jean Cochrane <jeancochrane@users.noreply.github.com> * Update dbt/models/model/schema.yml Co-authored-by: Jean Cochrane <jeancochrane@users.noreply.github.com> --------- Co-authored-by: William Ridgeway <10358980+wrridgeway@users.noreply.github.com> Co-authored-by: Jean Cochrane <jeancochrane@users.noreply.github.com>
diff --git a/dbt/models/model/docs.md b/dbt/models/model/docs.md
@@ -173,6 +173,19 @@ Wall time of each stage (train, assess, etc.) for each model run (`run_id`).
 **Primary Key**: `year`, `run_id`
 {% enddocs %}
 
+# training_data
+
+{% docs table_training_data %}
+
+A table containing the training data from the final model runs.
+
+We update this table once per assessment year after choosing the final model
+runs for the year. As such, only final model run IDs should be present in this
+table.
+
+**Primary Key**: `run_id`, `meta_card_num`, `meta_sale_document_num`
+{% enddocs %}
+
 # vw_card_res_input
 
 {% docs view_vw_card_res_input %}
diff --git a/dbt/models/model/model.training_data.py b/dbt/models/model/model.training_data.py
@@ -0,0 +1,80 @@
+import functools
+
+from pyspark.sql.functions import lit
+
+
+def model(dbt, session):
+    dbt.config(
+        materialized="incremental",
+        incremental_strategy="insert_overwrite",
+        partitioned_by=["assessment_year", "run_id", "meta_township_code"],
+        on_schema_change="append_new_columns",
+    )
+
+    # Build the base metadata DataFrame
+    base_query = """
+        SELECT
+          run_id,
+          year,
+          assessment_year,
+          dvc_md5_training_data
+        FROM model.metadata
+        WHERE run_type = 'final'
+    """
+    metadata_df = session.sql(base_query)
+
+    if dbt.is_incremental:
+        # anti-join out any run_ids already in the target
+        existing = (
+            session.table(f"{dbt.this.schema}.{dbt.this.identifier}")
+            .select("run_id")
+            .distinct()
+        )
+        metadata_df = metadata_df.join(existing, on="run_id", how="left_anti")
+
+        # if there’s nothing new, return an *empty* DataFrame
+        if metadata_df.limit(1).count() == 0:
+            print(">>> no new run_id found; skipping incremental update")
+            # this returns zero rows but preserves the full target schema
+            return session.table(
+                f"{dbt.this.schema}.{dbt.this.identifier}"
+            ).limit(0)
+
+    # Collect remaining metadata
+    metadata = metadata_df.toPandas()
+
+    bucket = "ccao-data-dvc-us-east-1"
+    all_dfs = []
+
+    for _, row in metadata.iterrows():
+        run_id = row["run_id"]
+        year = int(row["year"])
+        h = row["dvc_md5_training_data"]
+
+        prefix = "" if year <= 2023 else "files/md5/"
+        key = f"{prefix}{h[:2]}/{h[2:]}"
+        s3p = f"{bucket}/{key}"
+
+        print(f">>> reading all columns for run {run_id!r}")
+        print(f"    →   S3 key = {s3p}")
+        df = session.read.parquet(f"s3://{s3p}")
+
+        # coerce booleans for mismatched types
+        if "ccao_is_active_exe_homeowner" in df.columns:
+            df = df.withColumn(
+                "ccao_is_active_exe_homeowner",
+                df["ccao_is_active_exe_homeowner"].cast("boolean"),
+            )
+
+        # add run_id and assessment_year columns
+        df = df.withColumn("run_id", lit(run_id)).withColumn(
+            "assessment_year", lit(row["assessment_year"])
+        )
+
+        all_dfs.append(df)
+        print(f"Processed run_id={run_id}, rows={df.count()}")
+
+    # Union all the new runs together
+    return functools.reduce(
+        lambda x, y: x.unionByName(y, allowMissingColumns=True), all_dfs
+    )
diff --git a/dbt/models/model/schema.yml b/dbt/models/model/schema.yml
@@ -230,6 +230,19 @@ models:
         description: |
           Any notes or caveats associated with the model run
 
+  - name: model.training_data
+    description: '{{ doc("table_training_data") }}'
+    config:
+      tags:
+        - load_manual
+    tests:
+      - unique_combination_of_columns:
+          name: model_training_data_unique_card_doc_number_run_id
+          combination_of_columns:
+            - run_id
+            - meta_sale_document_num
+            - meta_card_num
+
   - name: model.vw_pin_shared_input
     description: '{{ doc("view_vw_pin_shared_input") }}'
     config: