|
10 | 10 | import marimo |
11 | 11 |
|
12 | 12 | __generated_with = "0.15.2" |
13 | | -app = marimo.App(width="full", app_title="SEC 10-K Data Review") |
| 13 | +app = marimo.App(width="columns", app_title="SEC 10-K Data Review") |
14 | 14 |
|
15 | 15 |
|
16 | | -@app.cell |
| 16 | +@app.cell(column=0) |
17 | 17 | def _(): |
18 | 18 | import os |
19 | 19 | from pathlib import Path |
@@ -59,7 +59,72 @@ def get_pudl(table_name: str) -> pl.DataFrame: |
59 | 59 | return (get_pudl,) |
60 | 60 |
|
61 | 61 |
|
62 | | -@app.cell(hide_code=True) |
| 62 | +@app.cell |
| 63 | +def _(pl): |
| 64 | + def clean_industry_data(df: pl.DataFrame) -> pl.DataFrame: |
| 65 | + # Step 1: Clean industry_name_sic |
| 66 | + # Find the most common name for each industry_id_sic |
| 67 | + canonical_names = ( |
| 68 | + df.filter(pl.col("industry_name_sic").is_not_null()) |
| 69 | + .group_by("industry_id_sic") |
| 70 | + .agg(common_name=pl.col("industry_name_sic").mode()) |
| 71 | + ) |
| 72 | + |
| 73 | + # Identify the most common name and ensure it's unique per ID |
| 74 | + unique_canonical_names = ( |
| 75 | + canonical_names.group_by("industry_id_sic") |
| 76 | + .agg(count=pl.count()) |
| 77 | + .filter(pl.col("count") == 1) |
| 78 | + ) |
| 79 | + |
| 80 | + # Create a mapping dictionary |
| 81 | + name_mapping = { |
| 82 | + row["industry_id_sic"]: row["common_name"] |
| 83 | + for row in unique_canonical_names.rows() |
| 84 | + } |
| 85 | + |
| 86 | + # Fill in canonical names where applicable |
| 87 | + df = df.with_columns( |
| 88 | + pl.when(pl.col("industry_name_sic").is_null()) |
| 89 | + .then(pl.col("industry_id_sic").map(name_mapping)) |
| 90 | + .otherwise(pl.col("industry_name_sic")) |
| 91 | + .alias("cleaned_industry_name_sic") |
| 92 | + ) |
| 93 | + |
| 94 | + # Step 2: Fill in industry_id_sic using cleaned names |
| 95 | + df = df.with_columns( |
| 96 | + pl.when( |
| 97 | + pl.col("industry_id_sic").is_null() |
| 98 | + & pl.col("cleaned_industry_name_sic").is_not_null() |
| 99 | + ) |
| 100 | + .then(pl.col("cleaned_industry_name_sic").map(name_mapping)) |
| 101 | + .otherwise(pl.col("industry_id_sic")) |
| 102 | + .alias("cleaned_industry_id_sic") |
| 103 | + ) |
| 104 | + |
| 105 | + # Step 3: Handle nulls based on central_index_key and consistency before and after |
| 106 | + def fill_nulls(group: pl.DataFrame) -> pl.DataFrame: |
| 107 | + # Sort by report_date |
| 108 | + group = group.sort("report_date") |
| 109 | + |
| 110 | + # Forward fill for missing IDs and names |
| 111 | + group = group.with_columns( |
| 112 | + ffill_id=pl.col("industry_id_sic").fill_null(strategy="forward"), |
| 113 | + ffill_name=pl.col("industry_name_sic").fill_null(strategy="forward"), |
| 114 | + ) |
| 115 | + |
| 116 | + # Backward fill for missing IDs and names |
| 117 | + return group.with_columns( |
| 118 | + pl.col("ffill_id").fill_null(strategy="backward"), |
| 119 | + pl.col("ffill_name").fill_null(strategy="backward"), |
| 120 | + ) |
| 121 | + |
| 122 | + return df.groupby("central_index_key").agg(fill_nulls(pl.all())) |
| 123 | + |
| 124 | + return |
| 125 | + |
| 126 | + |
| 127 | +@app.cell(column=1, hide_code=True) |
63 | 128 | def _(mo): |
64 | 129 | mo.md( |
65 | 130 | r""" |
|
0 commit comments