Skip to content

Commit cc0981b

Browse files
committed
Simplify & update environment; Add industry name/id cleaning
1 parent 6d262ba commit cc0981b

File tree

3 files changed

+402
-3041
lines changed

3 files changed

+402
-3041
lines changed

marimo/sec10k-data-review.py

Lines changed: 68 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,10 +10,10 @@
1010
import marimo
1111

1212
__generated_with = "0.15.2"
13-
app = marimo.App(width="full", app_title="SEC 10-K Data Review")
13+
app = marimo.App(width="columns", app_title="SEC 10-K Data Review")
1414

1515

16-
@app.cell
16+
@app.cell(column=0)
1717
def _():
1818
import os
1919
from pathlib import Path
@@ -59,7 +59,72 @@ def get_pudl(table_name: str) -> pl.DataFrame:
5959
return (get_pudl,)
6060

6161

62-
@app.cell(hide_code=True)
62+
@app.cell
63+
def _(pl):
64+
def clean_industry_data(df: pl.DataFrame) -> pl.DataFrame:
65+
# Step 1: Clean industry_name_sic
66+
# Find the most common name for each industry_id_sic
67+
canonical_names = (
68+
df.filter(pl.col("industry_name_sic").is_not_null())
69+
.group_by("industry_id_sic")
70+
.agg(common_name=pl.col("industry_name_sic").mode())
71+
)
72+
73+
# Identify the most common name and ensure it's unique per ID
74+
unique_canonical_names = (
75+
canonical_names.group_by("industry_id_sic")
76+
.agg(count=pl.count())
77+
.filter(pl.col("count") == 1)
78+
)
79+
80+
# Create a mapping dictionary
81+
name_mapping = {
82+
row["industry_id_sic"]: row["common_name"]
83+
for row in unique_canonical_names.rows()
84+
}
85+
86+
# Fill in canonical names where applicable
87+
df = df.with_columns(
88+
pl.when(pl.col("industry_name_sic").is_null())
89+
.then(pl.col("industry_id_sic").map(name_mapping))
90+
.otherwise(pl.col("industry_name_sic"))
91+
.alias("cleaned_industry_name_sic")
92+
)
93+
94+
# Step 2: Fill in industry_id_sic using cleaned names
95+
df = df.with_columns(
96+
pl.when(
97+
pl.col("industry_id_sic").is_null()
98+
& pl.col("cleaned_industry_name_sic").is_not_null()
99+
)
100+
.then(pl.col("cleaned_industry_name_sic").map(name_mapping))
101+
.otherwise(pl.col("industry_id_sic"))
102+
.alias("cleaned_industry_id_sic")
103+
)
104+
105+
# Step 3: Handle nulls based on central_index_key and consistency before and after
106+
def fill_nulls(group: pl.DataFrame) -> pl.DataFrame:
107+
# Sort by report_date
108+
group = group.sort("report_date")
109+
110+
# Forward fill for missing IDs and names
111+
group = group.with_columns(
112+
ffill_id=pl.col("industry_id_sic").fill_null(strategy="forward"),
113+
ffill_name=pl.col("industry_name_sic").fill_null(strategy="forward"),
114+
)
115+
116+
# Backward fill for missing IDs and names
117+
return group.with_columns(
118+
pl.col("ffill_id").fill_null(strategy="backward"),
119+
pl.col("ffill_name").fill_null(strategy="backward"),
120+
)
121+
122+
return df.groupby("central_index_key").agg(fill_nulls(pl.all()))
123+
124+
return
125+
126+
127+
@app.cell(column=1, hide_code=True)
63128
def _(mo):
64129
mo.md(
65130
r"""

0 commit comments

Comments
 (0)