Skip to content

Commit b805916

Browse files
committed
Update notebook and remove unused dependencies.
1 parent 53e1ccf commit b805916

File tree

3 files changed

+509
-468
lines changed

3 files changed

+509
-468
lines changed

marimo/sec10k-data-review.py

Lines changed: 38 additions & 75 deletions
Original file line numberDiff line numberDiff line change
@@ -60,67 +60,7 @@ def get_pudl(table_name: str) -> pl.DataFrame:
6060

6161

6262
@app.cell
63-
def _(pl):
64-
def clean_industry_data(df: pl.DataFrame) -> pl.DataFrame:
65-
# Step 1: Clean industry_name_sic
66-
# Find the most common name for each industry_id_sic
67-
canonical_names = (
68-
df.filter(pl.col("industry_name_sic").is_not_null())
69-
.group_by("industry_id_sic")
70-
.agg(common_name=pl.col("industry_name_sic").mode())
71-
)
72-
73-
# Identify the most common name and ensure it's unique per ID
74-
unique_canonical_names = (
75-
canonical_names.group_by("industry_id_sic")
76-
.agg(count=pl.count())
77-
.filter(pl.col("count") == 1)
78-
)
79-
80-
# Create a mapping dictionary
81-
name_mapping = {
82-
row["industry_id_sic"]: row["common_name"]
83-
for row in unique_canonical_names.rows()
84-
}
85-
86-
# Fill in canonical names where applicable
87-
df = df.with_columns(
88-
pl.when(pl.col("industry_name_sic").is_null())
89-
.then(pl.col("industry_id_sic").map(name_mapping))
90-
.otherwise(pl.col("industry_name_sic"))
91-
.alias("cleaned_industry_name_sic")
92-
)
93-
94-
# Step 2: Fill in industry_id_sic using cleaned names
95-
df = df.with_columns(
96-
pl.when(
97-
pl.col("industry_id_sic").is_null()
98-
& pl.col("cleaned_industry_name_sic").is_not_null()
99-
)
100-
.then(pl.col("cleaned_industry_name_sic").map(name_mapping))
101-
.otherwise(pl.col("industry_id_sic"))
102-
.alias("cleaned_industry_id_sic")
103-
)
104-
105-
# Step 3: Handle nulls based on central_index_key and consistency before and after
106-
def fill_nulls(group: pl.DataFrame) -> pl.DataFrame:
107-
# Sort by report_date
108-
group = group.sort("report_date")
109-
110-
# Forward fill for missing IDs and names
111-
group = group.with_columns(
112-
ffill_id=pl.col("industry_id_sic").fill_null(strategy="forward"),
113-
ffill_name=pl.col("industry_name_sic").fill_null(strategy="forward"),
114-
)
115-
116-
# Backward fill for missing IDs and names
117-
return group.with_columns(
118-
pl.col("ffill_id").fill_null(strategy="backward"),
119-
pl.col("ffill_name").fill_null(strategy="backward"),
120-
)
121-
122-
return df.groupby("central_index_key").agg(fill_nulls(pl.all()))
123-
63+
def _():
12464
return
12565

12666

@@ -139,7 +79,6 @@ def _(mo):
13979
- Filtering all the industry names for "electric" and "power" shows a bunch of other industries that are not related to electricity generation. Mostly electronics, etc.
14080
- So the 4911 and 4931 seem to be the main ones we expect to link to EIA Utilities.
14181
- And then there's a number of smaller industries with cogeneration that often match, but don't have as many companies in them, and probably aren't responsible for much generation.
142-
- Note: this kind of analysis would be easier if we cleaned up the SIC names & IDs so that they're more consistent & complete.
14382
"""
14483
)
14584
return
@@ -156,39 +95,62 @@ def _(companies, pl):
15695
(
15796
companies.filter(pl.col("utility_id_eia").is_not_null())
15897
.select(["industry_id_sic", "industry_name_sic"])
159-
.group_by(["industry_id_sic", "industry_name_sic"])
160-
.agg(count=pl.len())
98+
.group_by(pl.col("industry_id_sic"))
99+
.agg(
100+
industry_name_sic=pl.first("industry_name_sic"),
101+
count=pl.len(),
102+
)
161103
.sort("count", descending=True)
162-
.head(20)
104+
.head(25)
163105
)
164106
return
165107

166108

167109
@app.cell
168110
def _(companies, pl):
169111
electricity_sics = (
170-
companies.group_by(sic=pl.col("industry_id_sic"))
171-
.agg(fraction_with_utility_id=pl.col("utility_id_eia").is_not_null().mean())
112+
companies.group_by(pl.col("industry_id_sic"))
113+
.agg(
114+
industry_name_sic=pl.first("industry_name_sic"),
115+
fraction_with_utility_id=pl.col("utility_id_eia")
116+
.is_not_null()
117+
.mean()
118+
.round(3),
119+
)
172120
.sort("fraction_with_utility_id", descending=True)
173-
.head(20)
174121
)
175-
electricity_sics
122+
electricity_sics.head(25)
176123
return (electricity_sics,)
177124

178125

126+
@app.cell
127+
def _(electricity_sics, plt):
128+
plt.xticks(rotation=90, size=5)
129+
plt.xlabel("Industry ID (SIC)")
130+
plt.ylabel("Fraction of companies with Utility ID (EIA)")
131+
plt.bar(
132+
electricity_sics.head(100)["industry_id_sic"],
133+
electricity_sics.head(100)["fraction_with_utility_id"],
134+
)
135+
return
136+
137+
179138
@app.cell
180139
def _(companies, electricity_sics, pl):
181140
majority_electric = (
182141
electricity_sics.filter(pl.col("fraction_with_utility_id") > 0.5)
183-
.select("sic")
142+
.select(pl.col("industry_id_sic"))
184143
.to_series()
185144
.to_list()
186145
)
187146
(
188147
companies.filter(pl.col("industry_id_sic").is_in(majority_electric))
189148
.select(["industry_id_sic", "industry_name_sic"])
190-
.group_by(["industry_id_sic", "industry_name_sic"])
191-
.agg(count=pl.len())
149+
.group_by(pl.col("industry_id_sic"))
150+
.agg(
151+
count=pl.len(),
152+
industry_name_sic=pl.first("industry_name_sic"),
153+
)
192154
.sort("count", descending=True)
193155
)
194156
return (majority_electric,)
@@ -204,13 +166,14 @@ def _(companies, majority_electric, pl, plt):
204166
.group_by(["year", "industry_id_sic"])
205167
.agg(
206168
fraction_with_utility_id=pl.col("utility_id_eia").is_not_null().mean(),
169+
industry_name_sic=pl.first("industry_name_sic"),
207170
)
208171
.sort("year")
209172
)
210173

211-
for sic in util_ids_by_year["industry_id_sic"].unique():
212-
df = util_ids_by_year.filter(pl.col("industry_id_sic") == sic)
213-
plt.plot(df["year"], df["fraction_with_utility_id"], label=sic)
174+
for industry_name in util_ids_by_year["industry_name_sic"].unique():
175+
df = util_ids_by_year.filter(pl.col("industry_name_sic") == industry_name)
176+
plt.plot(df["year"], df["fraction_with_utility_id"], label=industry_name)
214177

215178
plt.legend()
216179
return

0 commit comments

Comments
 (0)