@@ -60,67 +60,7 @@ def get_pudl(table_name: str) -> pl.DataFrame:
6060
6161
6262@app .cell
63- def _ (pl ):
64- def clean_industry_data (df : pl .DataFrame ) -> pl .DataFrame :
65- # Step 1: Clean industry_name_sic
66- # Find the most common name for each industry_id_sic
67- canonical_names = (
68- df .filter (pl .col ("industry_name_sic" ).is_not_null ())
69- .group_by ("industry_id_sic" )
70- .agg (common_name = pl .col ("industry_name_sic" ).mode ())
71- )
72-
73- # Identify the most common name and ensure it's unique per ID
74- unique_canonical_names = (
75- canonical_names .group_by ("industry_id_sic" )
76- .agg (count = pl .count ())
77- .filter (pl .col ("count" ) == 1 )
78- )
79-
80- # Create a mapping dictionary
81- name_mapping = {
82- row ["industry_id_sic" ]: row ["common_name" ]
83- for row in unique_canonical_names .rows ()
84- }
85-
86- # Fill in canonical names where applicable
87- df = df .with_columns (
88- pl .when (pl .col ("industry_name_sic" ).is_null ())
89- .then (pl .col ("industry_id_sic" ).map (name_mapping ))
90- .otherwise (pl .col ("industry_name_sic" ))
91- .alias ("cleaned_industry_name_sic" )
92- )
93-
94- # Step 2: Fill in industry_id_sic using cleaned names
95- df = df .with_columns (
96- pl .when (
97- pl .col ("industry_id_sic" ).is_null ()
98- & pl .col ("cleaned_industry_name_sic" ).is_not_null ()
99- )
100- .then (pl .col ("cleaned_industry_name_sic" ).map (name_mapping ))
101- .otherwise (pl .col ("industry_id_sic" ))
102- .alias ("cleaned_industry_id_sic" )
103- )
104-
105- # Step 3: Handle nulls based on central_index_key and consistency before and after
106- def fill_nulls (group : pl .DataFrame ) -> pl .DataFrame :
107- # Sort by report_date
108- group = group .sort ("report_date" )
109-
110- # Forward fill for missing IDs and names
111- group = group .with_columns (
112- ffill_id = pl .col ("industry_id_sic" ).fill_null (strategy = "forward" ),
113- ffill_name = pl .col ("industry_name_sic" ).fill_null (strategy = "forward" ),
114- )
115-
116- # Backward fill for missing IDs and names
117- return group .with_columns (
118- pl .col ("ffill_id" ).fill_null (strategy = "backward" ),
119- pl .col ("ffill_name" ).fill_null (strategy = "backward" ),
120- )
121-
122- return df .groupby ("central_index_key" ).agg (fill_nulls (pl .all ()))
123-
63+ def _ ():
12464 return
12565
12666
@@ -139,7 +79,6 @@ def _(mo):
13979 - Filtering all the industry names for "electric" and "power" shows a bunch of other industries that are not related to electricity generation. Mostly electronics, etc.
14080 - So the 4911 and 4931 seem to be the main ones we expect to link to EIA Utilities.
14181 - And then there's a number of smaller industries with cogeneration that often match, but don't have as many companies in them, and probably aren't responsible for much generation.
142- - Note: this kind of analysis would be easier if we cleaned up the SIC names & IDs so that they're more consistent & complete.
14382 """
14483 )
14584 return
@@ -156,39 +95,62 @@ def _(companies, pl):
15695 (
15796 companies .filter (pl .col ("utility_id_eia" ).is_not_null ())
15897 .select (["industry_id_sic" , "industry_name_sic" ])
159- .group_by (["industry_id_sic" , "industry_name_sic" ])
160- .agg (count = pl .len ())
98+ .group_by (pl .col ("industry_id_sic" ))
99+ .agg (
100+ industry_name_sic = pl .first ("industry_name_sic" ),
101+ count = pl .len (),
102+ )
161103 .sort ("count" , descending = True )
162- .head (20 )
104+ .head (25 )
163105 )
164106 return
165107
166108
167109@app .cell
168110def _ (companies , pl ):
169111 electricity_sics = (
170- companies .group_by (sic = pl .col ("industry_id_sic" ))
171- .agg (fraction_with_utility_id = pl .col ("utility_id_eia" ).is_not_null ().mean ())
112+ companies .group_by (pl .col ("industry_id_sic" ))
113+ .agg (
114+ industry_name_sic = pl .first ("industry_name_sic" ),
115+ fraction_with_utility_id = pl .col ("utility_id_eia" )
116+ .is_not_null ()
117+ .mean ()
118+ .round (3 ),
119+ )
172120 .sort ("fraction_with_utility_id" , descending = True )
173- .head (20 )
174121 )
175- electricity_sics
122+ electricity_sics . head ( 25 )
176123 return (electricity_sics ,)
177124
178125
126+ @app .cell
127+ def _ (electricity_sics , plt ):
128+ plt .xticks (rotation = 90 , size = 5 )
129+ plt .xlabel ("Industry ID (SIC)" )
130+ plt .ylabel ("Fraction of companies with Utility ID (EIA)" )
131+ plt .bar (
132+ electricity_sics .head (100 )["industry_id_sic" ],
133+ electricity_sics .head (100 )["fraction_with_utility_id" ],
134+ )
135+ return
136+
137+
179138@app .cell
180139def _ (companies , electricity_sics , pl ):
181140 majority_electric = (
182141 electricity_sics .filter (pl .col ("fraction_with_utility_id" ) > 0.5 )
183- .select ("sic" )
142+ .select (pl . col ( "industry_id_sic" ) )
184143 .to_series ()
185144 .to_list ()
186145 )
187146 (
188147 companies .filter (pl .col ("industry_id_sic" ).is_in (majority_electric ))
189148 .select (["industry_id_sic" , "industry_name_sic" ])
190- .group_by (["industry_id_sic" , "industry_name_sic" ])
191- .agg (count = pl .len ())
149+ .group_by (pl .col ("industry_id_sic" ))
150+ .agg (
151+ count = pl .len (),
152+ industry_name_sic = pl .first ("industry_name_sic" ),
153+ )
192154 .sort ("count" , descending = True )
193155 )
194156 return (majority_electric ,)
@@ -204,13 +166,14 @@ def _(companies, majority_electric, pl, plt):
204166 .group_by (["year" , "industry_id_sic" ])
205167 .agg (
206168 fraction_with_utility_id = pl .col ("utility_id_eia" ).is_not_null ().mean (),
169+ industry_name_sic = pl .first ("industry_name_sic" ),
207170 )
208171 .sort ("year" )
209172 )
210173
211- for sic in util_ids_by_year ["industry_id_sic " ].unique ():
212- df = util_ids_by_year .filter (pl .col ("industry_id_sic " ) == sic )
213- plt .plot (df ["year" ], df ["fraction_with_utility_id" ], label = sic )
174+ for industry_name in util_ids_by_year ["industry_name_sic " ].unique ():
175+ df = util_ids_by_year .filter (pl .col ("industry_name_sic " ) == industry_name )
176+ plt .plot (df ["year" ], df ["fraction_with_utility_id" ], label = industry_name )
214177
215178 plt .legend ()
216179 return
0 commit comments