-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathpg_to_switch.py
1996 lines (1766 loc) · 78.6 KB
/
pg_to_switch.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
import os
import sys
import math
from datetime import datetime as dt
import ast
import itertools
from statistics import mode
import collections
import shlex
from pathlib import Path
from typing import List, Optional
from typing_extensions import Annotated
import pandas as pd
import numpy as np
import scipy
import sqlalchemy as sa
import typer
from powergenome.resource_clusters import ResourceGroup
import pandas as pd
from powergenome.fuels import fuel_cost_table
from powergenome.generators import GeneratorClusters, create_plant_gen_id
from powergenome.util import (
build_scenario_settings,
init_pudl_connection,
load_settings,
check_settings,
snake_case_col,
)
from powergenome.time_reduction import kmeans_time_clustering
from powergenome.eia_opendata import fetch_fuel_prices
from powergenome.eia_opendata import add_user_fuel_prices
import geopandas as gpd
from powergenome.generators import *
from powergenome.external_data import (
make_demand_response_profiles,
make_generator_variability,
load_demand_segments,
)
from powergenome.GenX import (
add_misc_gen_values,
hydro_energy_to_power,
add_co2_costs_to_o_m,
create_policy_req,
set_must_run_generation,
min_cap_req,
)
from powergenome.co2_pipeline_cost import merge_co2_pipeline_costs
from conversion_functions import (
switch_fuel_cost_table,
switch_fuels,
gen_info_table,
hydro_time_tables,
load_zones_table,
fuel_market_tables,
timeseries,
timeseries_full,
graph_timestamp_map_table,
graph_timestamp_map_kmeans,
loads_table,
tx_cost_transform,
variable_capacity_factors_table,
transmission_lines_table,
balancing_areas,
ts_tp_pg_kmeans,
hydro_timepoints_pg_kmeans,
hydro_timeseries_pg_kmeans,
hydro_system_module_tables,
variable_cf_pg_kmeans,
load_pg_kmeans,
first_key,
first_value,
final_key,
final_value,
km_per_mile,
)
from powergenome.load_profiles import (
make_load_curves,
add_load_growth,
make_final_load_curves,
make_distributed_gen_profiles,
)
if not sys.warnoptions:
import warnings
warnings.simplefilter("ignore")
def fuel_files(
fuel_prices: pd.DataFrame,
planning_years: List[int],
regions: List[str],
fuel_region_map: dict[str, List[str]],
fuel_emission_factors: dict[str, float],
out_folder: Path,
):
fuel_cost = switch_fuel_cost_table(
fuel_region_map,
fuel_prices,
regions,
scenario=["reference", "user"],
year_list=planning_years,
)
fuels_table = switch_fuels(fuel_prices, fuel_emission_factors)
fuels_table.loc[len(fuels_table.index)] = [
"Fuel",
0,
0,
] # adding in a dummy fuel for regional_fuel_market
### edit by RR
IPM_regions = regions
load_zones = load_zones_table(IPM_regions, zone_ccs_distance_km=0)
# add in the dummy loadzone
load_zones.loc[len(load_zones.index)] = [
"loadzone",
0,
load_zones["zone_dbid"].max() + 1,
]
load_zones.to_csv(out_folder / "load_zones.csv", index=False)
regional_fuel_markets = pd.DataFrame(
{"regional_fuel_market": "loadzone-Fuel", "fuel": "Fuel"}, index=[0]
)
regional_fuel_markets
### edited by RR. CHANGE COLUMN NAME from fuel to rfm.
zone_regional_fm = pd.DataFrame(
{"load_zone": "loadzone", "rfm": "loadzone-Fuel"}, index=[0]
)
zone_regional_fm
# creating dummy values based on one load zone in REAM's input file
# note:regional_fuel_market should align with the regional_fuel_market table.
# TODO --RR
fuel_supply_curves20 = pd.DataFrame(
{
"period": [2020, 2020, 2020, 2020, 2020, 2020],
"tier": [1, 2, 3, 4, 5, 6],
"unit_cost": [1.9, 4.0, 487.5, 563.7, 637.8, 816.7],
"max_avail_at_cost": [651929, 3845638, 3871799, 3882177, 3889953, 3920836],
}
)
fuel_supply_curves20.insert(0, "regional_fuel_market", "loadzone-Fuel")
fuel_supply_curves30 = fuel_supply_curves20.copy()
fuel_supply_curves30["period"] = 2030
fuel_supply_curves40 = fuel_supply_curves20.copy()
fuel_supply_curves40["period"] = 2040
fuel_supply_curves50 = fuel_supply_curves20.copy()
fuel_supply_curves50["period"] = 2050
fuel_supply_curves = pd.concat(
[
fuel_supply_curves20,
fuel_supply_curves30,
fuel_supply_curves40,
fuel_supply_curves50,
]
)
fuel_supply_curves
regional_fuel_markets.to_csv(out_folder / "regional_fuel_markets.csv", index=False)
zone_regional_fm.to_csv(
out_folder / "zone_to_regional_fuel_market.csv", index=False
)
fuel_supply_curves.to_csv(out_folder / "fuel_supply_curves.csv", index=False)
###
fuel_cost.to_csv(out_folder / "fuel_cost.csv", index=False)
fuels_table.to_csv(out_folder / "fuels.csv", index=False)
def generator_and_load_files(
gc: GeneratorClusters,
all_fuel_prices,
pudl_engine: sa.engine,
scen_settings_dict: dict[dict],
out_folder: Path,
pg_engine: sa.engine,
hydro_variability_new: pd.DataFrame,
):
"""
Steps:
use PowerGenome functions to define all_gen (unchanged across years), with
parameters for all generators
rename columns in all_gen to match Switch conventions
split all_gen into existing_gen_units (exploded by vintage) and new_gens
"""
# TODO: maybe move all the arguments into an `options` dict that can be
# passed to all the functions, so we don't have to worry about which
# functions need which arguments
out_folder.mkdir(parents=True, exist_ok=True)
first_year_settings = first_value(scen_settings_dict)
# get tables of generators, organized by model_year or build_year
# (model_year shows generators active in a particular model year, used for
# gathering operational data like variable capacity factors; build_year
# shows gens built in a particular year, used to gather construction data
# like capital cost and capacity built)
gens_by_model_year, gens_by_build_year = gen_tables(
gc, pudl_engine, scen_settings_dict
)
#########
# create Switch input files from these tables
gen_build_costs_file(gens_by_build_year, out_folder)
# This uses gens_by_model_year, to increase the chance that a generator
# cluster that exists in an early year will also be modeled in a later year,
# even if it retires, so we can reuse that info in chained models where we
# turn off age-based retirement.
# We have to send the fuel prices so it can check which gens use a real fuel
# and which don't, because PowerGenome gives a heat rate for all of them.
gen_info_file(first_year_settings, gens_by_model_year, all_fuel_prices, out_folder)
# balancing_tables(first_year_settings, pudl_engine, all_gen_units, out_folder)
gen_build_predetermined_file(gens_by_build_year, out_folder)
operational_files(
scen_settings_dict,
pg_engine,
hydro_variability_new,
gens_by_model_year,
out_folder,
)
def operational_files(
scen_settings_dict,
pg_engine,
hydro_variability_new,
gens_by_model_year,
out_folder,
):
"""
Create all files describing time-varying operation of the system, i.e.,
for loads, hydro, variable capacity factors for renewables, etc.
"""
timepoint_start = 1
# will hold all years of each type of data
output = collections.defaultdict(list)
timepoint_start = 1
for model_year, year_settings in scen_settings_dict.items():
period_all_gen = gens_by_model_year.query("model_year == @model_year")
print("Gathering generator variability data.")
period_all_gen_variability = make_generator_variability(period_all_gen)
period_all_gen_variability.columns = period_all_gen["Resource"]
if "gen_is_baseload" in period_all_gen.columns:
period_all_gen_variability = set_must_run_generation(
period_all_gen_variability,
period_all_gen.loc[
period_all_gen["gen_is_baseload"] == True, "Resource"
].to_list(),
)
# TODO: is this needed? can it be eliminated by improvements upstream?
# ####### add by Rangrang, need to discuss further about CF of hydros in MIS_D_MD
# change the variability of hyfro generators in MIS_D_MS
# the profiles for them were missing and were filled with 1, which does not make sense since
# all variable resources should have a variable capacity factoe between 0-1.
hydro_variability_new = hydro_variability_new.iloc[:8760]
MIS_D_MS_hydro = [
col
for col in period_all_gen_variability.columns
if "MIS_D_MS" in col
if "hydro" in col
]
for col in MIS_D_MS_hydro:
period_all_gen_variability[col] = hydro_variability_new["MIS_D_MS"]
period_lc = make_final_load_curves(pg_engine, year_settings)
cluster_time = year_settings.get("reduce_time_domain") is True
# do time clustering/sampling
if cluster_time:
assert "time_domain_periods" in year_settings
assert "time_domain_days_per_period" in year_settings
# results is a dict with keys "resource_profiles" (gen_variability), "load_profiles",
# "time_series_mapping" (maps clusters sequentially to potential periods in year),
# "ClusterWeights", etc. See PG for full details.
print(f"Beginning clustering of timeseries ({model_year}).")
results, representative_point, weights = kmeans_time_clustering(
resource_profiles=period_all_gen_variability,
load_profiles=period_lc,
days_in_group=year_settings["time_domain_days_per_period"],
num_clusters=year_settings["time_domain_periods"],
include_peak_day=year_settings.get("include_peak_day", True),
load_weight=year_settings.get("demand_weight_factor", 1),
variable_resources_only=year_settings.get(
"variable_resources_only", True
),
)
print("Finished clustering timeseries.")
period_lc_sampled = results["load_profiles"]
period_variability_sampled = results["resource_profiles"]
#######
# Omit existing generators that have no active capacity this period.
# In some cases, PowerGenome may include generators that are post-
# retirement (or maybe pre-construction?), to make sure the same sample
# weeks are selected for every period. Here we filter those out because
# Switch will not accept time-varying data for generators that cannot be
# used.
period_all_gen = period_all_gen.query("Existing_Cap_MW.notna() or new_build")
period_all_gen_variability = period_all_gen_variability.loc[
:, period_all_gen["Resource"]
]
if cluster_time:
period_variability_sampled = period_variability_sampled.loc[
:, period_all_gen["Resource"]
]
# timeseries_df and timepoints_df
if cluster_time:
timeseries_df, timepoints_df = ts_tp_pg_kmeans(
representative_point["slot"],
weights,
year_settings["time_domain_days_per_period"],
year_settings["model_year"],
year_settings["model_first_planning_year"],
)
timepoints_df["timepoint_id"] = range(
timepoint_start, timepoint_start + len(timepoints_df)
)
timepoint_start = timepoints_df["timepoint_id"].max() + 1
else:
if year_settings.get("full_time_domain") is True:
timeseries_df, timepoints_df, timestamp_interval = timeseries_full(
period_lc_sampled,
year_settings["model_year"],
year_settings["model_first_planning_year"],
settings=year_settings,
)
else:
timeseries_df, timepoints_df, timestamp_interval = timeseries(
period_lc_sampled,
year_settings["model_year"],
year_settings["model_first_planning_year"],
settings=year_settings,
)
timepoints_df["timepoint_id"] = range(
timepoint_start, timepoint_start + len(timepoints_df)
)
timepoint_start = timepoints_df["timepoint_id"].max() + 1
# create lists and dictionary for later use
timepoints_timestamp = timepoints_df[
"timestamp"
].to_list() # timestamp list
timepoints_tp_id = timepoints_df[
"timepoint_id"
].to_list() # timepoint_id list
timepoints_dict = dict(
zip(timepoints_timestamp, timepoints_tp_id)
) # {timestamp: timepoint_id}
output["timeseries.csv"].append(timeseries_df)
output["timepoints.csv"].append(timepoints_df)
# hydro timepoint data
if cluster_time:
hydro_timepoints_df = hydro_timepoints_pg_kmeans(timepoints_df)
hydro_timeseries_table = hydro_timeseries_pg_kmeans(
period_all_gen,
period_variability_sampled.loc[
:, period_all_gen.loc[period_all_gen["HYDRO"] == 1, "Resource"]
],
hydro_timepoints_df,
)
else:
hydro_timepoints_df, hydro_timeseries_table = hydro_time_tables(
period_all_gen,
period_all_gen_variability,
timepoints_df,
year_settings["model_year"],
)
output["hydro_timepoints.csv"].append(hydro_timepoints_df)
output["hydro_timeseries.csv"].append(hydro_timeseries_table)
# hydro network data
if cluster_time:
(
water_nodes,
water_connections,
reservoirs,
hydro_pj,
water_node_tp_flows,
) = hydro_system_module_tables(
period_all_gen,
period_variability_sampled.loc[
:, period_all_gen.loc[period_all_gen["HYDRO"] == 1, "Resource"]
],
hydro_timepoints_df,
flow_per_mw=1.02,
)
else:
(
water_nodes,
water_connections,
reservoirs,
hydro_pj,
water_node_tp_flows,
) = hydro_system_module_tables(
period_all_gen,
period_all_gen_variability.loc[
:, period_all_gen.loc[period_all_gen["HYDRO"] == 1, "Resource"]
],
timepoints_df,
flow_per_mw=1.02,
)
output["water_nodes.csv"].append(water_nodes)
output["water_connections.csv"].append(water_connections)
output["reservoirs.csv"].append(reservoirs)
output["hydro_generation_projects.csv"].append(hydro_pj)
output["water_node_tp_flows.csv"].append(water_node_tp_flows)
# loads
if cluster_time:
loads = load_pg_kmeans(period_lc_sampled, timepoints_df)
timepoints_tp_id = timepoints_df[
"timepoint_id"
].to_list() # timepoint_id list
dummy_df = pd.DataFrame({"TIMEPOINT": timepoints_tp_id})
dummy_df.insert(0, "LOAD_ZONE", "loadzone")
dummy_df.insert(2, "zone_demand_mw", 0)
loads = loads.append(dummy_df)
else:
loads, loads_with_year_hour = loads_table(
period_lc_sampled,
timepoints_timestamp,
timepoints_dict,
year_settings["model_year"],
)
# for fuel_cost and regional_fuel_market issue
dummy_df = pd.DataFrame({"TIMEPOINT": timepoints_tp_id})
dummy_df.insert(0, "LOAD_ZONE", "loadzone")
dummy_df.insert(2, "zone_demand_mw", 0)
loads = loads.append(dummy_df)
# year_hour is used by vcf below
year_hour = loads_with_year_hour["year_hour"].to_list()
output["loads.csv"].append(loads)
# capacity factors for variable generators
if cluster_time:
vcf = variable_cf_pg_kmeans(
period_all_gen, period_variability_sampled, timepoints_df
)
else:
vcf = variable_capacity_factors_table(
period_all_gen_variability,
year_hour,
timepoints_dict,
period_all_gen,
year_settings["model_year"],
)
output["variable_capacity_factors.csv"].append(vcf)
# timestamp map for graphs
if cluster_time:
graph_timestamp_map = graph_timestamp_map_kmeans(timepoints_df)
else:
graph_timestamp_map = graph_timestamp_map_table(
timeseries_df, timestamp_interval
)
output["graph_timestamp_map.csv"].append(graph_timestamp_map)
# drop_duplicates isn't enough for some files, because they may have
# different capacities calculated in different years (!)
aggregation_rules = {
"reservoirs.csv": {"res_min_vol": "min", "res_max_vol": "max"},
"water_connections.csv": {"wc_capacity": "max"},
}
for file, agg_rule in aggregation_rules.items():
df = pd.concat(output[file])
group_cols = df.columns.difference(agg_rule.keys())
df = df.groupby(group_cols.to_list()).agg(agg_rule).reset_index()
output[file] = [df]
# Write to CSV files (remove any remaining duplicate rows, e.g., based on the
# same generator reported in different model years)
for file, dfs in output.items():
pd.concat(dfs).drop_duplicates().to_csv(out_folder / file, index=False)
def gen_build_costs_file(gens_by_build_year, out_folder):
"""
Input:
* gens_by_build_year: from gen_tables, based on gc.create_all_gens
* out_folder: directory to store the output
Output columns
* GENERATION_PROJECT: Resourc
* build_year: based off of the build years from gens_by_build_year
* gen_overnight_cost: uses PG capex_mw and regional_cost_multiplier
* gen_fixed_om: uses PG Fixed_OM_Cost_per_MWyr_mean for all generators
* gen_storage_energy_overnight_cost: uses PG capex_mw and regional_cost_multiplier
* gen_storage_energy_fixed_om: PG Fixed_OM_Cost_per_MWhyr for all generators
"""
defs = {
"GENERATION_PROJECT": "Resource",
"BUILD_YEAR": "build_year",
"gen_overnight_cost": "capex_mw * regional_cost_multiplier",
"gen_fixed_om": "Fixed_OM_Cost_per_MWyr_mean",
"gen_storage_energy_overnight_cost": "capex_mwh * regional_cost_multiplier",
"gen_storage_energy_fixed_om": "Fixed_OM_Cost_per_MWhyr",
}
gen_build_costs = pd.DataFrame(
{col: gens_by_build_year.eval(expr) for col, expr in defs.items()}
)
gen_build_costs.to_csv(out_folder / "gen_build_costs.csv", index=False, na_rep=".")
def gen_build_predetermined_file(gens_by_build_year, out_folder):
"""
Output columns
* GENERATION_PROJECT: Resource from gens_by_build_year
* build_year: from gens_by_build_year
* build_gen_predetermined: based on capacity_mw from gens_by_build_year
* build_gen_energy_predetermined: based on capacity_mwh from gens_by_build_year
"""
# write the relevant columns out for Switch
gbp_cols = {
"Resource": "GENERATION_PROJECT",
"build_year": "build_year",
"capacity_mw": "build_gen_predetermined",
"capacity_mwh": "build_gen_energy_predetermined",
}
gbp = gens_by_build_year.loc[gens_by_build_year["existing"], gbp_cols.keys()]
gbp = gbp.rename(columns=gbp_cols)
gbp.to_csv(out_folder / "gen_build_predetermined.csv", index=False, na_rep=".")
def gen_info_file(
settings,
gens_by_model_year: pd.DataFrame,
fuel_prices: pd.DataFrame,
out_folder: Path,
):
# consolidate to one row per generator cluster (we assume data is the same
# for all rows)
gens = gens_by_model_year.drop_duplicates(subset="Resource")
set_retirement_age(gens, settings)
gen_info = gen_info_table(
gens,
settings.get("transmission_investment_cost")["spur"]["capex_mw_mile"],
)
graph_tech_colors_data = {
"gen_type": [
"Biomass",
"Coal",
"Naturalgas",
"Geothermal",
"Hydro",
"Nuclear",
"Oil",
"Solar",
"Storage",
"Waste",
"Wave",
"Wind",
"Other",
],
"color": [
"green",
"saddlebrown",
"gray",
"red",
"royalblue",
"blueviolet",
"orange",
"gold",
"aquamarine",
"black",
"blue",
"deepskyblue",
"white",
],
}
graph_tech_colors_table = pd.DataFrame(graph_tech_colors_data)
graph_tech_colors_table.insert(0, "map_name", "default")
graph_tech_colors_table
graph_tech_types_table = gen_info.drop_duplicates(subset="gen_tech")
graph_tech_types_table["map_name"] = "default"
graph_tech_types_table["energy_source"] = graph_tech_types_table[
"gen_energy_source"
]
cols = ["map_name", "gen_type", "gen_tech", "energy_source"]
graph_tech_types_table = graph_tech_types_table[cols]
# Drop the heat rate that PowerGenome provides for many non-fuel-using generators
fuels = fuel_prices["fuel"].unique()
fuels = [fuel.capitalize() for fuel in fuels]
non_fuel_table = graph_tech_types_table[
~graph_tech_types_table["energy_source"].isin(fuels)
]
non_fuel_energy_table = (
non_fuel_table[["energy_source"]].drop_duplicates().sort_values("energy_source")
)
gen_info.loc[
gen_info["gen_energy_source"].isin(non_fuel_energy_table["energy_source"]),
"gen_full_load_heat_rate",
] = None
graph_tech_colors_table.to_csv(out_folder / "graph_tech_colors.csv", index=False)
graph_tech_types_table.to_csv(out_folder / "graph_tech_types.csv", index=False)
non_fuel_energy_table.to_csv(
out_folder / "non_fuel_energy_sources.csv", index=False
)
# identify generators participating in ESR or minimum capacity programs,
# then drop those columns
ESR_col = [col for col in gen_info.columns if col.startswith("ESR")]
ESR_generators = gen_info[["GENERATION_PROJECT"] + ESR_col]
min_cap_col = [col for col in gen_info.columns if col.startswith("MinCapTag")]
min_cap_gens = gen_info[["GENERATION_PROJECT"] + min_cap_col]
gen_info = gen_info.drop(columns=ESR_col + min_cap_col)
gen_info.to_csv(out_folder / "gen_info.csv", index=False, na_rep=".")
# save deviations from mean O&M cost in gen_om_by_period.csv to allow variation by study period.
om_cols = [
"Fixed_OM_Cost_per_MWyr",
"Var_OM_Cost_per_MWh",
"Fixed_OM_Cost_per_MWhyr",
]
# drop existing generators that are retired by this time
gen_om_by_period = gens_by_model_year.query("Existing_Cap_MW.notna() or new_build")
# calculate difference from the mean
gen_om_by_period[om_cols] -= gen_om_by_period[[c + "_mean" for c in om_cols]].values
# ignore tiny differences from the mean
gen_om_by_period[om_cols] = gen_om_by_period[om_cols].mask(
gen_om_by_period[om_cols].abs() <= 1e-9, 0
)
# drop zeros (not essential, but helpful for seeing only the ones with adjustments)
gen_om_by_period[om_cols] = gen_om_by_period[om_cols].replace({0: float("nan")})
gen_om_by_period = gen_om_by_period.dropna(subset=om_cols, how="all")
# filter columns
gen_om_by_period = gen_om_by_period[["Resource", "model_year"] + om_cols]
gen_om_by_period.columns = [
"GENERATION_PROJECT",
"PERIOD",
"gen_fixed_om_by_period",
"gen_variable_om_by_period",
"gen_storage_energy_fixed_om_by_period",
]
gen_om_by_period.to_csv(
out_folder / "gen_om_by_period.csv", index=False, na_rep="."
)
################
# ESR and min_cap programs
# create esr_generators.csv: list of generators participating in ESR (RPS/CES) programs
ESR_generators_long = pd.melt(
ESR_generators, id_vars=["GENERATION_PROJECT"], value_vars=ESR_col
)
ESR_generators_long = ESR_generators_long[ESR_generators_long["value"] == 1].rename(
columns={"variable": "ESR_PROGRAM", "GENERATION_PROJECT": "ESR_GEN"}
)
ESR_generators_long = ESR_generators_long[["ESR_PROGRAM", "ESR_GEN"]]
ESR_generators_long.to_csv(out_folder / "esr_generators.csv", index=False)
# make min_cap_generators.csv, showing generators that can help satisfy
# minimum capacity rules
min_cap_generators_long = pd.melt(
min_cap_gens, id_vars=["GENERATION_PROJECT"], value_vars=min_cap_col
)
min_cap_generators_long = min_cap_generators_long[
min_cap_generators_long["value"] == 1
].rename(
columns={"variable": "MIN_CAP_PROGRAM", "GENERATION_PROJECT": "MIN_CAP_GEN"}
)
min_cap_generators_long = min_cap_generators_long[
["MIN_CAP_PROGRAM", "MIN_CAP_GEN"]
]
min_cap_generators_long.to_csv(out_folder / "min_cap_generators.csv", index=False)
###############################################################
def gen_tables(gc, pudl_engine, scen_settings_dict):
"""
Return dataframes showing all generator clusters that can be operated in
each model_year and that can be built in each build_year. gens_by_model_year
has one row for every model_year when the generator or unit can be operated.
gens_by_build_year has one row for every build_year when the generators can
be built or were built.
These dataframes each show both new and existing generators. They contain
all the data from gc.create_all_generators() (and gc.units_model after
running this) plus some extra data.
The "existing" column identifies generators that have a scheduled
construction plan in the past or near future; for these, capacity_mw and
possibly capacity_mwh will also have values. The "new_build" column
identifies generators that can be built during the study period.
This is the main place where generator data is read from PowerGenome, so it
is also the best place to filter or adapt the data as needed before use
elsewhere.
Note: this changes all the generator-related attributes of gc.
"""
# we save and restore gc.settings, but calling gc.create_all_generators()
# has unknown side effects on gc, including updating all the
# generator-related attributes.
orig_gc_settings = gc.settings
gen_dfs = []
unit_dfs = []
"""
# for testing:
year_settings = first_value(scen_settings_dict)
"""
for year_settings in scen_settings_dict.values():
"""
# for testing:
gen_df = gc.all_resources.copy()
"""
gc.settings = year_settings
gen_df = gc.create_all_generators().copy()
# identify existing and new-build for reference later
# (these could overlap in principle, but don't as of Feb. 2024)
if gc.current_gens:
gen_df["existing"] = gen_df["Resource"].isin(
gc.existing_resources["Resource"]
)
else: # must all be new
gen_df["existing"] = False
gen_df["new_build"] = gen_df["Resource"].isin(gc.new_resources["Resource"])
# clean up some resource and technology labels
gen_df["Resource"] = gen_df["Resource"].str.rstrip("_")
gen_df["technology"] = gen_df["technology"].str.rstrip("_")
# gather some extra data from PowerGenome
gen_df = add_misc_gen_values(gen_df, year_settings)
gen_df = hydro_energy_to_power(
gen_df,
year_settings.get("hydro_factor"),
year_settings.get("regional_hydro_factor", {}),
)
gen_df = add_co2_costs_to_o_m(gen_df)
# apply capacity derating if needed (e.g., to get the right average
# output for small hydro); Switch restricts output via
# gen_forced_outage_rate; we supersede the previous forced outage rate,
# since we assume the capacity_factor is based on historical output,
# including the effect of forced outages
if year_settings.get("derate_capacity"):
derate = gen_df["technology"].isin(year_settings.get("derate_techs", []))
gen_df.loc[derate, "gen_forced_outage_rate"] = 1 - gen_df.loc[
derate, "capacity_factor"
].fillna(1).clip(0, 1)
# If running an operation model, only consider existing projects. This
# is rarely used; normally we setup operation models based on solved
# capacity-planning models, but if specified, we drop the option for
# new gens.
if year_settings.get("operation_model"):
gen_df = gen_df.loc[gen_df["existing"], :]
# make sure new_build is turned off for any that overlap
gen_df = gen_df["new_build"] = False
# in greenfield scenarios, Existing_Cap_MW might be omitted
if "Existing_Cap_MW" not in gen_df.columns:
gen_df["Existing_Cap_MW"] = float("nan")
# identify storage gens for the next few steps
storage_gens = gen_df["STOR"].astype(bool)
# Use $0 as capex and fixed O&M for existing plants (our settings don't
# have all of these for existing plants as of Mar 2024)
for c in ["capex_mw", "Fixed_OM_Cost_per_MWyr"]:
gen_df[c] = gen_df[c].fillna(0)
for c in ["capex_mwh", "Fixed_OM_Cost_per_MWhyr"]:
gen_df.loc[storage_gens, c] = gen_df.loc[storage_gens, c].fillna(0)
# Use 1 as regional_cost_multiplier if not specified (i.e., for existing gens)
gen_df["regional_cost_multiplier"] = gen_df["regional_cost_multiplier"].fillna(
1
)
# Remove storage-related params for non-storage gens (we get a lot of
# these as of Mar 2024)
gen_df.loc[
~storage_gens,
["Existing_Cap_MWh", "capex_mwh", "Fixed_OM_Cost_per_MWhyr"],
] = None
# record which model year these generators could be used in
gen_df["model_year"] = year_settings["model_year"]
gen_dfs.append(gen_df)
# find build_year, capacity_mw and capacity_mwh for existing generating
# units online in this model_year for each gen cluster
eia_unit_info = eia_build_info(gc)
unit_df = gen_df.merge(eia_unit_info, on="Resource", how="left")
unit_dfs.append(unit_df)
gc.settings = orig_gc_settings
gens_by_model_year = pd.concat(gen_dfs, ignore_index=True)
units_by_model_year = pd.concat(unit_dfs, ignore_index=True)
# Set same info as eia_build_info() (build_year, capacity_mw and
# capacity_mwh) for generic generators (Resources in the "existing" list
# that didn't get matching record(s) from the eia_unit_info, currently only
# distributed generation). We do this after the loop so we can infer a
# sequence of capacity additions that results in the available capacity
# reported for each model year.
generic = units_by_model_year["existing"] & units_by_model_year["build_year"].isna()
generic_units = units_by_model_year[generic].drop(
columns=["plant_gen_id", "build_year", "capacity_mw", "capacity_mwh"]
)
generic_units = generic_units.merge(
generic_gen_build_info(generic_units, first_value(scen_settings_dict)),
on="Resource",
how="left",
)
units_by_model_year = (
pd.concat([units_by_model_year[~generic], generic_units])
.sort_values(["Resource", "model_year", "build_year"])
.reset_index()
)
assert (
units_by_model_year.query("existing")["build_year"].notna().all()
), "Some existing generating units have no build_year assigned."
# In PowerGenome, Fixed_OM_Cost_per_MWyr, Var_OM_Cost_per_MWh and Fixed_OM_Cost_per_MWhyr vary by
# model year, not build year. So we calculate averages across model years to
# use for all build years, but also leave the per-model-year values to use
# as an ancillary input. (The averages are Fixed_OM_Cost_per_MWyr_mean,
# Var_OM_Cost_per_MWh_mean and Fixed_OM_Cost_per_MWhyr_mean.)
for col in [
"Fixed_OM_Cost_per_MWyr",
"Var_OM_Cost_per_MWh",
"Fixed_OM_Cost_per_MWhyr",
]:
mean = gens_by_model_year.groupby("Resource")[col].mean()
gens_by_model_year[col + "_mean"] = gens_by_model_year["Resource"].map(mean)
# gens_by_model_year[col + "_mean"] = (
# gens_by_model_year.groupby("Resource")[col].transform(lambda x: x.mean()
# )
# create by_build_year tables from these
# Merge the repeated records for existing gens from different model years.
# We take the first row found for non-numeric columns and the mean for
# numeric columns, since values may vary across model years (e.g.,
# capacity_mw for a single unit can change between model years due to
# derating by the cluster average capacity factor, since the cluster makeup
# changes over time; fixed O&M rises over time for some plants)
numeric_cols = unit_df.select_dtypes(include="number").columns.drop("build_year")
dup_rules = {
c: "mean" if c in numeric_cols else "first" for c in units_by_model_year.columns
}
unit_info = units_by_model_year.groupby(
["Resource", "plant_gen_id", "build_year"], as_index=False
).agg(dup_rules)
# average model_year is not meaningful
unit_info = unit_info.drop(columns=["model_year"])
# aggregate by build_year
# this could in theory be done with groupby()[columns].sum(), but then
# pandas 1.4.4 sometimes drops capacity_mw for this dataframe (seems to happen
# with columns where one's name is a shorter version of the other, and can't
# be reproduced if you save the table as .csv and read it back in.)
build_year_info = unit_info.groupby(["Resource", "build_year"], as_index=False).agg(
{"capacity_mw": "sum", "capacity_mwh": "sum"}
)
# Existing gen clusters are duplicated across model years, so we first
# consolidate to one row per resource, then replicate data for each
# resource/build_year combo
existing_gens = gens_by_model_year.query("existing").drop_duplicates(
subset="Resource", keep="first"
)
# turn off "new_build" flag if set; those will be duplicated in
# new_gens_by_build_year
existing_gens["new_build"] = False
existing_gens_by_build_year = existing_gens.merge(
build_year_info, on="Resource", how="left"
)
# Create dataframe showing when the new generators can be built and
# consolidating by build year instead of model year. This is simple, since
# for new gens the build_year is the same as the model_year (and that's the
# year for which costs are shown)
new_gens_by_build_year = gens_by_model_year.query("new_build")
new_gens_by_build_year["build_year"] = new_gens_by_build_year["model_year"]
# turn off "existing" flag if set; those are duplicated in
# existing_gens_by_build_year
new_gens_by_build_year["existing"] = False
gens_by_build_year = pd.concat(
[existing_gens_by_build_year, new_gens_by_build_year], ignore_index=True
)
# Remove storage-related params (always 0?) for non-storage gens
# (could be done in the loop above, and in theory the sums would come out
# as NaNs, but in practice they sometimes come out as 0 instead)
gens_by_build_year.loc[
gens_by_build_year["STOR"] == 0,
[
"Existing_Cap_MWh",
"capex_mwh",
"Fixed_OM_Cost_per_MWhyr",
"capacity_mwh",
],
] = None
assert (
gens_by_build_year["new_build"] & gens_by_build_year["Existing_Cap_MW"].notna()
).sum() == 0, "Some new-build generators have Existing_Cap_MW assigned."
return gens_by_model_year, gens_by_build_year
def set_retirement_age(df, settings):
# set retirement age (500 years if not specified) This uses the same logic
# as powergenome.generators.label_retirement_year(), which doesn't seem to
# get called for a lot of the generators we are using.
# Note: In the economic retirement cases, retirement_ages is set as ~ in the
# .yml files, which comes back as None instead of a missing entry or empty
# dict, so we work around that
retirement_ages = settings.get("retirement_ages") or {}
df["retirement_age"] = df["technology"].map(retirement_ages).fillna(500)
return df
def eia_build_info(gc: GeneratorClusters):
"""
Return a dataframe showing Resource, plant_gen_id, build_year, capacity_mw
and capacity_mwh for all EIA generating units that were aggregated for the
previous call to gc.create_all_generators().
Note: capacity_mw will be de-rated according to the unit's average capacity
factor if specified in gc.settings (typical for small hydro, geothermal,
possibly biomass)
Inputs: - gc: GeneratorClusters object previously used to call
gc.create_all_generators
"""
units = gc.all_units.copy()
if "Resource" not in units.columns:
# PowerGenome before March 2024
# Construct a resource ID the same way PowerGenome does when "extra_outputs" is set
units["Resource"] = (
units["model_region"]
+ "_"
+ snake_case_col(units["technology"])
+ "_"
+ units["cluster"].astype(str)
)
# assign unique ID for each unit, for de-duplication later
units = create_plant_gen_id(units)
# Set retirement age (units has a retirement_age, but it's not completely
# filled in.) We do this temporarily here so we can back-calculate the
# right in-service year to assign
set_retirement_age(units, gc.settings)
# drop any with no retirement year (generally only ~1 planned builds that
# don't have an online date so PG couldn't assign a retirement date)
units = units.query("retirement_year.notna()")
# Use object attribute -- set in main() -- to determine if PG bug should be replicated