66from typing import Dict , List , Tuple
77import urllib .parse
88
9- # Functions to compute fields for simulations ###########################################
9+ # Functions to compute fields for simulations #################################
1010def get_data_size_and_hpss (hpss_path : str ) -> Tuple [str , str ]:
1111 """Get the data size in TB"""
1212 is_symlink : bool = check_if_symlink (hpss_path )
@@ -139,7 +139,7 @@ def get_run_script_reproduction(model_version: str, simulation_name: str) -> str
139139 run_script_reproduction = ""
140140 return run_script_reproduction
141141
142- # Define simulations and their grouping ###########################################
142+ # Define simulations and their grouping #######################################
143143class Simulation (object ):
144144 def __init__ (self , simulation_dict ):
145145 self .model_version = simulation_dict ["model_version" ]
@@ -183,10 +183,20 @@ def __init__(self, simulation_dict):
183183 if not self .run_script_original :
184184 self .run_script_original = "N/A"
185185
186- def get_row (self , output_file ):
187- if output_file .endswith ("simulation_table.rst" ):
188- return [self .simulation_name , self .data_size , self .esgf , self .hpss ]
189- elif output_file .endswith ("reproduction_table.rst" ):
186+ def get_row (self , output_file , minimal_content : bool = False ) -> List [str ]:
187+ if "simulation" in output_file :
188+ row = [self .simulation_name , self .data_size , self .esgf , self .hpss ]
189+ if minimal_content :
190+ match_object : re .Match = re .match ("`.*<(.*)>`_" , self .esgf )
191+ if match_object :
192+ row [2 ] = match_object .group (1 ) # Extract URL from the esgf link
193+ if self .hpss .startswith ("(symlink) " ):
194+ # Remove symlink prefix for the HPSS path
195+ # Since we don't want that in the csv output,
196+ # which a computer reads.
197+ row [3 ] = row [3 ].replace ("(symlink) " , "" )
198+ return row
199+ elif "reproduction" in output_file :
190200 return [self .simulation_name , self .machine , self .checksum , self .run_script_reproduction , self .run_script_original ]
191201 else :
192202 raise RuntimeError (f"Invalid output_file={ output_file } " )
@@ -223,7 +233,7 @@ def __init__(self, name):
223233 def append (self , group ):
224234 self .groups .update ([(group .name , group )])
225235
226- # Construct simulations ###########################################
236+ # Construct simulations #######################################################
227237
228238def read_simulations (csv_file ):
229239 # model_version > group > resolution > category > simulation_name,
@@ -284,7 +294,18 @@ def read_simulations(csv_file):
284294 c .simulations .update ([(s .simulation_name , s )])
285295 return versions
286296
287- # Construct table display of simulations ###########################################
297+ # Construct output csv ########################################################
298+
299+ def construct_output_csv (resolutions : OrderedDict [str , Category ], header_cells : List [str ], output_file : str ):
300+ with open (output_file , 'w' , newline = '' , encoding = 'utf-8' ) as f :
301+ writer = csv .writer (f )
302+ writer .writerow (header_cells )
303+ for resolution in resolutions .values ():
304+ for category in resolution .categories .values ():
305+ for simulation in category .simulations .values ():
306+ writer .writerow (simulation .get_row (output_file , minimal_content = True ))
307+
308+ # Construct table display of simulations ######################################
288309def pad_cells (cells : List [str ], col_divider : str , cell_paddings : List [int ]) -> str :
289310 string = col_divider
290311 for i in range (len (cells )):
@@ -328,19 +349,25 @@ def generate_table(page_type: str, resolutions: OrderedDict[str, Category], head
328349def construct_pages (csv_file : str , model_version : str , group_name : str , include_reproduction_scripts : bool = False ):
329350 versions : OrderedDict [str , ModelVersion ] = read_simulations (csv_file )
330351 resolutions : OrderedDict [str , Category ] = versions [model_version ].groups [group_name ].resolutions
352+ header_cells : List [str ] = ["Simulation" , "Data Size (TB)" , "ESGF Links" , "HPSS Path" ]
353+ construct_output_csv (resolutions , header_cells , f"../machine_readable_data/{ model_version } _{ group_name } _simulations.csv" )
354+ print (f"csv of the simulations will be available at https://github.com/E3SM-Project/e3sm_data_docs/blob/main/machine_readable_data/{ model_version } _{ group_name } _simulations.csv" )
331355 # TODO: add proper subdirs and index.rst files in docs/
332356 generate_table (
333357 f"{ model_version } { group_name } simulation table" ,
334358 resolutions ,
335- [ "Simulation" , "Data Size (TB)" , "ESGF Links" , "HPSS Path" ] ,
359+ header_cells ,
336360 f"../docs/source/{ model_version } /{ group_name } /simulation_data/simulation_table.rst" ,
337361 [85 , 15 , 400 , 140 ]
338362 )
339363 if include_reproduction_scripts :
364+ header_cells_reproduction : List [str ] = ["Simulation" , "Machine" , "10 day checksum" , "Reproduction Script" , "Original Script (requires significant changes to run!!)" ,]
365+ construct_output_csv (resolutions , header_cells_reproduction , f"../machine_readable_data/{ model_version } _{ group_name } _reproductions.csv" )
366+ print (f"csv of the reproductions will be available at https://github.com/E3SM-Project/e3sm_data_docs/blob/main/machine_readable_data/{ model_version } _{ group_name } _reproductions.csv" )
340367 generate_table (
341368 f"{ model_version } { group_name } reproduction table" ,
342369 resolutions ,
343- [ "Simulation" , "Machine" , "10 day checksum" , "Reproduction Script" , "Original Script (requires significant changes to run!!)" ,] ,
370+ header_cells_reproduction ,
344371 f"../docs/source/{ model_version } /{ group_name } /reproducing_simulations/reproduction_table.rst" ,
345372 # TODO: The script boxes have to be 200 characters wide to fit in the links...
346373 # This is unfortunate because the actual displayed text is quite short.
@@ -356,4 +383,4 @@ def construct_pages(csv_file: str, model_version: str, group_name: str, include_
356383 # Sources for v1 data
357384 # https://acme-climate.atlassian.net/wiki/spaces/ED/pages/4495441922/V1+Simulation+backfill+WIP
358385 # https://acme-climate.atlassian.net/wiki/spaces/DOC/pages/1271169273/v1+High+Res+Coupled+Run+Output+HPSS+Archive
359- construct_pages ("simulations_v1_water_cycle.csv" , "v1" , "WaterCycle" )
386+ construct_pages ("input/ simulations_v1_water_cycle.csv" , "v1" , "WaterCycle" )
0 commit comments