Skip to content

Commit e1bfdc6

Browse files
committed
Produce csv files for machine readability
1 parent 5c5dfc3 commit e1bfdc6

File tree

7 files changed

+133
-13
lines changed

7 files changed

+133
-13
lines changed

machine_readable_data/README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
This directory contains the csv files that have output matching the tables on the e3sm_data_docs webpage.
2+
These csv files are meant for machine readability whereas the web pages are meant for human readability.

machine_readable_data/v1_WaterCycle_simulations.csv

Lines changed: 91 additions & 0 deletions
Large diffs are not rendered by default.

utils/generate_html.bash

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
1-
pr_num=60
2-
try_num=11
1+
pr_num=61
2+
try_num=5
33

44
# Chrysalis
55
#destination_dir=/lcrc/group/e3sm/public_html/diagnostic_output/$USER/data_docs_${pr_num}

utils/generate_tables.py

Lines changed: 38 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
from typing import Dict, List, Tuple
77
import urllib.parse
88

9-
# Functions to compute fields for simulations ###########################################
9+
# Functions to compute fields for simulations #################################
1010
def get_data_size_and_hpss(hpss_path: str) -> Tuple[str, str]:
1111
"""Get the data size in TB"""
1212
is_symlink: bool = check_if_symlink(hpss_path)
@@ -139,7 +139,7 @@ def get_run_script_reproduction(model_version: str, simulation_name: str) -> str
139139
run_script_reproduction = ""
140140
return run_script_reproduction
141141

142-
# Define simulations and their grouping ###########################################
142+
# Define simulations and their grouping #######################################
143143
class Simulation(object):
144144
def __init__(self, simulation_dict):
145145
self.model_version = simulation_dict["model_version"]
@@ -183,10 +183,20 @@ def __init__(self, simulation_dict):
183183
if not self.run_script_original:
184184
self.run_script_original = "N/A"
185185

186-
def get_row(self, output_file):
187-
if output_file.endswith("simulation_table.rst"):
188-
return [self.simulation_name, self.data_size, self.esgf, self.hpss]
189-
elif output_file.endswith("reproduction_table.rst"):
186+
def get_row(self, output_file, minimal_content: bool = False) -> List[str]:
187+
if "simulation" in output_file:
188+
row = [self.simulation_name, self.data_size, self.esgf, self.hpss]
189+
if minimal_content:
190+
match_object: re.Match = re.match("`.*<(.*)>`_", self.esgf)
191+
if match_object:
192+
row[2] = match_object.group(1) # Extract URL from the esgf link
193+
if self.hpss.startswith("(symlink) "):
194+
# Remove symlink prefix for the HPSS path
195+
# Since we don't want that in the csv output,
196+
# which a computer reads.
197+
row[3] = row[3].replace("(symlink) ", "")
198+
return row
199+
elif "reproduction" in output_file:
190200
return [self.simulation_name, self.machine, self.checksum, self.run_script_reproduction, self.run_script_original]
191201
else:
192202
raise RuntimeError(f"Invalid output_file={output_file}")
@@ -223,7 +233,7 @@ def __init__(self, name):
223233
def append(self, group):
224234
self.groups.update([(group.name, group)])
225235

226-
# Construct simulations ###########################################
236+
# Construct simulations #######################################################
227237

228238
def read_simulations(csv_file):
229239
# model_version > group > resolution > category > simulation_name,
@@ -284,7 +294,18 @@ def read_simulations(csv_file):
284294
c.simulations.update([(s.simulation_name, s)])
285295
return versions
286296

287-
# Construct table display of simulations ###########################################
297+
# Construct output csv ########################################################
298+
299+
def construct_output_csv(resolutions: OrderedDict[str, Category], header_cells: List[str], output_file: str):
300+
with open(output_file, 'w', newline='', encoding='utf-8') as f:
301+
writer = csv.writer(f)
302+
writer.writerow(header_cells)
303+
for resolution in resolutions.values():
304+
for category in resolution.categories.values():
305+
for simulation in category.simulations.values():
306+
writer.writerow(simulation.get_row(output_file, minimal_content=True))
307+
308+
# Construct table display of simulations ######################################
288309
def pad_cells(cells: List[str], col_divider: str, cell_paddings: List[int]) -> str:
289310
string = col_divider
290311
for i in range(len(cells)):
@@ -328,19 +349,25 @@ def generate_table(page_type: str, resolutions: OrderedDict[str, Category], head
328349
def construct_pages(csv_file: str, model_version: str, group_name: str, include_reproduction_scripts: bool = False):
329350
versions: OrderedDict[str, ModelVersion] = read_simulations(csv_file)
330351
resolutions: OrderedDict[str, Category] = versions[model_version].groups[group_name].resolutions
352+
header_cells: List[str] = ["Simulation", "Data Size (TB)", "ESGF Links", "HPSS Path"]
353+
construct_output_csv(resolutions, header_cells, f"../machine_readable_data/{model_version}_{group_name}_simulations.csv")
354+
print(f"csv of the simulations will be available at https://github.com/E3SM-Project/e3sm_data_docs/blob/main/machine_readable_data/{model_version}_{group_name}_simulations.csv")
331355
# TODO: add proper subdirs and index.rst files in docs/
332356
generate_table(
333357
f"{model_version} {group_name} simulation table",
334358
resolutions,
335-
["Simulation", "Data Size (TB)", "ESGF Links", "HPSS Path"],
359+
header_cells,
336360
f"../docs/source/{model_version}/{group_name}/simulation_data/simulation_table.rst",
337361
[85, 15, 400, 140]
338362
)
339363
if include_reproduction_scripts:
364+
header_cells_reproduction: List[str] = ["Simulation", "Machine", "10 day checksum", "Reproduction Script", "Original Script (requires significant changes to run!!)",]
365+
construct_output_csv(resolutions, header_cells_reproduction, f"../machine_readable_data/{model_version}_{group_name}_reproductions.csv")
366+
print(f"csv of the reproductions will be available at https://github.com/E3SM-Project/e3sm_data_docs/blob/main/machine_readable_data/{model_version}_{group_name}_reproductions.csv")
340367
generate_table(
341368
f"{model_version} {group_name} reproduction table",
342369
resolutions,
343-
["Simulation", "Machine", "10 day checksum", "Reproduction Script", "Original Script (requires significant changes to run!!)",],
370+
header_cells_reproduction,
344371
f"../docs/source/{model_version}/{group_name}/reproducing_simulations/reproduction_table.rst",
345372
# TODO: The script boxes have to be 200 characters wide to fit in the links...
346373
# This is unfortunate because the actual displayed text is quite short.
@@ -356,4 +383,4 @@ def construct_pages(csv_file: str, model_version: str, group_name: str, include_
356383
# Sources for v1 data
357384
# https://acme-climate.atlassian.net/wiki/spaces/ED/pages/4495441922/V1+Simulation+backfill+WIP
358385
# https://acme-climate.atlassian.net/wiki/spaces/DOC/pages/1271169273/v1+High+Res+Coupled+Run+Output+HPSS+Archive
359-
construct_pages("simulations_v1_water_cycle.csv", "v1", "WaterCycle")
386+
construct_pages("input/simulations_v1_water_cycle.csv", "v1", "WaterCycle")
File renamed without changes.
File renamed without changes.
File renamed without changes.

0 commit comments

Comments
 (0)