Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions machine_readable_data/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
This directory contains the csv files that have output matching the tables on the e3sm_data_docs webpage.
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The csv files are placed in this directory. That means they will be available at https://github.com/E3SM-Project/e3sm_data_docs/blob/main/machine_readable_data/. If you approve of the PR's output, I can produce the csv files for v2 and v2.1 too, if needed.

These csv files are meant for machine readability whereas the web pages are meant for human readability.
91 changes: 91 additions & 0 deletions machine_readable_data/v1_WaterCycle_simulations.csv

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions utils/generate_html.bash
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
pr_num=60
try_num=11
pr_num=61
try_num=5

# Chrysalis
#destination_dir=/lcrc/group/e3sm/public_html/diagnostic_output/$USER/data_docs_${pr_num}
Expand Down
49 changes: 38 additions & 11 deletions utils/generate_tables.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from typing import Dict, List, Tuple
import urllib.parse

# Functions to compute fields for simulations ###########################################
# Functions to compute fields for simulations #################################
def get_data_size_and_hpss(hpss_path: str) -> Tuple[str, str]:
"""Get the data size in TB"""
is_symlink: bool = check_if_symlink(hpss_path)
Expand Down Expand Up @@ -139,7 +139,7 @@ def get_run_script_reproduction(model_version: str, simulation_name: str) -> str
run_script_reproduction = ""
return run_script_reproduction

# Define simulations and their grouping ###########################################
# Define simulations and their grouping #######################################
class Simulation(object):
def __init__(self, simulation_dict):
self.model_version = simulation_dict["model_version"]
Expand Down Expand Up @@ -183,10 +183,20 @@ def __init__(self, simulation_dict):
if not self.run_script_original:
self.run_script_original = "N/A"

def get_row(self, output_file):
if output_file.endswith("simulation_table.rst"):
return [self.simulation_name, self.data_size, self.esgf, self.hpss]
elif output_file.endswith("reproduction_table.rst"):
def get_row(self, output_file, minimal_content: bool = False) -> List[str]:
if "simulation" in output_file:
row = [self.simulation_name, self.data_size, self.esgf, self.hpss]
if minimal_content:
match_object: re.Match = re.match("`.*<(.*)>`_", self.esgf)
if match_object:
row[2] = match_object.group(1) # Extract URL from the esgf link
if self.hpss.startswith("(symlink) "):
# Remove symlink prefix for the HPSS path
# Since we don't want that in the csv output,
# which a computer reads.
row[3] = row[3].replace("(symlink) ", "")
return row
elif "reproduction" in output_file:
return [self.simulation_name, self.machine, self.checksum, self.run_script_reproduction, self.run_script_original]
else:
raise RuntimeError(f"Invalid output_file={output_file}")
Expand Down Expand Up @@ -223,7 +233,7 @@ def __init__(self, name):
def append(self, group):
self.groups.update([(group.name, group)])

# Construct simulations ###########################################
# Construct simulations #######################################################

def read_simulations(csv_file):
# model_version > group > resolution > category > simulation_name,
Expand Down Expand Up @@ -284,7 +294,18 @@ def read_simulations(csv_file):
c.simulations.update([(s.simulation_name, s)])
return versions

# Construct table display of simulations ###########################################
# Construct output csv ########################################################

def construct_output_csv(resolutions: OrderedDict[str, Category], header_cells: List[str], output_file: str):
with open(output_file, 'w', newline='', encoding='utf-8') as f:
writer = csv.writer(f)
writer.writerow(header_cells)
for resolution in resolutions.values():
for category in resolution.categories.values():
for simulation in category.simulations.values():
writer.writerow(simulation.get_row(output_file, minimal_content=True))

# Construct table display of simulations ######################################
def pad_cells(cells: List[str], col_divider: str, cell_paddings: List[int]) -> str:
string = col_divider
for i in range(len(cells)):
Expand Down Expand Up @@ -328,19 +349,25 @@ def generate_table(page_type: str, resolutions: OrderedDict[str, Category], head
def construct_pages(csv_file: str, model_version: str, group_name: str, include_reproduction_scripts: bool = False):
versions: OrderedDict[str, ModelVersion] = read_simulations(csv_file)
resolutions: OrderedDict[str, Category] = versions[model_version].groups[group_name].resolutions
header_cells: List[str] = ["Simulation", "Data Size (TB)", "ESGF Links", "HPSS Path"]
construct_output_csv(resolutions, header_cells, f"../machine_readable_data/{model_version}_{group_name}_simulations.csv")
print(f"csv of the simulations will be available at https://github.com/E3SM-Project/e3sm_data_docs/blob/main/machine_readable_data/{model_version}_{group_name}_simulations.csv")
# TODO: add proper subdirs and index.rst files in docs/
generate_table(
Comment on lines +353 to 356
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Really, we're just creating two output types here (1. a csv for a computer to read and 2. a rst and ultimately html page for a human to read) from the same data in RAM. This implementation doesn't construct the HTML from the csv directly.

Copy link
Collaborator

@TonyB9000 TonyB9000 Jul 15, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@forsyth2 Understood. This will be great. (If the very same process is creating both, there should be little room for inconsistenies.)

The only fields I need are:

  1. A "unique key" to identify the archive (essentially the unique case_id)
  2. The Volume-in-TB field (to avoid download of M TB, when disk free space < M TB)
  3. The precise HPSS path (to feed "zstash --check --hpss=path) when zstash is ready to perform this transfer.

I made a wild guess for v3: It looks like this:

LR:DECK,v3.LR.piControl,1,na,na,/home/g/golaz/E3SMv3.LR/v3.LR.piControl
LR:DECK,v3.LR.1pctCO2_bcdt15m,1,na,na,/home/w/wlin/E3SMv3/v3.LR.1pctCO2_bcdt15m
LR:DECK,v3.LR.abrupt-4xCO2_bcdt15m,1,na,na,/home/w/wlin/E3SMv3/v3.LR.abrupt-4xCO2_bcdt15m
LR:Historical,v3.LR.historical_0051,1,na,na,/home/w/wlin/E3SMv3/v3.LR.historical_0051
LR:Historical,v3.LR.historical_0101,1,na,na,/home/w/wlin/E3SMv3/v3.LR.historical_0101
LR:Historical,v3.LR.historical_0151,1,na,na,/home/w/wlin/E3SMv3/v3.LR.historical_0151
LR:Historical,v3.LR.historical_0201,1,na,na,/home/w/wlin/E3SMv3/v3.LR.historical_0201
LR:Historical,v3.LR.historical_0251,1,na,na,/home/w/wlin/E3SMv3/v3.LR.historical_0251
LR:AMIP,v3.LR.amip_0101,1,na,na,home/w/wlin/E3SMv3/AMIP/v3.LR.amip_0101
LR:AMIP,v3.LR.amip_0151,1,na,na,home/w/wlin/E3SMv3/AMIP/v3.LR.amip_0151
LR:AMIP,v3.LR.amip_0201,1,na,na,home/w/wlin/E3SMv3/AMIP/v3.LR.amip_0201
LR:DAMIP,v3.LR.hist-GHG_0101,1,na,na,/home/g/golaz/E3SMv3.LR/v3.LR.hist-GHG_0101
LR:DAMIP,v3.LR.hist-GHG_0151,1,na,na,/home/g/golaz/E3SMv3.LR/v3.LR.hist-GHG_0151
LR:DAMIP,v3.LR.hist-GHG_0201,1,na,na,/home/g/golaz/E3SMv3.LR/v3.LR.hist-GHG_0201
LR:DAMIP,v3.LR.hist-aer_0101,1,na,na,/home/g/golaz/E3SMv3.LR/v3.LR.hist-aer_0101
LR:DAMIP,v3.LR.hist-aer_0151,1,na,na,/home/g/golaz/E3SMv3.LR/v3.LR.hist-aer_0151
LR:DAMIP,v3.LR.hist-aer_0201,1,na,na,/home/g/golaz/E3SMv3.LR/v3.LR.hist-aer_0201
LR:DAMIP,v3.LR.hist-xGHG-xaer_0101,1,na,na,/home/g/golaz/E3SMv3.LR/v3.LR.hist-xGHG-xaer_0101
LR:DAMIP,v3.LR.hist-xGHG-xaer_0151,1,na,na,/home/g/golaz/E3SMv3.LR/v3.LR.hist-xGHG-xaer_0151
LR:DAMIP,v3.LR.hist-xGHG-xaer_0201,1,na,na,/home/g/golaz/E3SMv3.LR/v3.LR.hist-xGHG-xaer_0201
LR:RFMIP,v3.LR.piClim-control-iceini,1,na,na,/home/k/kaizhang/E3SM/E3SMv3/v3.LR.piClim-histall/v3.LR.piClim-control-iceini
LR:RFMIP,v3.LR.piClim-histall_0101,1,na,na,/home/k/kaizhang/E3SM/E3SMv3/v3.LR.piClim-histall/v3.LR.piClim-histall_0101
LR:RFMIP,v3.LR.piClim-histall_0151,1,na,na,/home/k/kaizhang/E3SM/E3SMv3/v3.LR.piClim-histall/v3.LR.piClim-histall_0151
LR:RFMIP,v3.LR.piClim-histall_0201,1,na,na,/home/k/kaizhang/E3SM/E3SMv3/v3.LR.piClim-histall/v3.LR.piClim-histall_0201
LR:RFMIP,v3.LR.piClim-histGHG_0101,1,na,na,/home/k/kaizhang/E3SM/E3SMv3/v3.LR.piClim-histGHG/v3.LR.piClim-histGHG_0101
LR:RFMIP,v3.LR.piClim-histGHG_0151,1,na,na,/home/k/kaizhang/E3SM/E3SMv3/v3.LR.piClim-histGHG/v3.LR.piClim-histGHG_0151
LR:RFMIP,v3.LR.piClim-histGHG_0201,1,na,na,/home/k/kaizhang/E3SM/E3SMv3/v3.LR.piClim-histGHG/v3.LR.piClim-histGHG_0201
LR:RFMIP,v3.LR.piClim-histaer_0101,1,na,na,/home/k/kaizhang/E3SM/E3SMv3/v3.LR.piClim-histaer/v3.LR.piClim-histaer_0101
LR:RFMIP,v3.LR.piClim-histaer_0151,1,na,na,/home/k/kaizhang/E3SM/E3SMv3/v3.LR.piClim-histaer/v3.LR.piClim-histaer_0151
LR:RFMIP,v3.LR.piClim-histaer_0201,1,na,na,/home/k/kaizhang/E3SM/E3SMv3/v3.LR.piClim-histaer/v3.LR.piClim-histaer_0201

f"{model_version} {group_name} simulation table",
resolutions,
["Simulation", "Data Size (TB)", "ESGF Links", "HPSS Path"],
header_cells,
f"../docs/source/{model_version}/{group_name}/simulation_data/simulation_table.rst",
[85, 15, 400, 140]
)
if include_reproduction_scripts:
header_cells_reproduction: List[str] = ["Simulation", "Machine", "10 day checksum", "Reproduction Script", "Original Script (requires significant changes to run!!)",]
construct_output_csv(resolutions, header_cells_reproduction, f"../machine_readable_data/{model_version}_{group_name}_reproductions.csv")
print(f"csv of the reproductions will be available at https://github.com/E3SM-Project/e3sm_data_docs/blob/main/machine_readable_data/{model_version}_{group_name}_reproductions.csv")
generate_table(
f"{model_version} {group_name} reproduction table",
resolutions,
["Simulation", "Machine", "10 day checksum", "Reproduction Script", "Original Script (requires significant changes to run!!)",],
header_cells_reproduction,
f"../docs/source/{model_version}/{group_name}/reproducing_simulations/reproduction_table.rst",
# TODO: The script boxes have to be 200 characters wide to fit in the links...
# This is unfortunate because the actual displayed text is quite short.
Expand All @@ -356,4 +383,4 @@ def construct_pages(csv_file: str, model_version: str, group_name: str, include_
# Sources for v1 data
# https://acme-climate.atlassian.net/wiki/spaces/ED/pages/4495441922/V1+Simulation+backfill+WIP
# https://acme-climate.atlassian.net/wiki/spaces/DOC/pages/1271169273/v1+High+Res+Coupled+Run+Output+HPSS+Archive
construct_pages("simulations_v1_water_cycle.csv", "v1", "WaterCycle")
construct_pages("input/simulations_v1_water_cycle.csv", "v1", "WaterCycle")
File renamed without changes.