E3SM-Project · forsyth2 · Jul 16, 2025 · Jul 15, 2025 · forsyth2 · Jul 15, 2025
diff --git a/machine_readable_data/README.md b/machine_readable_data/README.md
@@ -0,0 +1,2 @@
+This directory contains the csv files that have output matching the tables on the e3sm_data_docs webpage.
+These csv files are meant for machine readability whereas the web pages are meant for human readability.
diff --git a/machine_readable_data/v1_WaterCycle_simulations.csv b/machine_readable_data/v1_WaterCycle_simulations.csv
diff --git a/utils/generate_html.bash b/utils/generate_html.bash
@@ -1,5 +1,5 @@
-pr_num=60
-try_num=11
+pr_num=61
+try_num=5
 
 # Chrysalis
 #destination_dir=/lcrc/group/e3sm/public_html/diagnostic_output/$USER/data_docs_${pr_num}

diff --git a/utils/generate_tables.py b/utils/generate_tables.py
@@ -6,7 +6,7 @@
 from typing import Dict, List, Tuple
 import urllib.parse
 
-# Functions to compute fields for simulations ###########################################
+# Functions to compute fields for simulations #################################
 def get_data_size_and_hpss(hpss_path: str) -> Tuple[str, str]:
         """Get the data size in TB"""
         is_symlink: bool = check_if_symlink(hpss_path)
@@ -139,7 +139,7 @@ def get_run_script_reproduction(model_version: str, simulation_name: str) -> str
         run_script_reproduction = ""
     return run_script_reproduction
 
-# Define simulations and their grouping ###########################################
+# Define simulations and their grouping #######################################
 class Simulation(object):
     def __init__(self, simulation_dict):
         self.model_version = simulation_dict["model_version"]
@@ -183,10 +183,20 @@ def __init__(self, simulation_dict):
         if not self.run_script_original:
             self.run_script_original = "N/A"
 
-    def get_row(self, output_file):
-        if output_file.endswith("simulation_table.rst"):
-            return [self.simulation_name, self.data_size, self.esgf, self.hpss]
-        elif output_file.endswith("reproduction_table.rst"):
+    def get_row(self, output_file, minimal_content: bool = False) -> List[str]:
+        if "simulation" in output_file:
+            row = [self.simulation_name, self.data_size, self.esgf, self.hpss]
+            if minimal_content:
+                match_object: re.Match = re.match("`.*<(.*)>`_", self.esgf)
+                if match_object:
+                    row[2] = match_object.group(1)  # Extract URL from the esgf link
+                if self.hpss.startswith("(symlink) "):
+                    # Remove symlink prefix for the HPSS path
+                    # Since we don't want that in the csv output,
+                    # which a computer reads.
+                    row[3] = row[3].replace("(symlink) ", "")
+            return row
+        elif "reproduction" in output_file:
             return [self.simulation_name, self.machine, self.checksum, self.run_script_reproduction, self.run_script_original]
         else:
             raise RuntimeError(f"Invalid output_file={output_file}")
@@ -223,7 +233,7 @@ def __init__(self, name):
     def append(self, group):
         self.groups.update([(group.name, group)])
 
-# Construct simulations ###########################################
+# Construct simulations #######################################################
 
 def read_simulations(csv_file):
     # model_version > group > resolution > category > simulation_name, 
@@ -284,7 +294,18 @@ def read_simulations(csv_file):
             c.simulations.update([(s.simulation_name, s)])
     return versions
 
-# Construct table display of simulations ###########################################
+# Construct output csv ########################################################
+
+def construct_output_csv(resolutions: OrderedDict[str, Category], header_cells: List[str], output_file: str):
+    with open(output_file, 'w', newline='', encoding='utf-8') as f:
+        writer = csv.writer(f)
+        writer.writerow(header_cells)
+        for resolution in resolutions.values():
+            for category in resolution.categories.values():
+                for simulation in category.simulations.values():
+                    writer.writerow(simulation.get_row(output_file, minimal_content=True))
+
+# Construct table display of simulations ######################################
 def pad_cells(cells: List[str], col_divider: str, cell_paddings: List[int]) -> str:
     string = col_divider
     for i in range(len(cells)):
@@ -328,19 +349,25 @@ def generate_table(page_type: str, resolutions: OrderedDict[str, Category], head
 def construct_pages(csv_file: str, model_version: str, group_name: str, include_reproduction_scripts: bool = False):
     versions: OrderedDict[str, ModelVersion] = read_simulations(csv_file)
     resolutions: OrderedDict[str, Category] = versions[model_version].groups[group_name].resolutions
+    header_cells: List[str] = ["Simulation", "Data Size (TB)", "ESGF Links", "HPSS Path"]
+    construct_output_csv(resolutions, header_cells, f"../machine_readable_data/{model_version}_{group_name}_simulations.csv")
+    print(f"csv of the simulations will be available at https://github.com/E3SM-Project/e3sm_data_docs/blob/main/machine_readable_data/{model_version}_{group_name}_simulations.csv")
     # TODO: add proper subdirs and index.rst files in docs/
     generate_table(
         f"{model_version} {group_name} simulation table",
         resolutions,
-        ["Simulation", "Data Size (TB)", "ESGF Links", "HPSS Path"],
+        header_cells,
         f"../docs/source/{model_version}/{group_name}/simulation_data/simulation_table.rst",
         [85, 15, 400, 140]
     )
     if include_reproduction_scripts:
+        header_cells_reproduction: List[str] = ["Simulation", "Machine", "10 day checksum", "Reproduction Script", "Original Script (requires significant changes to run!!)",]
+        construct_output_csv(resolutions, header_cells_reproduction, f"../machine_readable_data/{model_version}_{group_name}_reproductions.csv")
+        print(f"csv of the reproductions will be available at https://github.com/E3SM-Project/e3sm_data_docs/blob/main/machine_readable_data/{model_version}_{group_name}_reproductions.csv")
         generate_table(
             f"{model_version} {group_name} reproduction table",
             resolutions,
-            ["Simulation", "Machine", "10 day checksum", "Reproduction Script", "Original Script (requires significant changes to run!!)",],
+            header_cells_reproduction,
             f"../docs/source/{model_version}/{group_name}/reproducing_simulations/reproduction_table.rst",
             # TODO: The script boxes have to be 200 characters wide to fit in the links...
             # This is unfortunate because the actual displayed text is quite short.
@@ -356,4 +383,4 @@ def construct_pages(csv_file: str, model_version: str, group_name: str, include_
     # Sources for v1 data
     # https://acme-climate.atlassian.net/wiki/spaces/ED/pages/4495441922/V1+Simulation+backfill+WIP
     # https://acme-climate.atlassian.net/wiki/spaces/DOC/pages/1271169273/v1+High+Res+Coupled+Run+Output+HPSS+Archive 
-    construct_pages("simulations_v1_water_cycle.csv", "v1", "WaterCycle")
+    construct_pages("input/simulations_v1_water_cycle.csv", "v1", "WaterCycle")
diff --git a/utils/simulations_v1_water_cycle.csv → utils/input/simulations_v1_water_cycle.csv b/utils/simulations_v1_water_cycle.csv → utils/input/simulations_v1_water_cycle.csv
diff --git a/utils/simulations_v2_1.csv → utils/input/simulations_v2_1.csv b/utils/simulations_v2_1.csv → utils/input/simulations_v2_1.csv
diff --git a/utils/simulations_v2_water_cycle.csv → utils/input/simulations_v2_water_cycle.csv b/utils/simulations_v2_water_cycle.csv → utils/input/simulations_v2_water_cycle.csv
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		This directory contains the csv files that have output matching the tables on the e3sm_data_docs webpage.
Copy link Collaborator Author forsyth2 Jul 15, 2025 Choose a reason for hiding this comment The reason will be displayed to describe this comment to others. Learn more. The `csv` files are placed in this directory. That means they will be available at https://github.com/E3SM-Project/e3sm_data_docs/blob/main/machine_readable_data/. If you approve of the PR's output, I can produce the `csv` files for v2 and v2.1 too, if needed.
		These csv files are meant for machine readability whereas the web pages are meant for human readability.