add HPSS url column to data table

chengzhuzhang · chengzhuzhang · commit 2695f8d8ad7a · 2025-10-06T17:53:16.000-07:00
diff --git a/docs/source/v2/WaterCycle/simulation_data/index.rst b/docs/source/v2/WaterCycle/simulation_data/index.rst
@@ -42,6 +42,12 @@ For more information, refer to `zstash usage <https://e3sm-project.github.io/zst
 `web interface <https://portal.nersc.gov/archive/home/projects/e3sm/www/WaterCycle/E3SMv2>`_.
 Note that this will be slow and inefficient since you'll have to download the tar files.
 
+**Tip for users without NERSC access**: Before downloading large tar files, you can first download the ``index.db`` file and use sqlite to check the archive contents: ::
+
+   sqlite3 index.db "SELECT tar,name,size from files;" > archive_contents.txt
+
+This will help you identify which specific tar files contain the data you need before downloading.
+
 **v2.LR** simulations data has been archived on NERSC HPSS under: ::
 
   /home/projects/e3sm/www/WaterCycle/E3SMv2/LR
diff --git a/utils/generate_tables.py b/utils/generate_tables.py
@@ -173,6 +173,9 @@ def __init__(self, simulation_dict):
 
         self.esgf = get_esgf(self.model_version, self.resolution, self.simulation_name, self.experiment, self.ensemble_num, self.link_type, self.node)
 
+        # Generate web interface URL from HPSS path
+        self.web_interface = self.get_web_interface_url()
+
         self.run_script_original = get_run_script_original(self.model_version, self.simulation_name)
         self.run_script_reproduction = get_run_script_reproduction(self.model_version, self.simulation_name)
         
@@ -183,9 +186,20 @@ def __init__(self, simulation_dict):
         if not self.run_script_original:
             self.run_script_original = "N/A"
 
+    def get_web_interface_url(self) -> str:
+        """Generate web interface URL from HPSS path"""
+        if self.hpss and self.data_size:
+            # Convert HPSS path to web interface URL
+            # /home/projects/e3sm/www/CoupledSystem/E3SMv3/LR/v3.LR.piControl -> https://portal.nersc.gov/archive/home/projects/e3sm/www/CoupledSystem/E3SMv3/LR/v3.LR.piControl
+            hpss_clean = self.hpss.replace("(symlink) ", "")  # Remove symlink prefix if present
+            # Use the full path - each simulation gets its own distinct URL
+            web_url = f"https://portal.nersc.gov/archive{hpss_clean}"
+            return f"`HPSS URL <{web_url}>`_"
+        return ""
+
     def get_row(self, output_file, minimal_content: bool = False) -> List[str]:
         if "simulation" in output_file:
-            row = [self.simulation_name, self.data_size, self.esgf, self.hpss]
+            row = [self.simulation_name, self.data_size, self.esgf, self.hpss, self.web_interface]
             if minimal_content:
                 match_object: re.Match = re.match("`.*<(.*)>`_", self.esgf)
                 if match_object:
@@ -195,6 +209,10 @@ def get_row(self, output_file, minimal_content: bool = False) -> List[str]:
                     # Since we don't want that in the csv output,
                     # which a computer reads.
                     row[3] = row[3].replace("(symlink) ", "")
+                # Extract web interface URL for CSV
+                web_match: re.Match = re.match("`.*<(.*)>`_", self.web_interface)
+                if web_match:
+                    row[4] = web_match.group(1)  # Extract URL from the web interface link
             return row
         elif "reproduction" in output_file:
             return [self.simulation_name, self.machine, self.checksum, self.run_script_reproduction, self.run_script_original]
@@ -353,7 +371,7 @@ def generate_table(page_type: str, resolutions: OrderedDict[str, Category], head
 def construct_pages(csv_file: str, model_version: str, group_name: str, include_reproduction_scripts: bool = False):
     versions: OrderedDict[str, ModelVersion] = read_simulations(csv_file)
     resolutions: OrderedDict[str, Category] = versions[model_version].groups[group_name].resolutions
-    header_cells: List[str] = ["Simulation", "Data Size (TB)", "ESGF Links", "HPSS Path"]
+    header_cells: List[str] = ["Simulation", "Data Size (TB)", "ESGF Links", "HPSS Path", "HPSS URL"]
     construct_output_csv(resolutions, header_cells, f"../machine_readable_data/{model_version}_{group_name}_simulations.csv")
     print(f"csv of the simulations will be available at https://github.com/E3SM-Project/e3sm_data_docs/blob/main/machine_readable_data/{model_version}_{group_name}_simulations.csv")
     # TODO: add proper subdirs and index.rst files in docs/
@@ -362,7 +380,7 @@ def construct_pages(csv_file: str, model_version: str, group_name: str, include_
         resolutions,
         header_cells,
         f"../docs/source/{model_version}/{group_name}/simulation_data/simulation_table.rst",
-        [85, 15, 400, 140]
+        [65, 15, 300, 120, 80]
     )
     if include_reproduction_scripts:
         header_cells_reproduction: List[str] = ["Simulation", "Machine", "10 day checksum", "Reproduction Script", "Original Script (requires significant changes to run!!)",]