Skip to content

Commit 2695f8d

Browse files
committed
add HPSS url column to data table
1 parent c88e812 commit 2695f8d

File tree

2 files changed

+27
-3
lines changed

2 files changed

+27
-3
lines changed

docs/source/v2/WaterCycle/simulation_data/index.rst

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,12 @@ For more information, refer to `zstash usage <https://e3sm-project.github.io/zst
4242
`web interface <https://portal.nersc.gov/archive/home/projects/e3sm/www/WaterCycle/E3SMv2>`_.
4343
Note that this will be slow and inefficient since you'll have to download the tar files.
4444

45+
**Tip for users without NERSC access**: Before downloading large tar files, you can first download the ``index.db`` file and use sqlite to check the archive contents: ::
46+
47+
sqlite3 index.db "SELECT tar,name,size from files;" > archive_contents.txt
48+
49+
This will help you identify which specific tar files contain the data you need before downloading.
50+
4551
**v2.LR** simulations data has been archived on NERSC HPSS under: ::
4652

4753
/home/projects/e3sm/www/WaterCycle/E3SMv2/LR

utils/generate_tables.py

Lines changed: 21 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -173,6 +173,9 @@ def __init__(self, simulation_dict):
173173

174174
self.esgf = get_esgf(self.model_version, self.resolution, self.simulation_name, self.experiment, self.ensemble_num, self.link_type, self.node)
175175

176+
# Generate web interface URL from HPSS path
177+
self.web_interface = self.get_web_interface_url()
178+
176179
self.run_script_original = get_run_script_original(self.model_version, self.simulation_name)
177180
self.run_script_reproduction = get_run_script_reproduction(self.model_version, self.simulation_name)
178181

@@ -183,9 +186,20 @@ def __init__(self, simulation_dict):
183186
if not self.run_script_original:
184187
self.run_script_original = "N/A"
185188

189+
def get_web_interface_url(self) -> str:
190+
"""Generate web interface URL from HPSS path"""
191+
if self.hpss and self.data_size:
192+
# Convert HPSS path to web interface URL
193+
# /home/projects/e3sm/www/CoupledSystem/E3SMv3/LR/v3.LR.piControl -> https://portal.nersc.gov/archive/home/projects/e3sm/www/CoupledSystem/E3SMv3/LR/v3.LR.piControl
194+
hpss_clean = self.hpss.replace("(symlink) ", "") # Remove symlink prefix if present
195+
# Use the full path - each simulation gets its own distinct URL
196+
web_url = f"https://portal.nersc.gov/archive{hpss_clean}"
197+
return f"`HPSS URL <{web_url}>`_"
198+
return ""
199+
186200
def get_row(self, output_file, minimal_content: bool = False) -> List[str]:
187201
if "simulation" in output_file:
188-
row = [self.simulation_name, self.data_size, self.esgf, self.hpss]
202+
row = [self.simulation_name, self.data_size, self.esgf, self.hpss, self.web_interface]
189203
if minimal_content:
190204
match_object: re.Match = re.match("`.*<(.*)>`_", self.esgf)
191205
if match_object:
@@ -195,6 +209,10 @@ def get_row(self, output_file, minimal_content: bool = False) -> List[str]:
195209
# Since we don't want that in the csv output,
196210
# which a computer reads.
197211
row[3] = row[3].replace("(symlink) ", "")
212+
# Extract web interface URL for CSV
213+
web_match: re.Match = re.match("`.*<(.*)>`_", self.web_interface)
214+
if web_match:
215+
row[4] = web_match.group(1) # Extract URL from the web interface link
198216
return row
199217
elif "reproduction" in output_file:
200218
return [self.simulation_name, self.machine, self.checksum, self.run_script_reproduction, self.run_script_original]
@@ -353,7 +371,7 @@ def generate_table(page_type: str, resolutions: OrderedDict[str, Category], head
353371
def construct_pages(csv_file: str, model_version: str, group_name: str, include_reproduction_scripts: bool = False):
354372
versions: OrderedDict[str, ModelVersion] = read_simulations(csv_file)
355373
resolutions: OrderedDict[str, Category] = versions[model_version].groups[group_name].resolutions
356-
header_cells: List[str] = ["Simulation", "Data Size (TB)", "ESGF Links", "HPSS Path"]
374+
header_cells: List[str] = ["Simulation", "Data Size (TB)", "ESGF Links", "HPSS Path", "HPSS URL"]
357375
construct_output_csv(resolutions, header_cells, f"../machine_readable_data/{model_version}_{group_name}_simulations.csv")
358376
print(f"csv of the simulations will be available at https://github.com/E3SM-Project/e3sm_data_docs/blob/main/machine_readable_data/{model_version}_{group_name}_simulations.csv")
359377
# TODO: add proper subdirs and index.rst files in docs/
@@ -362,7 +380,7 @@ def construct_pages(csv_file: str, model_version: str, group_name: str, include_
362380
resolutions,
363381
header_cells,
364382
f"../docs/source/{model_version}/{group_name}/simulation_data/simulation_table.rst",
365-
[85, 15, 400, 140]
383+
[65, 15, 300, 120, 80]
366384
)
367385
if include_reproduction_scripts:
368386
header_cells_reproduction: List[str] = ["Simulation", "Machine", "10 day checksum", "Reproduction Script", "Original Script (requires significant changes to run!!)",]

0 commit comments

Comments
 (0)