Skip to content
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,7 @@ conda-build/

*~
utils/out.txt
utils/out_*.txt
utils/simulation_table.txt
utils/reproduction_table.txt
utils/test_reproduction_scripts.o*
340 changes: 211 additions & 129 deletions docs/source/v1/WaterCycle/simulation_data/simulation_table.rst

Large diffs are not rendered by default.

7 changes: 4 additions & 3 deletions utils/generate_html.bash
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
pr_num=59
pr_num=60
try_num=9

# Chrysalis
#destination_dir=/lcrc/group/e3sm/public_html/diagnostic_output/$USER/data_docs_${pr_num}
#web_page="https://web.lcrc.anl.gov/public/e3sm/diagnostic_output/$USER/data_docs_${pr_num}/html/"
# Perlmutter
destination_dir=/global/cfs/cdirs/e3sm/www/$USER/data_docs_${pr_num}
web_page="https://portal.nersc.gov/cfs/e3sm/$USER/data_docs_${pr_num}/html/"
destination_dir=/global/cfs/cdirs/e3sm/www/$USER/data_docs_${pr_num}_try${try_num}
web_page="https://portal.nersc.gov/cfs/e3sm/$USER/data_docs_${pr_num}_try${try_num}/html/"

python generate_tables.py
if [ $? != 0 ]; then
Expand Down
116 changes: 82 additions & 34 deletions utils/generate_tables.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,24 @@
import requests
from collections import OrderedDict
from typing import Dict, List, Tuple
import urllib.parse

# Functions to compute fields for simulations ###########################################
def get_data_size_and_hpss(hpss_path: str) -> Tuple[str, str]:
"""Get the data size in TB"""
output = "out.txt"
is_symlink: bool = check_if_symlink(hpss_path)
output = "out_du.txt"
if os.path.exists(output):
os.remove(output)
try:
os.system(f'(hsi "du {hpss_path}") 2>&1 | tee {output}')
if is_symlink:
# The `/*` expands symlinks on HSI!
# This will actually work fine even if it's not a symlink,
# but we needed to check for symlinks anyway to note "(symlink)" by the HPSS path,
# so we might as well handle the cases separately here.
os.system(f'(hsi "du {hpss_path}/*") 2>&1 | tee {output}')
else:
os.system(f'(hsi "du {hpss_path}") 2>&1 | tee {output}')
except Exception as e:
print(f"hsi failed: {e}")
return ("", "")
Expand All @@ -31,37 +40,85 @@ def get_data_size_and_hpss(hpss_path: str) -> Tuple[str, str]:
data_size = int(num_bytes)/1e12
if data_size > 0:
data_size = f"{data_size:.0f}"
hpss = hpss_path
if is_symlink:
hpss = f"(symlink) {hpss_path}"
else:
hpss = hpss_path
else:
data_size = ""
hpss = ""
return (data_size, hpss)

def get_esgf(source_id: str, model_version: str, experiment: str, ensemble_num: str, link_type: str, node: str) -> str:
def check_if_symlink(hpss_path: str) -> bool:
output: str = "out_symlink_check.txt"
if os.path.exists(output):
os.remove(output)
try:
os.system(f'(hsi "ls {hpss_path}") 2>&1 | tee {output}')
except Exception as e:
print(f"hsi failed: {e}")
return False
with open(output, "r") as f:
for line in f:
# Symlinks on HSI/HPSS end in `@`
match_object = re.search(f"{os.path.basename(hpss_path)}@", line)
if match_object:
return True
return False


def get_esgf(model_version: str, resolution: str, simulation_name: str, experiment: str, ensemble_num: str, link_type: str, node: str) -> str:
esgf: str
if link_type == "none":
esgf = ""
elif node == "cels.anl":
esgf = f"`CMIP <https://esgf-node.{node}.gov/search/?project=CMIP6&activeFacets=%7B%22source_id%22%3A%22{source_id}%22%2C%22experiment_id%22%3A%22{experiment}%22%2C%22variant_label%22%3A%22r{ensemble_num}i1p1f1%22%7D>`_"
elif experiment and ensemble_num:
# See https://github.com/E3SM-Project/CMIP6-Metadata/pull/9#issuecomment-1246086256 for the table of ensemble numbers
# Note that `[1:]`` removes `v` from `model_version`
esgf_native: str = f"`Native <https://esgf-node.{node}.gov/search/e3sm/?model_version={model_version[1:]}_0&experiment={experiment}&ensemble_member=ens{ensemble_num}>`_"
if experiment == 'hist-all-xGHG-xaer':
experiment_id = 'hist-nat'
elif model_version == "v1":
v1_institution_id: str
variant_suffix: str
if simulation_name.startswith("LE_"):
v1_institution_id = "UCSB"
variant_suffix = "i2p2f1"
else:
experiment_id = experiment
esgf_cmip: str = f"`CMIP <https://esgf-node.{node}.gov/search/cmip6/?source_id={source_id}&experiment_id={experiment_id}&variant_label=r{ensemble_num}i1p1f1>`_"
if link_type == "cmip":
esgf = esgf_cmip
elif link_type == "native":
esgf = esgf_native
elif link_type == "both":
esgf = esgf_cmip + ', ' + esgf_native
else:
raise ValueError(f"Invalid link_type={link_type}")
v1_institution_id = "E3SM-Project"
variant_suffix = "i1p1f1"
human_readable_active_facets: str = f'{{"institution_id":"{v1_institution_id}","source_id":"E3SM-1-0","experiment_id":"{experiment}","variant_label":"r{ensemble_num}{variant_suffix}"}}'
url_active_facets: str = urllib.parse.quote(human_readable_active_facets)
esgf = f"`CMIP <https://esgf-node.{node}.gov/search?project=CMIP6&activeFacets={url_active_facets}>`_"
else:
esgf = ""
# v2, v2.1
# Determine source_id
if (len(model_version) == 4) and (model_version[2] == "."):
source_id = f"E3SM-{model_version[1]}-{model_version[3]}"
elif (len (model_version) == 2):
if resolution == "NARRM":
source_id = f"E3SM-{model_version[1]}-0-{resolution}"
else:
source_id = f"E3SM-{model_version[1]}-0"
else:
raise RuntimeError(f"Invalid model-version={model_version}")
# Determine esgf
if node == "cels.anl": # v2.1 only
human_readable_active_facets = f'{{"source_id":"{source_id}","experiment_id":"{experiment}","variant_label":"r{ensemble_num}i1p1f1"}}'
url_active_facets: str = urllib.parse.quote(human_readable_active_facets)
esgf = f"`CMIP <https://esgf-node.{node}.gov/search/?project=CMIP6&activeFacets={url_active_facets}>`_"
elif experiment and ensemble_num:
# See https://github.com/E3SM-Project/CMIP6-Metadata/pull/9#issuecomment-1246086256 for the table of ensemble numbers
# Note that `[1:]`` removes `v` from `model_version`
esgf_native: str = f"`Native <https://esgf-node.{node}.gov/search/e3sm/?model_version={model_version[1:]}_0&experiment={experiment}&ensemble_member=ens{ensemble_num}>`_"
if experiment == 'hist-all-xGHG-xaer':
experiment_id = 'hist-nat'
else:
experiment_id = experiment
esgf_cmip: str = f"`CMIP <https://esgf-node.{node}.gov/search/cmip6/?source_id={source_id}&experiment_id={experiment_id}&variant_label=r{ensemble_num}i1p1f1>`_"
if link_type == "cmip":
esgf = esgf_cmip
elif link_type == "native":
esgf = esgf_native
elif link_type == "both":
esgf = esgf_cmip + ', ' + esgf_native
else:
raise ValueError(f"Invalid link_type={link_type}")
else:
esgf = ""
return esgf

def get_run_script_original(model_version: str, simulation_name: str) -> str:
Expand Down Expand Up @@ -114,16 +171,7 @@ def __init__(self, simulation_dict):
hpss_path = f"/home/projects/e3sm/www/{self.group}/E3SM{self.model_version}/{self.resolution}/{self.simulation_name}"
self.data_size, self.hpss = get_data_size_and_hpss(hpss_path)

if (len(self.model_version) == 4) and (self.model_version[2] == "."):
source_id = f"E3SM-{self.model_version[1]}-{self.model_version[3]}"
elif (len (self.model_version) == 2):
if self.resolution == "NARRM":
source_id = f"E3SM-{self.model_version[1]}-0-{self.resolution}"
else:
source_id = f"E3SM-{self.model_version[1]}-0"
else:
raise RuntimeError(f"Invalid model-version={self.model_version}")
self.esgf = get_esgf(source_id, self.model_version, self.experiment, self.ensemble_num, self.link_type, self.node)
self.esgf = get_esgf(self.model_version, self.resolution, self.simulation_name, self.experiment, self.ensemble_num, self.link_type, self.node)

self.run_script_original = get_run_script_original(self.model_version, self.simulation_name)
self.run_script_reproduction = get_run_script_reproduction(self.model_version, self.simulation_name)
Expand Down Expand Up @@ -286,7 +334,7 @@ def construct_pages(csv_file: str, model_version: str, group_name: str, include_
resolutions,
["Simulation", "Data Size (TB)", "ESGF Links", "HPSS Path"],
f"../docs/source/{model_version}/{group_name}/simulation_data/simulation_table.rst",
[85, 15, 400, 130]
[85, 15, 400, 140]
)
if include_reproduction_scripts:
generate_table(
Expand Down
18 changes: 18 additions & 0 deletions utils/make_symlinks.bash
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# This will be a problem if these simulations are ever removed from the publication archives!
for i in $(seq 1 20); do
hsi ln -s /home/projects/e3sm/www/publication-archives/pub_archive_E3SM_1_0_LE_historical_ens$i /home/projects/e3sm/www/WaterCycle/E3SMv1/LR/LE_historical_ens$i
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

HSI/HPSS adds a @ to the end of its symlinks, but that may just be a visual indicator. In any case, HPSS paths and data sizes aren't being displayed on https://portal.nersc.gov/cfs/e3sm/forsyth/data_docs_60_try2/html/v1/WaterCycle/simulation_data/simulation_table.html

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Some of the other data sets are showing a size of 0, but these doesn't even show a size of at all, so that makes me think the path isn't being found.

That said, they do show up in my output logs:

1	/home/projects/e3sm/www/WaterCycle/E3SMv1/LR/LE_historical_ens11
-----------------------
0	total 512-byte blocks, 0 Files (0 bytes)

So, it seems to read it as an empty path. I wonder if symlinks show zero size?

This one shows up as 0 in the table:

341850452	2	/home/projects/e3sm/www/WaterCycle/E3SMv1/HR/cori-haswell.20190513.F2010LRtunedHR.plus4K.noCNT.ne30_oECv3/
-----------------------
341850452	total 512-byte blocks, 2 Files (175,027,431,424 bytes)

So, it must be because 175x10^9 bytes is basically 0 TB (0.175x10^12). Indeed, this 113x10^12 one shows up as 113:

221651622324	820	/home/projects/e3sm/www/WaterCycle/E3SMv1/HR/20211021-maint-1.0-tro.A_WCYCLSSP585_CMIP6_HR.ne120_oRRS18v3_ICG.unc12-3rd-attempt/
-----------------------
221651622324	total 512-byte blocks, 820 Files (113,485,630,629,888 bytes)

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@forsyth2 could you double check the file size from /home/projects/e3sm/www/publication-archives/pub_archive_E3SM_1_0_LE_historical_ens$i, hopefully there is no corruption during zstash archive or transfer.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@chengzhuzhang It's definitely an issue with the symlinks; I'm discussing with NERSC support. The original paths are fine, e.g.,:

hsi du /home/projects/e3sm/www/publication-archives/pub_archive_E3SM_1_0_LE_historical_ens1
# 49970007900	95	/home/projects/e3sm/www/publication-archives/pub_archive_E3SM_1_0_LE_historical_ens1/
# -----------------------
# 49970007900	total 512-byte blocks, 95 Files (25,584,644,044,800 bytes)

done

for i in $(seq 1 20); do
hsi ln -s /home/projects/e3sm/www/publication-archives/pub_archive_E3SM_1_0_LE_ssp370_ens$i /home/projects/e3sm/www/WaterCycle/E3SMv1/LR/LE_ssp370_ens$i
done

# Symlink last remaining large simulation
# This will be a problem if ndk ever deletes the source!
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@chengzhuzhang I meant to include this in the self-review I just posted. The symlinks are fine as long as we are guaranteed that people don't delete the source directories like /home/projects/e3sm/www/publication-archives/ or /home/n/ndk/2019/theta.20190910.branch_noCNT.n825def.unc06.A_WCYCL1950S_CMIP6_HR.ne120_oRRS18v3_ICG. Is that something we can be sure of?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think so, tagging directory owners @TonyB9000 and @ndkeen, please make sure not to delete above directories.

hsi ln -s /home/n/ndk/2019/theta.20190910.branch_noCNT.n825def.unc06.A_WCYCL1950S_CMIP6_HR.ne120_oRRS18v3_ICG /home/projects/e3sm/www/WaterCycle/E3SMv1/LR/theta.20190910.branch_noCNT.n825def.unc06.A_WCYCL1950S_CMIP6_HR.ne120_oRRS18v3_ICG

# Note:
# It seems impossible to do a recursive remove with HSI/on HPSS.
# > rm -rf E3SM_1_0_LE_historical_ens1@ # Trying to remove mislabeled directory
# Unknown option or missing argument: 'r' ignored
# Unknown option or missing argument: 'f' ignored
7 changes: 7 additions & 0 deletions utils/print_ensemble_rows.bash
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
for i in $(seq 1 20); do
echo "v1, WaterCycle, LR, LargeEnsemble, LE_historical_ens$i, , , historical-large-ensemble, $i, none, ,"
done

for i in $(seq 1 20); do
echo "v1, WaterCycle, LR, LargeEnsemble, LE_ssp370_ens$i, , , ssp370-large-ensemble, $i, none, ,"
done
Loading