Skip to content

Commit f0499ba

Browse files
committed
Add more columns to data-finding table
1 parent 22e7100 commit f0499ba

File tree

1 file changed

+73
-47
lines changed

1 file changed

+73
-47
lines changed

utils/find_data.py

Lines changed: 73 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,44 @@
55

66
# Run with `time python find_data.py > out_find_data.txt`
77

8+
# Standard HPSS `/home/projects/e3sm/www/WaterCycle/E3SMv1/` paths from
9+
# https://docs.e3sm.org/e3sm_data_docs/_build/html/v1/WaterCycle/simulation_data/simulation_table.html
10+
CENTRALIZED_PREFIXES: List[str] = [
11+
"/home/projects/e3sm/www/WaterCycle/E3SMv1/LR/",
12+
"/home/projects/e3sm/www/WaterCycle/E3SMv1/HR/"
13+
]
14+
# User-specific paths from
15+
# https://e3sm.atlassian.net/wiki/spaces/ED/pages/4495441922/V1+Simulation+backfill+WIP
16+
USER_PREFIXES: List[str] = [
17+
"/home/b/beharrop/E3SM_simulations/",
18+
"/home/c/chengzhu/",
19+
"/home/d/dcomeau/cryosphere_simulations/",
20+
"/home/g/golaz/2018/E3SM_simulations/",
21+
"/home/g/golaz/2018/E3SM_simulations/repaired/",
22+
"/home/g/golaz/2019/E3SM_simulations/",
23+
"/home/g/golaz/2019/E3SM_simulations/repaired/",
24+
"/home/j/jinyun/CBGCv1/",
25+
"/home/j/jonbob/",
26+
"/home/m/maltrud/E3SM/",
27+
"/home/n/ndk/2019/",
28+
"/home/n/ndk/2020/",
29+
"/home/n/ndk/2021/",
30+
"/home/projects/m3412/"
31+
"/home/s/shix/E3SM/",
32+
"/home/t/tang30/2018/E3SM_simulations/",
33+
"/home/t/tang30/2018/E3SM_simulations/repaired/",
34+
"/home/t/tang30/2019/E3SM_simulations/",
35+
"/home/t/tang30/2019/E3SM_simulations/repaired/",
36+
"/home/q/qzhu/CBGCv1/",
37+
"/home/z/zshaheen/2018/E3SM_simulations/",
38+
"/home/z/zshaheen/2018/E3SM_simulations/repaired/",
39+
]
40+
# Publication paths `/home/projects/e3sm/www/publication-archives/` noted in
41+
# https://github.com/E3SM-Project/e3sm_data_docs/pull/59#issuecomment-3063668732
42+
PUBLICATION_PREFIXES: List[str] = [
43+
"/home/projects/e3sm/www/publication-archives/pub_archive_E3SM_1_0_"
44+
]
45+
846
class DataPathsContainer(object):
947
def __init__(self, simulation_name: str):
1048
self.simulation_name: str = simulation_name
@@ -39,55 +77,15 @@ def read_simulations(csv_file: str) -> List[DataPathsContainer]:
3977
return containers
4078

4179
def search_for_all_appearances(containers: List[DataPathsContainer]):
42-
# Try the standard HPSS `/home/projects/e3sm/www/WaterCycle/E3SMv1/` paths from
43-
# https://docs.e3sm.org/e3sm_data_docs/_build/html/v1/WaterCycle/simulation_data/simulation_table.html
44-
standard_prefixes: List[str] = ["/home/projects/e3sm/www/WaterCycle/E3SMv1/LR/", "/home/projects/e3sm/www/WaterCycle/E3SMv1/HR/"]
45-
# Try the user-specific paths from
46-
# https://e3sm.atlassian.net/wiki/spaces/ED/pages/4495441922/V1+Simulation+backfill+WIP
47-
user_prefixes: List[str] = [
48-
"/home/b/beharrop/E3SM_simulations/",
49-
"/home/c/chengzhu/",
50-
"/home/d/dcomeau/cryosphere_simulations/",
51-
"/home/g/golaz/2018/E3SM_simulations/",
52-
"/home/g/golaz/2018/E3SM_simulations/repaired/",
53-
"/home/g/golaz/2019/E3SM_simulations/",
54-
"/home/g/golaz/2019/E3SM_simulations/repaired/",
55-
"/home/j/jinyun/CBGCv1/",
56-
"/home/j/jonbob/",
57-
"/home/m/maltrud/E3SM/",
58-
"/home/n/ndk/2019/",
59-
"/home/n/ndk/2020/",
60-
"/home/n/ndk/2021/",
61-
"/home/projects/m3412/"
62-
"/home/s/shix/E3SM/",
63-
"/home/t/tang30/2018/E3SM_simulations/",
64-
"/home/t/tang30/2018/E3SM_simulations/repaired/",
65-
"/home/t/tang30/2019/E3SM_simulations/",
66-
"/home/t/tang30/2019/E3SM_simulations/repaired/",
67-
"/home/q/qzhu/CBGCv1/",
68-
"/home/z/zshaheen/2018/E3SM_simulations/",
69-
"/home/z/zshaheen/2018/E3SM_simulations/repaired/",
70-
]
71-
# Try the `/home/projects/e3sm/www/publication-archives/` paths noted in
72-
# https://github.com/E3SM-Project/e3sm_data_docs/pull/59#issuecomment-3063668732
73-
pub_prefixes: List[str] = [
74-
"/home/projects/e3sm/www/publication-archives/pub_archive_E3SM_1_0_"
75-
]
76-
77-
# For testing
78-
# containers = containers[15:25]
79-
80-
num_prefixes: int = len(standard_prefixes) + len (user_prefixes) + len(pub_prefixes)
80+
num_prefixes: int = len(CENTRALIZED_PREFIXES) + len (USER_PREFIXES) + len(PUBLICATION_PREFIXES)
8181
num_simulations: int = len(containers)
8282
print(f"Trying {num_prefixes} paths for {num_simulations} simulations ({num_prefixes * num_simulations} combinations)")
83-
# Trying 23 paths for 91 simulations (2093 combinations)
84-
8583
for container in containers:
86-
for prefix in standard_prefixes:
84+
for prefix in CENTRALIZED_PREFIXES:
8785
add_hpss_paths(container, prefix)
88-
for prefix in user_prefixes:
86+
for prefix in USER_PREFIXES:
8987
add_hpss_paths(container, prefix)
90-
for prefix in pub_prefixes:
88+
for prefix in PUBLICATION_PREFIXES:
9189
add_hpss_paths(container, prefix)
9290

9391
def add_hpss_paths(container: DataPathsContainer, hpss_prefix: str):
@@ -114,13 +112,41 @@ def construct_markdown_table(containers: List[DataPathsContainer]):
114112
output_file: str = "out_simulation_paths.md"
115113
with open(output_file, "w") as f:
116114
f.write("# Data Paths\n\n")
117-
f.write("| Simulation name | Real paths | Symlink paths | Num hard copies | Num total copies |\n")
118-
f.write("| --- | --- | --- | --- | --- |\n")
115+
f.write("| Simulation name | Physically present in this many places | Accessible from this many places | Physically in publication archives? | Accessible from centralized location? | Real paths | Symlink paths |\n")
116+
f.write("| --- | --- | --- | --- | --- | --- | --- |\n")
119117
for container in containers:
118+
physically_in_publication_archive: bool = False
119+
accessible_from_central_location: bool = False
120120
symlinks: List[str] = []
121121
for (src, dst) in container.symlink_paths:
122122
symlinks.append(f"{src} -> {dst}")
123-
f.write(f"| {container.simulation_name} | {container.real_paths} | {symlinks} | {container.get_num_hard_copies()} | {container.get_num_total_copies()} | \n")
123+
if not physically_in_publication_archive:
124+
physically_in_publication_archive = check_for_prefix(dst, PUBLICATION_PREFIXES)
125+
if not accessible_from_central_location:
126+
accessible_from_central_location = check_for_prefix(src, CENTRALIZED_PREFIXES)
127+
if not physically_in_publication_archive:
128+
for path in container.real_paths:
129+
physically_in_publication_archive = check_for_prefix(path, PUBLICATION_PREFIXES)
130+
if physically_in_publication_archive:
131+
break
132+
if not accessible_from_central_location:
133+
for path in container.real_paths:
134+
accessible_from_central_location = check_for_prefix(path, CENTRALIZED_PREFIXES)
135+
if accessible_from_central_location:
136+
break
137+
f.write(f"| {container.simulation_name} | {container.get_num_hard_copies()} | {container.get_num_total_copies()} | {bool2str(physically_in_publication_archive)} | {bool2str(accessible_from_central_location)} | {container.real_paths} | {symlinks} |\n")
138+
139+
def check_for_prefix(path: str, prefix_list: List[str]):
140+
for prefix in prefix_list:
141+
if path.startswith(prefix):
142+
return True
143+
return False
144+
145+
def bool2str(val: bool) -> str:
146+
if val:
147+
return "Yes"
148+
else:
149+
return "" # Empty string
124150

125151
if __name__ == "__main__":
126152
containers: List[DataPathsContainer] = read_simulations("input/simulations_v1_water_cycle.csv")

0 commit comments

Comments
 (0)