Skip to content

Commit d022ffd

Browse files
Add bulk download scripts
1 parent 95b26a7 commit d022ffd

15 files changed

+302
-0
lines changed
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
# Dataset
2+
3+
The volumetric data files (CHGCAR, AECCAR0, AECCAR2, ELFCAR) and DOSCAR were downloaded from the Materials Project on 10-22-2025 (database version v2025.09.25)
4+
5+
## Data Generation Steps
6+
7+
To regenerate this data:
8+
9+
0. In a fresh directory, do the following:
10+
1. Run `fetch_deprecated_task_ids.py` and `map_material_to_task_ids.py`
11+
2. Run each of the `download_*.sh` files. This can be done concurrently. It is best to send this to the background with `nohup <script.py> &`
12+
3. Run `download_task_docs.py`
13+
4. Run `remove_deprecated_data.py`
14+
5. Run `remove_missing_task_docs.py`
15+
6. Run `remove_lcharg_false_chgcars.py`
16+
7. Run `remove_nscf_task_ids.py`
17+
8. Run `remove_missing_task_to_material_mapping_chgcars.py`
18+
9. Run `functional_to_task_ids.py`
19+
20+
## Trash
21+
22+
The `trash` folder contains all invalid volumetric data files.
23+
24+
## Metadata
25+
26+
The `metadata` folder contains information about the dataset that is used for filtering:
27+
28+
- `deprecated_material_ids.csv`: A list of all Material IDs that are deprecated according to the Materials Project.
29+
- `deprecated_task_ids.csv`: A list of all Task IDs associated with deprecated Material IDs.
30+
- `material_id_to_task_ids.json.gz`: A mapping of Material IDs to Task IDs.
31+
- `task_id_to_material_id.json.gz`: A mapping of Task IDs to Material IDs.
32+
- `chgcars_functional_to_task_ids.json.gz`: A mapping of functional to chgcars Task IDs.
33+
- `elfcars_functional_to_task_ids.json.gz`: A mapping of functional to elfcars Task IDs.
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
#!/bin/bash
2+
aws s3 cp --no-sign-request --recursive s3://materialsproject-parsed/aeccar0s aeccar0s
3+
rm aeccar0s/.*
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
#!/bin/bash
2+
aws s3 cp --no-sign-request --recursive s3://materialsproject-parsed/aeccar2s aeccar2s
3+
rm aeccar2s/.*
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
#!/bin/bash
2+
aws s3 cp --no-sign-request --recursive s3://materialsproject-parsed/chgcars chgcars
3+
rm chgcars/.*
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
#!/bin/bash
2+
aws s3 cp --no-sign-request --recursive s3://materialsproject-parsed/dos dos
3+
rm dos/.*
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
#!/bin/bash
2+
aws s3 cp --no-sign-request --recursive s3://materialsproject-parsed/elfcars elfcars
3+
rm elfcars/.*
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
"""
2+
Downloads task documents corresponding to a list of task IDs
3+
"""
4+
5+
import os
6+
7+
from emmet.core.utils import jsanitize
8+
from monty.serialization import dumpfn
9+
from mp_api.client.routes.materials.tasks import TaskRester
10+
11+
os.makedirs("task_docs", exist_ok=True)
12+
task_ids_to_fetch = [
13+
f.split(".")[0]
14+
for f in os.listdir("elfcars")
15+
+ os.listdir("chgcars")
16+
+ os.listdir("aeccar0s")
17+
+ os.listdir("aeccar2s")
18+
]
19+
fetched_task_ids = [f.split(".")[0] for f in os.listdir("task_docs")]
20+
task_ids = list(set(task_ids_to_fetch) - set(fetched_task_ids))
21+
batch_size = 10000
22+
with TaskRester() as tpr:
23+
for i in range(0, len(task_ids), batch_size):
24+
docs = tpr.search(task_ids[i : i + batch_size])
25+
for doc in docs:
26+
dumpfn(jsanitize(doc), f"task_docs/{doc.task_id}.json.gz")
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
"""
2+
Writes out a CSV file of all deprecated material IDs and task IDs
3+
"""
4+
5+
import csv
6+
import os
7+
8+
from mp_api.client import MPRester
9+
10+
with MPRester() as mpr:
11+
docs = mpr.materials.summary.search(
12+
deprecated=True, fields=["material_id", "task_ids"]
13+
)
14+
15+
deprecated_task_ids = []
16+
deprecated_material_ids = []
17+
for doc in docs:
18+
deprecated_task_ids.extend([str(task_id) for task_id in doc.task_ids])
19+
deprecated_material_ids.append(doc.material_id)
20+
deprecated_task_ids.sort()
21+
deprecated_material_ids.sort()
22+
23+
os.makedirs("metadata", exist_ok=True)
24+
with open("metadata/deprecated_task_ids.csv", "w") as f:
25+
writer = csv.writer(f)
26+
writer.writerow(deprecated_task_ids)
27+
28+
with open("metadata/deprecated_material_ids.csv", "w") as f:
29+
writer = csv.writer(f)
30+
writer.writerow(deprecated_material_ids)
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
import os
2+
import sys
3+
from collections import defaultdict
4+
from monty.serialization import loadfn, dumpfn
5+
from multiprocessing import Pool, cpu_count
6+
7+
def get_run_type(task_id):
8+
doc = loadfn(f"task_docs/{task_id}.json.gz")
9+
run_type = doc["calcs_reversed"][0]["run_type"]
10+
return task_id, run_type
11+
12+
if __name__ == "__main__":
13+
folder = sys.argv[1]
14+
task_ids = [f.split(".")[0] for f in os.listdir(folder)]
15+
16+
nproc = cpu_count() #min(8, cpu_count())
17+
print(f"Using {nproc} parallel workers...")
18+
19+
functional_to_task = defaultdict(list)
20+
21+
with Pool(nproc) as pool:
22+
for counter, (task_id, run_type) in enumerate(pool.imap_unordered(get_run_type, task_ids), 1):
23+
print(counter)
24+
functional_to_task[run_type].append(task_id)
25+
26+
dumpfn(functional_to_task, f"metadata/{folder}_functional_to_task_ids.json.gz")
27+
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
"""
2+
Maps material_id to task_id and vice versa
3+
"""
4+
5+
import os
6+
7+
from monty.serialization import dumpfn
8+
from mp_api.client import MPRester
9+
from tqdm import tqdm
10+
11+
task_to_mat_id_map = {}
12+
mat_to_task_id_map = {}
13+
14+
15+
with MPRester() as mpr:
16+
docs = mpr.materials.summary.search(
17+
deprecated=False, fields=["material_id", "task_ids"]
18+
)
19+
20+
for doc in tqdm(docs):
21+
mp_id = str(doc.material_id)
22+
mat_to_task_id_map[mp_id] = []
23+
for task_id in doc.task_ids:
24+
task_id = str(task_id)
25+
task_to_mat_id_map[task_id] = mp_id
26+
mat_to_task_id_map[mp_id].append(task_id)
27+
28+
os.makedirs("metadata", exist_ok=True)
29+
dumpfn(task_to_mat_id_map, "metadata/task_id_to_material_id.json.gz")
30+
dumpfn(mat_to_task_id_map, "metadata/material_id_to_task_ids.json.gz")

0 commit comments

Comments
 (0)