Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 33 additions & 0 deletions src/electrai/data/MP/s3_download/bulk_download/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# Dataset

The volumetric data files (CHGCAR, AECCAR0, AECCAR2, ELFCAR) and DOSCAR were downloaded from the Materials Project on 10-22-2025 (database version v2025.09.25)

## Data Generation Steps

To regenerate this data:

0. In a fresh directory, do the following:
1. Run `fetch_deprecated_task_ids.py` and `map_material_to_task_ids.py`
2. Run each of the `download_*.sh` files. This can be done concurrently. It is best to send this to the background with `nohup <script.py> &`
3. Run `download_task_docs.py`
4. Run `remove_deprecated_data.py`
5. Run `remove_missing_task_docs.py`
6. Run `remove_lcharg_false_chgcars.py`
7. Run `remove_nscf_task_ids.py`
8. Run `remove_missing_task_to_material_mapping_chgcars.py`
9. Run `functional_to_task_ids.py`

## Trash

The `trash` folder contains all invalid volumetric data files.

## Metadata

The `metadata` folder contains information about the dataset that is used for filtering:

- `deprecated_material_ids.csv`: A list of all Material IDs that are deprecated according to the Materials Project.
- `deprecated_task_ids.csv`: A list of all Task IDs associated with deprecated Material IDs.
- `material_id_to_task_ids.json.gz`: A mapping of Material IDs to Task IDs.
- `task_id_to_material_id.json.gz`: A mapping of Task IDs to Material IDs.
- `chgcars_functional_to_task_ids.json.gz`: A mapping of functional to chgcars Task IDs.
- `elfcars_functional_to_task_ids.json.gz`: A mapping of functional to elfcars Task IDs.
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#!/bin/bash
aws s3 cp --no-sign-request --recursive s3://materialsproject-parsed/chgcars chgcars
rm chgcars/.*
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#!/bin/bash
aws s3 cp --no-sign-request --recursive s3://materialsproject-parsed/elfcars elfcars
rm elfcars/.*
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
"""
Downloads task documents corresponding to a list of task IDs
"""

from __future__ import annotations

import os

from emmet.core.utils import jsanitize
from monty.serialization import dumpfn
from mp_api.client.routes.materials.tasks import TaskRester

os.makedirs("task_docs", exist_ok=True)
task_ids_to_fetch = [
f.split(".")[0] for f in os.listdir("elfcars") + os.listdir("chgcars")
]
fetched_task_ids = [f.split(".")[0] for f in os.listdir("task_docs")]
task_ids = list(set(task_ids_to_fetch) - set(fetched_task_ids))
batch_size = 10000
with TaskRester() as tpr:
for i in range(0, len(task_ids), batch_size):
docs = tpr.search(task_ids[i : i + batch_size])
for doc in docs:
dumpfn(jsanitize(doc), f"task_docs/{doc.task_id}.json.gz")
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
"""
Writes out a CSV file of all deprecated material IDs and task IDs
"""

from __future__ import annotations

import csv
import os

from mp_api.client import MPRester

with MPRester() as mpr:
docs = mpr.materials.summary.search(
deprecated=True, fields=["material_id", "task_ids"]
)

deprecated_task_ids = []
deprecated_material_ids = []
for doc in docs:
deprecated_task_ids.extend([str(task_id) for task_id in doc.task_ids])
deprecated_material_ids.append(doc.material_id)
deprecated_task_ids.sort()
deprecated_material_ids.sort()

os.makedirs("metadata", exist_ok=True)
with open("metadata/deprecated_task_ids.csv", "w") as f:
writer = csv.writer(f)
writer.writerow(deprecated_task_ids)

with open("metadata/deprecated_material_ids.csv", "w") as f:
writer = csv.writer(f)
writer.writerow(deprecated_material_ids)
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
from __future__ import annotations

import os
import sys
from collections import defaultdict
from multiprocessing import Pool, cpu_count

from monty.serialization import dumpfn, loadfn


def get_run_type(task_id):
doc = loadfn(f"task_docs/{task_id}.json.gz")
run_type = doc["calcs_reversed"][0]["run_type"]
return task_id, run_type


if __name__ == "__main__":
folder = sys.argv[1]
task_ids = [f.split(".")[0] for f in os.listdir(folder)]

nproc = cpu_count() # min(8, cpu_count())
print(f"Using {nproc} parallel workers...")

functional_to_task = defaultdict(list)

with Pool(nproc) as pool:
for counter, (task_id, run_type) in enumerate(
pool.imap_unordered(get_run_type, task_ids), 1
):
print(counter)
functional_to_task[run_type].append(task_id)

dumpfn(functional_to_task, f"metadata/{folder}_functional_to_task_ids.json.gz")
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
"""
Maps material_id to task_id and vice versa
"""

from __future__ import annotations

import os

from monty.serialization import dumpfn
from mp_api.client import MPRester
from tqdm import tqdm

task_to_mat_id_map = {}
mat_to_task_id_map = {}


with MPRester() as mpr:
docs = mpr.materials.summary.search(
deprecated=False, fields=["material_id", "task_ids"]
)

for doc in tqdm(docs):
mp_id = str(doc.material_id)
mat_to_task_id_map[mp_id] = []
for task_id in doc.task_ids:
task_id = str(task_id)
task_to_mat_id_map[task_id] = mp_id
mat_to_task_id_map[mp_id].append(task_id)

os.makedirs("metadata", exist_ok=True)
dumpfn(task_to_mat_id_map, "metadata/task_id_to_material_id.json.gz")
dumpfn(mat_to_task_id_map, "metadata/material_id_to_task_ids.json.gz")
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
"""
Filter out all CHGCARs associated with a deprecated ID
"""

from __future__ import annotations

import csv
import os
from shutil import move

from tqdm import tqdm

task_ids = []
with open("metadata/deprecated_task_ids.csv") as f:
reader = csv.reader(f)
for row in reader:
task_ids.extend(row)

for task_id in tqdm(task_ids):
for folder in ("elfcars", "chgcars"):
os.makedirs(f"trash/deprecated/{folder}", exist_ok=True)
if os.path.exists(f"{folder}/{task_id}.json.gz"):
move(
f"{folder}/{task_id}.json.gz",
f"trash/deprecated/{folder}/{task_id}.json.gz",
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
"""
Filter out all CHGCARs used as inputs and not written out as outputs
"""

from __future__ import annotations

import os
from shutil import move

from monty.serialization import loadfn
from tqdm import tqdm

task_ids = [f.split(".")[0] for f in os.listdir("chgcars")]

files_to_remove = []
for task_id in tqdm(task_ids):
if not os.path.exists(f"task_docs/{task_id}.json.gz"):
continue
task_doc = loadfn(f"task_docs/{task_id}.json.gz")
if task_doc["input"]["incar"].get("LCHARG", False) is False:
files_to_remove.append(task_id)

os.makedirs("trash/lcharg_false/chgcars", exist_ok=True)
for file_to_remove in files_to_remove:
move(
f"chgcars/{file_to_remove}.json.gz",
f"trash/lcharg_false/chgcars/{file_to_remove}.json.gz",
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
"""
Filter out all volumetric files missing task data.
"""

from __future__ import annotations

import os
from shutil import move

from tqdm import tqdm

task_docs = [f.split(".")[0] for f in os.listdir("task_docs")]
for folder in ("elfcars", "chgcars"):
os.makedirs(f"trash/missing_task_data/{folder}", exist_ok=True)
task_ids = [f.split(".")[0] for f in os.listdir(folder)]
files_to_remove = list(set(task_ids) - set(task_docs))
for file_to_remove in tqdm(files_to_remove):
move(
f"{folder}/{file_to_remove}.json.gz",
f"trash/missing_task_data/{folder}/{file_to_remove}.json.gz",
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
"""
Filter out all task IDs within chgcars with no associated material ID.
"""

from __future__ import annotations

import gzip
import json
import os
from shutil import move

from tqdm import tqdm

for folder in ("elfcars", "chgcars"):
task_ids = [f.split(".")[0] for f in os.listdir(folder)]

map_file = "metadata/task_id_to_material_id.json.gz"
with gzip.open(map_file, "rt") as file:
map_task_material = json.load(file)

files_to_remove = []
for task_id in tqdm(task_ids):
if task_id not in map_task_material:
files_to_remove.append(task_id)

os.makedirs("trash/missing_task_material_mapping/chgcars/", exist_ok=True)
for file_to_remove in files_to_remove:
move(
f"{folder}/{file_to_remove}.json.gz",
f"trash/missing_task_material_mapping/{folder}/{file_to_remove}.json.gz",
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
"""
Filter out all CHGCARs written out during an NSCF run.
"""

from __future__ import annotations

import os
from shutil import move

from monty.serialization import loadfn
from tqdm import tqdm

folders = ("elfcars", "chgcars")
task_ids = []
for folder in folders:
task_ids.extend([f.split(".")[0] for f in os.listdir(folder)])
task_ids = list(set(task_ids))

files_to_remove = []
for task_id in tqdm(task_ids):
if not os.path.exists(f"task_docs/{task_id}.json.gz"):
continue
task_doc = loadfn(f"task_docs/{task_id}.json.gz")
if task_doc["input"]["incar"].get("ICHARG", 0) >= 10:
files_to_remove.append(task_id)

for file_type in folders:
os.makedirs(f"trash/nscf/{file_type}", exist_ok=True)
for file_to_remove in files_to_remove:
if os.path.exists(f"{file_type}/{file_to_remove}.json.gz"):
move(
f"{file_type}/{file_to_remove}.json.gz",
f"trash/nscf/{file_type}/{file_to_remove}.json.gz",
)