Add bulk download scripts

Andrew-S-Rosen · Andrew-S-Rosen · commit d022ffdd8e81 · 2025-10-24T21:11:18.000-04:00
diff --git a/src/electrai/data/MP/s3_download/bulk_download/README.md b/src/electrai/data/MP/s3_download/bulk_download/README.md
@@ -0,0 +1,33 @@
+# Dataset
+
+The volumetric data files (CHGCAR, AECCAR0, AECCAR2, ELFCAR) and DOSCAR were downloaded from the Materials Project on 10-22-2025 (database version v2025.09.25)
+
+## Data Generation Steps
+
+To regenerate this data:
+
+0. In a fresh directory, do the following:
+1. Run `fetch_deprecated_task_ids.py` and `map_material_to_task_ids.py`
+2. Run each of the `download_*.sh` files. This can be done concurrently. It is best to send this to the background with `nohup <script.py> &`
+3. Run `download_task_docs.py`
+4. Run `remove_deprecated_data.py`
+5. Run `remove_missing_task_docs.py`
+6. Run `remove_lcharg_false_chgcars.py`
+7. Run `remove_nscf_task_ids.py`
+8. Run `remove_missing_task_to_material_mapping_chgcars.py`
+9. Run `functional_to_task_ids.py`
+
+## Trash
+
+The `trash` folder contains all invalid volumetric data files.
+
+## Metadata
+
+The `metadata` folder contains information about the dataset that is used for filtering:
+
+- `deprecated_material_ids.csv`: A list of all Material IDs that are deprecated according to the Materials Project.
+- `deprecated_task_ids.csv`: A list of all Task IDs associated with deprecated Material IDs.
+- `material_id_to_task_ids.json.gz`: A mapping of Material IDs to Task IDs.
+- `task_id_to_material_id.json.gz`: A mapping of Task IDs to Material IDs.
+- `chgcars_functional_to_task_ids.json.gz`: A mapping of functional to chgcars Task IDs.
+- `elfcars_functional_to_task_ids.json.gz`: A mapping of functional to elfcars Task IDs.
diff --git a/src/electrai/data/MP/s3_download/bulk_download/download_aeccar0s.sh b/src/electrai/data/MP/s3_download/bulk_download/download_aeccar0s.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+aws s3 cp --no-sign-request --recursive s3://materialsproject-parsed/aeccar0s aeccar0s
+rm aeccar0s/.*
diff --git a/src/electrai/data/MP/s3_download/bulk_download/download_aeccar2s.sh b/src/electrai/data/MP/s3_download/bulk_download/download_aeccar2s.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+aws s3 cp --no-sign-request --recursive s3://materialsproject-parsed/aeccar2s aeccar2s
+rm aeccar2s/.*
diff --git a/src/electrai/data/MP/s3_download/bulk_download/download_chgcars.sh b/src/electrai/data/MP/s3_download/bulk_download/download_chgcars.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+aws s3 cp --no-sign-request --recursive s3://materialsproject-parsed/chgcars chgcars
+rm chgcars/.*
diff --git a/src/electrai/data/MP/s3_download/bulk_download/download_dos.sh b/src/electrai/data/MP/s3_download/bulk_download/download_dos.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+aws s3 cp --no-sign-request --recursive s3://materialsproject-parsed/dos dos
+rm dos/.*
diff --git a/src/electrai/data/MP/s3_download/bulk_download/download_elfcars.sh b/src/electrai/data/MP/s3_download/bulk_download/download_elfcars.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+aws s3 cp --no-sign-request --recursive s3://materialsproject-parsed/elfcars elfcars
+rm elfcars/.*
diff --git a/src/electrai/data/MP/s3_download/bulk_download/download_task_docs.py b/src/electrai/data/MP/s3_download/bulk_download/download_task_docs.py
@@ -0,0 +1,26 @@
+"""
+Downloads task documents corresponding to a list of task IDs
+"""
+
+import os
+
+from emmet.core.utils import jsanitize
+from monty.serialization import dumpfn
+from mp_api.client.routes.materials.tasks import TaskRester
+
+os.makedirs("task_docs", exist_ok=True)
+task_ids_to_fetch = [
+    f.split(".")[0]
+    for f in os.listdir("elfcars")
+    + os.listdir("chgcars")
+    + os.listdir("aeccar0s")
+    + os.listdir("aeccar2s")
+]
+fetched_task_ids = [f.split(".")[0] for f in os.listdir("task_docs")]
+task_ids = list(set(task_ids_to_fetch) - set(fetched_task_ids))
+batch_size = 10000
+with TaskRester() as tpr:
+    for i in range(0, len(task_ids), batch_size):
+        docs = tpr.search(task_ids[i : i + batch_size])
+        for doc in docs:
+            dumpfn(jsanitize(doc), f"task_docs/{doc.task_id}.json.gz")
diff --git a/src/electrai/data/MP/s3_download/bulk_download/fetch_deprecated_task_ids.py b/src/electrai/data/MP/s3_download/bulk_download/fetch_deprecated_task_ids.py
@@ -0,0 +1,30 @@
+"""
+Writes out a CSV file of all deprecated material IDs and task IDs
+"""
+
+import csv
+import os
+
+from mp_api.client import MPRester
+
+with MPRester() as mpr:
+    docs = mpr.materials.summary.search(
+        deprecated=True, fields=["material_id", "task_ids"]
+    )
+
+deprecated_task_ids = []
+deprecated_material_ids = []
+for doc in docs:
+    deprecated_task_ids.extend([str(task_id) for task_id in doc.task_ids])
+    deprecated_material_ids.append(doc.material_id)
+deprecated_task_ids.sort()
+deprecated_material_ids.sort()
+
+os.makedirs("metadata", exist_ok=True)
+with open("metadata/deprecated_task_ids.csv", "w") as f:
+    writer = csv.writer(f)
+    writer.writerow(deprecated_task_ids)
+
+with open("metadata/deprecated_material_ids.csv", "w") as f:
+    writer = csv.writer(f)
+    writer.writerow(deprecated_material_ids)
diff --git a/src/electrai/data/MP/s3_download/bulk_download/functional_to_task_ids.py b/src/electrai/data/MP/s3_download/bulk_download/functional_to_task_ids.py
@@ -0,0 +1,27 @@
+import os
+import sys
+from collections import defaultdict
+from monty.serialization import loadfn, dumpfn
+from multiprocessing import Pool, cpu_count
+
+def get_run_type(task_id):
+    doc = loadfn(f"task_docs/{task_id}.json.gz")
+    run_type = doc["calcs_reversed"][0]["run_type"]
+    return task_id, run_type
+
+if __name__ == "__main__":
+    folder = sys.argv[1]
+    task_ids = [f.split(".")[0] for f in os.listdir(folder)]
+
+    nproc = cpu_count() #min(8, cpu_count()) 
+    print(f"Using {nproc} parallel workers...")
+
+    functional_to_task = defaultdict(list)
+
+    with Pool(nproc) as pool:
+        for counter, (task_id, run_type) in enumerate(pool.imap_unordered(get_run_type, task_ids), 1):
+            print(counter)
+            functional_to_task[run_type].append(task_id)
+
+    dumpfn(functional_to_task, f"metadata/{folder}_functional_to_task_ids.json.gz")
+
diff --git a/src/electrai/data/MP/s3_download/bulk_download/map_material_to_task_id.py b/src/electrai/data/MP/s3_download/bulk_download/map_material_to_task_id.py
@@ -0,0 +1,30 @@
+"""
+Maps material_id to task_id and vice versa
+"""
+
+import os
+
+from monty.serialization import dumpfn
+from mp_api.client import MPRester
+from tqdm import tqdm
+
+task_to_mat_id_map = {}
+mat_to_task_id_map = {}
+
+
+with MPRester() as mpr:
+    docs = mpr.materials.summary.search(
+        deprecated=False, fields=["material_id", "task_ids"]
+    )
+
+for doc in tqdm(docs):
+    mp_id = str(doc.material_id)
+    mat_to_task_id_map[mp_id] = []
+    for task_id in doc.task_ids:
+        task_id = str(task_id)
+        task_to_mat_id_map[task_id] = mp_id
+        mat_to_task_id_map[mp_id].append(task_id)
+
+os.makedirs("metadata", exist_ok=True)
+dumpfn(task_to_mat_id_map, "metadata/task_id_to_material_id.json.gz")
+dumpfn(mat_to_task_id_map, "metadata/material_id_to_task_ids.json.gz")
diff --git a/src/electrai/data/MP/s3_download/bulk_download/remove_deprecated_data.py b/src/electrai/data/MP/s3_download/bulk_download/remove_deprecated_data.py
@@ -0,0 +1,26 @@
+"""
+Filter out all CHGCARs associated with a deprecated ID
+"""
+
+from __future__ import annotations
+
+import csv
+import os
+from shutil import move
+
+from tqdm import tqdm
+
+task_ids = []
+with open("metadata/deprecated_task_ids.csv", "r") as f:
+    reader = csv.reader(f)
+    for row in reader:
+        task_ids.extend(row)
+
+for task_id in tqdm(task_ids):
+    for folder in ("elfcars", "chgcars", "aeccar0s", "aeccar2s", "dos"):
+        os.makedirs(f"trash/deprecated/{folder}", exist_ok=True)
+        if os.path.exists(f"{folder}/{task_id}.json.gz"):
+            move(
+                f"{folder}/{task_id}.json.gz",
+                f"trash/deprecated/{folder}/{task_id}.json.gz",
+            )
diff --git a/src/electrai/data/MP/s3_download/bulk_download/remove_lcharg_false_chgcars.py b/src/electrai/data/MP/s3_download/bulk_download/remove_lcharg_false_chgcars.py
@@ -0,0 +1,28 @@
+"""
+Filter out all CHGCARs used as inputs and not written out as outputs
+"""
+
+from __future__ import annotations
+
+import os
+from shutil import move
+
+from monty.serialization import loadfn
+from tqdm import tqdm
+
+task_ids = [f.split(".")[0] for f in os.listdir("chgcars")]
+
+files_to_remove = []
+for task_id in tqdm(task_ids):
+    if not os.path.exists(f"task_docs/{task_id}.json.gz"):
+        continue
+    task_doc = loadfn(f"task_docs/{task_id}.json.gz")
+    if task_doc["input"]["incar"].get("LCHARG", False) is False:
+        files_to_remove.append(task_id)
+
+os.makedirs("trash/lcharg_false/chgcars", exist_ok=True)
+for file_to_remove in files_to_remove:
+    move(
+        f"chgcars/{file_to_remove}.json.gz",
+        f"trash/lcharg_false/chgcars/{file_to_remove}.json.gz",
+    )
diff --git a/src/electrai/data/MP/s3_download/bulk_download/remove_missing_task_docs.py b/src/electrai/data/MP/s3_download/bulk_download/remove_missing_task_docs.py
@@ -0,0 +1,21 @@
+"""
+Filter out all volumetric files missing task data.
+"""
+
+from __future__ import annotations
+
+import os
+from shutil import move
+
+from tqdm import tqdm
+
+task_docs = [f.split(".")[0] for f in os.listdir("task_docs")]
+for folder in ("elfcars", "chgcars", "aeccar0s", "aeccar2s"):
+    os.makedirs(f"trash/missing_task_data/{folder}", exist_ok=True)
+    task_ids = [f.split(".")[0] for f in os.listdir(folder)]
+    files_to_remove = list(set(task_ids) - set(task_docs))
+    for file_to_remove in tqdm(files_to_remove):
+        move(
+            f"{folder}/{file_to_remove}.json.gz",
+            f"trash/missing_task_data/{folder}/{file_to_remove}.json.gz",
+        )
diff --git a/src/electrai/data/MP/s3_download/bulk_download/remove_missing_task_to_material_mapping.py b/src/electrai/data/MP/s3_download/bulk_download/remove_missing_task_to_material_mapping.py
@@ -0,0 +1,32 @@
+"""
+Filter out all task IDs within chgcars with no associated material ID.
+"""
+
+from __future__ import annotations
+
+import gzip
+import json
+import os
+from pathlib import Path
+from shutil import move
+
+from tqdm import tqdm
+
+for folder in ("elfcars", "chgcars", "aeccar0s", "aeccar2"):
+    task_ids = [f.split(".")[0] for f in os.listdir(folder)]
+
+    map_file = "metadata/task_id_to_material_id.json.gz"
+    with gzip.open(map_file, "rt") as file:
+        map_task_material = json.load(file)
+
+    files_to_remove = []
+    for task_id in tqdm(task_ids):
+        if task_id not in map_task_material.keys():
+            files_to_remove.append(task_id)
+
+    os.makedirs("trash/missing_task_material_mapping/chgcars/", exist_ok=True)
+    for file_to_remove in files_to_remove:
+        move(
+            f"{folder}/{file_to_remove}.json.gz",
+            f"trash/missing_task_material_mapping/{folder}/{file_to_remove}.json.gz",
+        )
diff --git a/src/electrai/data/MP/s3_download/bulk_download/remove_nscf_task_ids.py b/src/electrai/data/MP/s3_download/bulk_download/remove_nscf_task_ids.py
@@ -0,0 +1,34 @@
+"""
+Filter out all CHGCARs written out during an NSCF run.
+"""
+
+from __future__ import annotations
+
+import os
+from shutil import move
+
+from monty.serialization import loadfn
+from tqdm import tqdm
+
+folders = ("elfcars", "chgcars", "aeccar0s", "aeccar2s")
+task_ids = []
+for folder in folders:
+    task_ids.extend([f.split(".")[0] for f in os.listdir(folder)])
+task_ids = list(set(task_ids))
+
+files_to_remove = []
+for task_id in tqdm(task_ids):
+    if not os.path.exists(f"task_docs/{task_id}.json.gz"):
+        continue
+    task_doc = loadfn(f"task_docs/{task_id}.json.gz")
+    if task_doc["input"]["incar"].get("ICHARG", 0) >= 10:
+        files_to_remove.append(task_id)
+
+for file_type in ("elfcars", "chgcars", "aeccar0s", "aeccar2s"):
+    os.makedirs(f"trash/nscf/{file_type}", exist_ok=True)
+    for file_to_remove in files_to_remove:
+        if os.path.exists(f"{file_type}/{file_to_remove}.json.gz"):
+            move(
+                f"{file_type}/{file_to_remove}.json.gz",
+                f"trash/nscf/{file_type}/{file_to_remove}.json.gz",
+            )

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+#!/bin/bash`
	`2`	`+aws s3 cp --no-sign-request --recursive s3://materialsproject-parsed/aeccar0s aeccar0s`
	`3`	`+rm aeccar0s/.*`