converting Slater dataset files from msgpack to HDF5 format

enjyashraf18 · enjyashraf18 · commit 192c4b762936 · 2025-08-10T01:31:42.000+03:00
diff --git a/atomdb/datasets/datasets_data.h5 b/atomdb/datasets/datasets_data.h5
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:7b5900e9c56c2a5c9e99a6f4f7ef9a7041789d81b3b12cbe7d03e75a70d7dc10
-size 46560562
+oid sha256:afcf1e437f143d5861f8c30cd890cc7996c40ddc12bb112d949f4db64537ed74
+size 922707071
diff --git a/atomdb/datasets/slater/h5file_creator.py b/atomdb/datasets/slater/h5file_creator.py
@@ -10,51 +10,51 @@
 # Suppresses NaturalNameWarning warnings from PyTables.
 warnings.filterwarnings("ignore", category=pt.NaturalNameWarning)
 
-max_norba = 30  # needs to be calculated
+max_norba = 56
 
 SLATER_PROPERTY_CONFIGS = [
     {
-        "SpeciesInfo": "elem",  #
+        "SpeciesInfo": "elem",
         "type": "string",
     },
     {
-        "SpeciesInfo": "nexc",  #
+        "SpeciesInfo": "nexc",
         "type": "int",
     },
     {
-        "SpeciesInfo": "charge",  #
+        "SpeciesInfo": "charge",
         "type": "int",
     },
     {
-        "SpeciesInfo": "mult",  #
+        "SpeciesInfo": "mult",
         "type": "int",
     },
     {
-        "SpeciesInfo": "nelec",  #
+        "SpeciesInfo": "nelec",
         "type": "int",
     },
     {
-        "SpeciesInfo": "nspin",  #
+        "SpeciesInfo": "nspin",
         "type": "int",
     },
     {
-        "SpeciesInfo": "energy",  #
+        "SpeciesInfo": "energy",
         "type": "float",
     },
     {
-        "SpeciesInfo": "ip",  #
+        "SpeciesInfo": "ip",
         "type": "float",
     },
     {
-        "SpeciesInfo": "mu",  #
+        "SpeciesInfo": "mu",
         "type": "float",
     },
     {
-        "SpeciesInfo": "eta",  #
+        "SpeciesInfo": "eta",
         "type": "float",
     },
     {
-        "SpeciesInfo": "nbasis",  #
+        "SpeciesInfo": "nbasis",
         "type": "int",
     },
     {
@@ -133,24 +133,24 @@
         "folder": "DensityLaplacian",
         "spins": "no",
     },
-    # {
-    #     'Carray_property': 'mo_ked_a',
-    #     'table_name': 'mo_ked_a',
-    #     'folder': 'KineticEnergyDensity',
-    #     'spins': 'yes'
-    # },
-    # {
-    #     'Carray_property': 'mo_ked_b',
-    #     'table_name': 'mo_ked_b',
-    #     'folder': 'KineticEnergyDensity',
-    #     'spins': 'yes'
-    # },
-    # {
-    #     'Carray_property': 'ked_tot',
-    #     'table_name': 'ked_tot',
-    #     'folder': 'KineticEnergyDensity',
-    #     'spins': 'no'
-    # }
+    {
+        "Carray_property": "mo_ked_a",
+        "table_name": "mo_ked_a",
+        "folder": "KineticEnergyDensity",
+        "spins": "yes",
+    },
+    {
+        "Carray_property": "mo_ked_b",
+        "table_name": "mo_ked_b",
+        "folder": "KineticEnergyDensity",
+        "spins": "yes",
+    },
+    {
+        "Carray_property": "ked_tot",
+        "table_name": "ked_tot",
+        "folder": "KineticEnergyDensity",
+        "spins": "no",
+    },
 ]
 
 
@@ -257,9 +257,12 @@ def create_properties_arrays(hdf5_file, parent_folder, table_name, description,
         description (str): Description of the table.
         data (numpy.ndarray): The array data to store in the table.
     """
+    filters = pt.Filters(complevel=5, complib="blosc2")
 
     # Create the table and populate the data
-    table = hdf5_file.create_table(parent_folder, table_name, ArrayPropertyDescription, description)
+    table = hdf5_file.create_table(
+        parent_folder, table_name, ArrayPropertyDescription, description, filters=filters
+    )
     row = table.row
     padded_data = np.pad(data, (0, max_norba - len(data)), "constant", constant_values=0)
     row["value"] = padded_data
@@ -278,9 +281,12 @@ def create_spins_array(h5file, parent_folder, key, array_data, shape):
         shape (int): The total size of the CArray.
     """
     data_length = len(array_data)
+    filters = pt.Filters(complevel=5, complib="blosc2")
 
     # Create the CArray and populate the data
-    array = h5file.create_carray(parent_folder, key, pt.Float64Atom(), shape=(shape,))
+    array = h5file.create_carray(
+        parent_folder, key, pt.Float64Atom(), shape=(shape,), filters=filters
+    )
     array[:data_length] = array_data
     array[data_length:] = 0
 
@@ -294,24 +300,29 @@ def create_tot_array(h5file, parent_folder, key, array_data):
         key (str): Name of the CArray.
         array_data (numpy.ndarray): The array data to store in the CArray.
     """
+    data_length = len(array_data)
+    filters = pt.Filters(complevel=5, complib="blosc2")
 
     # Create the CArray and populate the data
     tot_gradient_array = h5file.create_carray(
-        parent_folder, key, pt.Float64Atom(), shape=(NPOINTS,)
+        parent_folder, key, pt.Float64Atom(), shape=(NPOINTS,), filters=filters
     )
-    tot_gradient_array[:] = array_data
+    if data_length < NPOINTS:
+        tot_gradient_array[:data_length] = array_data
+        tot_gradient_array[data_length:] = 0
+
+    else:
+        tot_gradient_array[:] = array_data
 
 
-def create_hdf5_file(DATASETS_H5FILE, fields, dataset):
+def create_hdf5_file(DATASETS_H5FILE, fields, dataset, mult):
     """Creates an HDF5 folder with structured data for a specific dataset and element.
 
     Args:
+        DATASETS_H5FILE (tables.File): An open PyTables HDF5 file object to store the data.
         fields (dataclass): A dataclass containing the fields to store in the HDF5 file.
         dataset (str): Name of the dataset.
-        elem (str): Element symbol.
-        charge (int): Charge of the system.
-        mult (int): Multiplicity of the system.
-        nexc (int): Number of excitations.
+        mult (int): Multiplicity.
     """
     fields = asdict(fields)
     dataset = dataset.lower()
@@ -320,7 +331,6 @@ def create_hdf5_file(DATASETS_H5FILE, fields, dataset):
     elem = fields["elem"]
     nexc = fields["nexc"]
     atnum = element_symbol_map[elem][ElementAttr.atnum]
-    mult = get_scalar_data("mult", atnum, fields["nelec"])
     charge = atnum - fields["nelec"]
 
     # charge and mult can be calculated (instead of passing them)?
diff --git a/atomdb/periodic_test.py b/atomdb/periodic_test.py
@@ -37,6 +37,21 @@ class ElementAttr(IntEnum):
 
 
 def get_scalar_data(prop_name, atnum, nelec):
+    """
+    Get a scalar property value for a given element.
+
+    Args:
+        prop_name (str): Property name to retrieve.
+        atnum (int): Atomic number of the element.
+        nelec (int): Number of electrons in the element.
+
+    Returns:
+        int | float | str | dict[str, float] | None:
+            - int, float, or str for single-valued properties.
+            - dict for properties with multiple sources.
+            - None
+    """
+
     charge = atnum - nelec
 
     if charge != 0 and prop_name not in ["atmass", "elem", "atnum", "name"]:
@@ -73,6 +88,13 @@ def get_scalar_data(prop_name, atnum, nelec):
 
 
 def map_element_symbol():
+    """
+    Build a mapping of element symbols to their atomic number and name.
+
+    Returns:
+        dict[str, tuple[int, str]]:
+            Dictionary mapping element symbol → (atomic_number, name).
+    """
     element_symbol_map = {}
     for element_group in ELEMENTS_H5FILE.root.Elements:
         symbol = element_group.symbol[0]["value"].decode("utf-8").strip()
diff --git a/atomdb/species.py b/atomdb/species.py
@@ -733,31 +733,22 @@ def compile_species(
     fields = submodule.run(elem, charge, mult, nexc, dataset, datapath)
 
     # dump the data to the HDF5 file
-    dump(fields, dataset)
+    dump(fields, dataset, mult)
 
 
-def dump(fields, dataset):
+def dump(fields, dataset, mult):
     r"""Dump the compiled species data to an HDF5 file in the AtomDB database.
 
     Parameters
     ----------
-    fields : dict
-        Dictionary containing the compiled data fields for the species.
-    dataset : str
-        Name of the dataset selected.
-    elem : str
-        Element symbol.
-    charge : int
-        Charge.
-    mult : int
-        Multiplicity.
-    nexc : int, optional
-        Excitation level, by default 0.
+    fields (dataclass): A dataclass containing the fields to store in the HDF5 file.
+    dataset (str): Name of the dataset.
+    mult (int): Multiplicity.
     """
 
     # Save data to the HDF5 file
     element_folder_creator = import_module(f"atomdb.datasets.{dataset}.h5file_creator")
-    element_folder_creator.create_hdf5_file(DATASETS_H5FILE, fields, dataset)
+    element_folder_creator.create_hdf5_file(DATASETS_H5FILE, fields, dataset, mult)
 
 
 def load(
@@ -851,7 +842,7 @@ def datafile(
 
     Returns
     -------
-    str
+    list
         paths to the database file of a species in AtomDB.
 
     """
@@ -901,6 +892,8 @@ def get_species_data(folder_path, elem, DATASET_PROPERTY_CONFIGS):
     ----------
     folder_path : str
         Path to the HDF5 folder containing the species data.
+    elem : str
+        Element symbol.
     DATASET_PROPERTY_CONFIGS : list
         list of configuration dictionaries.
 
diff --git a/atomdb/test.py b/atomdb/test.py