Skip to content

Commit 192c4b7

Browse files
committed
converting Slater dataset files from msgpack to HDF5 format
1 parent eb2fb57 commit 192c4b7

File tree

5 files changed

+83
-82
lines changed

5 files changed

+83
-82
lines changed

atomdb/datasets/datasets_data.h5

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
version https://git-lfs.github.com/spec/v1
2-
oid sha256:7b5900e9c56c2a5c9e99a6f4f7ef9a7041789d81b3b12cbe7d03e75a70d7dc10
3-
size 46560562
2+
oid sha256:afcf1e437f143d5861f8c30cd890cc7996c40ddc12bb112d949f4db64537ed74
3+
size 922707071

atomdb/datasets/slater/h5file_creator.py

Lines changed: 50 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -10,51 +10,51 @@
1010
# Suppresses NaturalNameWarning warnings from PyTables.
1111
warnings.filterwarnings("ignore", category=pt.NaturalNameWarning)
1212

13-
max_norba = 30 # needs to be calculated
13+
max_norba = 56
1414

1515
SLATER_PROPERTY_CONFIGS = [
1616
{
17-
"SpeciesInfo": "elem", #
17+
"SpeciesInfo": "elem",
1818
"type": "string",
1919
},
2020
{
21-
"SpeciesInfo": "nexc", #
21+
"SpeciesInfo": "nexc",
2222
"type": "int",
2323
},
2424
{
25-
"SpeciesInfo": "charge", #
25+
"SpeciesInfo": "charge",
2626
"type": "int",
2727
},
2828
{
29-
"SpeciesInfo": "mult", #
29+
"SpeciesInfo": "mult",
3030
"type": "int",
3131
},
3232
{
33-
"SpeciesInfo": "nelec", #
33+
"SpeciesInfo": "nelec",
3434
"type": "int",
3535
},
3636
{
37-
"SpeciesInfo": "nspin", #
37+
"SpeciesInfo": "nspin",
3838
"type": "int",
3939
},
4040
{
41-
"SpeciesInfo": "energy", #
41+
"SpeciesInfo": "energy",
4242
"type": "float",
4343
},
4444
{
45-
"SpeciesInfo": "ip", #
45+
"SpeciesInfo": "ip",
4646
"type": "float",
4747
},
4848
{
49-
"SpeciesInfo": "mu", #
49+
"SpeciesInfo": "mu",
5050
"type": "float",
5151
},
5252
{
53-
"SpeciesInfo": "eta", #
53+
"SpeciesInfo": "eta",
5454
"type": "float",
5555
},
5656
{
57-
"SpeciesInfo": "nbasis", #
57+
"SpeciesInfo": "nbasis",
5858
"type": "int",
5959
},
6060
{
@@ -133,24 +133,24 @@
133133
"folder": "DensityLaplacian",
134134
"spins": "no",
135135
},
136-
# {
137-
# 'Carray_property': 'mo_ked_a',
138-
# 'table_name': 'mo_ked_a',
139-
# 'folder': 'KineticEnergyDensity',
140-
# 'spins': 'yes'
141-
# },
142-
# {
143-
# 'Carray_property': 'mo_ked_b',
144-
# 'table_name': 'mo_ked_b',
145-
# 'folder': 'KineticEnergyDensity',
146-
# 'spins': 'yes'
147-
# },
148-
# {
149-
# 'Carray_property': 'ked_tot',
150-
# 'table_name': 'ked_tot',
151-
# 'folder': 'KineticEnergyDensity',
152-
# 'spins': 'no'
153-
# }
136+
{
137+
"Carray_property": "mo_ked_a",
138+
"table_name": "mo_ked_a",
139+
"folder": "KineticEnergyDensity",
140+
"spins": "yes",
141+
},
142+
{
143+
"Carray_property": "mo_ked_b",
144+
"table_name": "mo_ked_b",
145+
"folder": "KineticEnergyDensity",
146+
"spins": "yes",
147+
},
148+
{
149+
"Carray_property": "ked_tot",
150+
"table_name": "ked_tot",
151+
"folder": "KineticEnergyDensity",
152+
"spins": "no",
153+
},
154154
]
155155

156156

@@ -257,9 +257,12 @@ def create_properties_arrays(hdf5_file, parent_folder, table_name, description,
257257
description (str): Description of the table.
258258
data (numpy.ndarray): The array data to store in the table.
259259
"""
260+
filters = pt.Filters(complevel=5, complib="blosc2")
260261

261262
# Create the table and populate the data
262-
table = hdf5_file.create_table(parent_folder, table_name, ArrayPropertyDescription, description)
263+
table = hdf5_file.create_table(
264+
parent_folder, table_name, ArrayPropertyDescription, description, filters=filters
265+
)
263266
row = table.row
264267
padded_data = np.pad(data, (0, max_norba - len(data)), "constant", constant_values=0)
265268
row["value"] = padded_data
@@ -278,9 +281,12 @@ def create_spins_array(h5file, parent_folder, key, array_data, shape):
278281
shape (int): The total size of the CArray.
279282
"""
280283
data_length = len(array_data)
284+
filters = pt.Filters(complevel=5, complib="blosc2")
281285

282286
# Create the CArray and populate the data
283-
array = h5file.create_carray(parent_folder, key, pt.Float64Atom(), shape=(shape,))
287+
array = h5file.create_carray(
288+
parent_folder, key, pt.Float64Atom(), shape=(shape,), filters=filters
289+
)
284290
array[:data_length] = array_data
285291
array[data_length:] = 0
286292

@@ -294,24 +300,29 @@ def create_tot_array(h5file, parent_folder, key, array_data):
294300
key (str): Name of the CArray.
295301
array_data (numpy.ndarray): The array data to store in the CArray.
296302
"""
303+
data_length = len(array_data)
304+
filters = pt.Filters(complevel=5, complib="blosc2")
297305

298306
# Create the CArray and populate the data
299307
tot_gradient_array = h5file.create_carray(
300-
parent_folder, key, pt.Float64Atom(), shape=(NPOINTS,)
308+
parent_folder, key, pt.Float64Atom(), shape=(NPOINTS,), filters=filters
301309
)
302-
tot_gradient_array[:] = array_data
310+
if data_length < NPOINTS:
311+
tot_gradient_array[:data_length] = array_data
312+
tot_gradient_array[data_length:] = 0
313+
314+
else:
315+
tot_gradient_array[:] = array_data
303316

304317

305-
def create_hdf5_file(DATASETS_H5FILE, fields, dataset):
318+
def create_hdf5_file(DATASETS_H5FILE, fields, dataset, mult):
306319
"""Creates an HDF5 folder with structured data for a specific dataset and element.
307320
308321
Args:
322+
DATASETS_H5FILE (tables.File): An open PyTables HDF5 file object to store the data.
309323
fields (dataclass): A dataclass containing the fields to store in the HDF5 file.
310324
dataset (str): Name of the dataset.
311-
elem (str): Element symbol.
312-
charge (int): Charge of the system.
313-
mult (int): Multiplicity of the system.
314-
nexc (int): Number of excitations.
325+
mult (int): Multiplicity.
315326
"""
316327
fields = asdict(fields)
317328
dataset = dataset.lower()
@@ -320,7 +331,6 @@ def create_hdf5_file(DATASETS_H5FILE, fields, dataset):
320331
elem = fields["elem"]
321332
nexc = fields["nexc"]
322333
atnum = element_symbol_map[elem][ElementAttr.atnum]
323-
mult = get_scalar_data("mult", atnum, fields["nelec"])
324334
charge = atnum - fields["nelec"]
325335

326336
# charge and mult can be calculated (instead of passing them)?

atomdb/periodic_test.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,21 @@ class ElementAttr(IntEnum):
3737

3838

3939
def get_scalar_data(prop_name, atnum, nelec):
40+
"""
41+
Get a scalar property value for a given element.
42+
43+
Args:
44+
prop_name (str): Property name to retrieve.
45+
atnum (int): Atomic number of the element.
46+
nelec (int): Number of electrons in the element.
47+
48+
Returns:
49+
int | float | str | dict[str, float] | None:
50+
- int, float, or str for single-valued properties.
51+
- dict for properties with multiple sources.
52+
- None
53+
"""
54+
4055
charge = atnum - nelec
4156

4257
if charge != 0 and prop_name not in ["atmass", "elem", "atnum", "name"]:
@@ -73,6 +88,13 @@ def get_scalar_data(prop_name, atnum, nelec):
7388

7489

7590
def map_element_symbol():
91+
"""
92+
Build a mapping of element symbols to their atomic number and name.
93+
94+
Returns:
95+
dict[str, tuple[int, str]]:
96+
Dictionary mapping element symbol → (atomic_number, name).
97+
"""
7698
element_symbol_map = {}
7799
for element_group in ELEMENTS_H5FILE.root.Elements:
78100
symbol = element_group.symbol[0]["value"].decode("utf-8").strip()

atomdb/species.py

Lines changed: 9 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -733,31 +733,22 @@ def compile_species(
733733
fields = submodule.run(elem, charge, mult, nexc, dataset, datapath)
734734

735735
# dump the data to the HDF5 file
736-
dump(fields, dataset)
736+
dump(fields, dataset, mult)
737737

738738

739-
def dump(fields, dataset):
739+
def dump(fields, dataset, mult):
740740
r"""Dump the compiled species data to an HDF5 file in the AtomDB database.
741741
742742
Parameters
743743
----------
744-
fields : dict
745-
Dictionary containing the compiled data fields for the species.
746-
dataset : str
747-
Name of the dataset selected.
748-
elem : str
749-
Element symbol.
750-
charge : int
751-
Charge.
752-
mult : int
753-
Multiplicity.
754-
nexc : int, optional
755-
Excitation level, by default 0.
744+
fields (dataclass): A dataclass containing the fields to store in the HDF5 file.
745+
dataset (str): Name of the dataset.
746+
mult (int): Multiplicity.
756747
"""
757748

758749
# Save data to the HDF5 file
759750
element_folder_creator = import_module(f"atomdb.datasets.{dataset}.h5file_creator")
760-
element_folder_creator.create_hdf5_file(DATASETS_H5FILE, fields, dataset)
751+
element_folder_creator.create_hdf5_file(DATASETS_H5FILE, fields, dataset, mult)
761752

762753

763754
def load(
@@ -851,7 +842,7 @@ def datafile(
851842
852843
Returns
853844
-------
854-
str
845+
list
855846
paths to the database file of a species in AtomDB.
856847
857848
"""
@@ -901,6 +892,8 @@ def get_species_data(folder_path, elem, DATASET_PROPERTY_CONFIGS):
901892
----------
902893
folder_path : str
903894
Path to the HDF5 folder containing the species data.
895+
elem : str
896+
Element symbol.
904897
DATASET_PROPERTY_CONFIGS : list
905898
list of configuration dictionaries.
906899

atomdb/test.py

Lines changed: 0 additions & 24 deletions
This file was deleted.

0 commit comments

Comments
 (0)