Skip to content

Commit f0eb59b

Browse files
committed
refactor uhf_augccpvdz dataset / add migration script
1 parent f2f2499 commit f0eb59b

File tree

2 files changed

+451
-13
lines changed

2 files changed

+451
-13
lines changed
Lines changed: 384 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,384 @@
1+
import warnings
2+
import numpy as np
3+
from importlib_resources import files
4+
import tables as pt
5+
from dataclasses import asdict
6+
from atomdb.datasets.uhf_augccpvdz.run import NPOINTS
7+
from atomdb.periodic_test import element_symbol_map, ElementAttr
8+
9+
10+
# Suppresses NaturalNameWarning warnings from PyTables.
11+
warnings.filterwarnings("ignore", category=pt.NaturalNameWarning)
12+
13+
max_norba = 100 #needs to be calculated
14+
15+
UHF_AUGCCPVDZ_PROPERTY_CONFIGS = [
16+
{
17+
"SpeciesInfo": "elem",
18+
"type": "string",
19+
},
20+
{
21+
"SpeciesInfo": "nexc",
22+
"type": "int",
23+
},
24+
{
25+
"SpeciesInfo": "charge",
26+
"type": "int",
27+
},
28+
{
29+
"SpeciesInfo": "mult",
30+
"type": "int",
31+
},
32+
{
33+
"SpeciesInfo": "nelec",
34+
"type": "int",
35+
},
36+
{
37+
"SpeciesInfo": "nspin",
38+
"type": "int",
39+
},
40+
{
41+
"SpeciesInfo": "energy",
42+
"type": "float",
43+
},
44+
{
45+
"SpeciesInfo": "ip",
46+
"type": "float",
47+
},
48+
{
49+
"SpeciesInfo": "mu",
50+
"type": "float",
51+
},
52+
{
53+
"SpeciesInfo": "eta",
54+
"type": "float",
55+
},
56+
{
57+
"SpeciesInfo": "nbasis",
58+
"type": "int",
59+
},
60+
{
61+
"property": "obasis_name",
62+
"table_name": "obasis_name",
63+
"description": "Orbital basis name",
64+
"type": "string",
65+
},
66+
{
67+
"array_property": "mo_energy_a",
68+
"table_name": "mo_energy_a",
69+
"description": "Alpha MO Energies",
70+
},
71+
{
72+
"array_property": "mo_energy_b",
73+
"table_name": "mo_energy_b",
74+
"description": "Beta MO Energies",
75+
},
76+
{
77+
"array_property": "mo_occs_a",
78+
"table_name": "mo_occs_a",
79+
"description": "Alpha MO Occupations",
80+
},
81+
{
82+
"array_property": "mo_occs_b",
83+
"table_name": "mo_occs_b",
84+
"description": "Alpha MO Energies",
85+
},
86+
{"Carray_property": "rs", "table_name": "rs", "folder": "RadialGrid", "spins": "no"},
87+
{
88+
"Carray_property": "mo_dens_a",
89+
"table_name": "mo_dens_a",
90+
"folder": "Density",
91+
"spins": "yes",
92+
},
93+
{
94+
"Carray_property": "mo_dens_b",
95+
"table_name": "mo_dens_b",
96+
"folder": "Density",
97+
"spins": "yes",
98+
},
99+
{"Carray_property": "dens_tot", "table_name": "dens_tot", "folder": "Density", "spins": "no"},
100+
{
101+
"Carray_property": "mo_ked_a",
102+
"table_name": "mo_ked_a",
103+
"folder": "KineticEnergyDensity",
104+
"spins": "yes",
105+
},
106+
{
107+
"Carray_property": "mo_ked_b",
108+
"table_name": "mo_ked_b",
109+
"folder": "KineticEnergyDensity",
110+
"spins": "yes",
111+
},
112+
{
113+
"Carray_property": "ked_tot",
114+
"table_name": "ked_tot",
115+
"folder": "KineticEnergyDensity",
116+
"spins": "no",
117+
},
118+
]
119+
120+
121+
class IntPropertyDescription(pt.IsDescription):
122+
value = pt.Int32Col()
123+
124+
125+
class StringPropertyDescription(pt.IsDescription):
126+
value = pt.StringCol(25)
127+
128+
129+
class FloatPropertyDescription(pt.IsDescription):
130+
value = pt.Float64Col()
131+
132+
133+
# static definition
134+
class ArrayPropertyDescription(pt.IsDescription):
135+
value = pt.Float64Col(shape=(max_norba,))
136+
137+
138+
class SpeciesInfo(pt.IsDescription):
139+
"""Schema for SpeciesInfo table."""
140+
141+
elem = pt.StringCol(25)
142+
charge = pt.Int32Col()
143+
mult = pt.Int32Col()
144+
nexc = pt.Int32Col()
145+
nelec = pt.Int32Col()
146+
nspin = pt.Int32Col()
147+
nbasis = pt.Int32Col()
148+
energy = pt.Float64Col()
149+
ip = pt.Float64Col()
150+
mu = pt.Float64Col()
151+
eta = pt.Float64Col()
152+
153+
154+
def create_species_info_table(species_info_table_row, prop_name, prop_type, value):
155+
"""Adds a property column to speciesInfo table.
156+
157+
Args:
158+
table_row (dict): single row in the table that holds all the columns.
159+
prop_name (str): Name of the property column to add to the table.
160+
prop_type (str): Data type of the property ('int', 'string', or 'float').
161+
value: The value to store in the column.
162+
163+
"""
164+
if prop_type == "int":
165+
value = int(value) if value is not None else 0
166+
167+
elif prop_type == "string":
168+
value = str(value) if value is not None else ""
169+
170+
elif prop_type == "float":
171+
value = float(value) if value is not None else np.nan
172+
173+
species_info_table_row[prop_name] = value
174+
175+
176+
def create_properties_tables(hdf5_file, parent_folder, config, value):
177+
"""Creates a table for storing properties in the HDF5 file.
178+
179+
Args:
180+
hdf5_file (tables.File): The open HDF5 file where the table will be created.
181+
parent_folder (tables.Group): The parent folder in the HDF5 file where the table will be stored.
182+
config (dict): Configuration dictionary containing table metadata, including:
183+
- 'table_name': Name of the table.
184+
- 'description': Description of the table.
185+
- 'type': Data type of the property ('int', 'string', or 'float').
186+
value: The value to store in the table.
187+
"""
188+
189+
# Extract table metadata from config.
190+
table_name = config["table_name"]
191+
table_description = config["description"]
192+
type = config["type"]
193+
194+
if type == "int":
195+
row_description = IntPropertyDescription
196+
value = int(value) if value is not None else 0
197+
198+
elif type == "string":
199+
row_description = StringPropertyDescription
200+
value = str(value) if value is not None else ""
201+
202+
elif type == "float":
203+
row_description = FloatPropertyDescription
204+
value = float(value) if value is not None else np.nan
205+
206+
# Create the table and populate the data
207+
table = hdf5_file.create_table(parent_folder, table_name, row_description, table_description)
208+
row = table.row
209+
row["value"] = value
210+
row.append()
211+
table.flush()
212+
213+
214+
def create_properties_arrays(hdf5_file, parent_folder, table_name, description, data):
215+
"""Creates a table for storing an array property in the HDF5 file.
216+
217+
Args:
218+
hdf5_file (tables.File): The open HDF5 file where the array will be created.
219+
parent_folder (tables.Group): The parent folder in the HDF5 file where the table will be stored.
220+
table_name (str): Name of the table to create.
221+
description (str): Description of the table.
222+
data (numpy.ndarray): The array data to store in the table.
223+
"""
224+
filters = pt.Filters(complevel=5, complib="blosc2:lz4")
225+
226+
# Create the table and populate the data
227+
table = hdf5_file.create_table(
228+
parent_folder, table_name, ArrayPropertyDescription, description, filters=filters
229+
)
230+
row = table.row
231+
padded_data = np.pad(data, (0, max_norba - len(data)), "constant", constant_values=0)
232+
row["value"] = padded_data
233+
row.append()
234+
table.flush()
235+
236+
237+
def create_spins_array(h5file, parent_folder, key, array_data, shape):
238+
"""Creates a CArray for storing spin-dependent array data in the HDF5 file.
239+
240+
Args:
241+
hdf5_file (tables.File): The open HDF5 file where the CArray will be created.
242+
parent_folder (tables.Group): The parent folder in the HDF5 file where the CArray will be stored.
243+
key (str): Name of the CArray.
244+
array_data (numpy.ndarray): The array data to store in the CArray.
245+
shape (int): The total size of the CArray.
246+
"""
247+
data_length = len(array_data)
248+
filters = pt.Filters(complevel=5, complib="blosc2:lz4")
249+
250+
# Create the CArray and populate the data
251+
array = h5file.create_carray(
252+
parent_folder, key, pt.Float64Atom(), shape=(shape,), filters=filters
253+
)
254+
array[:data_length] = array_data
255+
array[data_length:] = 0
256+
257+
258+
def create_tot_array(h5file, parent_folder, key, array_data):
259+
"""Creates a CArray for storing total (non-spin-dependent) array data in the HDF5 file.
260+
261+
Args:
262+
h5file (tables.File): The open HDF5 file where the CArray will be created.
263+
parent_folder (tables.Group): The parent folder in the HDF5 file where the CArray will be stored.
264+
key (str): Name of the CArray.
265+
array_data (numpy.ndarray): The array data to store in the CArray.
266+
"""
267+
data_length = len(array_data)
268+
filters = pt.Filters(complevel=5, complib="blosc2:lz4")
269+
270+
# Create the CArray and populate the data
271+
tot_gradient_array = h5file.create_carray(
272+
parent_folder, key, pt.Float64Atom(), shape=(NPOINTS,), filters=filters
273+
)
274+
if data_length < NPOINTS:
275+
tot_gradient_array[:data_length] = array_data
276+
tot_gradient_array[data_length:] = 0
277+
278+
else:
279+
tot_gradient_array[:] = array_data
280+
281+
282+
def create_hdf5_file(DATASETS_H5FILE, fields, dataset, mult):
283+
"""Creates an HDF5 folder with structured data for a specific dataset and element.
284+
285+
Args:
286+
DATASETS_H5FILE (tables.File): An open PyTables HDF5 file object to store the data.
287+
fields (dataclass): A dataclass containing the fields to store in the HDF5 file.
288+
dataset (str): Name of the dataset.
289+
mult (int): Multiplicity.
290+
"""
291+
fields = asdict(fields)
292+
dataset = dataset.lower()
293+
shape = NPOINTS * max_norba
294+
295+
elem = fields["elem"]
296+
nexc = fields["nexc"]
297+
atnum = element_symbol_map[elem][ElementAttr.atnum]
298+
charge = atnum - fields["nelec"]
299+
300+
# charge and mult can be calculated (instead of passing them)?
301+
dataset_folder = f"/Datasets/{dataset}"
302+
elem_folder = f"{dataset_folder}/{elem}"
303+
specific_elem_folder = f"{elem_folder}/{elem}_{charge:03d}_{mult:03d}_{nexc:03d}"
304+
305+
# Create dataset folder if it doesn't exist
306+
if dataset_folder not in DATASETS_H5FILE:
307+
DATASETS_H5FILE.create_group("/Datasets", dataset, f"{dataset} Data")
308+
309+
# Create element folder if it doesn't exist
310+
if elem_folder not in DATASETS_H5FILE:
311+
DATASETS_H5FILE.create_group(dataset_folder, elem, f"{elem} Data")
312+
313+
# Create specific element folder (charge/mult/nexc) if it doesn't exist
314+
if specific_elem_folder not in DATASETS_H5FILE:
315+
DATASETS_H5FILE.create_group(
316+
elem_folder,
317+
f"{elem}_{charge:03d}_{mult:03d}_{nexc:03d}",
318+
f"{elem} {charge} {mult} {nexc} Data",
319+
)
320+
321+
folders = {
322+
"Properties": DATASETS_H5FILE.create_group(
323+
specific_elem_folder, "Properties", "Properties Data"
324+
),
325+
"RadialGrid": DATASETS_H5FILE.create_group(
326+
specific_elem_folder, "RadialGrid", "Radial Grid Data"
327+
),
328+
"Density": DATASETS_H5FILE.create_group(specific_elem_folder, "Density", "Density Data"),
329+
"DensityGradient": DATASETS_H5FILE.create_group(
330+
specific_elem_folder, "DensityGradient", "Density Gradient Data"
331+
),
332+
"DensityLaplacian": DATASETS_H5FILE.create_group(
333+
specific_elem_folder, "DensityLaplacian", "Density Laplacian Data"
334+
),
335+
"KineticEnergyDensity": DATASETS_H5FILE.create_group(
336+
specific_elem_folder, "KineticEnergyDensity", "Kinetic Energy Density Data"
337+
),
338+
}
339+
340+
# Create basic species table and its row
341+
species_info_table = DATASETS_H5FILE.create_table(
342+
folders["Properties"], "species_info", SpeciesInfo, "Species Information"
343+
)
344+
species_info_table_row = species_info_table.row
345+
346+
# Create basic property tables
347+
for config in UHF_AUGCCPVDZ_PROPERTY_CONFIGS:
348+
if "SpeciesInfo" in config:
349+
prop_name = config["SpeciesInfo"]
350+
create_species_info_table(
351+
species_info_table_row, prop_name, config["type"], fields[prop_name]
352+
)
353+
354+
elif "property" in config:
355+
prop_name = config["property"]
356+
create_properties_tables(
357+
DATASETS_H5FILE, folders["Properties"], config, fields[prop_name]
358+
)
359+
360+
# Create array property tables
361+
elif "array_property" in config:
362+
prop_name = config["array_property"]
363+
create_properties_arrays(
364+
DATASETS_H5FILE,
365+
folders["Properties"],
366+
config["table_name"],
367+
config["description"],
368+
fields[prop_name],
369+
)
370+
371+
elif "Carray_property" in config:
372+
prop_name = config["Carray_property"]
373+
parent_folder = folders[config["folder"]]
374+
if config["spins"] == "yes":
375+
create_spins_array(
376+
DATASETS_H5FILE, parent_folder, config["table_name"], fields[prop_name], shape
377+
)
378+
elif config["spins"] == "no":
379+
create_tot_array(
380+
DATASETS_H5FILE, parent_folder, config["table_name"], fields[prop_name]
381+
)
382+
383+
species_info_table_row.append()
384+
species_info_table.flush()

0 commit comments

Comments
 (0)