santi921
diff --git a/‎.gitignore‎
Lines changed: 16 additions & 0 deletions b/‎.gitignore‎
Lines changed: 16 additions & 0 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 1 addition & 0 deletions b/‎pyproject.toml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎qtaim_gen/source/core/converter.py‎
Lines changed: 57 additions & 30 deletions b/‎qtaim_gen/source/core/converter.py‎
Lines changed: 57 additions & 30 deletions
diff --git a/‎qtaim_gen/source/scripts/helpers/generator_to_embed.py‎
Lines changed: 4 additions & 2 deletions b/‎qtaim_gen/source/scripts/helpers/generator_to_embed.py‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎qtaim_gen/source/scripts/helpers/tracking_db.py‎
Lines changed: 23 additions & 3 deletions b/‎qtaim_gen/source/scripts/helpers/tracking_db.py‎
Lines changed: 23 additions & 3 deletions
diff --git a/‎qtaim_gen/source/utils/validation.py‎
Lines changed: 10 additions & 3 deletions b/‎qtaim_gen/source/utils/validation.py‎
Lines changed: 10 additions & 3 deletions
@@ -38,5 +38,21 @@
 *wfx
 *xyz
 
+# working folders for dev
+examples/
+
 # W&B directories
 wandb/
+# ai documents
+docs/
+
+# Always track test fixtures regardless of extension rules above
+!tests/test_files/
+!tests/test_files/*/
+!tests/test_files/**/
+!tests/test_files/**/*.inp
+!tests/test_files/**/*.in
+!tests/test_files/**/*.json
+!tests/test_files/**/*.txt
+!tests/test_files/**/*.lmdb
+!tests/test_files/**/*.lmdb-lock
@@ -48,6 +48,7 @@ full-runner-parsl-nersc = "qtaim_gen.source.scripts.full_runner_parsl_nersc:main
 generator-single-runner = "qtaim_gen.source.scripts.generator_run:main"
 
 find-empty-compressed = "qtaim_gen.source.scripts.helpers.find_empty_compressed:main"
+count-orca-json = "qtaim_gen.source.scripts.helpers.count_orca_json:main"
 check-res-wfn = "qtaim_gen.source.scripts.helpers.check_res_wfn:main"
 check-res-rxn-json = "qtaim_gen.source.scripts.helpers.check_res_rxn_json:main"
 folder-xyz-molecules-to-pkl = "qtaim_gen.source.scripts.helpers.folder_xyz_molecules_to_pkl:main"
 
@@ -131,6 +131,7 @@ def __init__(self, config_dict: Dict[str, Any], config_path: str = None):
         self.config_dict = config_dict
         self.config_path = config_path
         self.restart = config_dict["restart"]
+        self._processed_source_keys: set = set()
 
 
         # Setup logging
@@ -169,7 +170,7 @@ def __init__(self, config_dict: Dict[str, Any], config_path: str = None):
         else:
             self.save_scaler = False
 
-        self.skip_keys = config_dict.get("filter_list", ["length", "scaled"])
+        self.skip_keys = list(config_dict.get("filter_list", ["length", "scaled"])) + ["processed_source_keys"]
 
         # Parallelization settings
         self.n_workers = config_dict.get("n_workers", 8)
@@ -249,14 +250,18 @@ def __init__(self, config_dict: Dict[str, Any], config_path: str = None):
         self.logger.info(f"Connected to output LMDB: {self.file}")
 
         if self.restart and os.path.exists(self.file):
-            # get all existing keys from the existing LMDB file and store in self.existing_keys to reference against 
             with self.db.begin(write=False) as txn:
-                self.existing_keys = set()
-
-                cursor = txn.cursor()
-                for key, _ in cursor:
-                    if key.decode("ascii") not in self.skip_keys:
-                        self.existing_keys.add(key.decode("ascii"))
+                # prefer source-key metadata written by new-format converters
+                psk_raw = txn.get(b"processed_source_keys")
+                if psk_raw is not None:
+                    self.existing_keys = pickle.loads(psk_raw)
+                else:
+                    # backward compat: old-format LMDBs stored molecule IDs as keys
+                    self.existing_keys = set()
+                    cursor = txn.cursor()
+                    for key, _ in cursor:
+                        if key.decode("ascii") not in self.skip_keys:
+                            self.existing_keys.add(key.decode("ascii"))
 
 
                 # handle scaled info
@@ -485,7 +490,11 @@ def scale_graphs_single(
                 if key_str not in self.config_dict["filter_list"]:
                     # process graph
                     try:
-                        graph = load_graph_from_serialized(pickle.loads(value))
+                        raw = pickle.loads(value)
+                        if isinstance(raw, dict):
+                            graph = load_graph_from_serialized(raw["molecule_graph"])
+                        else:
+                            graph = load_graph_from_serialized(raw)
                     except Exception as e:
                         self.logger.exception(f"Failed to load graph for key {key_str}: {e}")
                         continue
@@ -497,7 +506,7 @@ def scale_graphs_single(
                     txn.put(
                         f"{key_str}".encode("ascii"),
                         pickle.dumps(
-                            serialize_graph(graph[0], ret=True), protocol=-1
+                            {"molecule_graph": serialize_graph(graph[0], ret=True)}, protocol=-1
                         ),
                     )
                     txn.commit()
@@ -690,9 +699,11 @@ def finalize(self, return_info=False, keys_to_iterate=None, processed_count=0):
                 f"{lmdb_path}/label_scaler_iterative{shard_suffix}.pt"
             )
 
-        # last info on whether the graphs were scaled or not
+        # write metadata required by qtaim_embed's LMDBBaseDataset
         txn = self.db.begin(write=True)
+        txn.put("length".encode("ascii"), pickle.dumps(processed_count, protocol=-1))
         txn.put("scaled".encode("ascii"), pickle.dumps(False, protocol=-1))
+        txn.put("processed_source_keys".encode("ascii"), pickle.dumps(self._processed_source_keys, protocol=-1))
         txn.commit()
         self.db.close()
 
@@ -852,18 +863,25 @@ def merge_shards(
             map_async=True
         )
 
-        # Copy all entries from shards
+        # Copy all entries from shards, re-numbering graph keys to avoid collisions
+        _merge_skip = {b"length", b"scaled", b"scaler_finalized", b"processed_source_keys"}
         total_copied = 0
+        global_idx = 0
         with merged_env.begin(write=True) as dst_txn:
             for i, lmdb_path in enumerate(shard_lmdbs):
                 logger.info(f"Copying shard {i+1}/{len(shard_lmdbs)}")
                 src_env = lmdb.open(lmdb_path, subdir=False, readonly=True, lock=False)
                 with src_env.begin() as src_txn:
                     cursor = src_txn.cursor()
                     for key, value in cursor:
-                        dst_txn.put(key, value)
+                        if key in _merge_skip:
+                            continue
+                        dst_txn.put(f"{global_idx}".encode("ascii"), value)
+                        global_idx += 1
                         total_copied += 1
                 src_env.close()
+            dst_txn.put(b"length", pickle.dumps(global_idx, protocol=-1))
+            dst_txn.put(b"scaled", pickle.dumps(False, protocol=-1))
 
         merged_env.close()
         logger.info(f"Merged {total_copied} entries")
@@ -919,7 +937,7 @@ def merge_shards(
             logger.info("Applying merged scalers to LMDB...")
             env = lmdb.open(output_path, subdir=False, map_size=map_size)
             count = 0
-            metadata_keys = {b'scaled', b'scaler_finalized', b'length'}
+            metadata_keys = {b'scaled', b'scaler_finalized', b'length', b'processed_source_keys'}
             with env.begin(write=True) as txn:
                 cursor = txn.cursor()
                 for key, value in cursor:
@@ -928,17 +946,20 @@ def merge_shards(
                         continue
 
                     try:
-                        # Deserialize: pickle.loads returns bytes, then deserialize to PyG HeteroData
-                        serialized_bytes = pickle.loads(value)
-                        graph = load_graph_from_serialized(serialized_bytes)
+                        # Deserialize: pickle.loads may return dict or raw bytes depending on format
+                        raw = pickle.loads(value)
+                        if isinstance(raw, dict):
+                            graph = load_graph_from_serialized(raw["molecule_graph"])
+                        else:
+                            graph = load_graph_from_serialized(raw)
 
                         # Apply scalers - feature scaler expects a list
                         graph = merged_feature_scaler([graph])
                         graph = merged_label_scaler(graph)
 
                         # Serialize and write back
                         serialized_bytes = serialize_graph(graph[0], ret=True)
-                        txn.put(key, pickle.dumps(serialized_bytes, protocol=-1))
+                        txn.put(key, pickle.dumps({"molecule_graph": serialized_bytes}, protocol=-1))
                         count += 1
                     except Exception as e:
                         logger.warning(f"Failed to scale graph {key}: {e}")
@@ -1123,9 +1144,10 @@ def process(
                 self.feature_scaler_iterative.update([first_graph])
                 self.label_scaler_iterative.update([first_graph])
                 write_buffer.append((
-                    f"{key_str}".encode("ascii"),
-                    pickle.dumps(serialize_graph(first_graph, ret=True), protocol=-1),
+                    f"{processed_count}".encode("ascii"),
+                    pickle.dumps({"molecule_graph": serialize_graph(first_graph, ret=True)}, protocol=-1),
                 ))
+                self._processed_source_keys.add(key_str)
                 processed_count += 1
                 first_key_idx = idx + 1
                 break
@@ -1162,9 +1184,10 @@ def process_key(key):
                         self.label_scaler_iterative.update([graph])
 
                         write_buffer.append((
-                            f"{key_str}".encode("ascii"),
-                            pickle.dumps(serialize_graph(graph, ret=True), protocol=-1),
+                            f"{processed_count}".encode("ascii"),
+                            pickle.dumps({"molecule_graph": serialize_graph(graph, ret=True)}, protocol=-1),
                         ))
+                        self._processed_source_keys.add(key_str)
                         processed_count += 1
 
                         if len(write_buffer) >= self.batch_size:
@@ -1379,9 +1402,10 @@ def process(
                 self.feature_scaler_iterative.update([first_graph])
                 self.label_scaler_iterative.update([first_graph])
                 write_buffer.append((
-                    f"{key_str}".encode("ascii"),
-                    pickle.dumps(serialize_graph(first_graph, ret=True), protocol=-1),
+                    f"{processed_count}".encode("ascii"),
+                    pickle.dumps({"molecule_graph": serialize_graph(first_graph, ret=True)}, protocol=-1),
                 ))
+                self._processed_source_keys.add(key_str)
                 processed_count += 1
                 first_key_idx = idx + 1
                 break
@@ -1418,9 +1442,10 @@ def process_key(key):
                         self.label_scaler_iterative.update([graph])
 
                         write_buffer.append((
-                            f"{key_str}".encode("ascii"),
-                            pickle.dumps(serialize_graph(graph, ret=True), protocol=-1),
+                            f"{processed_count}".encode("ascii"),
+                            pickle.dumps({"molecule_graph": serialize_graph(graph, ret=True)}, protocol=-1),
                         ))
+                        self._processed_source_keys.add(key_str)
                         processed_count += 1
 
                         if len(write_buffer) >= self.batch_size:
@@ -2040,9 +2065,10 @@ def process(
                     self.feature_scaler_iterative.update([first_graph])
                     self.label_scaler_iterative.update([first_graph])
                     write_buffer.append((
-                        f"{key_str}".encode("ascii"),
-                        pickle.dumps(serialize_graph(first_graph, ret=True), protocol=-1),
+                        f"{processed_count}".encode("ascii"),
+                        pickle.dumps({"molecule_graph": serialize_graph(first_graph, ret=True)}, protocol=-1),
                     ))
+                    self._processed_source_keys.add(key_str)
                     processed_count += 1
                     first_key_idx = idx + 1
                     self.logger.info(f"Successfully initialized grapher with key {key_str}")
@@ -2087,9 +2113,10 @@ def process_key(key):
                         self.label_scaler_iterative.update([graph])
 
                         write_buffer.append((
-                            f"{key_str}".encode("ascii"),
-                            pickle.dumps(serialize_graph(graph, ret=True), protocol=-1),
+                            f"{processed_count}".encode("ascii"),
+                            pickle.dumps({"molecule_graph": serialize_graph(graph, ret=True)}, protocol=-1),
                         ))
+                        self._processed_source_keys.add(key_str)
                         processed_count += 1
 
                         # Batch commit
 
@@ -221,14 +221,16 @@ def main():
         )
 
         # Scale using train-only fitting
-        if not args.skip_scaling:
+        skip_scaling = args.skip_scaling or config_dict.get("skip_scaling", False)
+        if not skip_scaling:
             scale_split_lmdbs(converter, split_paths)
         else:
             print("Skipping scaling step")
 
     else:
         # Original behavior: scale the single output LMDB
-        if not args.skip_scaling:
+        skip_scaling = args.skip_scaling or config_dict.get("skip_scaling", False)
+        if not skip_scaling:
             converter.scale_graph_lmdb()
         else:
             print("Skipping scaling step")
 
@@ -117,8 +117,10 @@ def find_leaf_folders(path):
         "t_becke_fuzzy_density",
         "t_becke_fuzzy_spin",
         "t_bond",
-        "t_other_alie", 
-        "t_other_geometry"
+        "t_other_alie",
+        "t_other_geometry",
+        "has_orca_json",
+        "val_orca",
     ]
     columns = list(set(columns))  # ensure uniqueness
 
@@ -417,7 +419,25 @@ def get_tabs(subset):
             f"  {subset}: {get_tabs(subset)} {count_day} / {count_hr} / {dict_one_day_full_val.get(subset, 0)} / {dict_one_hour_full_val.get(subset, 0)}"
         )
 
-    # 7. print all counts from overall counts db
+    # 7. orca.json presence and validation per category
+    print("---" * 30)
+    print("orca.json presence and validation per category (subset):")
+    c.execute(
+        "SELECT subset, COUNT(DISTINCT job_id) FROM validation WHERE has_orca_json='True' GROUP BY subset"
+    )
+    orca_present = dict(c.fetchall())
+    c.execute(
+        "SELECT subset, COUNT(DISTINCT job_id) FROM validation WHERE val_orca='True' GROUP BY subset"
+    )
+    orca_valid = dict(c.fetchall())
+    for subset in sorted(set(list(orca_present.keys()) + list(orca_valid.keys()))):
+        present = orca_present.get(subset, 0)
+        valid = orca_valid.get(subset, 0)
+        total = counts_overall.get(subset, "N/A") if path_to_overall_counts_db else ""
+        total_str = f" / {total}" if total != "" else ""
+        print(f"  {subset}: {get_tabs(subset)} has={present}{total_str}  valid={valid}")
+
+    # 8. print all counts from overall counts db
     if path_to_overall_counts_db:
         print("---" * 30)
         print("Overall job counts per category (subset):")
 
@@ -58,6 +58,8 @@ def get_val_breakdown_from_folder(
         "val_bond": None,
         "val_fuzzy": None,
         "val_other": None,
+        "has_orca_json": False,
+        "val_orca": None,
     }
 
     # check timings
@@ -117,9 +119,12 @@ def get_val_breakdown_from_folder(
 
     # check orca (optional)
     orca_file = os.path.join(folder, "orca.json")
-    if os.path.exists(orca_file) and os.path.getsize(orca_file) > 0:
-        tf_orca = validate_orca_dict(orca_file, n_atoms=n_atoms, logger=None)
-        info["val_orca"] = tf_orca
+    if os.path.exists(orca_file):
+        info["has_orca_json"] = True
+        if os.path.getsize(orca_file) > 0:
+            info["val_orca"] = validate_orca_dict(orca_file, n_atoms=n_atoms, logger=None)
+        else:
+            info["val_orca"] = False
 
     return info
 
@@ -743,6 +748,8 @@ def get_information_from_job_folder(folder: str, full_set: int) -> dict:
         "val_bond": None,
         "val_fuzzy": None,
         "val_other": None,
+        "has_orca_json": False,
+        "val_orca": None,
         "n_atoms": None,
         "spin": None,
         "charge": None,