2020
2121DCAT = Namespace ("http://www.w3.org/ns/dcat#" )
2222
23+ # Rebuild models to resolve forward references now that KnowledgeGraph is imported
24+ DeleteAtom .model_rebuild ()
25+ SubstituteAtom .model_rebuild ()
26+ AddAtom .model_rebuild ()
27+ Rotate .model_rebuild ()
28+ Translate .model_rebuild ()
29+ Shear .model_rebuild ()
30+
2331# Mapping of operation method names to their classes
2432OPERATION_MAP = {
2533 "DeleteAtom" : DeleteAtom ,
@@ -309,14 +317,9 @@ def parse_samples(self, sample_data_list: List[Dict[str, Any]]) -> Dict[str, str
309317 import time
310318
311319 for sample_data in sample_data_list :
312- sample_start = time .time ()
313320 original_id = sample_data .get ("id" , "unknown" )
314- if self .debug :
315- print (f"\n { '=' * 60 } " )
316- print (f"Processing sample: { original_id } " )
317321
318322 # Normalise simulation_cell.vector if present
319- prep_start = time .time ()
320323 simcell = sample_data .get ("simulation_cell" )
321324 if isinstance (simcell , dict ):
322325 if "vector" in simcell :
@@ -363,17 +366,9 @@ def parse_samples(self, sample_data_list: List[Dict[str, Any]]) -> Dict[str, str
363366 sample_data [field ]
364367 )
365368
366- prep_time = time .time () - prep_start
367- if self .debug :
368- print (f" Data preparation: { prep_time :.3f} s" )
369-
370369 # Create sample object
371- model_start = time .time ()
372370 sample = AtomicScaleSample (** sample_data )
373371 original_id = sample .id
374- model_time = time .time () - model_start
375- if self .debug :
376- print (f" Model creation: { model_time :.3f} s" )
377372
378373 # Check if we should skip hashing for large systems
379374 n_atoms = (
@@ -383,51 +378,28 @@ def parse_samples(self, sample_data_list: List[Dict[str, Any]]) -> Dict[str, str
383378
384379 if skip_hash :
385380 # Skip hashing for large systems - treat as unique
386- if self .debug :
387- print (
388- f" Skipping hash for large system ({ n_atoms } atoms > { self .hash_threshold } threshold)"
389- )
390- graph_start = time .time ()
391381 sample .id = None # Let to_graph generate a new UUID
392382 sample .to_graph (self .kg )
393- graph_time = time .time () - graph_start
394- if self .debug :
395- print (f" Graph addition: { graph_time :.3f} s" )
396383 self .sample_map [original_id ] = sample .id
384+ if self .debug :
385+ print (f"Sample added (no hash check): { sample .id } " )
397386 else :
398387 # Use hash-based deduplication for smaller systems
399- hash_start = time .time ()
400388 sample .id = None
401389 sample_hash = sample ._compute_hash (precision = self .precision )
402- hash_time = time .time () - hash_start
403- if self .debug :
404- print (f" Hash computation: { hash_time :.3f} s ({ n_atoms } atoms)" )
405- print (f" Hash: { sample_hash } " )
406390
407391 # Check if this hash already exists in the KG
408- lookup_start = time .time ()
409392 existing_uri = self ._find_sample_by_hash (sample_hash )
410- lookup_time = time .time () - lookup_start
411- if self .debug :
412- print (f" Hash lookup: { lookup_time :.3f} s" )
413- print (f" Existing uri: { existing_uri } " )
414393
415394 if existing_uri :
416395 self .sample_map [original_id ] = existing_uri
417396 if self .debug :
418- print (f" Using existing sample (duplicate found) " )
397+ print (f"Sample exists: { existing_uri } " )
419398 else :
420- graph_start = time .time ()
421399 sample .to_graph (self .kg )
422- graph_time = time .time () - graph_start
423- if self .debug :
424- print (f" Graph addition: { graph_time :.3f} s" )
425400 self .sample_map [original_id ] = sample .id
426-
427- total_time = time .time () - sample_start
428- if self .debug :
429- print (f" TOTAL sample time: { total_time :.3f} s" )
430- print (f"{ '=' * 60 } " )
401+ if self .debug :
402+ print (f"Sample added: { sample .id } " )
431403
432404 return self .sample_map
433405
@@ -491,10 +463,7 @@ def parse_workflows(self, workflow_data_list: List[Dict[str, Any]]) -> List[str]
491463 workflow_uris .append (sim_uri )
492464
493465 if self .debug :
494- print (
495- f"Added workflow { i + 1 } : connecting samples "
496- f"{ workflow_data .get ('input_sample' , [])} to { workflow_data .get ('output_sample' , [])} "
497- )
466+ print (f"Workflow added: { sim_uri } " )
498467
499468 return workflow_uris
500469
@@ -577,11 +546,7 @@ def parse_operations(self, operation_data_list: List[Dict[str, Any]]) -> List[st
577546 operation_uris .append (operation .id )
578547
579548 if self .debug :
580- print (
581- f"Added operation { i + 1 } ({ method } ): "
582- f"{ operation_data .get ('input_sample' )} -> "
583- f"{ operation_data .get ('output_sample' )} "
584- )
549+ print (f"Operation added ({ method } ): { operation .id } " )
585550
586551 return operation_uris
587552
@@ -617,21 +582,13 @@ def parse(self, data: Union[str, Path, Dict[str, Any]]) -> Dict[str, Any]:
617582 if isinstance (data , (str , Path )):
618583 filepath = Path (data )
619584
620- if self .debug :
621- print (f"\n Reading file: { filepath } " )
622-
623- read_start = time .time ()
624585 with open (filepath , "r" ) as f :
625586 if filepath .suffix in [".yaml" , ".yml" ]:
626587 data = yaml .safe_load (f )
627588 elif filepath .suffix == ".json" :
628589 data = json .load (f )
629590 else :
630591 raise ValueError (f"Unsupported file format: { filepath .suffix } " )
631- read_time = time .time () - read_start
632-
633- if self .debug :
634- print (f"File reading time: { read_time :.3f} s" )
635592 elif not isinstance (data , dict ):
636593 raise TypeError (
637594 f"Unsupported data type: { type (data )} . Expected str, Path, or dict."
0 commit comments