further updates to parser

srmnitc · srmnitc · commit d2335f915aa8 · 2025-12-04T10:01:33.000+01:00
diff --git a/atomrdf/datamodels/structure.py b/atomrdf/datamodels/structure.py
@@ -689,9 +689,7 @@ def from_graph_calculated_properties(cls, graph, sample_id):
 
     def to_graph(self, graph, force=False):
         # if force - creates a new ID and saves the structure again
-        print(self.id)
         if not force and self.id is not None:
-            print("Sample already in graph, skipping...")
             return self.id
 
         # the rest of the function is only if id isnt there or force is true
@@ -729,14 +727,14 @@ def to_graph(self, graph, force=False):
         ]
 
         for defect in defect_fields:
-            obj = getattr(self, defect)
-            if isinstance(obj, BaseModel) and obj.model_fields_set:
-                if hasattr(obj, "to_graph"):
-                    obj.to_graph(graph, sample)
+            obj = getattr(self, defect, None)
+            if obj is not None:
+                if isinstance(obj, BaseModel) and obj.model_fields_set:
+                    if hasattr(obj, "to_graph"):
+                        obj.to_graph(graph, sample)
 
         # Add content hash to the graph for deduplication (skip validation for external vocab)
         content_hash = self._compute_hash()
-        print("Content hash:", content_hash)
         graph.add(
             (sample, DCAT.checksum, Literal(content_hash, datatype=XSD.string)),
             validate=False,
diff --git a/atomrdf/datamodels/workflow/operations.py b/atomrdf/datamodels/workflow/operations.py
@@ -24,8 +24,10 @@ def to_graph(self, graph):
         activity_id = f"deleteatom:{str(uuid.uuid4())}"
         self.id = activity_id
         activity = graph.create_node(activity_id, ASMO.DeleteAtom)
-        graph.add((self.output_sample, PROV.wasDerivedFrom, self.input_sample))
-        graph.add((self.output_sample, PROV.wasGeneratedBy, activity))
+        graph.add(
+            (URIRef(self.output_sample), PROV.wasDerivedFrom, URIRef(self.input_sample))
+        )
+        graph.add((URIRef(self.output_sample), PROV.wasGeneratedBy, activity))
 
     @classmethod
     def from_graph(cls, graph, activity_id):
@@ -44,8 +46,10 @@ def to_graph(self, graph):
         activity_id = f"substituteatom:{str(uuid.uuid4())}"
         self.id = activity_id
         activity = graph.create_node(activity_id, ASMO.SubstituteAtom)
-        graph.add((self.output_sample, PROV.wasDerivedFrom, self.input_sample))
-        graph.add((self.output_sample, PROV.wasGeneratedBy, activity))
+        graph.add(
+            (URIRef(self.output_sample), PROV.wasDerivedFrom, URIRef(self.input_sample))
+        )
+        graph.add((URIRef(self.output_sample), PROV.wasGeneratedBy, activity))
 
     @classmethod
     def from_graph(cls, graph, activity_id):
@@ -64,8 +68,10 @@ def to_graph(self, graph):
         activity_id = f"addatom:{str(uuid.uuid4())}"
         self.id = activity_id
         activity = graph.create_node(activity_id, ASMO.AddAtom)
-        graph.add((self.output_sample, PROV.wasDerivedFrom, self.input_sample))
-        graph.add((self.output_sample, PROV.wasGeneratedBy, activity))
+        graph.add(
+            (URIRef(self.output_sample), PROV.wasDerivedFrom, URIRef(self.input_sample))
+        )
+        graph.add((URIRef(self.output_sample), PROV.wasGeneratedBy, activity))
 
     @classmethod
     def from_graph(cls, graph, activity_id):
@@ -86,8 +92,10 @@ def to_graph(self, graph):
         activity_id = f"rotate:{str(uuid.uuid4())}"
         self.id = activity_id
         activity = graph.create_node(activity_id, ASMO.Rotation)
-        graph.add((self.output_sample, PROV.wasDerivedFrom, self.input_sample))
-        graph.add((self.output_sample, PROV.wasGeneratedBy, activity))
+        graph.add(
+            (URIRef(self.output_sample), PROV.wasDerivedFrom, URIRef(self.input_sample))
+        )
+        graph.add((URIRef(self.output_sample), PROV.wasGeneratedBy, activity))
 
         rot_vector_01 = graph.create_node(
             f"{activity_id}_RotationVector_1", CMSO.Vector
@@ -196,8 +204,10 @@ def to_graph(self, graph):
         activity_id = f"translate:{str(uuid.uuid4())}"
         self.id = activity_id
         activity = graph.create_node(activity_id, ASMO.Translation)
-        graph.add((self.output_sample, PROV.wasDerivedFrom, self.input_sample))
-        graph.add((self.output_sample, PROV.wasGeneratedBy, activity))
+        graph.add(
+            (URIRef(self.output_sample), PROV.wasDerivedFrom, URIRef(self.input_sample))
+        )
+        graph.add((URIRef(self.output_sample), PROV.wasGeneratedBy, activity))
 
         translation_vector = graph.create_node(
             f"{activity_id}_TranslationVector", CMSO.Vector
@@ -256,8 +266,10 @@ def to_graph(self, graph):
         activity_id = f"shear:{str(uuid.uuid4())}"
         self.id = activity_id
         activity = graph.create_node(activity_id, ASMO.Shear)
-        graph.add((self.output_sample, PROV.wasDerivedFrom, self.input_sample))
-        graph.add((self.output_sample, PROV.wasGeneratedBy, activity))
+        graph.add(
+            (URIRef(self.output_sample), PROV.wasDerivedFrom, URIRef(self.input_sample))
+        )
+        graph.add((URIRef(self.output_sample), PROV.wasGeneratedBy, activity))
 
         shear_vector = graph.create_node(f"{activity_id}_ShearVector", CMSO.Vector)
         graph.add((activity, CMSO.hasVector, shear_vector))
diff --git a/atomrdf/io/workflow_parser.py b/atomrdf/io/workflow_parser.py
@@ -20,6 +20,14 @@
 
 DCAT = Namespace("http://www.w3.org/ns/dcat#")
 
+# Rebuild models to resolve forward references now that KnowledgeGraph is imported
+DeleteAtom.model_rebuild()
+SubstituteAtom.model_rebuild()
+AddAtom.model_rebuild()
+Rotate.model_rebuild()
+Translate.model_rebuild()
+Shear.model_rebuild()
+
 # Mapping of operation method names to their classes
 OPERATION_MAP = {
     "DeleteAtom": DeleteAtom,
@@ -309,14 +317,9 @@ def parse_samples(self, sample_data_list: List[Dict[str, Any]]) -> Dict[str, str
         import time
 
         for sample_data in sample_data_list:
-            sample_start = time.time()
             original_id = sample_data.get("id", "unknown")
-            if self.debug:
-                print(f"\n{'='*60}")
-                print(f"Processing sample: {original_id}")
 
             # Normalise simulation_cell.vector if present
-            prep_start = time.time()
             simcell = sample_data.get("simulation_cell")
             if isinstance(simcell, dict):
                 if "vector" in simcell:
@@ -363,17 +366,9 @@ def parse_samples(self, sample_data_list: List[Dict[str, Any]]) -> Dict[str, str
                         sample_data[field]
                     )
 
-            prep_time = time.time() - prep_start
-            if self.debug:
-                print(f"  Data preparation: {prep_time:.3f}s")
-
             # Create sample object
-            model_start = time.time()
             sample = AtomicScaleSample(**sample_data)
             original_id = sample.id
-            model_time = time.time() - model_start
-            if self.debug:
-                print(f"  Model creation: {model_time:.3f}s")
 
             # Check if we should skip hashing for large systems
             n_atoms = (
@@ -383,51 +378,28 @@ def parse_samples(self, sample_data_list: List[Dict[str, Any]]) -> Dict[str, str
 
             if skip_hash:
                 # Skip hashing for large systems - treat as unique
-                if self.debug:
-                    print(
-                        f"  Skipping hash for large system ({n_atoms} atoms > {self.hash_threshold} threshold)"
-                    )
-                graph_start = time.time()
                 sample.id = None  # Let to_graph generate a new UUID
                 sample.to_graph(self.kg)
-                graph_time = time.time() - graph_start
-                if self.debug:
-                    print(f"  Graph addition: {graph_time:.3f}s")
                 self.sample_map[original_id] = sample.id
+                if self.debug:
+                    print(f"Sample added (no hash check): {sample.id}")
             else:
                 # Use hash-based deduplication for smaller systems
-                hash_start = time.time()
                 sample.id = None
                 sample_hash = sample._compute_hash(precision=self.precision)
-                hash_time = time.time() - hash_start
-                if self.debug:
-                    print(f"  Hash computation: {hash_time:.3f}s ({n_atoms} atoms)")
-                    print(f"  Hash: {sample_hash}")
 
                 # Check if this hash already exists in the KG
-                lookup_start = time.time()
                 existing_uri = self._find_sample_by_hash(sample_hash)
-                lookup_time = time.time() - lookup_start
-                if self.debug:
-                    print(f"  Hash lookup: {lookup_time:.3f}s")
-                    print(f"  Existing uri: {existing_uri}")
 
                 if existing_uri:
                     self.sample_map[original_id] = existing_uri
                     if self.debug:
-                        print(f"  Using existing sample (duplicate found)")
+                        print(f"Sample exists: {existing_uri}")
                 else:
-                    graph_start = time.time()
                     sample.to_graph(self.kg)
-                    graph_time = time.time() - graph_start
-                    if self.debug:
-                        print(f"  Graph addition: {graph_time:.3f}s")
                     self.sample_map[original_id] = sample.id
-
-            total_time = time.time() - sample_start
-            if self.debug:
-                print(f"  TOTAL sample time: {total_time:.3f}s")
-                print(f"{'='*60}")
+                    if self.debug:
+                        print(f"Sample added: {sample.id}")
 
         return self.sample_map
 
@@ -491,10 +463,7 @@ def parse_workflows(self, workflow_data_list: List[Dict[str, Any]]) -> List[str]
             workflow_uris.append(sim_uri)
 
             if self.debug:
-                print(
-                    f"Added workflow {i+1}: connecting samples "
-                    f"{workflow_data.get('input_sample', [])} to {workflow_data.get('output_sample', [])}"
-                )
+                print(f"Workflow added: {sim_uri}")
 
         return workflow_uris
 
@@ -577,11 +546,7 @@ def parse_operations(self, operation_data_list: List[Dict[str, Any]]) -> List[st
             operation_uris.append(operation.id)
 
             if self.debug:
-                print(
-                    f"Added operation {i+1} ({method}): "
-                    f"{operation_data.get('input_sample')} -> "
-                    f"{operation_data.get('output_sample')}"
-                )
+                print(f"Operation added ({method}): {operation.id}")
 
         return operation_uris
 
@@ -617,21 +582,13 @@ def parse(self, data: Union[str, Path, Dict[str, Any]]) -> Dict[str, Any]:
         if isinstance(data, (str, Path)):
             filepath = Path(data)
 
-            if self.debug:
-                print(f"\nReading file: {filepath}")
-
-            read_start = time.time()
             with open(filepath, "r") as f:
                 if filepath.suffix in [".yaml", ".yml"]:
                     data = yaml.safe_load(f)
                 elif filepath.suffix == ".json":
                     data = json.load(f)
                 else:
                     raise ValueError(f"Unsupported file format: {filepath.suffix}")
-            read_time = time.time() - read_start
-
-            if self.debug:
-                print(f"File reading time: {read_time:.3f}s")
         elif not isinstance(data, dict):
             raise TypeError(
                 f"Unsupported data type: {type(data)}. Expected str, Path, or dict."