Skip to content

Commit d2335f9

Browse files
committed
further updates to parser
1 parent 970d1fd commit d2335f9

File tree

3 files changed

+44
-77
lines changed

3 files changed

+44
-77
lines changed

atomrdf/datamodels/structure.py

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -689,9 +689,7 @@ def from_graph_calculated_properties(cls, graph, sample_id):
689689

690690
def to_graph(self, graph, force=False):
691691
# if force - creates a new ID and saves the structure again
692-
print(self.id)
693692
if not force and self.id is not None:
694-
print("Sample already in graph, skipping...")
695693
return self.id
696694

697695
# the rest of the function is only if id isnt there or force is true
@@ -729,14 +727,14 @@ def to_graph(self, graph, force=False):
729727
]
730728

731729
for defect in defect_fields:
732-
obj = getattr(self, defect)
733-
if isinstance(obj, BaseModel) and obj.model_fields_set:
734-
if hasattr(obj, "to_graph"):
735-
obj.to_graph(graph, sample)
730+
obj = getattr(self, defect, None)
731+
if obj is not None:
732+
if isinstance(obj, BaseModel) and obj.model_fields_set:
733+
if hasattr(obj, "to_graph"):
734+
obj.to_graph(graph, sample)
736735

737736
# Add content hash to the graph for deduplication (skip validation for external vocab)
738737
content_hash = self._compute_hash()
739-
print("Content hash:", content_hash)
740738
graph.add(
741739
(sample, DCAT.checksum, Literal(content_hash, datatype=XSD.string)),
742740
validate=False,

atomrdf/datamodels/workflow/operations.py

Lines changed: 24 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,10 @@ def to_graph(self, graph):
2424
activity_id = f"deleteatom:{str(uuid.uuid4())}"
2525
self.id = activity_id
2626
activity = graph.create_node(activity_id, ASMO.DeleteAtom)
27-
graph.add((self.output_sample, PROV.wasDerivedFrom, self.input_sample))
28-
graph.add((self.output_sample, PROV.wasGeneratedBy, activity))
27+
graph.add(
28+
(URIRef(self.output_sample), PROV.wasDerivedFrom, URIRef(self.input_sample))
29+
)
30+
graph.add((URIRef(self.output_sample), PROV.wasGeneratedBy, activity))
2931

3032
@classmethod
3133
def from_graph(cls, graph, activity_id):
@@ -44,8 +46,10 @@ def to_graph(self, graph):
4446
activity_id = f"substituteatom:{str(uuid.uuid4())}"
4547
self.id = activity_id
4648
activity = graph.create_node(activity_id, ASMO.SubstituteAtom)
47-
graph.add((self.output_sample, PROV.wasDerivedFrom, self.input_sample))
48-
graph.add((self.output_sample, PROV.wasGeneratedBy, activity))
49+
graph.add(
50+
(URIRef(self.output_sample), PROV.wasDerivedFrom, URIRef(self.input_sample))
51+
)
52+
graph.add((URIRef(self.output_sample), PROV.wasGeneratedBy, activity))
4953

5054
@classmethod
5155
def from_graph(cls, graph, activity_id):
@@ -64,8 +68,10 @@ def to_graph(self, graph):
6468
activity_id = f"addatom:{str(uuid.uuid4())}"
6569
self.id = activity_id
6670
activity = graph.create_node(activity_id, ASMO.AddAtom)
67-
graph.add((self.output_sample, PROV.wasDerivedFrom, self.input_sample))
68-
graph.add((self.output_sample, PROV.wasGeneratedBy, activity))
71+
graph.add(
72+
(URIRef(self.output_sample), PROV.wasDerivedFrom, URIRef(self.input_sample))
73+
)
74+
graph.add((URIRef(self.output_sample), PROV.wasGeneratedBy, activity))
6975

7076
@classmethod
7177
def from_graph(cls, graph, activity_id):
@@ -86,8 +92,10 @@ def to_graph(self, graph):
8692
activity_id = f"rotate:{str(uuid.uuid4())}"
8793
self.id = activity_id
8894
activity = graph.create_node(activity_id, ASMO.Rotation)
89-
graph.add((self.output_sample, PROV.wasDerivedFrom, self.input_sample))
90-
graph.add((self.output_sample, PROV.wasGeneratedBy, activity))
95+
graph.add(
96+
(URIRef(self.output_sample), PROV.wasDerivedFrom, URIRef(self.input_sample))
97+
)
98+
graph.add((URIRef(self.output_sample), PROV.wasGeneratedBy, activity))
9199

92100
rot_vector_01 = graph.create_node(
93101
f"{activity_id}_RotationVector_1", CMSO.Vector
@@ -196,8 +204,10 @@ def to_graph(self, graph):
196204
activity_id = f"translate:{str(uuid.uuid4())}"
197205
self.id = activity_id
198206
activity = graph.create_node(activity_id, ASMO.Translation)
199-
graph.add((self.output_sample, PROV.wasDerivedFrom, self.input_sample))
200-
graph.add((self.output_sample, PROV.wasGeneratedBy, activity))
207+
graph.add(
208+
(URIRef(self.output_sample), PROV.wasDerivedFrom, URIRef(self.input_sample))
209+
)
210+
graph.add((URIRef(self.output_sample), PROV.wasGeneratedBy, activity))
201211

202212
translation_vector = graph.create_node(
203213
f"{activity_id}_TranslationVector", CMSO.Vector
@@ -256,8 +266,10 @@ def to_graph(self, graph):
256266
activity_id = f"shear:{str(uuid.uuid4())}"
257267
self.id = activity_id
258268
activity = graph.create_node(activity_id, ASMO.Shear)
259-
graph.add((self.output_sample, PROV.wasDerivedFrom, self.input_sample))
260-
graph.add((self.output_sample, PROV.wasGeneratedBy, activity))
269+
graph.add(
270+
(URIRef(self.output_sample), PROV.wasDerivedFrom, URIRef(self.input_sample))
271+
)
272+
graph.add((URIRef(self.output_sample), PROV.wasGeneratedBy, activity))
261273

262274
shear_vector = graph.create_node(f"{activity_id}_ShearVector", CMSO.Vector)
263275
graph.add((activity, CMSO.hasVector, shear_vector))

atomrdf/io/workflow_parser.py

Lines changed: 15 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,14 @@
2020

2121
DCAT = Namespace("http://www.w3.org/ns/dcat#")
2222

23+
# Rebuild models to resolve forward references now that KnowledgeGraph is imported
24+
DeleteAtom.model_rebuild()
25+
SubstituteAtom.model_rebuild()
26+
AddAtom.model_rebuild()
27+
Rotate.model_rebuild()
28+
Translate.model_rebuild()
29+
Shear.model_rebuild()
30+
2331
# Mapping of operation method names to their classes
2432
OPERATION_MAP = {
2533
"DeleteAtom": DeleteAtom,
@@ -309,14 +317,9 @@ def parse_samples(self, sample_data_list: List[Dict[str, Any]]) -> Dict[str, str
309317
import time
310318

311319
for sample_data in sample_data_list:
312-
sample_start = time.time()
313320
original_id = sample_data.get("id", "unknown")
314-
if self.debug:
315-
print(f"\n{'='*60}")
316-
print(f"Processing sample: {original_id}")
317321

318322
# Normalise simulation_cell.vector if present
319-
prep_start = time.time()
320323
simcell = sample_data.get("simulation_cell")
321324
if isinstance(simcell, dict):
322325
if "vector" in simcell:
@@ -363,17 +366,9 @@ def parse_samples(self, sample_data_list: List[Dict[str, Any]]) -> Dict[str, str
363366
sample_data[field]
364367
)
365368

366-
prep_time = time.time() - prep_start
367-
if self.debug:
368-
print(f" Data preparation: {prep_time:.3f}s")
369-
370369
# Create sample object
371-
model_start = time.time()
372370
sample = AtomicScaleSample(**sample_data)
373371
original_id = sample.id
374-
model_time = time.time() - model_start
375-
if self.debug:
376-
print(f" Model creation: {model_time:.3f}s")
377372

378373
# Check if we should skip hashing for large systems
379374
n_atoms = (
@@ -383,51 +378,28 @@ def parse_samples(self, sample_data_list: List[Dict[str, Any]]) -> Dict[str, str
383378

384379
if skip_hash:
385380
# Skip hashing for large systems - treat as unique
386-
if self.debug:
387-
print(
388-
f" Skipping hash for large system ({n_atoms} atoms > {self.hash_threshold} threshold)"
389-
)
390-
graph_start = time.time()
391381
sample.id = None # Let to_graph generate a new UUID
392382
sample.to_graph(self.kg)
393-
graph_time = time.time() - graph_start
394-
if self.debug:
395-
print(f" Graph addition: {graph_time:.3f}s")
396383
self.sample_map[original_id] = sample.id
384+
if self.debug:
385+
print(f"Sample added (no hash check): {sample.id}")
397386
else:
398387
# Use hash-based deduplication for smaller systems
399-
hash_start = time.time()
400388
sample.id = None
401389
sample_hash = sample._compute_hash(precision=self.precision)
402-
hash_time = time.time() - hash_start
403-
if self.debug:
404-
print(f" Hash computation: {hash_time:.3f}s ({n_atoms} atoms)")
405-
print(f" Hash: {sample_hash}")
406390

407391
# Check if this hash already exists in the KG
408-
lookup_start = time.time()
409392
existing_uri = self._find_sample_by_hash(sample_hash)
410-
lookup_time = time.time() - lookup_start
411-
if self.debug:
412-
print(f" Hash lookup: {lookup_time:.3f}s")
413-
print(f" Existing uri: {existing_uri}")
414393

415394
if existing_uri:
416395
self.sample_map[original_id] = existing_uri
417396
if self.debug:
418-
print(f" Using existing sample (duplicate found)")
397+
print(f"Sample exists: {existing_uri}")
419398
else:
420-
graph_start = time.time()
421399
sample.to_graph(self.kg)
422-
graph_time = time.time() - graph_start
423-
if self.debug:
424-
print(f" Graph addition: {graph_time:.3f}s")
425400
self.sample_map[original_id] = sample.id
426-
427-
total_time = time.time() - sample_start
428-
if self.debug:
429-
print(f" TOTAL sample time: {total_time:.3f}s")
430-
print(f"{'='*60}")
401+
if self.debug:
402+
print(f"Sample added: {sample.id}")
431403

432404
return self.sample_map
433405

@@ -491,10 +463,7 @@ def parse_workflows(self, workflow_data_list: List[Dict[str, Any]]) -> List[str]
491463
workflow_uris.append(sim_uri)
492464

493465
if self.debug:
494-
print(
495-
f"Added workflow {i+1}: connecting samples "
496-
f"{workflow_data.get('input_sample', [])} to {workflow_data.get('output_sample', [])}"
497-
)
466+
print(f"Workflow added: {sim_uri}")
498467

499468
return workflow_uris
500469

@@ -577,11 +546,7 @@ def parse_operations(self, operation_data_list: List[Dict[str, Any]]) -> List[st
577546
operation_uris.append(operation.id)
578547

579548
if self.debug:
580-
print(
581-
f"Added operation {i+1} ({method}): "
582-
f"{operation_data.get('input_sample')} -> "
583-
f"{operation_data.get('output_sample')}"
584-
)
549+
print(f"Operation added ({method}): {operation.id}")
585550

586551
return operation_uris
587552

@@ -617,21 +582,13 @@ def parse(self, data: Union[str, Path, Dict[str, Any]]) -> Dict[str, Any]:
617582
if isinstance(data, (str, Path)):
618583
filepath = Path(data)
619584

620-
if self.debug:
621-
print(f"\nReading file: {filepath}")
622-
623-
read_start = time.time()
624585
with open(filepath, "r") as f:
625586
if filepath.suffix in [".yaml", ".yml"]:
626587
data = yaml.safe_load(f)
627588
elif filepath.suffix == ".json":
628589
data = json.load(f)
629590
else:
630591
raise ValueError(f"Unsupported file format: {filepath.suffix}")
631-
read_time = time.time() - read_start
632-
633-
if self.debug:
634-
print(f"File reading time: {read_time:.3f}s")
635592
elif not isinstance(data, dict):
636593
raise TypeError(
637594
f"Unsupported data type: {type(data)}. Expected str, Path, or dict."

0 commit comments

Comments
 (0)