Skip to content

Commit 8aaa8f9

Browse files
authored
Merge pull request #976 from hubmapconsortium/karlburke/LargeCollectionTimeouts
Karlburke/large collection timeouts
2 parents 1cff0da + 8d491be commit 8aaa8f9

File tree

3 files changed

+102
-46
lines changed

3 files changed

+102
-46
lines changed

src/schema/schema_constants.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,3 +54,13 @@ class TriggerTypeEnum(Enum):
5454
BEFORE_UPDATE = 'before_update_trigger'
5555
AFTER_CREATE = 'after_create_trigger'
5656
AFTER_UPDATE = 'after_update_trigger'
57+
58+
# Define an enumeration of accepted Neo4j relationship types.
59+
class Neo4jRelationshipEnum(Enum):
60+
ACTIVITY_INPUT = 'ACTIVITY_INPUT'
61+
ACTIVITY_OUTPUT = 'ACTIVITY_INPUT'
62+
IN_COLLECTION = 'IN_COLLECTION'
63+
IN_UPLOAD = 'IN_UPLOAD'
64+
REVISION_OF = 'REVISION_OF'
65+
USES_DATA = 'USES_DATA'
66+

src/schema/schema_neo4j_queries.py

Lines changed: 76 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,14 @@
1+
import neo4j
12
from neo4j.exceptions import TransactionError
2-
from schema.schema_constants import SchemaConstants
3+
from neo4j import Session as Neo4jSession
4+
from schema.schema_constants import SchemaConstants, Neo4jRelationshipEnum
35
import logging
46

57
logger = logging.getLogger(__name__)
68

79
# The filed name of the single result record
810
record_field_name = 'result'
911

10-
1112
####################################################################################################
1213
## Functions can be called by app.py, schema_manager.py, and schema_triggers.py
1314
####################################################################################################
@@ -109,7 +110,38 @@ def get_entity(neo4j_driver, uuid):
109110

110111
return result
111112

113+
"""
114+
Given a list of UUIDs, return a dict mapping uuid -> entity_node
115+
Only UUIDs present in Neo4j will be returned.
116+
117+
Parameters
118+
----------
119+
neo4j_driver : neo4j.Driver object
120+
The neo4j database connection pool
121+
uuid_list : list of str
122+
The uuids of target entities to retrieve from Neo4j
123+
124+
Returns
125+
-------
126+
dict
127+
A dictionary of entity details returned from the Cypher query, keyed by
128+
the uuid provided in uuid_list.
129+
"""
130+
def identify_existing_dataset_entities(neo4j_driver, dataset_uuid_list:list):
131+
132+
if not dataset_uuid_list:
133+
return {}
134+
135+
query = """
136+
MATCH (e:Entity)
137+
WHERE e.uuid IN $param_uuids
138+
AND e.entity_type='Dataset'
139+
RETURN e.uuid AS uuid
140+
"""
112141

142+
with neo4j_driver.session() as session:
143+
results = session.run(query, param_uuids=dataset_uuid_list)
144+
return [record["uuid"] for record in results]
113145

114146
"""
115147
Get the uuids for each entity in a list that doesn't belong to a certain entity type. Uuids are ordered by type
@@ -884,18 +916,15 @@ def link_collection_to_datasets(neo4j_driver, collection_uuid, dataset_uuid_list
884916
try:
885917
with neo4j_driver.session() as session:
886918
tx = session.begin_transaction()
887-
888919
# First delete all the old linkages between this Collection and its member Datasets
889920
_delete_collection_linkages_tx(tx=tx
890921
, uuid=collection_uuid)
891922

892-
# Create relationship from each member Dataset node to this Collection node
893-
for dataset_uuid in dataset_uuid_list:
894-
create_relationship_tx(tx=tx
895-
, source_node_uuid=dataset_uuid
896-
, direction='->'
897-
, target_node_uuid=collection_uuid
898-
, relationship='IN_COLLECTION')
923+
_create_relationships_unwind_tx(tx=tx
924+
, source_uuid_list=dataset_uuid_list
925+
, target_uuid=collection_uuid
926+
, relationship=Neo4jRelationshipEnum.IN_COLLECTION
927+
, direction='->')
899928

900929
tx.commit()
901930
except TransactionError as te:
@@ -1980,6 +2009,43 @@ def create_relationship_tx(tx, source_node_uuid, target_node_uuid, relationship,
19802009

19812010
result = tx.run(query)
19822011

2012+
"""
2013+
Create multiple relationships between a target node and each node in
2014+
a list of source nodes in neo4j
2015+
2016+
Parameters
2017+
----------
2018+
tx : neo4j.Session object
2019+
The neo4j.Session object instance
2020+
source_uuid_list : list[str]
2021+
A list of UUIDs for nodes which will have a relationship to the node with target_uuid
2022+
target_uuid : str
2023+
The UUID of target node
2024+
relationship : Neo4jRelationshipEnum
2025+
The string for the Neo4j relationship type between each source node and the target node.
2026+
direction: str
2027+
The relationship direction of each source node to the target node: outgoing `->` or incoming `<-`
2028+
Neo4j CQL CREATE command supports only directional relationships
2029+
"""
2030+
def _create_relationships_unwind_tx(tx:Neo4jSession, source_uuid_list:list, target_uuid:str
2031+
, relationship:Neo4jRelationshipEnum, direction:str)->None:
2032+
logger.info("====== enter _create_relationships_unwind_tx() ======")
2033+
incoming = direction if direction == "<-" else "-"
2034+
outgoing = direction if direction == "->" else "-"
2035+
2036+
query = (
2037+
f"MATCH (t:Collection {{uuid: $target_uuid}}) "
2038+
f"UNWIND $source_uuid_list AS src_uuid "
2039+
f"MATCH (s:Dataset {{uuid: src_uuid}}) "
2040+
f"CREATE (s){incoming}[r:{relationship.value}]{outgoing}(t) "
2041+
f"RETURN src_uuid AS linked_uuid"
2042+
)
2043+
2044+
result = tx.run( query=query
2045+
, target_uuid=target_uuid
2046+
, source_uuid_list=source_uuid_list)
2047+
logger.info("====== returning from _create_relationships_unwind_tx() ======")
2048+
19832049
"""
19842050
Execute one query to create all outgoing relationships from each node whose
19852051
identifier is in the source node list to the target Activity node in neo4j

src/schema/schema_validators.py

Lines changed: 16 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -298,7 +298,8 @@ def verify_DOI_pair(property_key, normalized_entity_type, request, existing_data
298298
f" the prefix {SchemaConstants.DOI_BASE_URL}.")
299299

300300
"""
301-
Validate every entity in a list is of entity_type accepted
301+
Validate every entity in a list is of entity_type that can be in a
302+
Collection and already exists in Neo4j.
302303
303304
Parameters
304305
----------
@@ -318,41 +319,20 @@ def collection_entities_are_existing_datasets(property_key, normalized_entity_ty
318319
# Verify each UUID specified exists in the uuid-api, exists in Neo4j, and is for a Dataset before
319320
# proceeding with creation of Collection.
320321
bad_dataset_uuids = []
321-
for dataset_uuid in new_data_dict['dataset_uuids']:
322-
try:
323-
## The following code duplicates some functionality existing in app.py, in
324-
## query_target_entity(), which also deals with caching. In the future, the
325-
## validation logic shared by this file and app.py should become a utility
326-
## module, shared by validators as well as app.py. But for now, the code
327-
## is repeated for the following.
328-
329-
# Get cached ids if exist otherwise retrieve from UUID-API. Expect an
330-
# Exception to be raised if not found.
331-
dataset_uuid_entity = schema_manager.get_hubmap_ids(id=dataset_uuid)
332-
333-
# If the uuid exists per the uuid-api, make sure it also exists as a Neo4j entity.
334-
uuid = dataset_uuid_entity['uuid']
335-
entity_dict = schema_neo4j_queries.get_entity(schema_manager.get_neo4j_driver_instance(), dataset_uuid)
336-
337-
# If dataset_uuid is not found in Neo4j or is not for a Dataset, fail the validation.
338-
if not entity_dict:
339-
logger.info(f"Request for {dataset_uuid} inclusion in Collection,"
340-
f" but not found in Neo4j.")
341-
bad_dataset_uuids.append(dataset_uuid)
342-
elif entity_dict['entity_type'] != 'Dataset':
343-
logger.info(f"Request for {dataset_uuid} inclusion in Collection,"
344-
f" but entity_type={entity_dict['entity_type']}, not Dataset.")
345-
bad_dataset_uuids.append(dataset_uuid)
346-
except Exception as nfe:
347-
# If the dataset_uuid is not found, fail the validation.
348-
logger.info(f"Request for {dataset_uuid} inclusion in Collection"
349-
f" failed uuid-api retrieval.")
350-
bad_dataset_uuids.append(dataset_uuid)
351-
# If any uuids in the request dataset_uuids are not for an existing Dataset entity which
352-
# exists in uuid-api and Neo4j, raise an Exception so the validation fails and the
353-
# operation can be rejected.
354-
if bad_dataset_uuids:
355-
raise ValueError(f"Unable to find Datasets for {bad_dataset_uuids}.")
322+
dataset_uuid_list = new_data_dict['dataset_uuids']
323+
if not dataset_uuid_list:
324+
return
325+
326+
existing_datasets_list = schema_neo4j_queries.identify_existing_dataset_entities( neo4j_driver=schema_manager.get_neo4j_driver_instance()
327+
, dataset_uuid_list=dataset_uuid_list)
328+
329+
# If any UUIDs which were passed in do not exist in Neo4j or are not Datasets, identify them
330+
missing_uuid_set = set(dataset_uuid_list) - set(existing_datasets_list)
331+
if missing_uuid_set:
332+
logger.info(f"Only existing Datasets may be included in a Collection:"
333+
f" {sorted(missing_uuid_set)}")
334+
raise ValueError( f"Only existing Datasets may be included in a Collection, not these: "
335+
f" {sorted(missing_uuid_set)}")
356336

357337
"""
358338
Validate the provided value of Dataset.status on update via PUT

0 commit comments

Comments
 (0)