11from collections import OrderedDict
22import json
33import logging
4+ import networkx as nx
45from openpyxl .styles import Font , Alignment , PatternFill
56from openpyxl import load_workbook
67from openpyxl .utils .dataframe import dataframe_to_rows
1213from typing import Dict , List , Optional , Tuple , Union , BinaryIO , Literal
1314from flask import send_from_directory
1415
15- from schematic .schemas .generator import SchemaGenerator
16+ from schematic .schemas .data_model_graph import DataModelGraph , DataModelGraphExplorer
17+ from schematic .schemas .data_model_parser import DataModelParser
18+ from schematic .schemas .data_model_json_schema import DataModelJSONSchema
19+
1620from schematic .utils .google_api_utils import (
1721 execute_google_api_requests ,
1822 build_service_account_creds ,
3539class ManifestGenerator (object ):
3640 def __init__ (
3741 self ,
38- path_to_json_ld : str , # JSON-LD file to be used for generating the manifest
42+ path_to_data_model : str , # JSON-LD file to be used for generating the manifest
43+ graph : nx .MultiDiGraph , # At this point, the graph is fully formed.
3944 alphabetize_valid_values : str = 'ascending' ,
4045 title : str = None , # manifest sheet title
4146 root : str = None ,
@@ -54,6 +59,12 @@ def __init__(
5459 # google service credentials object
5560 self .creds = services_creds ["creds" ]
5661
62+ # Path to jsonld
63+ self .model_path = path_to_data_model
64+
65+ # Graph
66+ self .graph = graph
67+
5768 # schema root
5869 if root :
5970 self .root = root
@@ -79,14 +90,14 @@ def __init__(
7990 "when there is no manifest file for the dataset in question."
8091 )
8192
82- # SchemaGenerator() object
83- self .sg = SchemaGenerator ( path_to_json_ld )
93+ # Instantiate Data Model Explorer object
94+ self .dmge = DataModelGraphExplorer ( self . graph )
8495
8596 # additional metadata to add to manifest
8697 self .additional_metadata = additional_metadata
8798
8899 # Check if the class is in the schema
89- root_in_schema = self .sg . se .is_class_in_schema (self .root )
100+ root_in_schema = self .dmge .is_class_in_schema (self .root )
90101
91102 # If the class could not be found, give a notification
92103 if not root_in_schema :
@@ -95,8 +106,7 @@ def __init__(
95106 raise LookupError (exception_message )
96107
97108 # Determine whether current data type is file-based
98- self .is_file_based = "Filename" in self .sg .get_node_dependencies (self .root )
99-
109+ self .is_file_based = "Filename" in self .dmge .get_node_dependencies (self .root )
100110
101111 def _attribute_to_letter (self , attribute , manifest_fields ):
102112 """Map attribute to column letter in a google sheet"""
@@ -364,13 +374,12 @@ def _get_json_schema(self, json_schema_filepath: str) -> Dict:
364374 json_schema_filepath(str): path to json schema file
365375 Returns:
366376 Dictionary, containing portions of the json schema
377+ TODO: Do we even allow people to provide a json_schema_filepath anyore?
367378 """
368379 if not json_schema_filepath :
369- # if no json schema is provided; there must be
370- # schema explorer defined for schema.org schema
371- # o.w. this will throw an error
372- # TODO: catch error
373- json_schema = self .sg .get_json_schema_requirements (self .root , self .title )
380+ # TODO Catch error if no JSONLD or JSON path provided.
381+ data_model_js = DataModelJSONSchema (jsonld_path = self .model_path , graph = self .graph )
382+ json_schema = data_model_js .get_json_validation_schema (source_node = self .root , schema_name = self .title )
374383 else :
375384 with open (json_schema_filepath ) as jsonfile :
376385 json_schema = json .load (jsonfile )
@@ -813,9 +822,9 @@ def _request_row_format(self, i, req):
813822 notes_body["requests"] (dict): with information on note
814823 to add to the column header. This notes body will be added to a request.
815824 """
816- if self .sg . se :
825+ if self .dmge :
817826 # get node definition
818- note = self .sg . get_node_definition ( req )
827+ note = self .dmge . get_node_comment ( node_display_name = req )
819828
820829 notes_body = {
821830 "requests" : [
@@ -1014,8 +1023,7 @@ def _dependency_formatting(
10141023 dependency_formatting_body = {"requests" : []}
10151024 for j , val_dep in enumerate (val_dependencies ):
10161025 is_required = False
1017-
1018- if self .sg .is_node_required (val_dep ):
1026+ if self .dmge .get_node_required (node_display_name = val_dep ):
10191027 is_required = True
10201028 else :
10211029 is_required = False
@@ -1058,13 +1066,13 @@ def _request_dependency_formatting(
10581066 for req_val in req_vals :
10591067 # get this required/valid value's node label in schema, based on display name (i.e. shown to the user in a dropdown to fill in)
10601068 req_val = req_val ["userEnteredValue" ]
1061- req_val_node_label = self .sg .get_node_label (req_val )
1069+ req_val_node_label = self .dmge .get_node_label (req_val )
10621070 if not req_val_node_label :
10631071 # if this node is not in the graph
10641072 # continue - there are no dependencies for it
10651073 continue
10661074 # check if this required/valid value has additional dependency attributes
1067- val_dependencies = self .sg .get_node_dependencies (
1075+ val_dependencies = self .dmge .get_node_dependencies (
10681076 req_val_node_label , schema_ordered = False
10691077 )
10701078
@@ -1117,7 +1125,7 @@ def _create_requests_body(
11171125 requests_body ["requests" ] = []
11181126 for i , req in enumerate (ordered_metadata_fields [0 ]):
11191127 # Gather validation rules and valid values for attribute.
1120- validation_rules = self .sg .get_node_validation_rules (req )
1128+ validation_rules = self .dmge .get_node_validation_rules (node_display_name = req )
11211129
11221130 # Add regex match validaiton rule to Google Sheets.
11231131 if validation_rules and sheet_url :
@@ -1364,7 +1372,7 @@ def map_annotation_names_to_display_names(
13641372 pd.DataFrame: Annotations table with updated column headers.
13651373 """
13661374 # Get list of attribute nodes from data model
1367- model_nodes = self .sg . se . get_nx_schema () .nodes
1375+ model_nodes = self .graph .nodes
13681376
13691377 # Subset annotations to those appearing as a label in the model
13701378 labels = filter (lambda x : x in model_nodes , annotations .columns )
@@ -1492,7 +1500,7 @@ def _handle_output_format_logic(self, output_format: str = None, output_path: st
14921500 return dataframe
14931501
14941502 @staticmethod
1495- def create_single_manifest (jsonld : str , data_type : str , access_token :Optional [str ]= None , dataset_id :Optional [str ]= None , strict :Optional [bool ]= True , title :Optional [str ]= None , output_format :Literal ["google_sheet" , "excel" , "dataframe" ]= "google_sheet" , use_annotations :Optional [bool ]= False ) -> Union [str , pd .DataFrame , BinaryIO ]:
1503+ def create_single_manifest (path_to_data_model : str , graph_data_model : nx . MultiDiGraph , data_type : str , access_token :Optional [str ]= None , dataset_id :Optional [str ]= None , strict :Optional [bool ]= True , title :Optional [str ]= None , output_format :Literal ["google_sheet" , "excel" , "dataframe" ]= "google_sheet" , use_annotations :Optional [bool ]= False ) -> Union [str , pd .DataFrame , BinaryIO ]:
14961504 """Create a single manifest
14971505
14981506 Args:
@@ -1510,7 +1518,8 @@ def create_single_manifest(jsonld: str, data_type: str, access_token:Optional[st
15101518 """
15111519 # create object of type ManifestGenerator
15121520 manifest_generator = ManifestGenerator (
1513- path_to_json_ld = jsonld ,
1521+ path_to_data_model = path_to_data_model ,
1522+ graph = graph_data_model ,
15141523 title = title ,
15151524 root = data_type ,
15161525 use_annotations = use_annotations ,
@@ -1536,11 +1545,11 @@ def create_single_manifest(jsonld: str, data_type: str, access_token:Optional[st
15361545 return result
15371546
15381547 @staticmethod
1539- def create_manifests (jsonld :str , data_types :list , access_token :Optional [str ]= None , dataset_ids :Optional [list ]= None , output_format :Literal ["google_sheet" , "excel" , "dataframe" ]= "google_sheet" , title :Optional [str ]= None , strict :Optional [bool ]= True , use_annotations :Optional [bool ]= False ) -> Union [List [str ], List [pd .DataFrame ], BinaryIO ]:
1548+ def create_manifests (path_to_data_model :str , data_types :list , access_token :Optional [str ]= None , dataset_ids :Optional [list ]= None , output_format :Literal ["google_sheet" , "excel" , "dataframe" ]= "google_sheet" , title :Optional [str ]= None , strict :Optional [bool ]= True , use_annotations :Optional [bool ]= False ) -> Union [List [str ], List [pd .DataFrame ], BinaryIO ]:
15401549 """Create multiple manifests
15411550
15421551 Args:
1543- jsonld (str): jsonld schema
1552+ path_to_data_model (str): str path to data model
15441553 data_type (list): a list of data types
15451554 access_token (str, optional): synapse access token. Required when getting an existing manifest. Defaults to None.
15461555 dataset_id (list, optional): a list of dataset ids when generating an existing manifest. Defaults to None.
@@ -1552,18 +1561,30 @@ def create_manifests(jsonld:str, data_types:list, access_token:Optional[str]=Non
15521561 Returns:
15531562 Union[List[str], List[pd.DataFrame], BinaryIO]: a list of Googlesheet URLs, a list of pandas dataframes or an Excel file.
15541563 """
1564+ data_model_parser = DataModelParser (path_to_data_model = path_to_data_model )
1565+
1566+ #Parse Model
1567+ parsed_data_model = data_model_parser .parse_model ()
1568+
1569+ # Instantiate DataModelGraph
1570+ data_model_grapher = DataModelGraph (parsed_data_model )
1571+
1572+ # Generate graph
1573+ graph_data_model = data_model_grapher .generate_data_model_graph ()
1574+
1575+ # Gather all returned result urls
15551576 all_results = []
15561577 if data_types [0 ] == 'all manifests' :
1557- sg = SchemaGenerator ( path_to_json_ld = jsonld )
1558- component_digraph = sg . se .get_digraph_by_edge_type ('requiresComponent' )
1578+ dmge = DataModelGraphExplorer ( graph_data_model )
1579+ component_digraph = dmge .get_digraph_by_edge_type ('requiresComponent' )
15591580 components = component_digraph .nodes ()
15601581 for component in components :
15611582 if title :
15621583 t = f'{ title } .{ component } .manifest'
15631584 else :
15641585 t = f'Example.{ component } .manifest'
15651586 if output_format != "excel" :
1566- result = ManifestGenerator .create_single_manifest (jsonld = jsonld , data_type = component , output_format = output_format , title = t , access_token = access_token , strict = strict , use_annotations = use_annotations )
1587+ result = ManifestGenerator .create_single_manifest (path_to_data_model = path_to_data_model , data_type = component , graph_data_model = graph_data_model , output_format = output_format , title = t , access_token = access_token )
15671588 all_results .append (result )
15681589 else :
15691590 logger .error ('Currently we do not support returning multiple files as Excel format at once. Please choose a different output format. ' )
@@ -1578,9 +1599,9 @@ def create_manifests(jsonld:str, data_types:list, access_token:Optional[str]=Non
15781599 t = title
15791600 if dataset_ids :
15801601 # if a dataset_id is provided add this to the function call.
1581- result = ManifestGenerator .create_single_manifest (jsonld = jsonld , data_type = dt , dataset_id = dataset_ids [i ], output_format = output_format , title = t , access_token = access_token , strict = strict , use_annotations = use_annotations )
1602+ result = ManifestGenerator .create_single_manifest (path_to_data_model = path_to_data_model , data_type = dt , graph_data_model = graph_data_model , dataset_id = dataset_ids [i ], output_format = output_format , title = t , access_token = access_token , use_annotations = use_annotations )
15821603 else :
1583- result = ManifestGenerator .create_single_manifest (jsonld = jsonld , data_type = dt , output_format = output_format , title = t , access_token = access_token , strict = strict , use_annotations = use_annotations )
1604+ result = ManifestGenerator .create_single_manifest (path_to_data_model = path_to_data_model , data_type = dt , graph_data_model = graph_data_model , output_format = output_format , title = t , access_token = access_token , use_annotations = use_annotations )
15841605
15851606 # if output is pandas dataframe or google sheet url
15861607 if isinstance (result , str ) or isinstance (result , pd .DataFrame ):
@@ -1589,6 +1610,7 @@ def create_manifests(jsonld:str, data_types:list, access_token:Optional[str]=Non
15891610 if len (data_types ) > 1 :
15901611 logger .warning (f'Currently we do not support returning multiple files as Excel format at once. Only { t } would get returned. ' )
15911612 return result
1613+
15921614 return all_results
15931615
15941616
@@ -1632,7 +1654,7 @@ def get_manifest(
16321654
16331655 # Get manifest file associated with given dataset (if applicable)
16341656 # populate manifest with set of new files (if applicable)
1635- manifest_record = store .updateDatasetManifestFiles (self .sg , datasetId = dataset_id , store = False )
1657+ manifest_record = store .updateDatasetManifestFiles (self .dmge , datasetId = dataset_id , store = False )
16361658
16371659 # get URL of an empty manifest file created based on schema component
16381660 empty_manifest_url = self .get_empty_manifest (strict = strict , sheet_url = True )
@@ -1869,9 +1891,9 @@ def sort_manifest_fields(self, manifest_fields, order="schema"):
18691891
18701892 # order manifest fields based on data-model schema
18711893 if order == "schema" :
1872- if self .sg and self .root :
1894+ if self .dmge and self .root :
18731895 # get display names of dependencies
1874- dependencies_display_names = self .sg .get_node_dependencies (self .root )
1896+ dependencies_display_names = self .dmge .get_node_dependencies (self .root )
18751897
18761898 # reorder manifest fields so that root dependencies are first and follow schema order
18771899 manifest_fields = sorted (
0 commit comments