Skip to content

Commit 27c6c41

Browse files
authored
Merge pull request #1354 from Sage-Bionetworks/develop
Schematic release v24.1.1
2 parents c23ed32 + 2ade117 commit 27c6c41

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

43 files changed

+6448
-6070
lines changed

poetry.lock

Lines changed: 246 additions & 11 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,7 @@ pandarallel = "^1.6.4"
7373
schematic-db = {version = "0.0.dev33", extras = ["synapse"]}
7474
pyopenssl = "^23.0.0"
7575
typing-extensions = "<4.6.0"
76+
dataclasses-json = "^0.6.1"
7677

7778
[tool.poetry.group.dev.dependencies]
7879
pytest = "^7.0.0"

schematic/help.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -166,9 +166,6 @@
166166
"short_help": (
167167
"Convert specification from CSV data model to JSON-LD data model."
168168
),
169-
"base_schema": (
170-
"Path to base data model. BioThings data model is loaded by default."
171-
),
172169
"output_jsonld": (
173170
"Path to where the generated JSON-LD file needs to be outputted."
174171
),

schematic/manifest/commands.py

Lines changed: 30 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -6,11 +6,14 @@
66
import click
77
import click_log
88

9+
from schematic.schemas.data_model_parser import DataModelParser
10+
from schematic.schemas.data_model_graph import DataModelGraph, DataModelGraphExplorer
911
from schematic.manifest.generator import ManifestGenerator
12+
1013
from schematic.utils.cli_utils import log_value_from_config, query_dict, parse_synIDs
11-
from schematic.help import manifest_commands
12-
from schematic.schemas.generator import SchemaGenerator
1314
from schematic.utils.google_api_utils import export_manifest_csv
15+
from schematic.help import manifest_commands
16+
1417
from schematic.store.synapse import SynapseStorage
1518
from schematic.configuration.configuration import CONFIG
1619

@@ -59,7 +62,7 @@ def manifest(ctx, config): # use as `schematic manifest ...`
5962
help=query_dict(manifest_commands, ("manifest", "get", "data_type")),
6063
)
6164
@click.option(
62-
"-p", "--jsonld", help=query_dict(manifest_commands, ("manifest", "get", "jsonld"))
65+
"-p", "--path_to_data_model", help=query_dict(manifest_commands, ("manifest", "get", "path_to_data_model"))
6366
)
6467
@click.option(
6568
"-d",
@@ -104,7 +107,7 @@ def get_manifest(
104107
ctx,
105108
title,
106109
data_type,
107-
jsonld,
110+
path_to_data_model,
108111
dataset_id,
109112
sheet_url,
110113
output_csv,
@@ -121,17 +124,31 @@ def get_manifest(
121124
if data_type is None:
122125
data_type = CONFIG.manifest_data_type
123126
log_value_from_config("data_type", data_type)
124-
if jsonld is None:
125-
jsonld = CONFIG.model_location
126-
log_value_from_config("jsonld", jsonld)
127+
if path_to_data_model is None:
128+
path_to_data_model = CONFIG.model_location
129+
log_value_from_config("path_to_data_model", path_to_data_model)
127130
if title is None:
128131
title = CONFIG.manifest_title
129132
log_value_from_config("title", title)
130133

134+
data_model_parser = DataModelParser(path_to_data_model = path_to_data_model)
135+
136+
#Parse Model
137+
logger.info("Parsing data model.")
138+
parsed_data_model = data_model_parser.parse_model()
139+
140+
# Instantiate DataModelGraph
141+
data_model_grapher = DataModelGraph(parsed_data_model)
142+
143+
# Generate graph
144+
logger.info("Generating data model graph.")
145+
graph_data_model = data_model_grapher.generate_data_model_graph()
146+
131147
def create_single_manifest(data_type, output_csv=None, output_xlsx=None):
132148
# create object of type ManifestGenerator
133149
manifest_generator = ManifestGenerator(
134-
path_to_json_ld=jsonld,
150+
path_to_data_model=path_to_data_model,
151+
graph = graph_data_model,
135152
title=t,
136153
root=data_type,
137154
use_annotations=use_annotations,
@@ -174,7 +191,7 @@ def create_single_manifest(data_type, output_csv=None, output_xlsx=None):
174191
logger.info("Find the manifest template using this Google Sheet URL:")
175192
click.echo(result)
176193
if output_csv is None and output_xlsx is None:
177-
prefix, _ = os.path.splitext(jsonld)
194+
prefix, _ = os.path.splitext(path_to_data_model)
178195
prefix_root, prefix_ext = os.path.splitext(prefix)
179196
if prefix_ext == ".model":
180197
prefix = prefix_root
@@ -194,9 +211,10 @@ def create_single_manifest(data_type, output_csv=None, output_xlsx=None):
194211
if type(data_type) is str:
195212
data_type = [data_type]
196213

197-
if data_type[0] == 'all manifests':
198-
sg = SchemaGenerator(path_to_json_ld=jsonld)
199-
component_digraph = sg.se.get_digraph_by_edge_type('requiresComponent')
214+
if data_type[0] == 'all manifests':
215+
# Feed graph into the data model graph explorer
216+
dmge = DataModelGraphExplorer(graph_data_model)
217+
component_digraph = dmge.get_digraph_by_edge_type('requiresComponent')
200218
components = component_digraph.nodes()
201219
for component in components:
202220
t = f'{title}.{component}.manifest'

schematic/manifest/generator.py

Lines changed: 54 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
from collections import OrderedDict
22
import json
33
import logging
4+
import networkx as nx
45
from openpyxl.styles import Font, Alignment, PatternFill
56
from openpyxl import load_workbook
67
from openpyxl.utils.dataframe import dataframe_to_rows
@@ -12,7 +13,10 @@
1213
from typing import Dict, List, Optional, Tuple, Union, BinaryIO, Literal
1314
from flask import send_from_directory
1415

15-
from schematic.schemas.generator import SchemaGenerator
16+
from schematic.schemas.data_model_graph import DataModelGraph, DataModelGraphExplorer
17+
from schematic.schemas.data_model_parser import DataModelParser
18+
from schematic.schemas.data_model_json_schema import DataModelJSONSchema
19+
1620
from schematic.utils.google_api_utils import (
1721
execute_google_api_requests,
1822
build_service_account_creds,
@@ -35,7 +39,8 @@
3539
class ManifestGenerator(object):
3640
def __init__(
3741
self,
38-
path_to_json_ld: str, # JSON-LD file to be used for generating the manifest
42+
path_to_data_model: str, # JSON-LD file to be used for generating the manifest
43+
graph: nx.MultiDiGraph, # At this point, the graph is fully formed.
3944
alphabetize_valid_values: str = 'ascending',
4045
title: str = None, # manifest sheet title
4146
root: str = None,
@@ -54,6 +59,12 @@ def __init__(
5459
# google service credentials object
5560
self.creds = services_creds["creds"]
5661

62+
# Path to jsonld
63+
self.model_path = path_to_data_model
64+
65+
# Graph
66+
self.graph = graph
67+
5768
# schema root
5869
if root:
5970
self.root = root
@@ -79,14 +90,14 @@ def __init__(
7990
"when there is no manifest file for the dataset in question."
8091
)
8192

82-
# SchemaGenerator() object
83-
self.sg = SchemaGenerator(path_to_json_ld)
93+
# Instantiate Data Model Explorer object
94+
self.dmge = DataModelGraphExplorer(self.graph)
8495

8596
# additional metadata to add to manifest
8697
self.additional_metadata = additional_metadata
8798

8899
# Check if the class is in the schema
89-
root_in_schema = self.sg.se.is_class_in_schema(self.root)
100+
root_in_schema = self.dmge.is_class_in_schema(self.root)
90101

91102
# If the class could not be found, give a notification
92103
if not root_in_schema:
@@ -95,8 +106,7 @@ def __init__(
95106
raise LookupError(exception_message)
96107

97108
# Determine whether current data type is file-based
98-
self.is_file_based = "Filename" in self.sg.get_node_dependencies(self.root)
99-
109+
self.is_file_based = "Filename" in self.dmge.get_node_dependencies(self.root)
100110

101111
def _attribute_to_letter(self, attribute, manifest_fields):
102112
"""Map attribute to column letter in a google sheet"""
@@ -364,13 +374,12 @@ def _get_json_schema(self, json_schema_filepath: str) -> Dict:
364374
json_schema_filepath(str): path to json schema file
365375
Returns:
366376
Dictionary, containing portions of the json schema
377+
TODO: Do we even allow people to provide a json_schema_filepath anyore?
367378
"""
368379
if not json_schema_filepath:
369-
# if no json schema is provided; there must be
370-
# schema explorer defined for schema.org schema
371-
# o.w. this will throw an error
372-
# TODO: catch error
373-
json_schema = self.sg.get_json_schema_requirements(self.root, self.title)
380+
# TODO Catch error if no JSONLD or JSON path provided.
381+
data_model_js = DataModelJSONSchema(jsonld_path=self.model_path, graph=self.graph)
382+
json_schema = data_model_js.get_json_validation_schema(source_node=self.root, schema_name=self.title)
374383
else:
375384
with open(json_schema_filepath) as jsonfile:
376385
json_schema = json.load(jsonfile)
@@ -813,9 +822,9 @@ def _request_row_format(self, i, req):
813822
notes_body["requests"] (dict): with information on note
814823
to add to the column header. This notes body will be added to a request.
815824
"""
816-
if self.sg.se:
825+
if self.dmge:
817826
# get node definition
818-
note = self.sg.get_node_definition(req)
827+
note = self.dmge.get_node_comment(node_display_name = req)
819828

820829
notes_body = {
821830
"requests": [
@@ -1014,8 +1023,7 @@ def _dependency_formatting(
10141023
dependency_formatting_body = {"requests": []}
10151024
for j, val_dep in enumerate(val_dependencies):
10161025
is_required = False
1017-
1018-
if self.sg.is_node_required(val_dep):
1026+
if self.dmge.get_node_required(node_display_name=val_dep):
10191027
is_required = True
10201028
else:
10211029
is_required = False
@@ -1058,13 +1066,13 @@ def _request_dependency_formatting(
10581066
for req_val in req_vals:
10591067
# get this required/valid value's node label in schema, based on display name (i.e. shown to the user in a dropdown to fill in)
10601068
req_val = req_val["userEnteredValue"]
1061-
req_val_node_label = self.sg.get_node_label(req_val)
1069+
req_val_node_label = self.dmge.get_node_label(req_val)
10621070
if not req_val_node_label:
10631071
# if this node is not in the graph
10641072
# continue - there are no dependencies for it
10651073
continue
10661074
# check if this required/valid value has additional dependency attributes
1067-
val_dependencies = self.sg.get_node_dependencies(
1075+
val_dependencies = self.dmge.get_node_dependencies(
10681076
req_val_node_label, schema_ordered=False
10691077
)
10701078

@@ -1117,7 +1125,7 @@ def _create_requests_body(
11171125
requests_body["requests"] = []
11181126
for i, req in enumerate(ordered_metadata_fields[0]):
11191127
# Gather validation rules and valid values for attribute.
1120-
validation_rules = self.sg.get_node_validation_rules(req)
1128+
validation_rules = self.dmge.get_node_validation_rules(node_display_name=req)
11211129

11221130
# Add regex match validaiton rule to Google Sheets.
11231131
if validation_rules and sheet_url:
@@ -1364,7 +1372,7 @@ def map_annotation_names_to_display_names(
13641372
pd.DataFrame: Annotations table with updated column headers.
13651373
"""
13661374
# Get list of attribute nodes from data model
1367-
model_nodes = self.sg.se.get_nx_schema().nodes
1375+
model_nodes = self.graph.nodes
13681376

13691377
# Subset annotations to those appearing as a label in the model
13701378
labels = filter(lambda x: x in model_nodes, annotations.columns)
@@ -1492,7 +1500,7 @@ def _handle_output_format_logic(self, output_format: str = None, output_path: st
14921500
return dataframe
14931501

14941502
@staticmethod
1495-
def create_single_manifest(jsonld: str, data_type: str, access_token:Optional[str]=None, dataset_id:Optional[str]=None, strict:Optional[bool]=True, title:Optional[str]=None, output_format:Literal["google_sheet", "excel", "dataframe"]="google_sheet", use_annotations:Optional[bool]=False) -> Union[str, pd.DataFrame, BinaryIO]:
1503+
def create_single_manifest(path_to_data_model: str, graph_data_model: nx.MultiDiGraph, data_type: str, access_token:Optional[str]=None, dataset_id:Optional[str]=None, strict:Optional[bool]=True, title:Optional[str]=None, output_format:Literal["google_sheet", "excel", "dataframe"]="google_sheet", use_annotations:Optional[bool]=False) -> Union[str, pd.DataFrame, BinaryIO]:
14961504
"""Create a single manifest
14971505
14981506
Args:
@@ -1510,7 +1518,8 @@ def create_single_manifest(jsonld: str, data_type: str, access_token:Optional[st
15101518
"""
15111519
# create object of type ManifestGenerator
15121520
manifest_generator = ManifestGenerator(
1513-
path_to_json_ld=jsonld,
1521+
path_to_data_model=path_to_data_model,
1522+
graph=graph_data_model,
15141523
title=title,
15151524
root=data_type,
15161525
use_annotations=use_annotations,
@@ -1536,11 +1545,11 @@ def create_single_manifest(jsonld: str, data_type: str, access_token:Optional[st
15361545
return result
15371546

15381547
@staticmethod
1539-
def create_manifests(jsonld:str, data_types:list, access_token:Optional[str]=None, dataset_ids:Optional[list]=None, output_format:Literal["google_sheet", "excel", "dataframe"]="google_sheet", title:Optional[str]=None, strict:Optional[bool]=True, use_annotations:Optional[bool]=False) -> Union[List[str], List[pd.DataFrame], BinaryIO]:
1548+
def create_manifests(path_to_data_model:str, data_types:list, access_token:Optional[str]=None, dataset_ids:Optional[list]=None, output_format:Literal["google_sheet", "excel", "dataframe"]="google_sheet", title:Optional[str]=None, strict:Optional[bool]=True, use_annotations:Optional[bool]=False) -> Union[List[str], List[pd.DataFrame], BinaryIO]:
15401549
"""Create multiple manifests
15411550
15421551
Args:
1543-
jsonld (str): jsonld schema
1552+
path_to_data_model (str): str path to data model
15441553
data_type (list): a list of data types
15451554
access_token (str, optional): synapse access token. Required when getting an existing manifest. Defaults to None.
15461555
dataset_id (list, optional): a list of dataset ids when generating an existing manifest. Defaults to None.
@@ -1552,18 +1561,30 @@ def create_manifests(jsonld:str, data_types:list, access_token:Optional[str]=Non
15521561
Returns:
15531562
Union[List[str], List[pd.DataFrame], BinaryIO]: a list of Googlesheet URLs, a list of pandas dataframes or an Excel file.
15541563
"""
1564+
data_model_parser = DataModelParser(path_to_data_model = path_to_data_model)
1565+
1566+
#Parse Model
1567+
parsed_data_model = data_model_parser.parse_model()
1568+
1569+
# Instantiate DataModelGraph
1570+
data_model_grapher = DataModelGraph(parsed_data_model)
1571+
1572+
# Generate graph
1573+
graph_data_model = data_model_grapher.generate_data_model_graph()
1574+
1575+
# Gather all returned result urls
15551576
all_results = []
15561577
if data_types[0] == 'all manifests':
1557-
sg = SchemaGenerator(path_to_json_ld=jsonld)
1558-
component_digraph = sg.se.get_digraph_by_edge_type('requiresComponent')
1578+
dmge = DataModelGraphExplorer(graph_data_model)
1579+
component_digraph = dmge.get_digraph_by_edge_type('requiresComponent')
15591580
components = component_digraph.nodes()
15601581
for component in components:
15611582
if title:
15621583
t = f'{title}.{component}.manifest'
15631584
else:
15641585
t = f'Example.{component}.manifest'
15651586
if output_format != "excel":
1566-
result = ManifestGenerator.create_single_manifest(jsonld=jsonld, data_type=component, output_format=output_format, title=t, access_token=access_token, strict=strict, use_annotations=use_annotations)
1587+
result = ManifestGenerator.create_single_manifest(path_to_data_model=path_to_data_model, data_type=component, graph_data_model=graph_data_model, output_format=output_format, title=t, access_token=access_token)
15671588
all_results.append(result)
15681589
else:
15691590
logger.error('Currently we do not support returning multiple files as Excel format at once. Please choose a different output format. ')
@@ -1578,9 +1599,9 @@ def create_manifests(jsonld:str, data_types:list, access_token:Optional[str]=Non
15781599
t = title
15791600
if dataset_ids:
15801601
# if a dataset_id is provided add this to the function call.
1581-
result = ManifestGenerator.create_single_manifest(jsonld=jsonld, data_type=dt, dataset_id=dataset_ids[i], output_format=output_format, title=t, access_token=access_token, strict=strict, use_annotations=use_annotations)
1602+
result = ManifestGenerator.create_single_manifest(path_to_data_model=path_to_data_model, data_type=dt, graph_data_model=graph_data_model, dataset_id=dataset_ids[i], output_format=output_format, title=t, access_token=access_token, use_annotations=use_annotations)
15821603
else:
1583-
result = ManifestGenerator.create_single_manifest(jsonld=jsonld, data_type=dt, output_format=output_format, title=t, access_token=access_token, strict=strict, use_annotations=use_annotations)
1604+
result = ManifestGenerator.create_single_manifest(path_to_data_model=path_to_data_model, data_type=dt, graph_data_model=graph_data_model, output_format=output_format, title=t, access_token=access_token, use_annotations=use_annotations)
15841605

15851606
# if output is pandas dataframe or google sheet url
15861607
if isinstance(result, str) or isinstance(result, pd.DataFrame):
@@ -1589,6 +1610,7 @@ def create_manifests(jsonld:str, data_types:list, access_token:Optional[str]=Non
15891610
if len(data_types) > 1:
15901611
logger.warning(f'Currently we do not support returning multiple files as Excel format at once. Only {t} would get returned. ')
15911612
return result
1613+
15921614
return all_results
15931615

15941616

@@ -1632,7 +1654,7 @@ def get_manifest(
16321654

16331655
# Get manifest file associated with given dataset (if applicable)
16341656
# populate manifest with set of new files (if applicable)
1635-
manifest_record = store.updateDatasetManifestFiles(self.sg, datasetId = dataset_id, store = False)
1657+
manifest_record = store.updateDatasetManifestFiles(self.dmge, datasetId = dataset_id, store = False)
16361658

16371659
# get URL of an empty manifest file created based on schema component
16381660
empty_manifest_url = self.get_empty_manifest(strict=strict, sheet_url=True)
@@ -1869,9 +1891,9 @@ def sort_manifest_fields(self, manifest_fields, order="schema"):
18691891

18701892
# order manifest fields based on data-model schema
18711893
if order == "schema":
1872-
if self.sg and self.root:
1894+
if self.dmge and self.root:
18731895
# get display names of dependencies
1874-
dependencies_display_names = self.sg.get_node_dependencies(self.root)
1896+
dependencies_display_names = self.dmge.get_node_dependencies(self.root)
18751897

18761898
# reorder manifest fields so that root dependencies are first and follow schema order
18771899
manifest_fields = sorted(

0 commit comments

Comments
 (0)