Skip to content

Commit d05ff1d

Browse files
Lingling PengLingling Peng
authored andcommitted
merge with develop
2 parents f2ef8a7 + 4c72283 commit d05ff1d

File tree

2 files changed

+533
-18
lines changed

2 files changed

+533
-18
lines changed

synapseclient/extensions/curator/schema_generation.py

Lines changed: 72 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -4021,9 +4021,11 @@ def export_schema(schema: dict, file_path: str, logger: Logger) -> None:
40214021
filepath, str: path to store the schema
40224022
"""
40234023
file_path = os.path.expanduser(file_path)
4024-
json_schema_dirname = os.path.dirname(file_path)
4025-
if json_schema_dirname != "":
4026-
os.makedirs(json_schema_dirname, exist_ok=True)
4024+
# Don't create directories if the path looks like a URL
4025+
if not (file_path.startswith("http://") or file_path.startswith("https://")):
4026+
json_schema_dirname = os.path.dirname(file_path)
4027+
if json_schema_dirname != "":
4028+
os.makedirs(json_schema_dirname, exist_ok=True)
40274029
with open(file_path, "w", encoding="utf-8") as json_file:
40284030
json.dump(schema, json_file, sort_keys=True, indent=4, ensure_ascii=False)
40294031

@@ -5558,6 +5560,14 @@ def get_json_schema_log_file_path(data_model_path: str, source_node: str) -> str
55585560
Returns:
55595561
json_schema_log_file_path: str, file name for the log file
55605562
"""
5563+
# If it's a URL, extract just the filename
5564+
if data_model_path.startswith("http://") or data_model_path.startswith("https://"):
5565+
from urllib.parse import urlparse
5566+
5567+
parsed_url = urlparse(data_model_path)
5568+
# Get the last part of the path (filename)
5569+
data_model_path = os.path.basename(parsed_url.path)
5570+
55615571
data_model_path_root, _ = os.path.splitext(data_model_path)
55625572
prefix = data_model_path_root
55635573
prefix_root, prefix_ext = os.path.splitext(prefix)
@@ -5682,7 +5692,11 @@ def _write_data_model(
56825692
data_model_path=jsonld_path, source_node=name
56835693
)
56845694
json_schema_dirname = os.path.dirname(json_schema_path)
5685-
if json_schema_dirname != "":
5695+
# Don't create directories if the path looks like a URL
5696+
if json_schema_dirname != "" and not (
5697+
json_schema_path.startswith("http://")
5698+
or json_schema_path.startswith("https://")
5699+
):
56865700
os.makedirs(json_schema_dirname, exist_ok=True)
56875701

56885702
logger.info(
@@ -5761,7 +5775,20 @@ def _build_output_path(self, output_directory: Path) -> Path:
57615775
"""
57625776

57635777
stripped_component = self.component.replace(" ", "")
5764-
data_model_basename = Path(self.data_model_source).stem
5778+
5779+
# Handle URL by extracting just the filename
5780+
if self.data_model_source.startswith(
5781+
"http://"
5782+
) or self.data_model_source.startswith("https://"):
5783+
from urllib.parse import urlparse
5784+
5785+
parsed_url = urlparse(self.data_model_source)
5786+
# Get the last part of the path (filename)
5787+
filename = os.path.basename(parsed_url.path)
5788+
data_model_basename = Path(filename).stem
5789+
else:
5790+
data_model_basename = Path(self.data_model_source).stem
5791+
57655792
return Path(
57665793
output_directory,
57675794
data_model_basename,
@@ -5815,6 +5842,7 @@ def get_component_json_schema(
58155842
schema_name=self.component + "_validation",
58165843
jsonld_path=metadata_model.inputMModelLocation,
58175844
use_property_display_names=use_display_names,
5845+
write_schema=False, # Don't write intermediate files; write_json_schema_to_file() will handle final output
58185846
)
58195847
self.component_json_schema = json_schema
58205848

@@ -5876,12 +5904,12 @@ def generate_jsonschema(
58765904
in your validation rules. This allows different validation behavior per manifest type.
58775905
58785906
Arguments:
5879-
data_model_source: Path to the data model file (CSV or JSONLD) or URL to the raw
5880-
JSONLD. Can accept:
5907+
data_model_source: Path or URL to the data model file (CSV or JSONLD). Can accept:
58815908
5882-
- A CSV file with your data model specification (will be parsed automatically)
5883-
- A JSONLD file generated from `generate_jsonld()` or equivalent
5884-
- A URL pointing to a raw JSONLD data model
5909+
- A local CSV file with your data model specification (will be parsed automatically)
5910+
- A local JSONLD file generated from `generate_jsonld()` or equivalent
5911+
- A URL pointing to a raw CSV data model (e.g., from GitHub)
5912+
- A URL pointing to a raw JSONLD data model (e.g., from GitHub)
58855913
output_directory: Directory path where JSON Schema files will be saved. Each
58865914
component will generate a separate `<Component>_validation_schema.json` file.
58875915
data_type: List of specific component names (data types) to generate schemas for.
@@ -5931,7 +5959,7 @@ def generate_jsonschema(
59315959
)
59325960
```
59335961
5934-
Generate schema for specific components:
5962+
Generate schema for specific components from URL:
59355963
59365964
```python
59375965
schemas, file_paths = generate_jsonschema(
@@ -5942,6 +5970,18 @@ def generate_jsonschema(
59425970
synapse_client=syn
59435971
)
59445972
```
5973+
5974+
Generate schema from CSV URL:
5975+
5976+
```python
5977+
schemas, file_paths = generate_jsonschema(
5978+
data_model_source="https://raw.githubusercontent.com/org/repo/main/model.csv",
5979+
output_directory="./schemas",
5980+
data_type=None,
5981+
data_model_labels="class_label",
5982+
synapse_client=syn
5983+
)
5984+
```
59455985
"""
59465986

59475987
synapse_client = Synapse.get_client(synapse_client=synapse_client)
@@ -5996,8 +6036,9 @@ def generate_jsonld(
59966036
- Verifies the graph structure is a valid directed acyclic graph (DAG)
59976037
59986038
Arguments:
5999-
schema: Path to your data model CSV file. This file should contain your complete
6000-
data model specification with all attributes, validation rules, and relationships.
6039+
schema: Path or URL to your data model CSV file. Can be a local file path or a URL
6040+
(e.g., from GitHub). This file should contain your complete data model
6041+
specification with all attributes, validation rules, and relationships.
60016042
data_model_labels: Label format for the JSON-LD output:
60026043
60036044
- `"class_label"` (default, recommended): Uses standard attribute names as labels
@@ -6058,6 +6099,16 @@ class labels. Use cautiously as this can affect downstream compatibility.
60586099
synapse_client=syn
60596100
)
60606101
```
6102+
6103+
Load from URL:
6104+
```python
6105+
jsonld_model = generate_jsonld(
6106+
schema="https://raw.githubusercontent.com/org/repo/main/model.csv",
6107+
data_model_labels="class_label",
6108+
output_jsonld="downloaded_model.jsonld",
6109+
synapse_client=syn
6110+
)
6111+
```
60616112
"""
60626113
syn = Synapse.get_client(synapse_client=synapse_client)
60636114

@@ -6109,7 +6160,14 @@ class labels. Use cautiously as this can affect downstream compatibility.
61096160
# output JSON-LD file alongside CSV file by default, get path.
61106161
if output_jsonld is None:
61116162
if ".jsonld" not in schema:
6112-
csv_no_ext = re.sub("[.]csv$", "", schema)
6163+
# If schema is a URL, extract just the filename for local output
6164+
schema_path = schema
6165+
if schema.startswith("http://") or schema.startswith("https://"):
6166+
from urllib.parse import urlparse
6167+
6168+
parsed_url = urlparse(schema)
6169+
schema_path = os.path.basename(parsed_url.path)
6170+
csv_no_ext = re.sub("[.]csv$", "", schema_path)
61136171
output_jsonld = csv_no_ext + ".jsonld"
61146172
else:
61156173
output_jsonld = schema

0 commit comments

Comments
 (0)