Merge branch 'develop' into synpy-1717-allow-external-url

BryanFauble · BryanFauble · commit 74d1158245f7 · 2025-12-23T17:21:52.000Z
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -51,7 +51,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        os: [ubuntu-22.04, macos-13, windows-2022]
+        os: [ubuntu-22.04, macos-15-intel, windows-2022]
 
         # if changing the below change the run-integration-tests versions and the check-deploy versions
         # Make sure that we are running the integration tests on the first and last versions of the matrix
@@ -486,7 +486,7 @@ jobs:
 
     strategy:
       matrix:
-        os: [ubuntu-24.04, macos-13, windows-2022]
+        os: [ubuntu-24.04, macos-15-intel, windows-2022]
 
         # python versions should be consistent with the strategy matrix and the runs-integration-tests versions
         python: ["3.10", "3.11", "3.12", "3.13", "3.14"]
diff --git a/synapseclient/extensions/curator/schema_generation.py b/synapseclient/extensions/curator/schema_generation.py
@@ -4021,9 +4021,11 @@ def export_schema(schema: dict, file_path: str, logger: Logger) -> None:
         filepath, str: path to store the schema
     """
     file_path = os.path.expanduser(file_path)
-    json_schema_dirname = os.path.dirname(file_path)
-    if json_schema_dirname != "":
-        os.makedirs(json_schema_dirname, exist_ok=True)
+    # Don't create directories if the path looks like a URL
+    if not (file_path.startswith("http://") or file_path.startswith("https://")):
+        json_schema_dirname = os.path.dirname(file_path)
+        if json_schema_dirname != "":
+            os.makedirs(json_schema_dirname, exist_ok=True)
     with open(file_path, "w", encoding="utf-8") as json_file:
         json.dump(schema, json_file, sort_keys=True, indent=4, ensure_ascii=False)
 
@@ -5558,6 +5560,14 @@ def get_json_schema_log_file_path(data_model_path: str, source_node: str) -> str
     Returns:
         json_schema_log_file_path: str, file name for the log file
     """
+    # If it's a URL, extract just the filename
+    if data_model_path.startswith("http://") or data_model_path.startswith("https://"):
+        from urllib.parse import urlparse
+
+        parsed_url = urlparse(data_model_path)
+        # Get the last part of the path (filename)
+        data_model_path = os.path.basename(parsed_url.path)
+
     data_model_path_root, _ = os.path.splitext(data_model_path)
     prefix = data_model_path_root
     prefix_root, prefix_ext = os.path.splitext(prefix)
@@ -5682,7 +5692,11 @@ def _write_data_model(
             data_model_path=jsonld_path, source_node=name
         )
         json_schema_dirname = os.path.dirname(json_schema_path)
-        if json_schema_dirname != "":
+        # Don't create directories if the path looks like a URL
+        if json_schema_dirname != "" and not (
+            json_schema_path.startswith("http://")
+            or json_schema_path.startswith("https://")
+        ):
             os.makedirs(json_schema_dirname, exist_ok=True)
 
         logger.info(
@@ -5761,7 +5775,20 @@ def _build_output_path(self, output_directory: Path) -> Path:
         """
 
         stripped_component = self.component.replace(" ", "")
-        data_model_basename = Path(self.data_model_source).stem
+
+        # Handle URL by extracting just the filename
+        if self.data_model_source.startswith(
+            "http://"
+        ) or self.data_model_source.startswith("https://"):
+            from urllib.parse import urlparse
+
+            parsed_url = urlparse(self.data_model_source)
+            # Get the last part of the path (filename)
+            filename = os.path.basename(parsed_url.path)
+            data_model_basename = Path(filename).stem
+        else:
+            data_model_basename = Path(self.data_model_source).stem
+
         return Path(
             output_directory,
             data_model_basename,
@@ -5815,6 +5842,7 @@ def get_component_json_schema(
             schema_name=self.component + "_validation",
             jsonld_path=metadata_model.inputMModelLocation,
             use_property_display_names=use_display_names,
+            write_schema=False,  # Don't write intermediate files; write_json_schema_to_file() will handle final output
         )
         self.component_json_schema = json_schema
 
@@ -5876,12 +5904,12 @@ def generate_jsonschema(
     in your validation rules. This allows different validation behavior per manifest type.
 
     Arguments:
-        data_model_source: Path to the data model file (CSV or JSONLD) or URL to the raw
-            JSONLD. Can accept:
+        data_model_source: Path or URL to the data model file (CSV or JSONLD). Can accept:
 
-            - A CSV file with your data model specification (will be parsed automatically)
-            - A JSONLD file generated from `generate_jsonld()` or equivalent
-            - A URL pointing to a raw JSONLD data model
+            - A local CSV file with your data model specification (will be parsed automatically)
+            - A local JSONLD file generated from `generate_jsonld()` or equivalent
+            - A URL pointing to a raw CSV data model (e.g., from GitHub)
+            - A URL pointing to a raw JSONLD data model (e.g., from GitHub)
         output_directory: Directory path where JSON Schema files will be saved. Each
             component will generate a separate `<Component>_validation_schema.json` file.
         data_type: List of specific component names (data types) to generate schemas for.
@@ -5931,7 +5959,7 @@ def generate_jsonschema(
         )
         ```
 
-        Generate schema for specific components:
+        Generate schema for specific components from URL:
 
         ```python
         schemas, file_paths = generate_jsonschema(
@@ -5942,6 +5970,18 @@ def generate_jsonschema(
             synapse_client=syn
         )
         ```
+
+        Generate schema from CSV URL:
+
+        ```python
+        schemas, file_paths = generate_jsonschema(
+            data_model_source="https://raw.githubusercontent.com/org/repo/main/model.csv",
+            output_directory="./schemas",
+            data_type=None,
+            data_model_labels="class_label",
+            synapse_client=syn
+        )
+        ```
     """
 
     synapse_client = Synapse.get_client(synapse_client=synapse_client)
@@ -5996,8 +6036,9 @@ def generate_jsonld(
     - Verifies the graph structure is a valid directed acyclic graph (DAG)
 
     Arguments:
-        schema: Path to your data model CSV file. This file should contain your complete
-            data model specification with all attributes, validation rules, and relationships.
+        schema: Path or URL to your data model CSV file. Can be a local file path or a URL
+            (e.g., from GitHub). This file should contain your complete data model
+            specification with all attributes, validation rules, and relationships.
         data_model_labels: Label format for the JSON-LD output:
 
             - `"class_label"` (default, recommended): Uses standard attribute names as labels
@@ -6058,6 +6099,16 @@ class labels. Use cautiously as this can affect downstream compatibility.
             synapse_client=syn
         )
         ```
+
+        Load from URL:
+        ```python
+        jsonld_model = generate_jsonld(
+            schema="https://raw.githubusercontent.com/org/repo/main/model.csv",
+            data_model_labels="class_label",
+            output_jsonld="downloaded_model.jsonld",
+            synapse_client=syn
+        )
+        ```
     """
     syn = Synapse.get_client(synapse_client=synapse_client)
 
@@ -6109,7 +6160,14 @@ class labels. Use cautiously as this can affect downstream compatibility.
     # output JSON-LD file alongside CSV file by default, get path.
     if output_jsonld is None:
         if ".jsonld" not in schema:
-            csv_no_ext = re.sub("[.]csv$", "", schema)
+            # If schema is a URL, extract just the filename for local output
+            schema_path = schema
+            if schema.startswith("http://") or schema.startswith("https://"):
+                from urllib.parse import urlparse
+
+                parsed_url = urlparse(schema)
+                schema_path = os.path.basename(parsed_url.path)
+            csv_no_ext = re.sub("[.]csv$", "", schema_path)
             output_jsonld = csv_no_ext + ".jsonld"
         else:
             output_jsonld = schema
diff --git a/tests/unit/synapseclient/extensions/unit_test_curator.py b/tests/unit/synapseclient/extensions/unit_test_curator.py