Skip to content

Commit c312a02

Browse files
author
Lingling Peng
committed
Merge branch 'develop' into update-black
2 parents 27a2995 + fa8f6fa commit c312a02

File tree

2 files changed

+203
-75
lines changed

2 files changed

+203
-75
lines changed

synapseclient/extensions/curator/file_based_metadata_task.py

Lines changed: 45 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
from synapseclient.operations import FileOptions, get
2525

2626
TYPE_DICT = {
27-
"string": ColumnType.STRING,
27+
"string": ColumnType.MEDIUMTEXT,
2828
"number": ColumnType.DOUBLE,
2929
"integer": ColumnType.INTEGER,
3030
"boolean": ColumnType.BOOLEAN,
@@ -199,48 +199,67 @@ def _create_columns_from_json_schema(json_schema: dict[str, Any]) -> list[Column
199199
raise ValueError(
200200
"The 'properties' field in the JSON Schema must be a dictionary."
201201
)
202-
columns = []
203-
for name, prop_schema in properties.items():
204-
column_type = _get_column_type_from_js_property(prop_schema)
205-
maximum_size = None
206-
if column_type == "STRING":
207-
maximum_size = 100
208-
if column_type in LIST_TYPE_DICT.values():
209-
maximum_size = 5
210-
211-
column = Column(
212-
name=name,
213-
column_type=column_type,
214-
maximum_size=maximum_size,
215-
default_value=None,
216-
)
217-
columns.append(column)
202+
columns = [
203+
_create_synapse_column_from_js_property(prop_schema, name)
204+
for name, prop_schema in properties.items()
205+
]
218206
return columns
219207

220208

209+
def _create_synapse_column_from_js_property(
210+
js_property: dict[str, Any], name: str
211+
) -> Column:
212+
"""
213+
Creates a Synapse Column based on a JSON Schema property.
214+
215+
Args:
216+
js_property: A JSON Schema property in dict form.
217+
name: The name of the column.
218+
219+
Returns:
220+
A Synapse Column based on the JSON Schema property.
221+
"""
222+
column_type = _get_column_type_from_js_property(js_property)
223+
return Column(name=name, column_type=column_type)
224+
225+
221226
def _get_column_type_from_js_property(js_property: dict[str, Any]) -> ColumnType:
222227
"""
223228
Gets the Synapse column type from a JSON Schema property.
224229
The JSON Schema should be valid but that should not be assumed.
225-
If the type can not be determined ColumnType.STRING will be returned.
230+
If the type can not be determined ColumnType.MEDIUMTEXT will be returned.
226231
227232
Args:
228233
js_property: A JSON Schema property in dict form.
229234
230235
Returns:
231236
A Synapse ColumnType based on the JSON Schema type
232237
"""
233-
# Enums are always strings in Synapse tables
238+
# Enums are set as MediumText columns
234239
if "enum" in js_property:
235-
return ColumnType.STRING
240+
return ColumnType.MEDIUMTEXT
236241
if "type" in js_property:
237-
if js_property["type"] == "array":
242+
js_type = js_property["type"]
243+
# Synapse columns cannot be more than one type
244+
# If the JSONSchema type is a list of types, check if it's a nullable single type
245+
if isinstance(js_type, list):
246+
types = [t for t in js_type if t != "null"]
247+
if len(types) == 1:
248+
js_type = types[0]
249+
# If there are multiple non-null types, we cannot determine a single column type, so default to MediumText
250+
else:
251+
return ColumnType.MEDIUMTEXT
252+
if js_type == "array":
238253
return _get_list_column_type_from_js_property(js_property)
239-
return TYPE_DICT.get(js_property["type"], ColumnType.STRING)
254+
# If there is only one JSONSChema type, return the corresponding Synapse column type,
255+
# defaulting to MediumText if there is no match
256+
return TYPE_DICT.get(js_type, ColumnType.MEDIUMTEXT)
240257
# A oneOf list usually indicates that the type could be one or more different things
258+
# Curator extension does not create the types of JSON Schemas where this is the case
259+
# but if it is present we will attempt to determine the type based on the items in the oneOf list.
241260
if "oneOf" in js_property and isinstance(js_property["oneOf"], list):
242261
return _get_column_type_from_js_one_of_list(js_property["oneOf"])
243-
return ColumnType.STRING
262+
return ColumnType.MEDIUMTEXT
244263

245264

246265
def _get_column_type_from_js_one_of_list(js_one_of_list: list[Any]) -> ColumnType:
@@ -258,15 +277,15 @@ def _get_column_type_from_js_one_of_list(js_one_of_list: list[Any]) -> ColumnTyp
258277
items = [item for item in js_one_of_list if isinstance(item, dict)]
259278
# Enums are always strings in Synapse tables
260279
if [item for item in items if "enum" in item]:
261-
return ColumnType.STRING
280+
return ColumnType.MEDIUMTEXT
262281
# For Synapse ColumnType we can ignore null types in JSON Schemas
263282
type_items = [item for item in items if "type" in item if item["type"] != "null"]
264283
if len(type_items) == 1:
265284
type_item = type_items[0]
266285
if type_item["type"] == "array":
267286
return _get_list_column_type_from_js_property(type_item)
268-
return TYPE_DICT.get(type_item["type"], ColumnType.STRING)
269-
return ColumnType.STRING
287+
return TYPE_DICT.get(type_item["type"], ColumnType.MEDIUMTEXT)
288+
return ColumnType.MEDIUMTEXT
270289

271290

272291
def _get_list_column_type_from_js_property(js_property: dict[str, Any]) -> ColumnType:

tests/unit/synapseclient/extensions/unit_test_curator.py

Lines changed: 158 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
import shutil
1313
import tempfile
1414
import unittest
15+
from typing import Any
1516
from unittest.mock import Mock, mock_open, patch
1617

1718
import pandas as pd
@@ -26,6 +27,7 @@
2627
)
2728
from synapseclient.extensions.curator.file_based_metadata_task import (
2829
_create_columns_from_json_schema,
30+
_create_synapse_column_from_js_property,
2931
_get_column_type_from_js_one_of_list,
3032
_get_column_type_from_js_property,
3133
_get_list_column_type_from_js_property,
@@ -50,7 +52,7 @@
5052
SchemaRegistryColumnConfig,
5153
get_latest_schema_uri,
5254
)
53-
from synapseclient.models import ColumnType
55+
from synapseclient.models import Column, ColumnType
5456
from synapseclient.models.curation import (
5557
FileBasedMetadataTaskProperties,
5658
RecordBasedMetadataTaskProperties,
@@ -1670,50 +1672,6 @@ def test_create_columns_from_json_schema_success(self):
16701672
assert all(hasattr(col, "name") for col in columns)
16711673
assert all(hasattr(col, "column_type") for col in columns)
16721674

1673-
def test_get_column_type_from_js_property_enum(self):
1674-
"""Test getting column type for enum property."""
1675-
# GIVEN a JSON schema property with an enum
1676-
js_property = {"enum": ["option1", "option2", "option3"]}
1677-
1678-
# WHEN I get the column type
1679-
result = _get_column_type_from_js_property(js_property)
1680-
1681-
# THEN it should return STRING type
1682-
assert result == ColumnType.STRING
1683-
1684-
def test_get_column_type_from_js_property_array(self):
1685-
"""Test getting column type for array property."""
1686-
# GIVEN a JSON schema property with array type
1687-
js_property = {"type": "array", "items": {"type": "string"}}
1688-
1689-
# WHEN I get the column type
1690-
result = _get_column_type_from_js_property(js_property)
1691-
1692-
# THEN it should return a list type
1693-
assert result == ColumnType.STRING_LIST
1694-
1695-
def test_get_column_type_from_js_property_one_of(self):
1696-
"""Test getting column type for oneOf property."""
1697-
# GIVEN a JSON schema property with oneOf
1698-
js_property = {"oneOf": [{"type": "string"}, {"type": "null"}]}
1699-
1700-
# WHEN I get the column type
1701-
result = _get_column_type_from_js_property(js_property)
1702-
1703-
# THEN it should return STRING type
1704-
assert result == ColumnType.STRING
1705-
1706-
def test_get_column_type_from_js_property_fallback(self):
1707-
"""Test getting column type fallback to STRING."""
1708-
# GIVEN a JSON schema property without recognizable type
1709-
js_property = {"description": "some property"}
1710-
1711-
# WHEN I get the column type
1712-
result = _get_column_type_from_js_property(js_property)
1713-
1714-
# THEN it should return STRING type as fallback
1715-
assert result == ColumnType.STRING
1716-
17171675
def test_get_column_type_from_js_one_of_list_with_enum(self):
17181676
"""Test getting column type from oneOf list containing enum."""
17191677
# GIVEN a oneOf list with an enum
@@ -1722,8 +1680,8 @@ def test_get_column_type_from_js_one_of_list_with_enum(self):
17221680
# WHEN I get the column type
17231681
result = _get_column_type_from_js_one_of_list(js_one_of_list)
17241682

1725-
# THEN it should return STRING type
1726-
assert result == ColumnType.STRING
1683+
# THEN it should return MEDIUMTEXT type
1684+
assert result == ColumnType.MEDIUMTEXT
17271685

17281686
def test_get_column_type_from_js_one_of_list_single_type(self):
17291687
"""Test getting column type from oneOf list with single non-null type."""
@@ -1758,8 +1716,8 @@ def test_get_column_type_from_js_one_of_list_fallback(self):
17581716
# WHEN I get the column type
17591717
result = _get_column_type_from_js_one_of_list(js_one_of_list)
17601718

1761-
# THEN it should return STRING type as fallback
1762-
assert result == ColumnType.STRING
1719+
# THEN it should return MEDIUMTEXT type as fallback
1720+
assert result == ColumnType.MEDIUMTEXT
17631721

17641722
def test_get_list_column_type_from_js_property_with_enum(self):
17651723
"""Test getting list column type for property with enum items."""
@@ -1795,6 +1753,157 @@ def test_get_list_column_type_from_js_property_fallback(self):
17951753
assert result == ColumnType.STRING_LIST
17961754

17971755

1756+
@pytest.mark.parametrize(
1757+
"json_schema, expected_columns",
1758+
[
1759+
(
1760+
{
1761+
"properties": {
1762+
"string_col": {"type": "string"},
1763+
}
1764+
},
1765+
[
1766+
Column(name="string_col", column_type=ColumnType.MEDIUMTEXT),
1767+
],
1768+
),
1769+
(
1770+
{
1771+
"properties": {
1772+
"string_col": {"type": "string"},
1773+
"int_col": {"type": "integer"},
1774+
"bool_col": {"type": "boolean"},
1775+
"number_col": {"type": "number"},
1776+
}
1777+
},
1778+
[
1779+
Column(name="string_col", column_type=ColumnType.MEDIUMTEXT),
1780+
Column(name="int_col", column_type=ColumnType.INTEGER),
1781+
Column(name="bool_col", column_type=ColumnType.BOOLEAN),
1782+
Column(name="number_col", column_type=ColumnType.DOUBLE),
1783+
],
1784+
),
1785+
],
1786+
ids=["one column", "three columns"],
1787+
)
1788+
def test_create_columns_from_json_schema(
1789+
json_schema: dict[str, Any], expected_columns: list[Column]
1790+
):
1791+
"""Test successful column creation from JSON schema."""
1792+
assert _create_columns_from_json_schema(json_schema) == expected_columns
1793+
1794+
1795+
@pytest.mark.parametrize(
1796+
"json_schema",
1797+
[{}, {"properties": []}],
1798+
ids=["empty schema", "properties is not a dict"],
1799+
)
1800+
def test_create_columns_from_json_schema_exceptions(json_schema: dict[str, Any]):
1801+
"""Test exceptions when creating columns from invalid JSON schema."""
1802+
with pytest.raises(ValueError):
1803+
_create_columns_from_json_schema(json_schema)
1804+
1805+
1806+
@pytest.mark.parametrize(
1807+
"json_schema_property, property_name, expected_column_type",
1808+
[
1809+
(
1810+
{"type": "array", "items": {"type": "string"}},
1811+
"string_list_col",
1812+
ColumnType.STRING_LIST,
1813+
),
1814+
(
1815+
{"type": "array", "items": {"type": "integer"}},
1816+
"int_list_col",
1817+
ColumnType.INTEGER_LIST,
1818+
),
1819+
(
1820+
{"type": "array", "items": {"type": "boolean"}},
1821+
"bool_list_col",
1822+
ColumnType.BOOLEAN_LIST,
1823+
),
1824+
(
1825+
{"type": "number"},
1826+
"number_col",
1827+
ColumnType.DOUBLE,
1828+
),
1829+
(
1830+
{"type": "integer"},
1831+
"integer_col",
1832+
ColumnType.INTEGER,
1833+
),
1834+
(
1835+
{"type": "boolean"},
1836+
"boolean_col",
1837+
ColumnType.BOOLEAN,
1838+
),
1839+
(
1840+
{"type": "string"},
1841+
"string_col",
1842+
ColumnType.MEDIUMTEXT,
1843+
),
1844+
],
1845+
ids=[
1846+
"string_list",
1847+
"integer_list",
1848+
"boolean_list",
1849+
"number",
1850+
"integer",
1851+
"boolean",
1852+
"string",
1853+
],
1854+
)
1855+
def test_create_synapse_column_from_js_property(
1856+
json_schema_property: dict[str, Any],
1857+
property_name: str,
1858+
expected_column_type: ColumnType,
1859+
):
1860+
"""Test successful column creation from JSON schema property."""
1861+
result = _create_synapse_column_from_js_property(
1862+
json_schema_property, property_name
1863+
)
1864+
assert isinstance(result, Column)
1865+
assert result.name == property_name
1866+
assert result.column_type == expected_column_type
1867+
1868+
1869+
@pytest.mark.parametrize(
1870+
"json_schema_property, expected_column_type",
1871+
[
1872+
({"enum": ["a", "b", "c"]}, ColumnType.MEDIUMTEXT),
1873+
({"type": "string"}, ColumnType.MEDIUMTEXT),
1874+
({"type": "integer"}, ColumnType.INTEGER),
1875+
({"type": "number"}, ColumnType.DOUBLE),
1876+
({"type": "boolean"}, ColumnType.BOOLEAN),
1877+
({"type": ["integer", "null"]}, ColumnType.INTEGER),
1878+
({"type": ["integer", "string"]}, ColumnType.MEDIUMTEXT),
1879+
({"type": "array", "items": {"type": "integer"}}, ColumnType.INTEGER_LIST),
1880+
({"oneOf": [{"type": "integer"}, {"type": "null"}]}, ColumnType.INTEGER),
1881+
({"type": "unknown"}, ColumnType.MEDIUMTEXT),
1882+
({}, ColumnType.MEDIUMTEXT),
1883+
],
1884+
ids=[
1885+
"enum_property",
1886+
"type_string",
1887+
"type_integer",
1888+
"type_number",
1889+
"type_boolean",
1890+
"type_list_nullable",
1891+
"type_list_multiple_types",
1892+
"type_array",
1893+
"one_of_list",
1894+
"unknown_type",
1895+
"empty_property",
1896+
],
1897+
)
1898+
def test_get_column_type_from_js_property(
1899+
json_schema_property: dict[str, Any], expected_column_type: ColumnType
1900+
):
1901+
"""Test getting column type from JSON schema property."""
1902+
assert (
1903+
_get_column_type_from_js_property(json_schema_property) == expected_column_type
1904+
)
1905+
1906+
17981907
class TestGetLatestSchemaUri(unittest.TestCase):
17991908
"""Test cases for get_latest_schema_uri function."""
18001909

0 commit comments

Comments
 (0)