From 93fae555f2350bd6f97a3e3880630987b266bf1f Mon Sep 17 00:00:00 2001 From: danlu1 Date: Sun, 5 Apr 2026 22:52:45 -0700 Subject: [PATCH 01/26] reformatting --- .../models/mixins/storable_container.py | 127 +++++++- .../protocols/storable_container_protocol.py | 13 +- synapseutils/sync.py | 7 + .../models/async/unit_test_folder_async.py | 299 +++++++++++++++++- .../unit_test_synapseutils_sync.py | 18 ++ 5 files changed, 456 insertions(+), 8 deletions(-) diff --git a/synapseclient/models/mixins/storable_container.py b/synapseclient/models/mixins/storable_container.py index 25432a6b9..fde6c2a30 100644 --- a/synapseclient/models/mixins/storable_container.py +++ b/synapseclient/models/mixins/storable_container.py @@ -1,6 +1,8 @@ """Mixin for objects that can have Folders and Files stored in them.""" import asyncio +import csv +import io import os from typing import ( TYPE_CHECKING, @@ -64,6 +66,100 @@ ) +MANIFEST_CSV_FILENAME = "manifest.csv" +DEFAULT_GENERATED_MANIFEST_CSV_KEYS = [ + "path", + "parentId", + "name", + "ID", + "synapseStore", + "contentType", + "used", + "executed", + "activityName", + "activityDescription", +] + + +def _manifest_csv_filename(path: str) -> str: + return os.path.join(os.path.expanduser(path), MANIFEST_CSV_FILENAME) + + +def _get_entity_provenance_dict_for_manifest(entity: "File") -> dict: + if not entity.activity: + return {} + used = [a.format_for_manifest() for a in entity.activity.used] + executed = [a.format_for_manifest() for a in entity.activity.executed] + return { + "used": ";".join(used), + "executed": ";".join(executed), + "activityName": entity.activity.name or "", + "activityDescription": entity.activity.description or "", + } + + +def _extract_entity_metadata_for_manifest_csv( + all_files: List["File"], +) -> Tuple[List[str], List[Dict]]: + keys = list(DEFAULT_GENERATED_MANIFEST_CSV_KEYS) + annotation_keys: set = set() + data = [] + for entity in all_files: + row: Dict = { + "path": entity.path, + "parentId": entity.parent_id, + "name": entity.name, + "ID": entity.id, + "synapseStore": entity.synapse_store, + "contentType": entity.content_type, + } + if entity.annotations: + annotation_keys.update(entity.annotations.keys()) + row.update( + { + key: (val if len(val) > 0 else "") + for key, val in entity.annotations.items() + } + ) + row.update(_get_entity_provenance_dict_for_manifest(entity=entity)) + data.append(row) + keys.extend(annotation_keys) + return keys, data + + +def _write_manifest_data_csv(filename: str, keys: List[str], data: List[Dict]) -> None: + from synapseutils.sync import _convert_manifest_data_row_to_dict + + with io.open(filename, "w", encoding="utf8", newline="") as fp: + writer = csv.DictWriter( + fp, + keys, + restval="", + extrasaction="ignore", + quoting=csv.QUOTE_MINIMAL, + ) + writer.writeheader() + for row in data: + writer.writerow(_convert_manifest_data_row_to_dict(row, keys)) + + +def generate_manifest_csv(all_files: List["File"], path: str) -> None: + """Generates a manifest.csv file based on a list of File entities. + + The generated file uses CSV format with comma delimiter and is interoperable + with the Synapse UI download cart. Column names follow the new convention: + `parentId` (instead of `parent`) and `ID` (instead of `id`). + + Arguments: + all_files: A list of File model objects. + path: The directory path where manifest.csv will be written. + """ + if path and all_files: + filename = _manifest_csv_filename(path=path) + keys, data = _extract_entity_metadata_for_manifest_csv(all_files=all_files) + _write_manifest_data_csv(filename, keys, data) + + @async_to_sync class StorableContainer(StorableContainerSynchronousProtocol): """ @@ -159,6 +255,7 @@ async def sync_from_synapse_async( link_hops: int = 1, queue: asyncio.Queue = None, include_types: Optional[List[str]] = None, + manifest: str = "all", *, synapse_client: Optional[Synapse] = None, ) -> Self: @@ -170,9 +267,10 @@ async def sync_from_synapse_async( If you only want to retrieve the full tree of metadata about your container specify `download_file` as False. - This works similar to [synapseutils.syncFromSynapse][], however, this does not - currently support the writing of data to a manifest TSV file. This will be a - future enhancement. + This works similar to [synapseutils.syncFromSynapse][], and generates a + `manifest.csv` file in each synced directory. The manifest uses CSV format + with `parentId` and `ID` columns, interoperable with the Synapse UI download + cart and `synapse get-download-list` CLI output. Supports syncing Files, Folders, Tables, EntityViews, SubmissionViews, Datasets, DatasetCollections, MaterializedViews, and VirtualTables from Synapse. The @@ -208,6 +306,11 @@ async def sync_from_synapse_async( `["folder", "file", "table", "entityview", "dockerrepo", "submissionview", "dataset", "datasetcollection", "materializedview", "virtualtable"]`. + manifest: Determines whether to generate a manifest CSV file. Options are: + + - `all` (default): generate `manifest.csv` in every synced directory + - `root`: generate `manifest.csv` only in the root `path` directory + - `suppress`: do not generate any manifest file synapse_client: If not passed in and caching was not disabled by `Synapse.allow_client_caching(False)` this will use the last created instance from the Synapse class constructor. @@ -397,6 +500,7 @@ async def my_function(): link_hops=link_hops, queue=queue, include_types=include_types, + manifest=manifest, synapse_client=syn, ) @@ -412,6 +516,7 @@ async def _sync_from_synapse_async( link_hops: int = 1, queue: asyncio.Queue = None, include_types: Optional[List[str]] = None, + manifest: str = "all", *, synapse_client: Optional[Synapse] = None, ) -> Self: @@ -494,6 +599,22 @@ async def _sync_from_synapse_async( for task in worker_tasks: task.cancel() + if create_workers and path and manifest != "suppress": + if manifest == "all": + for ( + directory_path, + file_entities, + ) in self.map_directory_to_all_contained_files(root_path=path).items(): + generate_manifest_csv( + all_files=file_entities, + path=directory_path, + ) + elif manifest == "root": + generate_manifest_csv( + all_files=self.flatten_file_list(), + path=path, + ) + return self def flatten_file_list(self) -> List["File"]: diff --git a/synapseclient/models/protocols/storable_container_protocol.py b/synapseclient/models/protocols/storable_container_protocol.py index 0352132d1..5391f7c2b 100644 --- a/synapseclient/models/protocols/storable_container_protocol.py +++ b/synapseclient/models/protocols/storable_container_protocol.py @@ -29,6 +29,7 @@ def sync_from_synapse( link_hops: int = 1, queue: asyncio.Queue = None, include_types: Optional[List[str]] = None, + manifest: str = "all", *, synapse_client: Optional[Synapse] = None, ) -> Self: @@ -40,9 +41,10 @@ def sync_from_synapse( If you only want to retrieve the full tree of metadata about your container specify `download_file` as False. - This works similar to [synapseutils.syncFromSynapse][], however, this does not - currently support the writing of data to a manifest TSV file. This will be a - future enhancement. + This works similar to [synapseutils.syncFromSynapse][], and generates a + `manifest.csv` file in each synced directory. The manifest uses CSV format + with `parentId` and `ID` columns, interoperable with the Synapse UI download + cart and `synapse get-download-list` CLI output. Supports syncing Files, Folders, Tables, EntityViews, SubmissionViews, Datasets, DatasetCollections, MaterializedViews, and VirtualTables from Synapse. The @@ -74,6 +76,11 @@ def sync_from_synapse( include_types: Must be a list of entity types (ie. ["folder","file"]) which can be found [here](https://rest-docs.synapse.org/rest/org/sagebionetworks/repo/model/EntityType.html) + manifest: Determines whether to generate a manifest CSV file. Options are: + + - `all` (default): generate `manifest.csv` in every synced directory + - `root`: generate `manifest.csv` only in the root `path` directory + - `suppress`: do not generate any manifest file synapse_client: If not passed in and caching was not disabled by `Synapse.allow_client_caching(False)` this will use the last created instance from the Synapse class constructor. diff --git a/synapseutils/sync.py b/synapseutils/sync.py index 6a6c21631..a7c7764b1 100644 --- a/synapseutils/sync.py +++ b/synapseutils/sync.py @@ -78,6 +78,13 @@ COMMAS_OUTSIDE_DOUBLE_QUOTES_PATTERN = re.compile(r",(?=(?:[^\"]*\"[^\"]*\")*[^\"]*$)") +@deprecated( + version="4.11.0", + reason=( + "To be removed in 5.0.0. Use StorableContainer.sync_from_synapse instead, " + "which generates a manifest.csv file interoperable with the Synapse UI download cart." + ), +) def syncFromSynapse( syn: Synapse, entity: Union[str, SynapseFile, SynapseProject, SynapseFolder], diff --git a/tests/unit/synapseclient/models/async/unit_test_folder_async.py b/tests/unit/synapseclient/models/async/unit_test_folder_async.py index d096d5a07..d8f9596a5 100644 --- a/tests/unit/synapseclient/models/async/unit_test_folder_async.py +++ b/tests/unit/synapseclient/models/async/unit_test_folder_async.py @@ -1,7 +1,11 @@ """Tests for the Folder class.""" +import csv +import io +import os +import tempfile import uuid from typing import Dict -from unittest.mock import AsyncMock, patch +from unittest.mock import AsyncMock, MagicMock, patch import pytest @@ -10,7 +14,12 @@ from synapseclient.core.constants import concrete_types from synapseclient.core.constants.concrete_types import FILE_ENTITY from synapseclient.core.exceptions import SynapseNotFoundError -from synapseclient.models import FailureStrategy, File, Folder +from synapseclient.models import Activity, FailureStrategy, File, Folder +from synapseclient.models.mixins.storable_container import ( + _extract_entity_metadata_for_manifest_csv, + _write_manifest_data_csv, + generate_manifest_csv, +) SYN_123 = "syn123" SYN_456 = "syn456" @@ -785,3 +794,289 @@ async def mock_get_children(*args, **kwargs): assert result.modified_by == MODIFIED_BY assert result.files[0].id == SYN_456 assert result.files[0].name == "example_file_1" + + async def test_sync_from_synapse_manifest_all_generates_per_directory( + self, + ) -> None: + # GIVEN a Folder object with a path + folder = Folder(id=SYN_123) + children = [{"id": SYN_456, "type": FILE_ENTITY, "name": "example_file_1"}] + + async def mock_get_children(*args, **kwargs): + for child in children: + yield child + + downloaded_file = File( + id=SYN_456, + name="example_file_1", + path="/tmp/mydir/example_file_1.txt", + parent_id=SYN_123, + ) + + # WHEN I call sync_from_synapse with manifest="all" and a path + with patch( + "synapseclient.models.mixins.storable_container.get_children", + side_effect=mock_get_children, + ), patch( + "synapseclient.api.entity_factory.get_entity_id_bundle2", + new_callable=AsyncMock, + return_value=self.get_example_rest_api_folder_output(), + ), patch( + "synapseclient.models.file.File.get_async", + return_value=downloaded_file, + ), patch( + "synapseclient.models.mixins.storable_container.generate_manifest_csv", + ) as mock_generate: + await folder.sync_from_synapse_async( + path="/tmp/mydir", manifest="all", synapse_client=self.syn + ) + + # THEN generate_manifest_csv should be called for each directory + assert mock_generate.call_count >= 1 + call_paths = [c.kwargs["path"] for c in mock_generate.call_args_list] + assert "/tmp/mydir" in call_paths + + async def test_sync_from_synapse_manifest_root_generates_only_at_root( + self, + ) -> None: + # GIVEN a Folder object with a path + folder = Folder(id=SYN_123) + children = [{"id": SYN_456, "type": FILE_ENTITY, "name": "example_file_1"}] + + async def mock_get_children(*args, **kwargs): + for child in children: + yield child + + downloaded_file = File( + id=SYN_456, + name="example_file_1", + path="/tmp/mydir/example_file_1.txt", + parent_id=SYN_123, + ) + + # WHEN I call sync_from_synapse with manifest="root" and a path + with patch( + "synapseclient.models.mixins.storable_container.get_children", + side_effect=mock_get_children, + ), patch( + "synapseclient.api.entity_factory.get_entity_id_bundle2", + new_callable=AsyncMock, + return_value=self.get_example_rest_api_folder_output(), + ), patch( + "synapseclient.models.file.File.get_async", + return_value=downloaded_file, + ), patch( + "synapseclient.models.mixins.storable_container.generate_manifest_csv", + ) as mock_generate: + await folder.sync_from_synapse_async( + path="/tmp/mydir", manifest="root", synapse_client=self.syn + ) + + # THEN generate_manifest_csv should be called exactly once with the root path + mock_generate.assert_called_once() + assert mock_generate.call_args.kwargs["path"] == "/tmp/mydir" + + async def test_sync_from_synapse_manifest_suppress_skips_generation( + self, + ) -> None: + # GIVEN a Folder object with a path + folder = Folder(id=SYN_123) + children = [{"id": SYN_456, "type": FILE_ENTITY, "name": "example_file_1"}] + + async def mock_get_children(*args, **kwargs): + for child in children: + yield child + + # WHEN I call sync_from_synapse with manifest="suppress" + with patch( + "synapseclient.models.mixins.storable_container.get_children", + side_effect=mock_get_children, + ), patch( + "synapseclient.api.entity_factory.get_entity_id_bundle2", + new_callable=AsyncMock, + return_value=self.get_example_rest_api_folder_output(), + ), patch( + "synapseclient.models.file.File.get_async", + return_value=(File(id=SYN_456, name="example_file_1")), + ), patch( + "synapseclient.models.mixins.storable_container.generate_manifest_csv", + ) as mock_generate: + await folder.sync_from_synapse_async( + path="/tmp/mydir", manifest="suppress", synapse_client=self.syn + ) + + # THEN generate_manifest_csv should never be called + mock_generate.assert_not_called() + + async def test_sync_from_synapse_no_manifest_without_path(self) -> None: + # GIVEN a Folder with no path specified + folder = Folder(id=SYN_123) + children = [{"id": SYN_456, "type": FILE_ENTITY, "name": "example_file_1"}] + + async def mock_get_children(*args, **kwargs): + for child in children: + yield child + + # WHEN I call sync_from_synapse with no path (default manifest="all") + with patch( + "synapseclient.models.mixins.storable_container.get_children", + side_effect=mock_get_children, + ), patch( + "synapseclient.api.entity_factory.get_entity_id_bundle2", + new_callable=AsyncMock, + return_value=self.get_example_rest_api_folder_output(), + ), patch( + "synapseclient.models.file.File.get_async", + return_value=(File(id=SYN_456, name="example_file_1")), + ), patch( + "synapseclient.models.mixins.storable_container.generate_manifest_csv", + ) as mock_generate: + await folder.sync_from_synapse_async(synapse_client=self.syn) + + # THEN generate_manifest_csv should not be called (no path to write to) + mock_generate.assert_not_called() + + +class TestGenerateManifestCsv: + """Tests for the generate_manifest_csv and related helper functions.""" + + def _make_file( + self, + syn_id: str = "syn123", + name: str = "file.txt", + path: str = "/data/file.txt", + parent_id: str = "syn456", + content_type: str = "text/plain", + synapse_store: bool = True, + annotations: dict = None, + activity: Activity = None, + ) -> File: + f = File( + id=syn_id, + name=name, + path=path, + parent_id=parent_id, + content_type=content_type, + synapse_store=synapse_store, + ) + if annotations: + f.annotations = annotations + if activity: + f.activity = activity + return f + + def test_extract_entity_metadata_uses_parentId_and_ID_columns(self) -> None: + # GIVEN a File entity + f = self._make_file() + + # WHEN metadata is extracted + keys, data = _extract_entity_metadata_for_manifest_csv([f]) + + # THEN column names use new convention + assert "parentId" in keys + assert "ID" in keys + assert "parent" not in keys + assert "id" not in keys + + # AND values are correct + assert data[0]["parentId"] == "syn456" + assert data[0]["ID"] == "syn123" + assert data[0]["path"] == "/data/file.txt" + assert data[0]["name"] == "file.txt" + + def test_extract_entity_metadata_includes_annotations(self) -> None: + # GIVEN a File entity with annotations + f = self._make_file(annotations={"tissue": ["brain"], "count": [42]}) + + # WHEN metadata is extracted + keys, data = _extract_entity_metadata_for_manifest_csv([f]) + + # THEN annotation keys are included + assert "tissue" in keys + assert "count" in keys + assert data[0]["tissue"] == ["brain"] + + def test_write_manifest_data_csv_produces_comma_separated_output(self) -> None: + # GIVEN some data with a value that contains a comma + keys = ["path", "parentId", "name", "ID"] + data = [ + { + "path": "/data/file.txt", + "parentId": "syn456", + "name": "a, b, c", + "ID": "syn123", + } + ] + + # WHEN written to a CSV buffer + buf = io.StringIO() + with patch("builtins.open", side_effect=lambda *a, **kw: buf): + pass # we call directly below + + output = io.StringIO() + writer = csv.DictWriter( + output, + keys, + restval="", + extrasaction="ignore", + quoting=csv.QUOTE_MINIMAL, + ) + writer.writeheader() + writer.writerow(data[0]) + content = output.getvalue() + + # THEN the header uses commas + assert "path,parentId,name,ID" in content + # AND the value with commas is quoted + assert '"a, b, c"' in content + + def test_generate_manifest_csv_creates_file(self) -> None: + # GIVEN a list of File entities and a temp directory + f = self._make_file( + syn_id="syn123", + name="data.csv", + path="/tmp/data.csv", + parent_id="syn456", + ) + + with tempfile.TemporaryDirectory() as tmpdir: + # WHEN generate_manifest_csv is called + generate_manifest_csv(all_files=[f], path=tmpdir) + + # THEN manifest.csv is created in the directory + manifest_path = os.path.join(tmpdir, "manifest.csv") + assert os.path.exists(manifest_path) + + # AND it has the expected columns with new naming convention + with open(manifest_path, newline="", encoding="utf8") as fp: + reader = csv.DictReader(fp) + headers = reader.fieldnames + assert "parentId" in headers + assert "ID" in headers + assert "parent" not in headers + assert "id" not in headers + + row = next(reader) + assert row["parentId"] == "syn456" + assert row["ID"] == "syn123" + + def test_generate_manifest_csv_skips_when_no_files(self) -> None: + # GIVEN an empty file list + with tempfile.TemporaryDirectory() as tmpdir: + # WHEN generate_manifest_csv is called with no files + generate_manifest_csv(all_files=[], path=tmpdir) + + # THEN no manifest.csv is created + assert not os.path.exists(os.path.join(tmpdir, "manifest.csv")) + + def test_generate_manifest_csv_quotes_values_with_commas(self) -> None: + # GIVEN a File whose name contains a comma + f = self._make_file(name="file, extra.txt", path="/tmp/file, extra.txt") + + with tempfile.TemporaryDirectory() as tmpdir: + generate_manifest_csv(all_files=[f], path=tmpdir) + manifest_path = os.path.join(tmpdir, "manifest.csv") + content = open(manifest_path, encoding="utf8").read() + + # THEN the comma-containing value is quoted in the CSV + assert '"file, extra.txt"' in content diff --git a/tests/unit/synapseutils/unit_test_synapseutils_sync.py b/tests/unit/synapseutils/unit_test_synapseutils_sync.py index 062984be0..0a200da26 100644 --- a/tests/unit/synapseutils/unit_test_synapseutils_sync.py +++ b/tests/unit/synapseutils/unit_test_synapseutils_sync.py @@ -1861,3 +1861,21 @@ def test_multiple_item(self) -> None: "baz", '"foo, bar, baz"', ] + + +class TestSyncFromSynapseDeprecation: + """Tests for the deprecation of syncFromSynapse.""" + + def test_syncFromSynapse_emits_deprecation_warning(self, syn: Synapse) -> None: + # GIVEN the legacy syncFromSynapse function + # WHEN it is called + # THEN a DeprecationWarning is raised pointing to StorableContainer + with pytest.warns( + DeprecationWarning, match="StorableContainer.sync_from_synapse" + ): + with patch.object( + sync, + "syncFromSynapse_async", + return_value=AsyncMock(return_value=[])(), + ): + sync.syncFromSynapse(syn=syn, entity="syn123") From 3bf190825fd27a1b301dda43d2514014b4c21ccc Mon Sep 17 00:00:00 2001 From: danlu1 Date: Thu, 9 Apr 2026 10:51:45 -0700 Subject: [PATCH 02/26] remove unwanted import --- .../models/mixins/storable_container.py | 114 ++-------- synapseclient/models/services/manifest.py | 134 +++++++++++ .../models/async/test_folder_async.py | 215 ++++++++++++++++++ .../models/async/unit_test_folder_async.py | 2 +- 4 files changed, 369 insertions(+), 96 deletions(-) create mode 100644 synapseclient/models/services/manifest.py diff --git a/synapseclient/models/mixins/storable_container.py b/synapseclient/models/mixins/storable_container.py index fde6c2a30..6f0458637 100644 --- a/synapseclient/models/mixins/storable_container.py +++ b/synapseclient/models/mixins/storable_container.py @@ -1,8 +1,6 @@ """Mixin for objects that can have Folders and Files stored in them.""" import asyncio -import csv -import io import os from typing import ( TYPE_CHECKING, @@ -65,99 +63,7 @@ VirtualTable, ) - -MANIFEST_CSV_FILENAME = "manifest.csv" -DEFAULT_GENERATED_MANIFEST_CSV_KEYS = [ - "path", - "parentId", - "name", - "ID", - "synapseStore", - "contentType", - "used", - "executed", - "activityName", - "activityDescription", -] - - -def _manifest_csv_filename(path: str) -> str: - return os.path.join(os.path.expanduser(path), MANIFEST_CSV_FILENAME) - - -def _get_entity_provenance_dict_for_manifest(entity: "File") -> dict: - if not entity.activity: - return {} - used = [a.format_for_manifest() for a in entity.activity.used] - executed = [a.format_for_manifest() for a in entity.activity.executed] - return { - "used": ";".join(used), - "executed": ";".join(executed), - "activityName": entity.activity.name or "", - "activityDescription": entity.activity.description or "", - } - - -def _extract_entity_metadata_for_manifest_csv( - all_files: List["File"], -) -> Tuple[List[str], List[Dict]]: - keys = list(DEFAULT_GENERATED_MANIFEST_CSV_KEYS) - annotation_keys: set = set() - data = [] - for entity in all_files: - row: Dict = { - "path": entity.path, - "parentId": entity.parent_id, - "name": entity.name, - "ID": entity.id, - "synapseStore": entity.synapse_store, - "contentType": entity.content_type, - } - if entity.annotations: - annotation_keys.update(entity.annotations.keys()) - row.update( - { - key: (val if len(val) > 0 else "") - for key, val in entity.annotations.items() - } - ) - row.update(_get_entity_provenance_dict_for_manifest(entity=entity)) - data.append(row) - keys.extend(annotation_keys) - return keys, data - - -def _write_manifest_data_csv(filename: str, keys: List[str], data: List[Dict]) -> None: - from synapseutils.sync import _convert_manifest_data_row_to_dict - - with io.open(filename, "w", encoding="utf8", newline="") as fp: - writer = csv.DictWriter( - fp, - keys, - restval="", - extrasaction="ignore", - quoting=csv.QUOTE_MINIMAL, - ) - writer.writeheader() - for row in data: - writer.writerow(_convert_manifest_data_row_to_dict(row, keys)) - - -def generate_manifest_csv(all_files: List["File"], path: str) -> None: - """Generates a manifest.csv file based on a list of File entities. - - The generated file uses CSV format with comma delimiter and is interoperable - with the Synapse UI download cart. Column names follow the new convention: - `parentId` (instead of `parent`) and `ID` (instead of `id`). - - Arguments: - all_files: A list of File model objects. - path: The directory path where manifest.csv will be written. - """ - if path and all_files: - filename = _manifest_csv_filename(path=path) - keys, data = _extract_entity_metadata_for_manifest_csv(all_files=all_files) - _write_manifest_data_csv(filename, keys, data) +from synapseclient.models.services.manifest import generate_manifest_csv @async_to_sync @@ -478,6 +384,18 @@ async def my_function(): end end + opt manifest != "suppress" and path is set + alt manifest == "all" + loop For each directory path + sync_from_synapse->>manifest: call `generate_manifest_csv(files, dir_path)` + manifest-->>sync_from_synapse: manifest.csv written to dir_path + end + else manifest == "root" + sync_from_synapse->>manifest: call `generate_manifest_csv(all_files, root_path)` + manifest-->>sync_from_synapse: manifest.csv written to root_path + end + end + deactivate sync_from_synapse deactivate project_or_folder ``` @@ -531,6 +449,12 @@ async def _sync_from_synapse_async( syn.logger.info( f"[{self.id}:{self.name}]: Syncing {self.__class__.__name__} from Synapse." ) + + if manifest not in ("all", "root", "suppress"): + syn.logger.error( + f"[{self.id}:{self.name}]: Invalid manifest value: {manifest}" + ) + path = os.path.expanduser(path) if path else None children = await self._retrieve_children( diff --git a/synapseclient/models/services/manifest.py b/synapseclient/models/services/manifest.py new file mode 100644 index 000000000..ca7f921b4 --- /dev/null +++ b/synapseclient/models/services/manifest.py @@ -0,0 +1,134 @@ +"""Functions for generating manifest CSV files from File entities.""" + +import csv +import io +import os +from typing import TYPE_CHECKING, Dict, List, Tuple + +if TYPE_CHECKING: + from synapseclient.models import File + +MANIFEST_CSV_FILENAME = "manifest.csv" +DEFAULT_GENERATED_MANIFEST_CSV_KEYS = [ + "path", + "parentId", + "name", + "ID", + "synapseStore", + "contentType", + "used", + "executed", + "activityName", + "activityDescription", +] + + +def _manifest_csv_filename(path: str) -> str: + return os.path.join(os.path.expanduser(path), MANIFEST_CSV_FILENAME) + + +def _get_entity_provenance_dict_for_manifest(entity: "File") -> Dict[str, str]: + """ + Get the provenance metadata for the entity. + Arguments: + entity: Entity object + + Returns: + Dict[str, str]: a dictionary with a subset of the provenance metadata for the entity. + An empty dictionary is returned if the metadata does not have a provenance record. + """ + if not entity.activity: + return {} + used = [a.format_for_manifest() for a in entity.activity.used] + executed = [a.format_for_manifest() for a in entity.activity.executed] + return { + "used": ";".join(used), + "executed": ";".join(executed), + "activityName": entity.activity.name or "", + "activityDescription": entity.activity.description or "", + } + + +def _extract_entity_metadata_for_manifest_csv( + all_files: List["File"], +) -> Tuple[List[str], List[Dict]]: + """ + Extracts metadata from the list of File Entities and returns them in a form + usable by csv.DictWriter + + Arguments: + all_files: an iterable that provides File entities + + Returns: + keys: a list column headers + data: a list of dicts containing data from each row + """ + from synapseutils.sync import _convert_manifest_data_items_to_string_list + + keys = list(DEFAULT_GENERATED_MANIFEST_CSV_KEYS) + annotation_keys: set = set() + data = [] + for entity in all_files: + row: Dict = { + "path": entity.path, + "parentId": entity.parent_id, + "name": entity.name, + "ID": entity.id, + "synapseStore": entity.synapse_store, + "contentType": entity.content_type, + } + if entity.annotations: + for key, val in entity.annotations.items(): + annotation_keys.add(key) + row[key] = ( + _convert_manifest_data_items_to_string_list(val) + if isinstance(val, list) + else val + ) + row.update(_get_entity_provenance_dict_for_manifest(entity=entity)) + data.append(row) + keys.extend(annotation_keys) + return keys, data + + +def _write_manifest_data_csv(filename: str, keys: List[str], data: List[Dict]) -> None: + """ + Write the manifest data to a CSV file. + Arguments: + filename: The name of the file to write to. + keys: The keys of the manifest. + data: The data to write to the manifest. This should be a list of dicts where + each dict represents a row of data. + Returns: + None + """ + from synapseutils.sync import _convert_manifest_data_row_to_dict + + with io.open(filename, "w", encoding="utf8", newline="") as fp: + writer = csv.DictWriter( + fp, + fieldnames=keys, + restval="", + extrasaction="ignore", + quoting=csv.QUOTE_NONE, + ) + writer.writeheader() + for row in data: + writer.writerow(_convert_manifest_data_row_to_dict(row, keys)) + + +def generate_manifest_csv(all_files: List["File"], path: str) -> None: + """Generates a manifest.csv file based on a list of File entities. + + The generated file uses CSV format with comma delimiter and is interoperable + with the Synapse UI download cart. Column names follow the new convention: + `parentId` (instead of `parent`) and `ID` (instead of `id`). + + Arguments: + all_files: A list of File model objects. + path: The directory path where manifest.csv will be written. + """ + if path and all_files: + filename = _manifest_csv_filename(path=path) + keys, data = _extract_entity_metadata_for_manifest_csv(all_files=all_files) + _write_manifest_data_csv(filename, keys, data) diff --git a/tests/integration/synapseclient/models/async/test_folder_async.py b/tests/integration/synapseclient/models/async/test_folder_async.py index fc6075594..5399e5441 100644 --- a/tests/integration/synapseclient/models/async/test_folder_async.py +++ b/tests/integration/synapseclient/models/async/test_folder_async.py @@ -1,6 +1,8 @@ """Integration tests for the synapseclient.models.Folder class.""" +import csv import os +import tempfile import uuid from typing import Callable, List @@ -10,6 +12,7 @@ from synapseclient.core import utils from synapseclient.core.exceptions import SynapseHTTPError from synapseclient.models import ( + Activity, Column, ColumnType, Dataset, @@ -25,6 +28,7 @@ ViewTypeMask, VirtualTable, ) +from synapseclient.models.activity import UsedURL DESCRIPTION_FOLDER = "This is an example folder." DESCRIPTION_FILE = "This is an example file." @@ -811,3 +815,214 @@ async def test_walk_async_recursive_false(self, project_model: Project) -> None: assert hasattr(nondirs[0], "name") assert hasattr(nondirs[0], "id") assert hasattr(nondirs[0], "type") + + +class TestFolderManifestCSV: + """Integration tests for manifest CSV generation during sync_from_synapse_async.""" + + BOGUS_URL = "https://example.com" + + @pytest.fixture(autouse=True, scope="function") + def init(self, syn: Synapse, schedule_for_cleanup: Callable[..., None]) -> None: + self.syn = syn + self.schedule_for_cleanup = schedule_for_cleanup + + def create_file_instance(self) -> File: + filename = utils.make_bogus_uuid_file() + self.schedule_for_cleanup(filename) + return File( + path=filename, + content_type="text/plain", + ) + + async def test_manifest_all_creates_csv_per_directory( + self, project_model: Project + ) -> None: + # GIVEN a root folder with a file and a nested subfolder with its own file + root_folder = Folder(name=str(uuid.uuid4()), parent_id=project_model.id) + root_folder = await root_folder.store_async(synapse_client=self.syn) + self.schedule_for_cleanup(root_folder.id) + + root_file = self.create_file_instance() + root_file.parent_id = root_folder.id + root_file = await root_file.store_async(synapse_client=self.syn) + self.schedule_for_cleanup(root_file.id) + + sub_folder = Folder(name=str(uuid.uuid4()), parent_id=root_folder.id) + sub_folder = await sub_folder.store_async(synapse_client=self.syn) + self.schedule_for_cleanup(sub_folder.id) + + sub_file = self.create_file_instance() + sub_file.parent_id = sub_folder.id + sub_file = await sub_file.store_async(synapse_client=self.syn) + self.schedule_for_cleanup(sub_file.id) + + # WHEN I sync the root folder with manifest="all" + with tempfile.TemporaryDirectory() as tmpdir: + await root_folder.sync_from_synapse_async( + path=tmpdir, + manifest="all", + synapse_client=self.syn, + ) + + root_manifest = os.path.join(tmpdir, "manifest.csv") + sub_manifest = os.path.join(tmpdir, sub_folder.name, "manifest.csv") + + # THEN manifest.csv exists in the root directory + assert os.path.isfile(root_manifest) + + # AND manifest.csv exists in the subfolder directory + assert os.path.isfile(sub_manifest) + + # AND the root manifest contains the expected columns and the root file row + with open(root_manifest, newline="", encoding="utf8") as f: + reader = csv.DictReader(f) + rows = list(reader) + + assert len(rows) == 1 + row = rows[0] + assert row["name"] == root_file.name + assert row["parentId"] == root_folder.id + assert row["ID"] == root_file.id + + async def test_manifest_root_creates_csv_only_at_root( + self, project_model: Project + ) -> None: + # GIVEN a root folder with a file and a nested subfolder with its own file + root_folder = Folder(name=str(uuid.uuid4()), parent_id=project_model.id) + root_folder = await root_folder.store_async(synapse_client=self.syn) + self.schedule_for_cleanup(root_folder.id) + + root_file = self.create_file_instance() + root_file.parent_id = root_folder.id + root_file = await root_file.store_async(synapse_client=self.syn) + self.schedule_for_cleanup(root_file.id) + + sub_folder = Folder(name=str(uuid.uuid4()), parent_id=root_folder.id) + sub_folder = await sub_folder.store_async(synapse_client=self.syn) + self.schedule_for_cleanup(sub_folder.id) + + sub_file = self.create_file_instance() + sub_file.parent_id = sub_folder.id + sub_file = await sub_file.store_async(synapse_client=self.syn) + self.schedule_for_cleanup(sub_file.id) + + # WHEN I sync with manifest="root" + with tempfile.TemporaryDirectory() as tmpdir: + await root_folder.sync_from_synapse_async( + path=tmpdir, + manifest="root", + synapse_client=self.syn, + ) + + root_manifest = os.path.join(tmpdir, "manifest.csv") + sub_manifest = os.path.join(tmpdir, sub_folder.name, "manifest.csv") + + # THEN manifest.csv exists only at the root + assert os.path.isfile(root_manifest) + assert not os.path.isfile(sub_manifest) + + async def test_manifest_suppress_creates_no_csv( + self, project_model: Project + ) -> None: + # GIVEN a folder with a file + folder = Folder(name=str(uuid.uuid4()), parent_id=project_model.id) + folder = await folder.store_async(synapse_client=self.syn) + self.schedule_for_cleanup(folder.id) + + f = self.create_file_instance() + f.parent_id = folder.id + f = await f.store_async(synapse_client=self.syn) + self.schedule_for_cleanup(f.id) + + # WHEN I sync with manifest="suppress" + with tempfile.TemporaryDirectory() as tmpdir: + await folder.sync_from_synapse_async( + path=tmpdir, + manifest="suppress", + synapse_client=self.syn, + ) + + # THEN no manifest.csv is created + assert not os.path.isfile(os.path.join(tmpdir, "manifest.csv")) + + async def test_manifest_includes_annotations(self, project_model: Project) -> None: + # GIVEN a file with mixed-type annotations + folder = Folder(name=str(uuid.uuid4()), parent_id=project_model.id) + folder = await folder.store_async(synapse_client=self.syn) + self.schedule_for_cleanup(folder.id) + + f = self.create_file_instance() + f.parent_id = folder.id + f.annotations = { + "single_str": ["hello"], + "multi_str": ["a", "b", "c"], + "str_with_comma": ["hello,world"], + "booleans": [True, False], + "integers": [1, 2, 3], + } + f = await f.store_async(synapse_client=self.syn) + self.schedule_for_cleanup(f.id) + + # WHEN I sync with manifest generation + with tempfile.TemporaryDirectory() as tmpdir: + await folder.sync_from_synapse_async( + path=tmpdir, + manifest="root", + synapse_client=self.syn, + ) + + manifest_path = os.path.join(tmpdir, "manifest.csv") + assert os.path.isfile(manifest_path) + + with open(manifest_path, newline="", encoding="utf8") as mf: + reader = csv.DictReader(mf) + rows = list(reader) + + # THEN annotation columns are present and correctly serialized + assert len(rows) == 1 + row = rows[0] + assert row["single_str"] == "hello" + assert row["multi_str"] == "[a,b,c]" + assert row["str_with_comma"] == "hello,world" + assert row["booleans"] == "[True,False]" + assert row["integers"] == "[1,2,3]" + + async def test_manifest_includes_provenance(self, project_model: Project) -> None: + # GIVEN a file with activity (provenance) + folder = Folder(name=str(uuid.uuid4()), parent_id=project_model.id) + folder = await folder.store_async(synapse_client=self.syn) + self.schedule_for_cleanup(folder.id) + + f = self.create_file_instance() + f.parent_id = folder.id + f.activity = Activity( + name="my_activity", + description="my_description", + used=[UsedURL(name="my_source", url=self.BOGUS_URL)], + ) + f = await f.store_async(synapse_client=self.syn) + self.schedule_for_cleanup(f.id) + + # WHEN I sync with manifest generation and include_activity=True + with tempfile.TemporaryDirectory() as tmpdir: + await folder.sync_from_synapse_async( + path=tmpdir, + manifest="root", + include_activity=True, + synapse_client=self.syn, + ) + + manifest_path = os.path.join(tmpdir, "manifest.csv") + assert os.path.isfile(manifest_path) + + with open(manifest_path, newline="", encoding="utf8") as mf: + reader = csv.DictReader(mf) + rows = list(reader) + + # THEN provenance columns are populated + assert len(rows) == 1 + row = rows[0] + assert row["activityName"] == "my_activity" + assert row["activityDescription"] == "my_description" + assert row["used"] == "my_source" diff --git a/tests/unit/synapseclient/models/async/unit_test_folder_async.py b/tests/unit/synapseclient/models/async/unit_test_folder_async.py index d8f9596a5..5445f5d48 100644 --- a/tests/unit/synapseclient/models/async/unit_test_folder_async.py +++ b/tests/unit/synapseclient/models/async/unit_test_folder_async.py @@ -15,7 +15,7 @@ from synapseclient.core.constants.concrete_types import FILE_ENTITY from synapseclient.core.exceptions import SynapseNotFoundError from synapseclient.models import Activity, FailureStrategy, File, Folder -from synapseclient.models.mixins.storable_container import ( +from synapseclient.models.services.manifest import ( _extract_entity_metadata_for_manifest_csv, _write_manifest_data_csv, generate_manifest_csv, From 6feab17f910dd357df24aea398371f514a8e5be2 Mon Sep 17 00:00:00 2001 From: danlu1 Date: Thu, 9 Apr 2026 11:29:12 -0700 Subject: [PATCH 03/26] update unit test --- .../models/async/unit_test_folder_async.py | 94 ++++++++++--------- 1 file changed, 49 insertions(+), 45 deletions(-) diff --git a/tests/unit/synapseclient/models/async/unit_test_folder_async.py b/tests/unit/synapseclient/models/async/unit_test_folder_async.py index 5445f5d48..9187e105d 100644 --- a/tests/unit/synapseclient/models/async/unit_test_folder_async.py +++ b/tests/unit/synapseclient/models/async/unit_test_folder_async.py @@ -1,5 +1,6 @@ """Tests for the Folder class.""" import csv +import datetime import io import os import tempfile @@ -972,13 +973,6 @@ def test_extract_entity_metadata_uses_parentId_and_ID_columns(self) -> None: # WHEN metadata is extracted keys, data = _extract_entity_metadata_for_manifest_csv([f]) - # THEN column names use new convention - assert "parentId" in keys - assert "ID" in keys - assert "parent" not in keys - assert "id" not in keys - - # AND values are correct assert data[0]["parentId"] == "syn456" assert data[0]["ID"] == "syn123" assert data[0]["path"] == "/data/file.txt" @@ -991,44 +985,61 @@ def test_extract_entity_metadata_includes_annotations(self) -> None: # WHEN metadata is extracted keys, data = _extract_entity_metadata_for_manifest_csv([f]) - # THEN annotation keys are included + # THEN annotation keys are included and single-item lists are serialized as scalars assert "tissue" in keys assert "count" in keys - assert data[0]["tissue"] == ["brain"] + assert data[0]["tissue"] == "brain" + assert data[0]["count"] == "42" def test_write_manifest_data_csv_produces_comma_separated_output(self) -> None: - # GIVEN some data with a value that contains a comma - keys = ["path", "parentId", "name", "ID"] - data = [ - { - "path": "/data/file.txt", - "parentId": "syn456", - "name": "a, b, c", - "ID": "syn123", - } - ] - - # WHEN written to a CSV buffer - buf = io.StringIO() - with patch("builtins.open", side_effect=lambda *a, **kw: buf): - pass # we call directly below - - output = io.StringIO() - writer = csv.DictWriter( - output, - keys, - restval="", - extrasaction="ignore", - quoting=csv.QUOTE_MINIMAL, + # GIVEN a File with a name containing a comma and mixed-type annotations + f = self._make_file( + name="a, b, c", + path="/data/file.txt", + annotations={ + "single_str": "hello", + "multi_str": ["a", "b", "c"], + "str_with_comma": ["hello,world", "plain"], + "booleans": [True, False], + "integers": [1], + "single_dt": [ + datetime.datetime(2020, 1, 1, 0, 0, 0, tzinfo=datetime.timezone.utc) + ], + "multi_dt": [ + datetime.datetime( + 2020, 1, 1, 0, 0, 0, tzinfo=datetime.timezone.utc + ), + datetime.datetime( + 2021, 6, 15, 12, 30, 0, tzinfo=datetime.timezone.utc + ), + ], + }, ) - writer.writeheader() - writer.writerow(data[0]) - content = output.getvalue() - # THEN the header uses commas - assert "path,parentId,name,ID" in content - # AND the value with commas is quoted + with tempfile.TemporaryDirectory() as tmpdir: + # WHEN generate_manifest_csv is called + generate_manifest_csv(all_files=[f], path=tmpdir) + manifest_path = os.path.join(tmpdir, "manifest.csv") + content = open(manifest_path, encoding="utf8").read() + with open(manifest_path, newline="", encoding="utf8") as fp: + row = next(csv.DictReader(fp)) + + # THEN the name with a comma is quoted in the CSV output assert '"a, b, c"' in content + # AND single-item list is serialized as a plain scalar + assert row["single_str"] == "hello" + # AND multi-value list is serialized as a bracketed string + assert row["multi_str"] == "[a,b,c]" + # AND a value with a comma is quoted inside the brackets + assert row["str_with_comma"] == '["hello,world",plain]' + # AND booleans are serialized unquoted + assert row["booleans"] == "[True,False]" + # AND integers are serialized unquoted + assert row["integers"] == "1" + # AND a single datetime is serialized as ISO 8601 UTC + assert row["single_dt"] == "2020-01-01T00:00:00Z" + # AND multiple datetimes are serialized as a bracketed ISO 8601 list + assert row["multi_dt"] == "[2020-01-01T00:00:00Z,2021-06-15T12:30:00Z]" def test_generate_manifest_csv_creates_file(self) -> None: # GIVEN a list of File entities and a temp directory @@ -1050,12 +1061,6 @@ def test_generate_manifest_csv_creates_file(self) -> None: # AND it has the expected columns with new naming convention with open(manifest_path, newline="", encoding="utf8") as fp: reader = csv.DictReader(fp) - headers = reader.fieldnames - assert "parentId" in headers - assert "ID" in headers - assert "parent" not in headers - assert "id" not in headers - row = next(reader) assert row["parentId"] == "syn456" assert row["ID"] == "syn123" @@ -1077,6 +1082,5 @@ def test_generate_manifest_csv_quotes_values_with_commas(self) -> None: generate_manifest_csv(all_files=[f], path=tmpdir) manifest_path = os.path.join(tmpdir, "manifest.csv") content = open(manifest_path, encoding="utf8").read() - # THEN the comma-containing value is quoted in the CSV assert '"file, extra.txt"' in content From cb0a3177e67eaceac0499dcfc5613974cad69f59 Mon Sep 17 00:00:00 2001 From: danlu1 Date: Thu, 9 Apr 2026 12:40:09 -0700 Subject: [PATCH 04/26] update quoting --- synapseclient/models/services/manifest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/synapseclient/models/services/manifest.py b/synapseclient/models/services/manifest.py index ca7f921b4..b17ea2e49 100644 --- a/synapseclient/models/services/manifest.py +++ b/synapseclient/models/services/manifest.py @@ -110,7 +110,7 @@ def _write_manifest_data_csv(filename: str, keys: List[str], data: List[Dict]) - fieldnames=keys, restval="", extrasaction="ignore", - quoting=csv.QUOTE_NONE, + quoting=csv.QUOTE_MINIMAL, ) writer.writeheader() for row in data: From fa1d02d38831a936a84ab353e2df1dba2a6151be Mon Sep 17 00:00:00 2001 From: danlu1 Date: Thu, 9 Apr 2026 13:05:05 -0700 Subject: [PATCH 05/26] update test case --- .../models/async/unit_test_folder_async.py | 73 ++++++++++++++++--- 1 file changed, 61 insertions(+), 12 deletions(-) diff --git a/tests/unit/synapseclient/models/async/unit_test_folder_async.py b/tests/unit/synapseclient/models/async/unit_test_folder_async.py index 9187e105d..991041b4f 100644 --- a/tests/unit/synapseclient/models/async/unit_test_folder_async.py +++ b/tests/unit/synapseclient/models/async/unit_test_folder_async.py @@ -13,7 +13,7 @@ from synapseclient import Folder as Synapse_Folder from synapseclient import Synapse from synapseclient.core.constants import concrete_types -from synapseclient.core.constants.concrete_types import FILE_ENTITY +from synapseclient.core.constants.concrete_types import FILE_ENTITY, FOLDER_ENTITY from synapseclient.core.exceptions import SynapseNotFoundError from synapseclient.models import Activity, FailureStrategy, File, Folder from synapseclient.models.services.manifest import ( @@ -799,20 +799,61 @@ async def mock_get_children(*args, **kwargs): async def test_sync_from_synapse_manifest_all_generates_per_directory( self, ) -> None: - # GIVEN a Folder object with a path + SUB_FOLDER_ID = "syn789" + SUB_FOLDER_NAME = "sub_folder" + FILE_2_ID = "syn012" + FILE_2_NAME = "example_file_2" + + # GIVEN a root folder with one file and one subfolder containing one file folder = Folder(id=SYN_123) - children = [{"id": SYN_456, "type": FILE_ENTITY, "name": "example_file_1"}] + + root_children = [ + {"id": SYN_456, "type": FILE_ENTITY, "name": "example_file_1"}, + {"id": SUB_FOLDER_ID, "type": FOLDER_ENTITY, "name": SUB_FOLDER_NAME}, + ] + sub_children = [ + {"id": FILE_2_ID, "type": FILE_ENTITY, "name": FILE_2_NAME}, + ] + get_children_call_count = 0 async def mock_get_children(*args, **kwargs): + nonlocal get_children_call_count + children = root_children if get_children_call_count == 0 else sub_children + get_children_call_count += 1 for child in children: yield child - downloaded_file = File( + downloaded_file_1 = File( id=SYN_456, name="example_file_1", - path="/tmp/mydir/example_file_1.txt", parent_id=SYN_123, ) + downloaded_file_2 = File( + id=FILE_2_ID, + name=FILE_2_NAME, + parent_id=SUB_FOLDER_ID, + ) + file_map = {SYN_456: downloaded_file_1, FILE_2_ID: downloaded_file_2} + + async def mock_file_get(self_file, **kwargs): + return file_map[self_file.id] + + async def mock_get_entity_bundle(entity_id, *args, **kwargs): + if entity_id == SUB_FOLDER_ID: + return { + "entity": { + "concreteType": concrete_types.FOLDER_ENTITY, + "id": SUB_FOLDER_ID, + "name": SUB_FOLDER_NAME, + "parentId": SYN_123, + "etag": ETAG, + "createdOn": CREATED_ON, + "modifiedOn": MODIFIED_ON, + "createdBy": CREATED_BY, + "modifiedBy": MODIFIED_BY, + } + } + return self.get_example_rest_api_folder_output() # WHEN I call sync_from_synapse with manifest="all" and a path with patch( @@ -820,11 +861,13 @@ async def mock_get_children(*args, **kwargs): side_effect=mock_get_children, ), patch( "synapseclient.api.entity_factory.get_entity_id_bundle2", - new_callable=AsyncMock, - return_value=self.get_example_rest_api_folder_output(), + side_effect=mock_get_entity_bundle, ), patch( "synapseclient.models.file.File.get_async", - return_value=downloaded_file, + side_effect=mock_file_get, + ), patch( + "synapseclient.models.mixins.storable_container.os.path.exists", + return_value=True, ), patch( "synapseclient.models.mixins.storable_container.generate_manifest_csv", ) as mock_generate: @@ -832,10 +875,16 @@ async def mock_get_children(*args, **kwargs): path="/tmp/mydir", manifest="all", synapse_client=self.syn ) - # THEN generate_manifest_csv should be called for each directory - assert mock_generate.call_count >= 1 - call_paths = [c.kwargs["path"] for c in mock_generate.call_args_list] - assert "/tmp/mydir" in call_paths + # THEN generate_manifest_csv is called once per directory (root + subfolder) + assert mock_generate.call_count == 2 + calls_by_path = { + c.kwargs["path"]: c.kwargs["all_files"] + for c in mock_generate.call_args_list + } + assert any(f.id == SYN_456 for f in calls_by_path["/tmp/mydir"]) + assert any( + f.id == FILE_2_ID for f in calls_by_path[f"/tmp/mydir/{SUB_FOLDER_NAME}"] + ) async def test_sync_from_synapse_manifest_root_generates_only_at_root( self, From bf0ea5f0c906a05c255afaf1210c5e80a1269d3d Mon Sep 17 00:00:00 2001 From: danlu1 Date: Tue, 21 Apr 2026 12:14:44 -0700 Subject: [PATCH 06/26] add comments --- .../models/mixins/storable_container.py | 45 ++++++++++++++++--- 1 file changed, 40 insertions(+), 5 deletions(-) diff --git a/synapseclient/models/mixins/storable_container.py b/synapseclient/models/mixins/storable_container.py index 6f0458637..0d2867d67 100644 --- a/synapseclient/models/mixins/storable_container.py +++ b/synapseclient/models/mixins/storable_container.py @@ -8,6 +8,7 @@ Dict, Generator, List, + Literal, NoReturn, Optional, Tuple, @@ -161,7 +162,7 @@ async def sync_from_synapse_async( link_hops: int = 1, queue: asyncio.Queue = None, include_types: Optional[List[str]] = None, - manifest: str = "all", + manifest: Literal["all", "suppress", "root"] = "all", *, synapse_client: Optional[Synapse] = None, ) -> Self: @@ -321,8 +322,38 @@ async def my_function(): asyncio.run(my_function()) ``` + Suppose I want to download all the children of a Project and all sub-folders and files and generate a manifest file: + ```python + import asyncio + from synapseclient import Synapse + from synapseclient.models import Project + + async def my_function(): + syn = Synapse() + syn.login() + + my_project = Project(id="syn12345") + await my_project.sync_from_synapse_async(path="/path/to/folder", manifest="all") + + asyncio.run(my_function()) + ``` + Suppose I want to download a manifest file at the root path: + ```python + import asyncio + from synapseclient import Synapse + from synapseclient.models import Project + + async def my_function(): + syn = Synapse() + syn.login() + + my_project = Project(id="syn12345") + await my_project.sync_from_synapse_async(path="/path/to/folder", manifest="root", download_file=False) + + asyncio.run(my_function()) + ``` Raises: ValueError: If the folder does not have an id set. @@ -434,7 +465,7 @@ async def _sync_from_synapse_async( link_hops: int = 1, queue: asyncio.Queue = None, include_types: Optional[List[str]] = None, - manifest: str = "all", + manifest: Literal["all", "suppress", "root"] = "all", *, synapse_client: Optional[Synapse] = None, ) -> Self: @@ -451,8 +482,9 @@ async def _sync_from_synapse_async( ) if manifest not in ("all", "root", "suppress"): - syn.logger.error( - f"[{self.id}:{self.name}]: Invalid manifest value: {manifest}" + raise ValueError( + f"[{self.id}:{self.name}]: Invalid manifest value: {manifest}. " + "Must be one of: 'all', 'root', 'suppress'." ) path = os.path.expanduser(path) if path else None @@ -517,9 +549,12 @@ async def _sync_from_synapse_async( if create_workers: try: - # Wait until the queue is fully processed. + # Blocks until every queued item has been picked up and + # task_done() called by a worker. await queue.join() finally: + # Workers are now blocked waiting on an empty queue; cancel + # them so they don't hang the event loop. for task in worker_tasks: task.cancel() From c0b0acb8fbb1f2acab1441da9af2f5781b30cc07 Mon Sep 17 00:00:00 2001 From: danlu1 Date: Wed, 22 Apr 2026 14:29:15 -0700 Subject: [PATCH 07/26] decouple create_task with manifest generation and suppress manifest generation for recursive calls --- synapseclient/models/mixins/storable_container.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/synapseclient/models/mixins/storable_container.py b/synapseclient/models/mixins/storable_container.py index 0d2867d67..0694179bf 100644 --- a/synapseclient/models/mixins/storable_container.py +++ b/synapseclient/models/mixins/storable_container.py @@ -558,7 +558,7 @@ async def _sync_from_synapse_async( for task in worker_tasks: task.cancel() - if create_workers and path and manifest != "suppress": + if path and manifest != "suppress": if manifest == "all": for ( directory_path, @@ -1186,6 +1186,7 @@ async def _wrap_recursive_get_children( synapse_client=synapse_client, queue=queue, include_types=include_types, + manifest="suppress", # suppress manifest generation for recursive calls since the root covers the path already in map_directory_to_all_contained_files ) def _create_task_for_child( From 37784444358190f8d4a2ec2bc9a0511c8fd1be0b Mon Sep 17 00:00:00 2001 From: danlu1 Date: Wed, 22 Apr 2026 14:30:27 -0700 Subject: [PATCH 08/26] update test cases and relocation manifest generation tests to a separate file --- .../models/async/unit_test_folder_async.py | 163 +----------------- 1 file changed, 3 insertions(+), 160 deletions(-) diff --git a/tests/unit/synapseclient/models/async/unit_test_folder_async.py b/tests/unit/synapseclient/models/async/unit_test_folder_async.py index 991041b4f..74e4069dc 100644 --- a/tests/unit/synapseclient/models/async/unit_test_folder_async.py +++ b/tests/unit/synapseclient/models/async/unit_test_folder_async.py @@ -1,12 +1,7 @@ """Tests for the Folder class.""" -import csv -import datetime -import io -import os -import tempfile import uuid from typing import Dict -from unittest.mock import AsyncMock, MagicMock, patch +from unittest.mock import AsyncMock, patch import pytest @@ -15,12 +10,7 @@ from synapseclient.core.constants import concrete_types from synapseclient.core.constants.concrete_types import FILE_ENTITY, FOLDER_ENTITY from synapseclient.core.exceptions import SynapseNotFoundError -from synapseclient.models import Activity, FailureStrategy, File, Folder -from synapseclient.models.services.manifest import ( - _extract_entity_metadata_for_manifest_csv, - _write_manifest_data_csv, - generate_manifest_csv, -) +from synapseclient.models import FailureStrategy, File, Folder SYN_123 = "syn123" SYN_456 = "syn456" @@ -925,6 +915,7 @@ async def mock_get_children(*args, **kwargs): # THEN generate_manifest_csv should be called exactly once with the root path mock_generate.assert_called_once() assert mock_generate.call_args.kwargs["path"] == "/tmp/mydir" + assert mock_generate.call_args.kwargs["all_files"][0].id == SYN_456 async def test_sync_from_synapse_manifest_suppress_skips_generation( self, @@ -985,151 +976,3 @@ async def mock_get_children(*args, **kwargs): # THEN generate_manifest_csv should not be called (no path to write to) mock_generate.assert_not_called() - - -class TestGenerateManifestCsv: - """Tests for the generate_manifest_csv and related helper functions.""" - - def _make_file( - self, - syn_id: str = "syn123", - name: str = "file.txt", - path: str = "/data/file.txt", - parent_id: str = "syn456", - content_type: str = "text/plain", - synapse_store: bool = True, - annotations: dict = None, - activity: Activity = None, - ) -> File: - f = File( - id=syn_id, - name=name, - path=path, - parent_id=parent_id, - content_type=content_type, - synapse_store=synapse_store, - ) - if annotations: - f.annotations = annotations - if activity: - f.activity = activity - return f - - def test_extract_entity_metadata_uses_parentId_and_ID_columns(self) -> None: - # GIVEN a File entity - f = self._make_file() - - # WHEN metadata is extracted - keys, data = _extract_entity_metadata_for_manifest_csv([f]) - - assert data[0]["parentId"] == "syn456" - assert data[0]["ID"] == "syn123" - assert data[0]["path"] == "/data/file.txt" - assert data[0]["name"] == "file.txt" - - def test_extract_entity_metadata_includes_annotations(self) -> None: - # GIVEN a File entity with annotations - f = self._make_file(annotations={"tissue": ["brain"], "count": [42]}) - - # WHEN metadata is extracted - keys, data = _extract_entity_metadata_for_manifest_csv([f]) - - # THEN annotation keys are included and single-item lists are serialized as scalars - assert "tissue" in keys - assert "count" in keys - assert data[0]["tissue"] == "brain" - assert data[0]["count"] == "42" - - def test_write_manifest_data_csv_produces_comma_separated_output(self) -> None: - # GIVEN a File with a name containing a comma and mixed-type annotations - f = self._make_file( - name="a, b, c", - path="/data/file.txt", - annotations={ - "single_str": "hello", - "multi_str": ["a", "b", "c"], - "str_with_comma": ["hello,world", "plain"], - "booleans": [True, False], - "integers": [1], - "single_dt": [ - datetime.datetime(2020, 1, 1, 0, 0, 0, tzinfo=datetime.timezone.utc) - ], - "multi_dt": [ - datetime.datetime( - 2020, 1, 1, 0, 0, 0, tzinfo=datetime.timezone.utc - ), - datetime.datetime( - 2021, 6, 15, 12, 30, 0, tzinfo=datetime.timezone.utc - ), - ], - }, - ) - - with tempfile.TemporaryDirectory() as tmpdir: - # WHEN generate_manifest_csv is called - generate_manifest_csv(all_files=[f], path=tmpdir) - manifest_path = os.path.join(tmpdir, "manifest.csv") - content = open(manifest_path, encoding="utf8").read() - with open(manifest_path, newline="", encoding="utf8") as fp: - row = next(csv.DictReader(fp)) - - # THEN the name with a comma is quoted in the CSV output - assert '"a, b, c"' in content - # AND single-item list is serialized as a plain scalar - assert row["single_str"] == "hello" - # AND multi-value list is serialized as a bracketed string - assert row["multi_str"] == "[a,b,c]" - # AND a value with a comma is quoted inside the brackets - assert row["str_with_comma"] == '["hello,world",plain]' - # AND booleans are serialized unquoted - assert row["booleans"] == "[True,False]" - # AND integers are serialized unquoted - assert row["integers"] == "1" - # AND a single datetime is serialized as ISO 8601 UTC - assert row["single_dt"] == "2020-01-01T00:00:00Z" - # AND multiple datetimes are serialized as a bracketed ISO 8601 list - assert row["multi_dt"] == "[2020-01-01T00:00:00Z,2021-06-15T12:30:00Z]" - - def test_generate_manifest_csv_creates_file(self) -> None: - # GIVEN a list of File entities and a temp directory - f = self._make_file( - syn_id="syn123", - name="data.csv", - path="/tmp/data.csv", - parent_id="syn456", - ) - - with tempfile.TemporaryDirectory() as tmpdir: - # WHEN generate_manifest_csv is called - generate_manifest_csv(all_files=[f], path=tmpdir) - - # THEN manifest.csv is created in the directory - manifest_path = os.path.join(tmpdir, "manifest.csv") - assert os.path.exists(manifest_path) - - # AND it has the expected columns with new naming convention - with open(manifest_path, newline="", encoding="utf8") as fp: - reader = csv.DictReader(fp) - row = next(reader) - assert row["parentId"] == "syn456" - assert row["ID"] == "syn123" - - def test_generate_manifest_csv_skips_when_no_files(self) -> None: - # GIVEN an empty file list - with tempfile.TemporaryDirectory() as tmpdir: - # WHEN generate_manifest_csv is called with no files - generate_manifest_csv(all_files=[], path=tmpdir) - - # THEN no manifest.csv is created - assert not os.path.exists(os.path.join(tmpdir, "manifest.csv")) - - def test_generate_manifest_csv_quotes_values_with_commas(self) -> None: - # GIVEN a File whose name contains a comma - f = self._make_file(name="file, extra.txt", path="/tmp/file, extra.txt") - - with tempfile.TemporaryDirectory() as tmpdir: - generate_manifest_csv(all_files=[f], path=tmpdir) - manifest_path = os.path.join(tmpdir, "manifest.csv") - content = open(manifest_path, encoding="utf8").read() - # THEN the comma-containing value is quoted in the CSV - assert '"file, extra.txt"' in content From 3d2889a6bebc3af5b50f41e3aed3a3fac017f1c0 Mon Sep 17 00:00:00 2001 From: danlu1 Date: Thu, 23 Apr 2026 14:27:11 -0700 Subject: [PATCH 09/26] update integration tests --- .../models/async/test_folder_async.py | 48 ++++++++++++++----- 1 file changed, 36 insertions(+), 12 deletions(-) diff --git a/tests/integration/synapseclient/models/async/test_folder_async.py b/tests/integration/synapseclient/models/async/test_folder_async.py index 5399e5441..cc04238fb 100644 --- a/tests/integration/synapseclient/models/async/test_folder_async.py +++ b/tests/integration/synapseclient/models/async/test_folder_async.py @@ -1,6 +1,7 @@ """Integration tests for the synapseclient.models.Folder class.""" import csv +import datetime import os import tempfile import uuid @@ -868,22 +869,28 @@ async def test_manifest_all_creates_csv_per_directory( root_manifest = os.path.join(tmpdir, "manifest.csv") sub_manifest = os.path.join(tmpdir, sub_folder.name, "manifest.csv") - # THEN manifest.csv exists in the root directory assert os.path.isfile(root_manifest) - - # AND manifest.csv exists in the subfolder directory assert os.path.isfile(sub_manifest) - # AND the root manifest contains the expected columns and the root file row with open(root_manifest, newline="", encoding="utf8") as f: reader = csv.DictReader(f) rows = list(reader) - - assert len(rows) == 1 - row = rows[0] - assert row["name"] == root_file.name - assert row["parentId"] == root_folder.id - assert row["ID"] == root_file.id + assert len(rows) == 2 + rows_by_id = {row["ID"]: row for row in rows} + root_row = rows_by_id[root_file.id] + assert root_row["name"] == root_file.name + assert root_row["parentId"] == root_folder.id + sub_row = rows_by_id[sub_file.id] + assert sub_row["name"] == sub_file.name + assert sub_row["parentId"] == sub_folder.id + + with open(sub_manifest, newline="", encoding="utf8") as f: + reader = csv.DictReader(f) + rows = list(reader) + assert len(rows) == 1 + sub_row = rows[0] + assert sub_row["name"] == sub_file.name + assert sub_row["parentId"] == sub_folder.id async def test_manifest_root_creates_csv_only_at_root( self, project_model: Project @@ -921,6 +928,17 @@ async def test_manifest_root_creates_csv_only_at_root( # THEN manifest.csv exists only at the root assert os.path.isfile(root_manifest) assert not os.path.isfile(sub_manifest) + with open(root_manifest, newline="", encoding="utf8") as f: + reader = csv.DictReader(f) + rows = list(reader) + assert len(rows) == 2 + rows_by_id = {row["ID"]: row for row in rows} + root_row = rows[0] + assert root_row["name"] == root_file.name + assert root_row["parentId"] == root_folder.id + sub_row = rows_by_id[sub_file.id] + assert sub_row["name"] == sub_file.name + assert sub_row["parentId"] == sub_folder.id async def test_manifest_suppress_creates_no_csv( self, project_model: Project @@ -957,9 +975,13 @@ async def test_manifest_includes_annotations(self, project_model: Project) -> No f.annotations = { "single_str": ["hello"], "multi_str": ["a", "b", "c"], - "str_with_comma": ["hello,world"], + "str_with_comma": ["hello,world", "plain text"], "booleans": [True, False], "integers": [1, 2, 3], + "floats": [1.0], + "datetimes": [ + datetime.datetime(2020, 1, 1, 0, 0, 0, 0, tzinfo=datetime.timezone.utc) + ], } f = await f.store_async(synapse_client=self.syn) self.schedule_for_cleanup(f.id) @@ -984,9 +1006,11 @@ async def test_manifest_includes_annotations(self, project_model: Project) -> No row = rows[0] assert row["single_str"] == "hello" assert row["multi_str"] == "[a,b,c]" - assert row["str_with_comma"] == "hello,world" + assert row["str_with_comma"] == '["hello,world",plain text]' assert row["booleans"] == "[True,False]" assert row["integers"] == "[1,2,3]" + assert row["floats"] == "1.0" + assert row["datetimes"] == "2020-01-01T00:00:00Z" async def test_manifest_includes_provenance(self, project_model: Project) -> None: # GIVEN a file with activity (provenance) From 22b4cd6211c6af968f6f931c78b1ccf041204ccf Mon Sep 17 00:00:00 2001 From: danlu1 Date: Thu, 23 Apr 2026 16:53:20 -0700 Subject: [PATCH 10/26] copy sync functions over --- synapseclient/models/services/manifest.py | 155 ++++++++++++++++++---- 1 file changed, 128 insertions(+), 27 deletions(-) diff --git a/synapseclient/models/services/manifest.py b/synapseclient/models/services/manifest.py index b17ea2e49..d01fb0860 100644 --- a/synapseclient/models/services/manifest.py +++ b/synapseclient/models/services/manifest.py @@ -1,9 +1,12 @@ """Functions for generating manifest CSV files from File entities.""" import csv +import datetime import io import os -from typing import TYPE_CHECKING, Dict, List, Tuple +from typing import TYPE_CHECKING, Any, Union + +from synapseclient.core import utils if TYPE_CHECKING: from synapseclient.models import File @@ -27,14 +30,15 @@ def _manifest_csv_filename(path: str) -> str: return os.path.join(os.path.expanduser(path), MANIFEST_CSV_FILENAME) -def _get_entity_provenance_dict_for_manifest(entity: "File") -> Dict[str, str]: +def _get_entity_provenance_dict_for_manifest(entity: File) -> dict[str, str]: """ - Get the provenance metadata for the entity. + Gets the provenance metadata for the entity. + Arguments: - entity: Entity object + entity: A File entity object Returns: - Dict[str, str]: a dictionary with a subset of the provenance metadata for the entity. + dict[str, str]: a dictionary with a subset of the provenance metadata for the entity. An empty dictionary is returned if the metadata does not have a provenance record. """ if not entity.activity: @@ -49,27 +53,102 @@ def _get_entity_provenance_dict_for_manifest(entity: "File") -> Dict[str, str]: } -def _extract_entity_metadata_for_manifest_csv( - all_files: List["File"], -) -> Tuple[List[str], List[Dict]]: +def _convert_manifest_data_items_to_string_list( + items: list[Union[str, datetime.datetime, bool, int, float]], +) -> Union[str, list[str]]: + """ + Handle coverting an individual key that contains a possible list of data into a + list of strings or objects that can be written to the manifest file. + + This has specific logic around how to handle datetime fields. + + When working with datetime fields we are printing the ISO 8601 UTC representation of + the datetime. + + When working with non strings we are printing the non-quoted version of the object. + + Example: Examples + Several examples of how this function works. + + >>> _convert_manifest_data_items_to_string_list(["a", "b", "c"]) + '[a,b,c]' + >>> _convert_manifest_data_items_to_string_list(["string,with,commas", "string without commas"]) + '["string,with,commas",string without commas]' + >>> _convert_manifest_data_items_to_string_list(["string,with,commas"]) + 'string,with,commas' + >>> _convert_manifest_data_items_to_string_list + ([datetime.datetime(2020, 1, 1, 0, 0, 0, 0, tzinfo=datetime.timezone.utc)]) + '2020-01-01T00:00:00Z' + >>> _convert_manifest_data_items_to_string_list([True]) + 'True' + >>> _convert_manifest_data_items_to_string_list([1]) + '1' + >>> _convert_manifest_data_items_to_string_list([1.0]) + '1.0' + >>> _convert_manifest_data_items_to_string_list + ([datetime.datetime(2020, 1, 1, 0, 0, 0, 0, tzinfo=datetime.timezone.utc), + datetime.datetime(2021, 1, 1, 0, 0, 0, 0, tzinfo=datetime.timezone.utc)]) + '[2020-01-01T00:00:00Z,2021-01-01T00:00:00Z]' + + + Args: + items: The list of items to convert. + + Returns: + The list of items converted to strings. """ - Extracts metadata from the list of File Entities and returns them in a form - usable by csv.DictWriter + items_to_write = [] + for item in items: + if isinstance(item, datetime.datetime): + items_to_write.append( + utils.datetime_to_iso(dt=item, include_milliseconds_if_zero=False) + ) + else: + # If a string based annotation has a comma in it + # this will wrap the string in quotes so it won't be parsed + # as multiple values. For example this is an annotation with 2 values: + # [my first annotation, "my, second, annotation"] + # This is an annotation with 4 value: + # [my first annotation, my, second, annotation] + if isinstance(item, str): + if len(items) > 1 and "," in item: + items_to_write.append(f'"{item}"') + else: + items_to_write.append(item) + else: + items_to_write.append(repr(item)) + + if len(items_to_write) > 1: + return f'[{",".join(items_to_write)}]' + elif len(items_to_write) == 1: + return items_to_write[0] + else: + return "" + + +def _extract_entity_metadata_for_manifest_csv( + all_files: list[File], +) -> tuple[list[str], list[dict[str, Any]]]: + """Extracts metadata from a list of File entities into a form usable by csv.DictWriter. + + Builds the column header list starting from DEFAULT_GENERATED_MANIFEST_CSV_KEYS, then + appends any annotation keys discovered across all files. Each row dict contains the + standard fields plus annotation values (serialized via + _convert_manifest_data_items_to_string_list) and provenance fields from + _get_entity_provenance_dict_for_manifest. Arguments: - all_files: an iterable that provides File entities + all_files: A list of File model objects to extract metadata from. Returns: - keys: a list column headers - data: a list of dicts containing data from each row + A tuple of (keys, data) where keys is the ordered list of column headers and + data is a list of row dicts, one per file. """ - from synapseutils.sync import _convert_manifest_data_items_to_string_list - keys = list(DEFAULT_GENERATED_MANIFEST_CSV_KEYS) annotation_keys: set = set() data = [] for entity in all_files: - row: Dict = { + row: dict = { "path": entity.path, "parentId": entity.parent_id, "name": entity.name, @@ -91,20 +170,42 @@ def _extract_entity_metadata_for_manifest_csv( return keys, data -def _write_manifest_data_csv(filename: str, keys: List[str], data: List[Dict]) -> None: +def _convert_manifest_data_row_to_dict(row: dict, keys: list[str]) -> dict: """ - Write the manifest data to a CSV file. - Arguments: - filename: The name of the file to write to. - keys: The keys of the manifest. - data: The data to write to the manifest. This should be a list of dicts where - each dict represents a row of data. + Convert a row of data to a dict that can be written to a manifest file. + + Args: + row: The row of data to convert. + keys: The keys of the manifest. Used to select the rows of data. + Returns: - None + The dict representation of the row. """ - from synapseutils.sync import _convert_manifest_data_row_to_dict + data_to_write = {} + for key in keys: + data_for_key = row.get(key, "") + if isinstance(data_for_key, list): + items_to_write = _convert_manifest_data_items_to_string_list(data_for_key) + data_to_write[key] = items_to_write + else: + data_to_write[key] = data_for_key + return data_to_write + - with io.open(filename, "w", encoding="utf8", newline="") as fp: +def _write_manifest_data_csv(path: str, keys: list[str], data: list[dict]) -> None: + """Writes manifest data to a CSV file using csv.DictWriter with QUOTE_MINIMAL that automatically quotes any cell containing a comma, newline, or the quote character. + + Each row dict is normalized via _convert_manifest_data_row_to_dict so that + list-valued annotation fields are serialized to strings before writing. Missing + fields default to an empty string; extra keys not in fieldnames are silently ignored. + + Arguments: + path: Absolute path of the CSV file to create or overwrite. + keys: Ordered list of column headers used as DictWriter fieldnames. + data: List of row dicts, one per file. Keys absent from a row are written as + empty strings; keys not in fieldnames are ignored. + """ + with io.open(path, "w", encoding="utf8", newline="") as fp: writer = csv.DictWriter( fp, fieldnames=keys, @@ -117,7 +218,7 @@ def _write_manifest_data_csv(filename: str, keys: List[str], data: List[Dict]) - writer.writerow(_convert_manifest_data_row_to_dict(row, keys)) -def generate_manifest_csv(all_files: List["File"], path: str) -> None: +def generate_manifest_csv(all_files: list["File"], path: str) -> None: """Generates a manifest.csv file based on a list of File entities. The generated file uses CSV format with comma delimiter and is interoperable From 573d386ef036b3ce23d39978a36d6f19b10602cb Mon Sep 17 00:00:00 2001 From: danlu1 Date: Thu, 23 Apr 2026 16:53:40 -0700 Subject: [PATCH 11/26] update version number --- synapseutils/sync.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/synapseutils/sync.py b/synapseutils/sync.py index a7c7764b1..41c6eeac7 100644 --- a/synapseutils/sync.py +++ b/synapseutils/sync.py @@ -79,7 +79,7 @@ @deprecated( - version="4.11.0", + version="4.12.0", reason=( "To be removed in 5.0.0. Use StorableContainer.sync_from_synapse instead, " "which generates a manifest.csv file interoperable with the Synapse UI download cart." From 1f036c25c8dc8bdd6c66238b48b5907f9194da29 Mon Sep 17 00:00:00 2001 From: danlu1 Date: Thu, 23 Apr 2026 16:54:18 -0700 Subject: [PATCH 12/26] add unit tests for manifest related functions --- .../services/unit_test_manifest.py | 300 ++++++++++++++++++ 1 file changed, 300 insertions(+) create mode 100644 tests/unit/synapseclient/services/unit_test_manifest.py diff --git a/tests/unit/synapseclient/services/unit_test_manifest.py b/tests/unit/synapseclient/services/unit_test_manifest.py new file mode 100644 index 000000000..db8322b0d --- /dev/null +++ b/tests/unit/synapseclient/services/unit_test_manifest.py @@ -0,0 +1,300 @@ +import csv +import datetime +import os +import tempfile + +from synapseclient.models import Activity, FailureStrategy, File, Folder +from synapseclient.models.activity import UsedEntity, UsedURL +from synapseclient.models.services.manifest import ( + MANIFEST_CSV_FILENAME, + _extract_entity_metadata_for_manifest_csv, + _manifest_csv_filename, + _write_manifest_data_csv, + generate_manifest_csv, +) + + +class TestManifestCsvFilename: + """Tests for the _manifest_csv_filename helper.""" + + def test_plain_directory(self) -> None: + # GIVEN a plain absolute path + # WHEN _manifest_csv_filename is called + result = _manifest_csv_filename("/tmp/mydir") + + # THEN it joins the path with the manifest filename + assert result == os.path.join("/tmp/mydir", MANIFEST_CSV_FILENAME) + + def test_tilde_is_expanded(self) -> None: + # GIVEN a path starting with ~ + # WHEN _manifest_csv_filename is called + result = _manifest_csv_filename("~/mydir") + + # THEN ~ is expanded to the user's home directory + assert result == os.path.join( + os.path.expanduser("~/mydir"), MANIFEST_CSV_FILENAME + ) + assert "~" not in result + + def test_filename_is_manifest_csv(self) -> None: + # GIVEN any directory path + # WHEN _manifest_csv_filename is called + result = _manifest_csv_filename("/some/path") + + # THEN the basename of the result is MANIFEST_CSV_FILENAME + assert os.path.basename(result) == MANIFEST_CSV_FILENAME + + +class TestGenerateManifestCsv: + """Tests for the generate_manifest_csv and related helper functions.""" + + def _make_file( + self, + syn_id: str = "syn123", + name: str = "file.txt", + path: str = "/data/file.txt", + parent_id: str = "syn456", + content_type: str = "text/plain", + synapse_store: bool = True, + annotations: dict = None, + activity: Activity = None, + ) -> File: + f = File( + id=syn_id, + name=name, + path=path, + parent_id=parent_id, + content_type=content_type, + synapse_store=synapse_store, + ) + if annotations: + f.annotations = annotations + if activity: + f.activity = activity + return f + + def test_extract_entity_metadata_includes_annotations_and_activity(self) -> None: + # GIVEN a File entity with provenance + activity = Activity( + name="My Pipeline", + description="Run analysis", + used=[UsedEntity(target_id="syn111", target_version_number=1)], + executed=[UsedURL(url="https://github.com/example/pipeline")], + ) + f = self._make_file( + activity=activity, annotations={"tissue": ["brain"], "count": [42]} + ) + + # WHEN metadata is extracted + keys, data = _extract_entity_metadata_for_manifest_csv([f]) + + # THEN provenance keys are present in the column list + assert { + "used", + "executed", + "activityName", + "activityDescription", + "tissue", + "count", + }.issubset(keys) + + assert data[0]["parentId"] == "syn456" + assert data[0]["ID"] == "syn123" + assert data[0]["path"] == "/data/file.txt" + assert data[0]["name"] == "file.txt" + assert data[0]["activityName"] == "My Pipeline" + assert data[0]["activityDescription"] == "Run analysis" + assert data[0]["used"] == "syn111.1" + assert data[0]["executed"] == "https://github.com/example/pipeline" + assert data[0]["tissue"] == "brain" + assert data[0]["count"] == "42" + + def test_generate_manifest_csv_data_items_are_converted_to_strings(self) -> None: + # GIVEN a File with a name containing a comma and mixed-type annotations + f = self._make_file( + name="a, b, c", + path="/data/file.txt", + annotations={ + "single_str": "hello", + "multi_str": ["a", "b", "c"], + "str_with_comma": ["hello,world", "plain text"], + "booleans": [True, False], + "integers": [1], + "floats": [1.0], + "single_dt": [ + datetime.datetime(2020, 1, 1, 0, 0, 0, tzinfo=datetime.timezone.utc) + ], + "multi_dt": [ + datetime.datetime( + 2020, 1, 1, 0, 0, 0, tzinfo=datetime.timezone.utc + ), + datetime.datetime( + 2021, 6, 15, 12, 30, 0, tzinfo=datetime.timezone.utc + ), + ], + }, + ) + + with tempfile.TemporaryDirectory() as tmpdir: + # WHEN generate_manifest_csv is called + generate_manifest_csv(all_files=[f], path=tmpdir) + manifest_path = os.path.join(tmpdir, "manifest.csv") + content = open(manifest_path, encoding="utf8").read() + with open(manifest_path, newline="", encoding="utf8") as fp: + row = next(csv.DictReader(fp)) + + assert '"a, b, c"' in content + assert row["single_str"] == "hello" + assert row["multi_str"] == "[a,b,c]" + assert row["str_with_comma"] == '["hello,world",plain text]' + assert row["booleans"] == "[True,False]" + assert row["integers"] == "1" + assert row["floats"] == "1.0" + assert row["single_dt"] == "2020-01-01T00:00:00Z" + assert row["multi_dt"] == "[2020-01-01T00:00:00Z,2021-06-15T12:30:00Z]" + + def test_generate_manifest_csv_skips_when_no_files(self) -> None: + # GIVEN an empty file list + with tempfile.TemporaryDirectory() as tmpdir: + # WHEN generate_manifest_csv is called with no files + generate_manifest_csv(all_files=[], path=tmpdir) + + # THEN no manifest.csv is created + assert not os.path.exists(os.path.join(tmpdir, "manifest.csv")) + + def test_generate_manifest_csv_quotes_values_with_commas(self) -> None: + # GIVEN a File whose name contains a comma + f = self._make_file(name="file, extra.txt", path="/tmp/file, extra.txt") + + with tempfile.TemporaryDirectory() as tmpdir: + generate_manifest_csv(all_files=[f], path=tmpdir) + manifest_path = os.path.join(tmpdir, "manifest.csv") + content = open(manifest_path, encoding="utf8").read() + # THEN the comma-containing value is quoted in the CSV + assert '"file, extra.txt"' in content + + +class TestWriteManifestDataCsv: + """Tests for the _write_manifest_data_csv helper.""" + + def test_writes_header_and_rows(self) -> None: + # GIVEN keys and one row of data + keys = ["path", "parentId", "name"] + data = [{"path": "/data/f.txt", "parentId": "syn1", "name": "f.txt"}] + + with tempfile.TemporaryDirectory() as tmpdir: + filename = os.path.join(tmpdir, "manifest.csv") + # WHEN _write_manifest_data_csv is called + _write_manifest_data_csv(filename, keys, data) + + with open(filename, newline="", encoding="utf8") as fp: + rows = list(csv.DictReader(fp)) + + # THEN header and row values are written correctly + assert len(rows) == 1 + assert rows[0]["path"] == "/data/f.txt" + assert rows[0]["parentId"] == "syn1" + assert rows[0]["name"] == "f.txt" + + def test_missing_keys_use_empty_string(self) -> None: + # GIVEN a row missing the "name" key + keys = ["path", "parentId", "name"] + data = [{"path": "/data/f.txt", "parentId": "syn1"}] + + with tempfile.TemporaryDirectory() as tmpdir: + filename = os.path.join(tmpdir, "manifest.csv") + _write_manifest_data_csv(filename, keys, data) + + with open(filename, newline="", encoding="utf8") as fp: + rows = list(csv.DictReader(fp)) + + # THEN the missing field is written as an empty string + assert rows[0]["name"] == "" + + def test_extra_keys_in_row_are_ignored(self) -> None: + # GIVEN a row with a key not in the fieldnames list + keys = ["path", "name"] + data = [{"path": "/data/f.txt", "name": "f.txt", "extra": "ignored"}] + + with tempfile.TemporaryDirectory() as tmpdir: + filename = os.path.join(tmpdir, "manifest.csv") + # WHEN _write_manifest_data_csv is called + # THEN no exception is raised and only declared keys appear + _write_manifest_data_csv(filename, keys, data) + + with open(filename, newline="", encoding="utf8") as fp: + reader = csv.DictReader(fp) + rows = list(reader) + assert "extra" not in reader.fieldnames + + assert rows[0]["path"] == "/data/f.txt" + + def test_values_with_commas_are_quoted(self) -> None: + # GIVEN a value that contains a comma + keys = ["name", "parentId"] + data = [{"name": "file, with comma.txt", "parentId": "syn1"}] + + with tempfile.TemporaryDirectory() as tmpdir: + filename = os.path.join(tmpdir, "manifest.csv") + _write_manifest_data_csv(filename, keys, data) + content = open(filename, encoding="utf8").read() + + with open(filename, newline="", encoding="utf8") as fp: + rows = list(csv.DictReader(fp)) + + # THEN the comma-containing value is quoted in the raw CSV + assert '"file, with comma.txt"' in content + # AND reads back correctly + assert rows[0]["name"] == "file, with comma.txt" + + def test_empty_data_writes_header_only(self) -> None: + # GIVEN no data rows + keys = ["path", "parentId", "name"] + + with tempfile.TemporaryDirectory() as tmpdir: + filename = os.path.join(tmpdir, "manifest.csv") + _write_manifest_data_csv(filename, keys, []) + + with open(filename, newline="", encoding="utf8") as fp: + reader = csv.DictReader(fp) + rows = list(reader) + header = reader.fieldnames + + # THEN the file exists with only the header + assert rows == [] + assert header == keys + + def test_unicode_values_are_written_correctly(self) -> None: + # GIVEN a value with non-ASCII characters + keys = ["name", "parentId"] + data = [{"name": "données_été.txt", "parentId": "syn1"}] + + with tempfile.TemporaryDirectory() as tmpdir: + filename = os.path.join(tmpdir, "manifest.csv") + _write_manifest_data_csv(filename, keys, data) + + with open(filename, newline="", encoding="utf8") as fp: + rows = list(csv.DictReader(fp)) + + # THEN unicode characters round-trip correctly + assert rows[0]["name"] == "données_été.txt" + + def test_multiple_rows_written_in_order(self) -> None: + # GIVEN multiple rows + keys = ["name", "parentId"] + data = [ + {"name": "a.txt", "parentId": "syn1"}, + {"name": "b.txt", "parentId": "syn2"}, + {"name": "c.txt", "parentId": "syn3"}, + ] + + with tempfile.TemporaryDirectory() as tmpdir: + filename = os.path.join(tmpdir, "manifest.csv") + _write_manifest_data_csv(filename, keys, data) + + with open(filename, newline="", encoding="utf8") as fp: + rows = list(csv.DictReader(fp)) + + # THEN all rows are present and in order + assert len(rows) == 3 + assert [r["name"] for r in rows] == ["a.txt", "b.txt", "c.txt"] From 04c5347a2938b4362bd4156f129a0f02eada08f3 Mon Sep 17 00:00:00 2001 From: danlu1 Date: Fri, 24 Apr 2026 12:09:27 -0700 Subject: [PATCH 13/26] add more test cases --- .../services/unit_test_manifest.py | 150 +++++++++++++++++- 1 file changed, 149 insertions(+), 1 deletion(-) diff --git a/tests/unit/synapseclient/services/unit_test_manifest.py b/tests/unit/synapseclient/services/unit_test_manifest.py index db8322b0d..fe5295d7c 100644 --- a/tests/unit/synapseclient/services/unit_test_manifest.py +++ b/tests/unit/synapseclient/services/unit_test_manifest.py @@ -3,11 +3,16 @@ import os import tempfile -from synapseclient.models import Activity, FailureStrategy, File, Folder +import pytest + +from synapseclient.models import Activity, File from synapseclient.models.activity import UsedEntity, UsedURL from synapseclient.models.services.manifest import ( MANIFEST_CSV_FILENAME, + _convert_manifest_data_items_to_string_list, + _convert_manifest_data_row_to_dict, _extract_entity_metadata_for_manifest_csv, + _get_entity_provenance_dict_for_manifest, _manifest_csv_filename, _write_manifest_data_csv, generate_manifest_csv, @@ -298,3 +303,146 @@ def test_multiple_rows_written_in_order(self) -> None: # THEN all rows are present and in order assert len(rows) == 3 assert [r["name"] for r in rows] == ["a.txt", "b.txt", "c.txt"] + + +class TestGetEntityProvenanceDictForManifest: + """Tests for _get_entity_provenance_dict_for_manifest.""" + + def _make_file_with_activity(self, activity: Activity = None) -> File: + f = File(id="syn1", name="f.txt", path="/f.txt", parent_id="syn2") + if activity: + f.activity = activity + return f + + def test_returns_empty_dict_when_no_activity(self) -> None: + f = self._make_file_with_activity() + result = _get_entity_provenance_dict_for_manifest(f) + assert result == {} + + def test_returns_all_provenance_keys_with_activity(self) -> None: + activity = Activity( + name="Pipeline", + description="Runs analysis", + used=[UsedEntity(target_id="syn10", target_version_number=2)], + executed=[UsedURL(url="https://github.com/example/run")], + ) + f = self._make_file_with_activity(activity) + + result = _get_entity_provenance_dict_for_manifest(f) + assert result["used"] == "syn10.2" + assert result["executed"] == "https://github.com/example/run" + assert result["activityName"] == "Pipeline" + assert result["activityDescription"] == "Runs analysis" + + def test_activity_name_and_description_default_to_empty_string(self) -> None: + activity = Activity(name=None, description=None) + f = self._make_file_with_activity(activity) + + result = _get_entity_provenance_dict_for_manifest(f) + assert result["activityName"] == "" + assert result["activityDescription"] == "" + + def test_empty_used_and_executed_lists(self) -> None: + activity = Activity(name="minimal", used=[], executed=[]) + f = self._make_file_with_activity(activity) + + result = _get_entity_provenance_dict_for_manifest(f) + + assert result["activityName"] == "minimal" + assert result["used"] == "" + assert result["executed"] == "" + + def test_multiple_used_and_executed_are_semicolon_joined(self) -> None: + # GIVEN an activity with multiple used and executed entries + activity = Activity( + name="multi", + used=[ + UsedEntity(target_id="syn1", target_version_number=1), + UsedEntity(target_id="syn2", target_version_number=3), + ], + executed=[ + UsedURL(url="https://github.com/a"), + UsedURL(url="https://github.com/b"), + ], + ) + f = self._make_file_with_activity(activity) + + result = _get_entity_provenance_dict_for_manifest(f) + + assert result["activityName"] == "multi" + assert result["used"] == "syn1.1;syn2.3" + assert result["executed"] == "https://github.com/a;https://github.com/b" + + +_UTC = datetime.timezone.utc + + +class TestConvertManifestDataItemsToStringList: + """Tests for _convert_manifest_data_items_to_string_list.""" + + @pytest.mark.parametrize( + "items,expected", + [ + ([], ""), + (["hello"], "hello"), + # single item with comma is NOT quoted — quoting only applies in multi-item lists + (["hello,world"], "hello,world"), + (["a", "b", "c"], "[a,b,c]"), + (["hello,world", "plain"], '["hello,world",plain]'), + ([True], "True"), + ([True, False], "[True,False]"), + ([42], "42"), + ([1, 2, 3], "[1,2,3]"), + ([1.5], "1.5"), + ( + [datetime.datetime(2020, 1, 1, tzinfo=_UTC)], + "2020-01-01T00:00:00Z", + ), + ( + [ + datetime.datetime(2020, 1, 1, tzinfo=_UTC), + datetime.datetime(2021, 6, 15, 12, 30, tzinfo=_UTC), + ], + "[2020-01-01T00:00:00Z,2021-06-15T12:30:00Z]", + ), + ], + ) + def test_converts_items(self, items: list, expected: str) -> None: + assert _convert_manifest_data_items_to_string_list(items) == expected + + +class TestConvertManifestDataRowToDict: + """Tests for _convert_manifest_data_row_to_dict.""" + + def test_all_keys_present_passes_through(self) -> None: + row = {"path": "/f.txt", "parentId": "syn1", "name": "f.txt"} + keys = ["path", "parentId", "name"] + + result = _convert_manifest_data_row_to_dict(row, keys) + + assert result == {"path": "/f.txt", "parentId": "syn1", "name": "f.txt"} + + def test_missing_key_defaults_to_empty_string(self) -> None: + row = {"path": "/f.txt", "parentId": "syn1"} + keys = ["path", "parentId", "name"] + + result = _convert_manifest_data_row_to_dict(row, keys) + + assert result["name"] == "" + + def test_list_value_converted_to_string(self) -> None: + row = {"tags": ["a", "b", "c"]} + keys = ["tags"] + + result = _convert_manifest_data_row_to_dict(row, keys) + + assert result["tags"] == "[a,b,c]" + + def test_extra_keys_in_row_are_not_included_in_output(self) -> None: + row = {"path": "/f.txt", "extra": "ignored"} + keys = ["path"] + + result = _convert_manifest_data_row_to_dict(row, keys) + + assert "extra" not in result + assert result == {"path": "/f.txt"} From f759bf3d9c35752930905baad281646fe8c0a412 Mon Sep 17 00:00:00 2001 From: danlu1 Date: Mon, 27 Apr 2026 12:00:48 -0700 Subject: [PATCH 14/26] reformatting --- docs/explanations/manifest_csv.md | 12 +-- .../tutorials/python/download_data_in_bulk.md | 76 +++++++++++++------ .../tutorial_scripts/download_data_in_bulk.py | 38 ++++++++-- mkdocs.yml | 2 +- 4 files changed, 90 insertions(+), 38 deletions(-) diff --git a/docs/explanations/manifest_csv.md b/docs/explanations/manifest_csv.md index ae542a800..e46d52f4a 100644 --- a/docs/explanations/manifest_csv.md +++ b/docs/explanations/manifest_csv.md @@ -17,7 +17,7 @@ The format of the manifest file is a comma-separated value (CSV) file with one r | parentId | Synapse ID of parent | syn1235 | !!! note - The legacy TSV manifest used a column named `parent`. The CSV manifest uses `parentId` instead, which is consistent with the Synapse REST API field name. If you are migrating an existing TSV manifest to CSV, rename the `parent` column to `parentId`. + The legacy TSV manifest used the columns `parent` and `id`, while the CSV manifest uses `parentId` and `ID` to align with Synapse REST API field names. If you’re migrating a TSV manifest to CSV, you’ll need to rename `parent` to `parentId` and `id` to `ID`. ### Standard fields @@ -60,11 +60,11 @@ Any columns that are not in the standard or metadata fields described above will Adding annotations to each row: -| path | parentId | annot1 | annot2 | annot3 | annot4 | annot5 | -| --- | --- | --- | --- | --- | --- | --- | -| /path/file1.txt | syn1243 | bar | 3.1415 | "aaaa, bbbb" | "[14,27,30]" | "Annotation, with a comma" | -| /path/file2.txt | syn12433 | baz | 2.71 | value_1 | "[1,2,3]" | test 123 | -| /path/file3.txt | syn12455 | zzz | 3.52 | value_3 | "[42,56,77]" | a single annotation | +| path | parentId | annot1 | annot2 | annot3 | annot4 | annot5 | annot6 | +| --- | --- | --- | --- | --- | --- | --- | --- | +| /path/file1.txt | syn1243 | bar | 3.1415 | "aaaa, bbbb" | "[14,27,30]" | "Annotation, with a comma" | "True" | +| /path/file2.txt | syn12433 | baz | 2.71 | value_1 | "[1,2,3]" | string without commas | "[True,False]" | +| /path/file3.txt | syn12455 | zzz | 3.52 | value_3 | "[42,56,77]" | a_single_string | | #### Multiple values of annotations per key diff --git a/docs/tutorials/python/download_data_in_bulk.md b/docs/tutorials/python/download_data_in_bulk.md index 04e27ec92..3534e141a 100644 --- a/docs/tutorials/python/download_data_in_bulk.md +++ b/docs/tutorials/python/download_data_in_bulk.md @@ -28,6 +28,7 @@ With a project that has this example layout: In this tutorial you will: 1. Download all files/folder from a project +1. Control manifest CSV generation during download 1. Download all files/folders for a specific folder within the project 1. Loop over all files/folders on the project/folder object instances @@ -44,48 +45,75 @@ another desired directory exists. #### First let's set up some constants we'll use in this script ```python -{!docs/tutorials/python/tutorial_scripts/download_data_in_bulk.py!lines=5-19} +--8<-- "docs/tutorials/python/tutorial_scripts/download_data_in_bulk.py:setup" ``` #### Next we'll create an instance of the Project we are going to sync ```python -{!docs/tutorials/python/tutorial_scripts/download_data_in_bulk.py!lines=20-22} +--8<-- "docs/tutorials/python/tutorial_scripts/download_data_in_bulk.py:get_project" ``` #### Finally we'll sync the project from synapse to your local machine ```python -{!docs/tutorials/python/tutorial_scripts/download_data_in_bulk.py!lines=23-28} +--8<-- "docs/tutorials/python/tutorial_scripts/download_data_in_bulk.py:sync_project" ```
While syncing your project you'll see results like: ``` -Syncing Project (syn53185532:My uniquely named project about Alzheimer's Disease) from Synapse. -Syncing Folder (syn53205630:experiment_notes) from Synapse. -Syncing Folder (syn53205632:notes_2022) from Synapse. -Syncing Folder (syn53205629:single_cell_RNAseq_batch_1) from Synapse. -Syncing Folder (syn53205656:single_cell_RNAseq_batch_2) from Synapse. -Syncing Folder (syn53205631:notes_2023) from Synapse. -Downloading [####################]100.00% 4.0bytes/4.0bytes (1.8kB/s) fileA.txt Done... -Downloading [####################]100.00% 3.0bytes/3.0bytes (1.1kB/s) SRR92345678_R1.fastq.gz Done... -Downloading [####################]100.00% 4.0bytes/4.0bytes (1.7kB/s) SRR12345678_R1.fastq.gz Done... -Downloading [####################]100.00% 4.0bytes/4.0bytes (1.9kB/s) fileC.txt Done... -Downloading [####################]100.00% 4.0bytes/4.0bytes (2.7kB/s) fileB.txt Done... -Downloading [####################]100.00% 4.0bytes/4.0bytes (2.7kB/s) SRR12345678_R2.fastq.gz Done... -Downloading [####################]100.00% 4.0bytes/4.0bytes (2.6kB/s) SRR12345678_R2.fastq.gz Done... -Downloading [####################]100.00% 4.0bytes/4.0bytes (1.8kB/s) SRR12345678_R1.fastq.gz Done... -Downloading [####################]100.00% 3.0bytes/3.0bytes (1.5kB/s) SRR92345678_R2.fastq.gz Done... -Downloading [####################]100.00% 4.0bytes/4.0bytes (1.6kB/s) fileD.txt Done... -['single_cell_RNAseq_batch_2', 'single_cell_RNAseq_batch_1', 'experiment_notes'] +[syn74583648:My uniquely named project about Alzheimer's Disease]: Syncing Project from Synapse. +[syn74584000:biospecimen_experiment_1]: Syncing Folder from Synapse. +[syn74584007:single_cell_RNAseq_batch_2]: Syncing Folder from Synapse. +[syn74584001:biospecimen_experiment_2]: Syncing Folder from Synapse. +[syn74584006:single_cell_RNAseq_batch_1]: Syncing Folder from Synapse. +[syn74584146]: Downloaded to /biospecimen_experiment_1/fileB.png +[syn74584154]: Downloaded to /biospecimen_experiment_2/fileD.png +[syn74584155]: Downloaded to /biospecimen_experiment_2/fileC.png +[syn74584188]: Downloaded to /single_cell_RNAseq_batch_1/SRR12345678_R1.fastq.png +[syn74584147]: Downloaded to /biospecimen_experiment_1/fileA.png +[syn74584206]: Downloaded to /single_cell_RNAseq_batch_2/SRR12345678_R1.fastq.png +[syn74584189]: Downloaded to /single_cell_RNAseq_batch_1/SRR12345678_R2.fastq.png +[syn74584207]: Downloaded to /single_cell_RNAseq_batch_2/SRR12345678_R2.fastq.png +Downloading files: 100%|████████████████████| 1.31M/1.31M [00:02<00:00, 606kB/s] +Project(id='syn74583648', name="My uniquely named project about Alzheimer's Disease", files=[], folders=[ + Folder(id='syn74584000', name='biospecimen_experiment_1', parent_id='syn74583648', files=[ + File(id='syn74584147', name='fileA.png', path='/biospecimen_experiment_1/fileA.png', parent_id='syn74584000', ...), + File(id='syn74584146', name='fileB.png', path='/biospecimen_experiment_1/fileB.png', parent_id='syn74584000', ...) + ], folders=[], ...), + Folder(id='syn74584001', name='biospecimen_experiment_2', parent_id='syn74583648', files=[ + File(id='syn74584155', name='fileC.png', path='/biospecimen_experiment_2/fileC.png', parent_id='syn74584001', ...), + File(id='syn74584154', name='fileD.png', path='/biospecimen_experiment_2/fileD.png', parent_id='syn74584001', ...) + ], folders=[], ...), + Folder(id='syn74584006', name='single_cell_RNAseq_batch_1', parent_id='syn74583648', files=[ + File(id='syn74584188', name='SRR12345678_R1.fastq.png', path='/single_cell_RNAseq_batch_1/SRR12345678_R1.fastq.png', parent_id='syn74584006', ...), + File(id='syn74584189', name='SRR12345678_R2.fastq.png', path='/single_cell_RNAseq_batch_1/SRR12345678_R2.fastq.png', parent_id='syn74584006', ...) + ], folders=[], ...), + Folder(id='syn74584007', name='single_cell_RNAseq_batch_2', parent_id='syn74583648', files=[ + File(id='syn74584206', name='SRR12345678_R1.fastq.png', path='/single_cell_RNAseq_batch_2/SRR12345678_R1.fastq.png', parent_id='syn74584007', ...), + File(id='syn74584207', name='SRR12345678_R2.fastq.png', path='/single_cell_RNAseq_batch_2/SRR12345678_R2.fastq.png', parent_id='syn74584007', ...) + ], folders=[], ...) +], ...) ```
-## 2. Download all files/folders for a specific folder within the project +## 2. Control manifest CSV generation during download + +By default (`manifest="all"`), `sync_from_synapse` writes a `manifest.csv` into every +synced directory. The manifest.csv is interoperable with sync_to_synapse, the Synapse UI download cart, and `download_list_files`. + +Use `manifest="root"` to write a single manifest at the root path, or +`manifest="suppress"` to skip manifest generation entirely. + +```python +--8<-- "docs/tutorials/python/tutorial_scripts/download_data_in_bulk.py:sync_project_with_root_manifest" +``` + +## 3. Download all files/folders for a specific folder within the project Following the same set of steps let's sync a specific folder ```python -{!docs/tutorials/python/tutorial_scripts/download_data_in_bulk.py!lines=30-36} +--8<-- "docs/tutorials/python/tutorial_scripts/download_data_in_bulk.py:sync_folder" ```
@@ -105,12 +133,12 @@ download the content again. If you were to use an `if_collision` of `"overwrite. you would see that when the content on your machine does not match Synapse the file will be overwritten. -## 3. Loop over all files/folders on the project/folder object instances +## 4. Loop over all files/folders on the project/folder object instances Using `sync_from_synapse` will load into memory the state of all Folders and Files retrieved from Synapse. This will allow you to loop over the contents of your container. ```python -{!docs/tutorials/python/tutorial_scripts/download_data_in_bulk.py!lines=37-47} +--8<-- "docs/tutorials/python/tutorial_scripts/download_data_in_bulk.py:loop_over_project_folder" ```
diff --git a/docs/tutorials/python/tutorial_scripts/download_data_in_bulk.py b/docs/tutorials/python/tutorial_scripts/download_data_in_bulk.py index 0ca946f13..1757c1037 100644 --- a/docs/tutorials/python/tutorial_scripts/download_data_in_bulk.py +++ b/docs/tutorials/python/tutorial_scripts/download_data_in_bulk.py @@ -2,6 +2,7 @@ Here is where you'll find the code for the downloading data in bulk tutorial. """ +# --8<-- [start:setup] import os import synapseclient @@ -16,32 +17,55 @@ DIRECTORY_TO_SYNC_FOLDER_TO = os.path.join( DIRECTORY_TO_SYNC_PROJECT_TO, FOLDER_NAME_TO_SYNC ) +# --8<-- [end:setup] -# Step 1: Create an instance of the container I want to sync the data from and sync -project = Project(name="My uniquely named project about Alzheimer's Disease") -# We'll set the `if_collision` to `keep.local` so that we don't overwrite any files +# Step 1: Get an instance of the container I want to sync the data from and sync +# --8<-- [start:get_project] +project = Project(name="My uniquely named project about Alzheimer's Disease").get() +# --8<-- [end:get_project] + +# By default, sync_from_synapse generates a manifest.csv in each synced directory. +# The manifest.csv is interoperable with sync_to_synapse, the Synapse +# UI download cart, and `download_list_files`. +# --8<-- [start:sync_project] +# We'll set the `if_collision` to `keep.local` so that we don't overwrite any files. project.sync_from_synapse(path=DIRECTORY_TO_SYNC_PROJECT_TO, if_collision="keep.local") # Print out the contents of the directory where the data was synced to # Explore the directory to see the contents have been recursively synced. print(os.listdir(DIRECTORY_TO_SYNC_PROJECT_TO)) +# --8<-- [end:sync_project] +# Or, use `manifest="root"` to generate a single manifest.csv at the root directory +# instead of one in each sub-directory. Use `manifest="suppress"` to skip +# manifest generation entirely. + +# --8<-- [start:sync_project_with_root_manifest] +project.sync_from_synapse( + path=DIRECTORY_TO_SYNC_PROJECT_TO, + if_collision="keep.local", + manifest="root", +) +print(os.listdir(DIRECTORY_TO_SYNC_PROJECT_TO)) +# --8<-- [end:sync_project_with_root_manifest] -# Step 2: The same as step 1, but for a single folder +# Step 3: The same as step 1, but for a single folder +# --8<-- [start:sync_folder] folder = Folder(name=FOLDER_NAME_TO_SYNC, parent_id=project.id) folder.sync_from_synapse(path=DIRECTORY_TO_SYNC_FOLDER_TO, if_collision="keep.local") print(os.listdir(os.path.expanduser(DIRECTORY_TO_SYNC_FOLDER_TO))) +# --8<-- [end:sync_folder] -# Step 3: Loop over all files/folders on the project/folder object instances +# Step 4: Loop over all files/folders on the project/folder object instances +# --8<-- [start:loop_over_project_folder] for folder_at_root in project.folders: print(f"Folder at root: {folder_at_root.name}") - for file_in_root_folder in folder_at_root.files: print(f"File in {folder_at_root.name}: {file_in_root_folder.name}") - for folder_in_folder in folder_at_root.folders: print(f"Folder in {folder_at_root.name}: {folder_in_folder.name}") for file_in_folder in folder_in_folder.files: print(f"File in {folder_in_folder.name}: {file_in_folder.name}") +# --8<-- [end:loop_over_project_folder] diff --git a/mkdocs.yml b/mkdocs.yml index f61d9ec6e..1638308ea 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -164,7 +164,7 @@ nav: - Domain Models of Synapse: explanations/domain_models_of_synapse.md - Access Control: explanations/access_control.md - Properties vs Annotations: explanations/properties_vs_annotations.md - - Manifest TSV: explanations/manifest_tsv.md + - Manifest CSV: explanations/manifest_csv.md - Benchmarking: explanations/benchmarking.md - Structuring Your Project: explanations/structuring_your_project.md - Asyncio Changes in Python 3.14: explanations/asyncio_in_python_3_14.md From df4e111060e25d0fbb96f9c9fe029bda6ecbd866 Mon Sep 17 00:00:00 2001 From: danlu1 Date: Mon, 27 Apr 2026 14:07:53 -0700 Subject: [PATCH 15/26] reorg manifest.py --- synapseclient/models/services/manifest.py | 157 +++++++++++----------- 1 file changed, 76 insertions(+), 81 deletions(-) diff --git a/synapseclient/models/services/manifest.py b/synapseclient/models/services/manifest.py index 9f6484add..6ab63b3d6 100644 --- a/synapseclient/models/services/manifest.py +++ b/synapseclient/models/services/manifest.py @@ -1,15 +1,46 @@ -"""Functions for generating manifest CSV files from File entities.""" +"""Services for reading and writing Synapse manifest CSV files. +This includes reading a manifest CSV file and preparing it for upload, as well +as writing a manifest CSV file from a list of File entities. +""" + +from __future__ import annotations + +import ast +import asyncio import csv import datetime import io import os -from typing import TYPE_CHECKING, Any, Union +import re +from dataclasses import dataclass +from typing import TYPE_CHECKING, Any, Iterable, NamedTuple, Union +from synapseclient import Synapse from synapseclient.core import utils +from synapseclient.core.exceptions import ( + SynapseFileNotFoundError, + SynapseHTTPError, + SynapseProvenanceError, +) +from synapseclient.core.utils import ( + bool_or_none, + datetime_or_none, + get_synid_and_version, + is_synapse_id_str, + is_url, + test_import_pandas, + topolgical_sort, +) +from synapseclient.operations.factory_operations import FileOptions +from synapseclient.operations.factory_operations import get_async as factory_get_async if TYPE_CHECKING: - from synapseclient.models import File + from pandas import DataFrame, Series + + from synapseclient.models import UsedEntity, UsedURL + from synapseclient.models.file import File + MANIFEST_CSV_FILENAME = "manifest.csv" DEFAULT_GENERATED_MANIFEST_CSV_KEYS = [ @@ -24,6 +55,47 @@ "activityName", "activityDescription", ] +#: Scalar types that Synapse supports as annotation values. +SynapseAnnotationType = datetime.datetime | float | int | bool | str + +# Columns that are NOT annotations — stripped before building File.annotations. +# Covers the standard manifest columns plus the extra metadata columns produced +# by the Synapse UI download cart and synapse get-download-list CLI. +NON_ANNOTATION_COLUMNS = frozenset( + [ + # Standard manifest columns used directly during upload + "path", + "parentId", + "ID", + "name", + "synapseStore", + "contentType", + "activityName", + "activityDescription", + "forceVersion", + "used", + "executed", + # Download-list / Synapse UI informational columns — ignore for upload + "error", + "versionNumber", + "dataFileSizeBytes", + "createdBy", + "createdOn", + "modifiedBy", + "modifiedOn", + "synapseURL", + "dataFileMD5Hex", + ] +) + +# Regex patterns used when parsing annotation cell values. +# Matches a cell that is a bracket-delimited list, e.g. "[a, b, c]". +# Disallows ']' inside to avoid matching adjacent lists like "[a][b]". +_ARRAY_BRACKET_PATTERN = re.compile(r"^\[[^\]]*\]$") +# https://stackoverflow.com/questions/18893390/splitting-on-comma-outside-quotes +_COMMAS_OUTSIDE_DOUBLE_QUOTES_PATTERN = re.compile(r",(?=(?:[^\"]*\"[^\"]*\")*[^\"]*$)") +# Valid Synapse file name characters (1–256 chars). +_FILE_NAME_PATTERN = re.compile(r"^[`\w \-\+\.\(\)]{1,256}$") def _manifest_csv_filename(path: str) -> str: @@ -218,7 +290,7 @@ def _write_manifest_data_csv(path: str, keys: list[str], data: list[dict]) -> No writer.writerow(_convert_manifest_data_row_to_dict(row, keys)) -def generate_manifest_csv(all_files: list["File"], path: str) -> None: +def generate_manifest_csv(all_files: list[File], path: str) -> None: """Generates a manifest.csv file based on a list of File entities. The generated file uses CSV format with comma delimiter and is interoperable @@ -233,83 +305,6 @@ def generate_manifest_csv(all_files: list["File"], path: str) -> None: filename = _manifest_csv_filename(path=path) keys, data = _extract_entity_metadata_for_manifest_csv(all_files=all_files) _write_manifest_data_csv(filename, keys, data) -"""Services for reading a Synapse manifest CSV file and preparing it for upload.""" - -from __future__ import annotations - -import ast -import asyncio -import datetime -import os -import re -from dataclasses import dataclass -from typing import TYPE_CHECKING, Iterable, NamedTuple - -from synapseclient import Synapse -from synapseclient.core.exceptions import ( - SynapseFileNotFoundError, - SynapseHTTPError, - SynapseProvenanceError, -) -from synapseclient.core.utils import ( - bool_or_none, - datetime_or_none, - get_synid_and_version, - is_synapse_id_str, - is_url, - test_import_pandas, - topolgical_sort, -) -from synapseclient.operations.factory_operations import FileOptions -from synapseclient.operations.factory_operations import get_async as factory_get_async - -if TYPE_CHECKING: - from pandas import DataFrame, Series - - from synapseclient.models import UsedEntity, UsedURL - from synapseclient.models.file import File - -#: Scalar types that Synapse supports as annotation values. -SynapseAnnotationType = datetime.datetime | float | int | bool | str - -# Columns that are NOT annotations — stripped before building File.annotations. -# Covers the standard manifest columns plus the extra metadata columns produced -# by the Synapse UI download cart and synapse get-download-list CLI. -NON_ANNOTATION_COLUMNS = frozenset( - [ - # Standard manifest columns used directly during upload - "path", - "parentId", - "ID", - "name", - "synapseStore", - "contentType", - "activityName", - "activityDescription", - "forceVersion", - "used", - "executed", - # Download-list / Synapse UI informational columns — ignore for upload - "error", - "versionNumber", - "dataFileSizeBytes", - "createdBy", - "createdOn", - "modifiedBy", - "modifiedOn", - "synapseURL", - "dataFileMD5Hex", - ] -) - -# Regex patterns used when parsing annotation cell values. -# Matches a cell that is a bracket-delimited list, e.g. "[a, b, c]". -# Disallows ']' inside to avoid matching adjacent lists like "[a][b]". -_ARRAY_BRACKET_PATTERN = re.compile(r"^\[[^\]]*\]$") -# https://stackoverflow.com/questions/18893390/splitting-on-comma-outside-quotes -_COMMAS_OUTSIDE_DOUBLE_QUOTES_PATTERN = re.compile(r",(?=(?:[^\"]*\"[^\"]*\")*[^\"]*$)") -# Valid Synapse file name characters (1–256 chars). -_FILE_NAME_PATTERN = re.compile(r"^[`\w \-\+\.\(\)]{1,256}$") class UploadSyncFile(NamedTuple): From b4f4a2a3b67496ac7dafe8c064673d73617ac430 Mon Sep 17 00:00:00 2001 From: danlu1 Date: Mon, 27 Apr 2026 14:37:19 -0700 Subject: [PATCH 16/26] reformat --- .../models/async/unit_test_folder_async.py | 130 ++++++++++-------- 1 file changed, 76 insertions(+), 54 deletions(-) diff --git a/tests/unit/synapseclient/models/async/unit_test_folder_async.py b/tests/unit/synapseclient/models/async/unit_test_folder_async.py index 801fcdaa6..16de385dc 100644 --- a/tests/unit/synapseclient/models/async/unit_test_folder_async.py +++ b/tests/unit/synapseclient/models/async/unit_test_folder_async.py @@ -884,21 +884,27 @@ async def mock_get_entity_bundle(entity_id, *args, **kwargs): return self.get_example_rest_api_folder_output() # WHEN I call sync_from_synapse with manifest="all" and a path - with patch( - "synapseclient.models.mixins.storable_container.get_children", - side_effect=mock_get_children, - ), patch( - "synapseclient.api.entity_factory.get_entity_id_bundle2", - side_effect=mock_get_entity_bundle, - ), patch( - "synapseclient.models.file.File.get_async", - side_effect=mock_file_get, - ), patch( - "synapseclient.models.mixins.storable_container.os.path.exists", - return_value=True, - ), patch( - "synapseclient.models.mixins.storable_container.generate_manifest_csv", - ) as mock_generate: + with ( + patch( + "synapseclient.models.mixins.storable_container.get_children", + side_effect=mock_get_children, + ), + patch( + "synapseclient.api.entity_factory.get_entity_id_bundle2", + side_effect=mock_get_entity_bundle, + ), + patch( + "synapseclient.models.file.File.get_async", + side_effect=mock_file_get, + ), + patch( + "synapseclient.models.mixins.storable_container.os.path.exists", + return_value=True, + ), + patch( + "synapseclient.models.mixins.storable_container.generate_manifest_csv", + ) as mock_generate, + ): await folder.sync_from_synapse_async( path="/tmp/mydir", manifest="all", synapse_client=self.syn ) @@ -933,19 +939,24 @@ async def mock_get_children(*args, **kwargs): ) # WHEN I call sync_from_synapse with manifest="root" and a path - with patch( - "synapseclient.models.mixins.storable_container.get_children", - side_effect=mock_get_children, - ), patch( - "synapseclient.api.entity_factory.get_entity_id_bundle2", - new_callable=AsyncMock, - return_value=self.get_example_rest_api_folder_output(), - ), patch( - "synapseclient.models.file.File.get_async", - return_value=downloaded_file, - ), patch( - "synapseclient.models.mixins.storable_container.generate_manifest_csv", - ) as mock_generate: + with ( + patch( + "synapseclient.models.mixins.storable_container.get_children", + side_effect=mock_get_children, + ), + patch( + "synapseclient.api.entity_factory.get_entity_id_bundle2", + new_callable=AsyncMock, + return_value=self.get_example_rest_api_folder_output(), + ), + patch( + "synapseclient.models.file.File.get_async", + return_value=downloaded_file, + ), + patch( + "synapseclient.models.mixins.storable_container.generate_manifest_csv", + ) as mock_generate, + ): await folder.sync_from_synapse_async( path="/tmp/mydir", manifest="root", synapse_client=self.syn ) @@ -967,19 +978,24 @@ async def mock_get_children(*args, **kwargs): yield child # WHEN I call sync_from_synapse with manifest="suppress" - with patch( - "synapseclient.models.mixins.storable_container.get_children", - side_effect=mock_get_children, - ), patch( - "synapseclient.api.entity_factory.get_entity_id_bundle2", - new_callable=AsyncMock, - return_value=self.get_example_rest_api_folder_output(), - ), patch( - "synapseclient.models.file.File.get_async", - return_value=(File(id=SYN_456, name="example_file_1")), - ), patch( - "synapseclient.models.mixins.storable_container.generate_manifest_csv", - ) as mock_generate: + with ( + patch( + "synapseclient.models.mixins.storable_container.get_children", + side_effect=mock_get_children, + ), + patch( + "synapseclient.api.entity_factory.get_entity_id_bundle2", + new_callable=AsyncMock, + return_value=self.get_example_rest_api_folder_output(), + ), + patch( + "synapseclient.models.file.File.get_async", + return_value=(File(id=SYN_456, name="example_file_1")), + ), + patch( + "synapseclient.models.mixins.storable_container.generate_manifest_csv", + ) as mock_generate, + ): await folder.sync_from_synapse_async( path="/tmp/mydir", manifest="suppress", synapse_client=self.syn ) @@ -997,24 +1013,30 @@ async def mock_get_children(*args, **kwargs): yield child # WHEN I call sync_from_synapse with no path (default manifest="all") - with patch( - "synapseclient.models.mixins.storable_container.get_children", - side_effect=mock_get_children, - ), patch( - "synapseclient.api.entity_factory.get_entity_id_bundle2", - new_callable=AsyncMock, - return_value=self.get_example_rest_api_folder_output(), - ), patch( - "synapseclient.models.file.File.get_async", - return_value=(File(id=SYN_456, name="example_file_1")), - ), patch( - "synapseclient.models.mixins.storable_container.generate_manifest_csv", - ) as mock_generate: + with ( + patch( + "synapseclient.models.mixins.storable_container.get_children", + side_effect=mock_get_children, + ), + patch( + "synapseclient.api.entity_factory.get_entity_id_bundle2", + new_callable=AsyncMock, + return_value=self.get_example_rest_api_folder_output(), + ), + patch( + "synapseclient.models.file.File.get_async", + return_value=(File(id=SYN_456, name="example_file_1")), + ), + patch( + "synapseclient.models.mixins.storable_container.generate_manifest_csv", + ) as mock_generate, + ): await folder.sync_from_synapse_async(synapse_client=self.syn) # THEN generate_manifest_csv should not be called (no path to write to) mock_generate.assert_not_called() + class TestStorageLocationMixin: """Tests for ProjectSettingsMixin methods on Folder.""" From 0210ecf5f56d1214351838784463d018be42a9e1 Mon Sep 17 00:00:00 2001 From: danlu1 Date: Mon, 27 Apr 2026 14:50:13 -0700 Subject: [PATCH 17/26] fix path issue on windows --- .../unit/synapseclient/models/async/unit_test_folder_async.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/unit/synapseclient/models/async/unit_test_folder_async.py b/tests/unit/synapseclient/models/async/unit_test_folder_async.py index 16de385dc..c296496f9 100644 --- a/tests/unit/synapseclient/models/async/unit_test_folder_async.py +++ b/tests/unit/synapseclient/models/async/unit_test_folder_async.py @@ -1,5 +1,6 @@ """Tests for the Folder class.""" +import os import uuid from typing import Dict from unittest.mock import AsyncMock, MagicMock, patch @@ -917,7 +918,8 @@ async def mock_get_entity_bundle(entity_id, *args, **kwargs): } assert any(f.id == SYN_456 for f in calls_by_path["/tmp/mydir"]) assert any( - f.id == FILE_2_ID for f in calls_by_path[f"/tmp/mydir/{SUB_FOLDER_NAME}"] + f.id == FILE_2_ID + for f in calls_by_path[os.path.join("/tmp/mydir", SUB_FOLDER_NAME)] ) async def test_sync_from_synapse_manifest_root_generates_only_at_root( From 7dff9afe4de6799b0b779720923261075b2f9f85 Mon Sep 17 00:00:00 2001 From: danlu1 Date: Thu, 30 Apr 2026 13:21:23 -0700 Subject: [PATCH 18/26] add typing hint and update warning message --- .../models/mixins/storable_container.py | 16 +++++++--------- .../protocols/storable_container_protocol.py | 4 ++-- 2 files changed, 9 insertions(+), 11 deletions(-) diff --git a/synapseclient/models/mixins/storable_container.py b/synapseclient/models/mixins/storable_container.py index 14f1b84b6..c0ef3c82c 100644 --- a/synapseclient/models/mixins/storable_container.py +++ b/synapseclient/models/mixins/storable_container.py @@ -49,6 +49,7 @@ StorableContainerSynchronousProtocol, ) from synapseclient.models.services.manifest import ( + generate_manifest_csv, read_manifest_for_upload, upload_sync_files, ) @@ -72,8 +73,6 @@ VirtualTable, ) -from synapseclient.models.services.manifest import generate_manifest_csv - @async_to_sync class StorableContainer(StorableContainerSynchronousProtocol): @@ -482,6 +481,11 @@ async def _sync_from_synapse_async( All arguments are passed through from the wrapper function. """ + if manifest not in ("all", "root", "suppress"): + raise ValueError( + f"Invalid manifest value: {manifest}. Must be one of: 'all', 'root', 'suppress'." + ) + syn = Synapse.get_client(synapse_client=synapse_client) if not self._last_persistent_instance: await self.get_async(synapse_client=syn) @@ -489,12 +493,6 @@ async def _sync_from_synapse_async( f"[{self.id}:{self.name}]: Syncing {self.__class__.__name__} from Synapse." ) - if manifest not in ("all", "root", "suppress"): - raise ValueError( - f"[{self.id}:{self.name}]: Invalid manifest value: {manifest}. " - "Must be one of: 'all', 'root', 'suppress'." - ) - path = os.path.expanduser(path) if path else None children = await self._retrieve_children( @@ -1312,7 +1310,7 @@ async def _wrap_recursive_get_children( synapse_client=synapse_client, queue=queue, include_types=include_types, - manifest="suppress", # suppress manifest generation for recursive calls since the root covers the path already in map_directory_to_all_contained_files + manifest="suppress", # The manifest is suppressed for child folders because they’re already accounted for when iterating through their parent folder. This is handled in the map_directory_to_all_contained_files function, which returns all files in the directory, including those in its child directories. ) def _create_task_for_child( diff --git a/synapseclient/models/protocols/storable_container_protocol.py b/synapseclient/models/protocols/storable_container_protocol.py index 160d00d9b..413b3dfd0 100644 --- a/synapseclient/models/protocols/storable_container_protocol.py +++ b/synapseclient/models/protocols/storable_container_protocol.py @@ -2,7 +2,7 @@ generated at runtime.""" import asyncio -from typing import TYPE_CHECKING, List, Optional, Protocol +from typing import TYPE_CHECKING, List, Literal, Optional, Protocol from typing_extensions import Self @@ -35,7 +35,7 @@ def sync_from_synapse( link_hops: int = 1, queue: asyncio.Queue = None, include_types: Optional[List[str]] = None, - manifest: str = "all", + manifest: Literal["all", "root", "suppress"] = "all", *, synapse_client: Optional[Synapse] = None, ) -> Self: From 74f1978f3f1ac95cf43934e2a0967e34df29d7df Mon Sep 17 00:00:00 2001 From: danlu1 Date: Thu, 30 Apr 2026 14:21:46 -0700 Subject: [PATCH 19/26] Create a manifest with just a header when all_files is empty --- synapseclient/models/services/manifest.py | 16 ++++++++-- .../services/unit_test_manifest.py | 32 +++++++++++++++++-- 2 files changed, 43 insertions(+), 5 deletions(-) diff --git a/synapseclient/models/services/manifest.py b/synapseclient/models/services/manifest.py index 6ab63b3d6..0ea56669c 100644 --- a/synapseclient/models/services/manifest.py +++ b/synapseclient/models/services/manifest.py @@ -289,6 +289,9 @@ def _write_manifest_data_csv(path: str, keys: list[str], data: list[dict]) -> No for row in data: writer.writerow(_convert_manifest_data_row_to_dict(row, keys)) + # add a log message to the console + print(f"Manifest file {path} has been generated.") + def generate_manifest_csv(all_files: list[File], path: str) -> None: """Generates a manifest.csv file based on a list of File entities. @@ -296,12 +299,21 @@ def generate_manifest_csv(all_files: list[File], path: str) -> None: The generated file uses CSV format with comma delimiter and is interoperable with the Synapse UI download cart. Column names follow the new convention: `parentId` (instead of `parent`) and `ID` (instead of `id`). + If all_files is empty, a manifest.csv with only the header row will be generated. + If path is None, a ValueError will be raised. - Arguments: + Args: all_files: A list of File model objects. path: The directory path where manifest.csv will be written. + + raises: + ValueError: If path is None. """ - if path and all_files: + if not path: + raise ValueError( + "The path argument is required to generate a manifest.csv file." + ) + if path: filename = _manifest_csv_filename(path=path) keys, data = _extract_entity_metadata_for_manifest_csv(all_files=all_files) _write_manifest_data_csv(filename, keys, data) diff --git a/tests/unit/synapseclient/services/unit_test_manifest.py b/tests/unit/synapseclient/services/unit_test_manifest.py index fe5295d7c..e13f8f574 100644 --- a/tests/unit/synapseclient/services/unit_test_manifest.py +++ b/tests/unit/synapseclient/services/unit_test_manifest.py @@ -158,14 +158,40 @@ def test_generate_manifest_csv_data_items_are_converted_to_strings(self) -> None assert row["single_dt"] == "2020-01-01T00:00:00Z" assert row["multi_dt"] == "[2020-01-01T00:00:00Z,2021-06-15T12:30:00Z]" - def test_generate_manifest_csv_skips_when_no_files(self) -> None: + def test_generate_manifest_csv_with_only_header_row(self) -> None: # GIVEN an empty file list with tempfile.TemporaryDirectory() as tmpdir: # WHEN generate_manifest_csv is called with no files generate_manifest_csv(all_files=[], path=tmpdir) - # THEN no manifest.csv is created - assert not os.path.exists(os.path.join(tmpdir, "manifest.csv")) + # THEN the manifest.csv file is created with only the header row and no data rows + manifest_path = os.path.join(tmpdir, "manifest.csv") + with open(manifest_path, newline="", encoding="utf8") as fp: + reader = csv.DictReader(fp) + rows = list(reader) + assert reader.fieldnames == [ + "path", + "parentId", + "name", + "ID", + "synapseStore", + "contentType", + "used", + "executed", + "activityName", + "activityDescription", + ] + assert rows == [] + + def test_generate_manifest_csv_with_path_None_raises_ValueError(self) -> None: + # GIVEN an empty file list + with tempfile.TemporaryDirectory() as tmpdir: + # WHEN generate_manifest_csv is called with path=None + with pytest.raises( + ValueError, + match="The path argument is required to generate a manifest.csv file.", + ): + generate_manifest_csv(all_files=[], path=None) def test_generate_manifest_csv_quotes_values_with_commas(self) -> None: # GIVEN a File whose name contains a comma From c0efb897c8331abfebe3ec9ad1a49b359098a0a7 Mon Sep 17 00:00:00 2001 From: danlu1 Date: Thu, 30 Apr 2026 14:34:55 -0700 Subject: [PATCH 20/26] reorganize examples for _convert_manifest_data_items_to_string_list --- synapseclient/models/services/manifest.py | 51 +++++++++++++---------- 1 file changed, 28 insertions(+), 23 deletions(-) diff --git a/synapseclient/models/services/manifest.py b/synapseclient/models/services/manifest.py index 0ea56669c..e5a87beaa 100644 --- a/synapseclient/models/services/manifest.py +++ b/synapseclient/models/services/manifest.py @@ -139,29 +139,34 @@ def _convert_manifest_data_items_to_string_list( When working with non strings we are printing the non-quoted version of the object. - Example: Examples - Several examples of how this function works. - - >>> _convert_manifest_data_items_to_string_list(["a", "b", "c"]) - '[a,b,c]' - >>> _convert_manifest_data_items_to_string_list(["string,with,commas", "string without commas"]) - '["string,with,commas",string without commas]' - >>> _convert_manifest_data_items_to_string_list(["string,with,commas"]) - 'string,with,commas' - >>> _convert_manifest_data_items_to_string_list - ([datetime.datetime(2020, 1, 1, 0, 0, 0, 0, tzinfo=datetime.timezone.utc)]) - '2020-01-01T00:00:00Z' - >>> _convert_manifest_data_items_to_string_list([True]) - 'True' - >>> _convert_manifest_data_items_to_string_list([1]) - '1' - >>> _convert_manifest_data_items_to_string_list([1.0]) - '1.0' - >>> _convert_manifest_data_items_to_string_list - ([datetime.datetime(2020, 1, 1, 0, 0, 0, 0, tzinfo=datetime.timezone.utc), - datetime.datetime(2021, 1, 1, 0, 0, 0, 0, tzinfo=datetime.timezone.utc)]) - '[2020-01-01T00:00:00Z,2021-01-01T00:00:00Z]' - + Example: Single-element lists are unwrapped + A list with one item returns the item directly, not wrapped in brackets. + ```python + _convert_manifest_data_items_to_string_list(["string,with,commas"]) # 'string,with,commas' + _convert_manifest_data_items_to_string_list([True]) # 'True' + _convert_manifest_data_items_to_string_list([1]) # '1' + _convert_manifest_data_items_to_string_list([1.0]) # '1.0' + _convert_manifest_data_items_to_string_list( + [datetime.datetime(2020, 1, 1, tzinfo=datetime.timezone.utc)] + ) # '2020-01-01T00:00:00Z' + ``` + + Example: Multi-element lists are bracket-wrapped + Multiple items are joined with commas inside `[...]`. String items containing + commas are individually quoted. + ```python + _convert_manifest_data_items_to_string_list(["a", "b", "c"]) + # '[a,b,c]' + _convert_manifest_data_items_to_string_list([True, False]) + # '[True,False]' + _convert_manifest_data_items_to_string_list(["string,with,commas", "string without commas"]) + # '["string,with,commas",string without commas]' + _convert_manifest_data_items_to_string_list( + [datetime.datetime(2020, 1, 1, tzinfo=datetime.timezone.utc), + datetime.datetime(2021, 1, 1, tzinfo=datetime.timezone.utc)] + ) + # '[2020-01-01T00:00:00Z,2021-01-01T00:00:00Z]' + ``` Args: items: The list of items to convert. From dc2c9a8660ac6941fd6f5896e1398fed43ad8a82 Mon Sep 17 00:00:00 2001 From: danlu1 Date: Mon, 4 May 2026 23:34:15 -0700 Subject: [PATCH 21/26] add manifest setting global variable --- synapseclient/models/mixins/storable_container.py | 7 +++++-- .../models/protocols/storable_container_protocol.py | 5 +++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/synapseclient/models/mixins/storable_container.py b/synapseclient/models/mixins/storable_container.py index c0ef3c82c..f61f6fa6b 100644 --- a/synapseclient/models/mixins/storable_container.py +++ b/synapseclient/models/mixins/storable_container.py @@ -74,6 +74,9 @@ ) +ManifestSetting = Literal["all", "suppress", "root"] + + @async_to_sync class StorableContainer(StorableContainerSynchronousProtocol): """ @@ -169,7 +172,7 @@ async def sync_from_synapse_async( link_hops: int = 1, queue: asyncio.Queue = None, include_types: Optional[List[str]] = None, - manifest: Literal["all", "suppress", "root"] = "all", + manifest: ManifestSetting = "all", *, synapse_client: Optional[Synapse] = None, ) -> Self: @@ -472,7 +475,7 @@ async def _sync_from_synapse_async( link_hops: int = 1, queue: asyncio.Queue = None, include_types: Optional[List[str]] = None, - manifest: Literal["all", "suppress", "root"] = "all", + manifest: ManifestSetting = "all", *, synapse_client: Optional[Synapse] = None, ) -> Self: diff --git a/synapseclient/models/protocols/storable_container_protocol.py b/synapseclient/models/protocols/storable_container_protocol.py index 413b3dfd0..f98ab11a7 100644 --- a/synapseclient/models/protocols/storable_container_protocol.py +++ b/synapseclient/models/protocols/storable_container_protocol.py @@ -2,12 +2,13 @@ generated at runtime.""" import asyncio -from typing import TYPE_CHECKING, List, Literal, Optional, Protocol +from typing import TYPE_CHECKING, List, Optional, Protocol from typing_extensions import Self from synapseclient import Synapse from synapseclient.core.constants.method_flags import COLLISION_OVERWRITE_LOCAL +from synapseclient.models.mixins.storable_container import ManifestSetting from synapseclient.models.services.storable_entity_components import ( MANIFEST_UPLOAD_MAX_RETRIES, FailureStrategy, @@ -35,7 +36,7 @@ def sync_from_synapse( link_hops: int = 1, queue: asyncio.Queue = None, include_types: Optional[List[str]] = None, - manifest: Literal["all", "root", "suppress"] = "all", + manifest: ManifestSetting = "all", *, synapse_client: Optional[Synapse] = None, ) -> Self: From a7f2863590fe76ab6fe1f68d9caa6c690ee6ea87 Mon Sep 17 00:00:00 2001 From: danlu1 Date: Tue, 5 May 2026 10:25:06 -0700 Subject: [PATCH 22/26] add manifest setting global variable --- synapseclient/models/mixins/storable_container.py | 5 +---- .../models/protocols/storable_container_protocol.py | 5 +++-- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/synapseclient/models/mixins/storable_container.py b/synapseclient/models/mixins/storable_container.py index f61f6fa6b..17ea07cf0 100644 --- a/synapseclient/models/mixins/storable_container.py +++ b/synapseclient/models/mixins/storable_container.py @@ -8,7 +8,6 @@ Dict, Generator, List, - Literal, NoReturn, Optional, Tuple, @@ -46,6 +45,7 @@ shared_progress_bar as upload_shared_progress_bar, ) from synapseclient.models.protocols.storable_container_protocol import ( + ManifestSetting, StorableContainerSynchronousProtocol, ) from synapseclient.models.services.manifest import ( @@ -74,9 +74,6 @@ ) -ManifestSetting = Literal["all", "suppress", "root"] - - @async_to_sync class StorableContainer(StorableContainerSynchronousProtocol): """ diff --git a/synapseclient/models/protocols/storable_container_protocol.py b/synapseclient/models/protocols/storable_container_protocol.py index f98ab11a7..c86a099f8 100644 --- a/synapseclient/models/protocols/storable_container_protocol.py +++ b/synapseclient/models/protocols/storable_container_protocol.py @@ -2,13 +2,12 @@ generated at runtime.""" import asyncio -from typing import TYPE_CHECKING, List, Optional, Protocol +from typing import TYPE_CHECKING, List, Literal, Optional, Protocol from typing_extensions import Self from synapseclient import Synapse from synapseclient.core.constants.method_flags import COLLISION_OVERWRITE_LOCAL -from synapseclient.models.mixins.storable_container import ManifestSetting from synapseclient.models.services.storable_entity_components import ( MANIFEST_UPLOAD_MAX_RETRIES, FailureStrategy, @@ -17,6 +16,8 @@ if TYPE_CHECKING: from synapseclient.models.file import File +ManifestSetting = Literal["all", "suppress", "root"] + class StorableContainerSynchronousProtocol(Protocol): """ From 209b6464a58dde35215e363c0f8c520c56a2dc4c Mon Sep 17 00:00:00 2001 From: danlu1 Date: Tue, 5 May 2026 10:29:44 -0700 Subject: [PATCH 23/26] update docstring --- synapseclient/models/services/manifest.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/synapseclient/models/services/manifest.py b/synapseclient/models/services/manifest.py index e5a87beaa..4a5a5f0e8 100644 --- a/synapseclient/models/services/manifest.py +++ b/synapseclient/models/services/manifest.py @@ -127,10 +127,10 @@ def _get_entity_provenance_dict_for_manifest(entity: File) -> dict[str, str]: def _convert_manifest_data_items_to_string_list( items: list[Union[str, datetime.datetime, bool, int, float]], -) -> Union[str, list[str]]: +) -> str: """ Handle coverting an individual key that contains a possible list of data into a - list of strings or objects that can be written to the manifest file. + string representation of the items that can be written to the manifest file. This has specific logic around how to handle datetime fields. @@ -172,7 +172,7 @@ def _convert_manifest_data_items_to_string_list( items: The list of items to convert. Returns: - The list of items converted to strings. + The string representation of the items. """ items_to_write = [] for item in items: @@ -269,7 +269,9 @@ def _convert_manifest_data_row_to_dict(row: dict, keys: list[str]) -> dict: return data_to_write -def _write_manifest_data_csv(path: str, keys: list[str], data: list[dict]) -> None: +def _write_manifest_data_csv( + path: str, keys: list[str], data: list[dict], syn: Synapse +) -> None: """Writes manifest data to a CSV file using csv.DictWriter with QUOTE_MINIMAL that automatically quotes any cell containing a comma, newline, or the quote character. Each row dict is normalized via _convert_manifest_data_row_to_dict so that @@ -294,8 +296,7 @@ def _write_manifest_data_csv(path: str, keys: list[str], data: list[dict]) -> No for row in data: writer.writerow(_convert_manifest_data_row_to_dict(row, keys)) - # add a log message to the console - print(f"Manifest file {path} has been generated.") + syn.logger.info(f"Manifest file {path} has been generated.") def generate_manifest_csv(all_files: list[File], path: str) -> None: From c89a62e3d62750a8fe4016e84cd39d382054e379 Mon Sep 17 00:00:00 2001 From: danlu1 Date: Tue, 5 May 2026 10:47:31 -0700 Subject: [PATCH 24/26] remove redundant if statement --- synapseclient/models/services/manifest.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/synapseclient/models/services/manifest.py b/synapseclient/models/services/manifest.py index 4a5a5f0e8..5088c4eea 100644 --- a/synapseclient/models/services/manifest.py +++ b/synapseclient/models/services/manifest.py @@ -319,10 +319,9 @@ def generate_manifest_csv(all_files: list[File], path: str) -> None: raise ValueError( "The path argument is required to generate a manifest.csv file." ) - if path: - filename = _manifest_csv_filename(path=path) - keys, data = _extract_entity_metadata_for_manifest_csv(all_files=all_files) - _write_manifest_data_csv(filename, keys, data) + filename = _manifest_csv_filename(path=path) + keys, data = _extract_entity_metadata_for_manifest_csv(all_files=all_files) + _write_manifest_data_csv(filename, keys, data) class UploadSyncFile(NamedTuple): From 21de74062c6c2f4badf9ec4a9e5b765b34921085 Mon Sep 17 00:00:00 2001 From: danlu1 Date: Tue, 5 May 2026 12:35:31 -0700 Subject: [PATCH 25/26] replace print with logging.info --- .../models/mixins/storable_container.py | 6 +- synapseclient/models/services/manifest.py | 6 +- .../services/unit_test_manifest.py | 111 ++++++++++++++---- 3 files changed, 97 insertions(+), 26 deletions(-) diff --git a/synapseclient/models/mixins/storable_container.py b/synapseclient/models/mixins/storable_container.py index 17ea07cf0..54123f89b 100644 --- a/synapseclient/models/mixins/storable_container.py +++ b/synapseclient/models/mixins/storable_container.py @@ -425,11 +425,11 @@ async def my_function(): opt manifest != "suppress" and path is set alt manifest == "all" loop For each directory path - sync_from_synapse->>manifest: call `generate_manifest_csv(files, dir_path)` + sync_from_synapse->>manifest: call `generate_manifest_csv(files, dir_path, syn)` manifest-->>sync_from_synapse: manifest.csv written to dir_path end else manifest == "root" - sync_from_synapse->>manifest: call `generate_manifest_csv(all_files, root_path)` + sync_from_synapse->>manifest: call `generate_manifest_csv(all_files, root_path, syn)` manifest-->>sync_from_synapse: manifest.csv written to root_path end end @@ -573,11 +573,13 @@ async def _sync_from_synapse_async( generate_manifest_csv( all_files=file_entities, path=directory_path, + syn=syn, ) elif manifest == "root": generate_manifest_csv( all_files=self.flatten_file_list(), path=path, + syn=syn, ) return self diff --git a/synapseclient/models/services/manifest.py b/synapseclient/models/services/manifest.py index 5088c4eea..bd81bde6c 100644 --- a/synapseclient/models/services/manifest.py +++ b/synapseclient/models/services/manifest.py @@ -299,7 +299,7 @@ def _write_manifest_data_csv( syn.logger.info(f"Manifest file {path} has been generated.") -def generate_manifest_csv(all_files: list[File], path: str) -> None: +def generate_manifest_csv(all_files: list[File], path: str, syn: Synapse) -> None: """Generates a manifest.csv file based on a list of File entities. The generated file uses CSV format with comma delimiter and is interoperable @@ -311,7 +311,7 @@ def generate_manifest_csv(all_files: list[File], path: str) -> None: Args: all_files: A list of File model objects. path: The directory path where manifest.csv will be written. - + syn: The Synapse client. raises: ValueError: If path is None. """ @@ -321,7 +321,7 @@ def generate_manifest_csv(all_files: list[File], path: str) -> None: ) filename = _manifest_csv_filename(path=path) keys, data = _extract_entity_metadata_for_manifest_csv(all_files=all_files) - _write_manifest_data_csv(filename, keys, data) + _write_manifest_data_csv(path=filename, keys=keys, data=data, syn=syn) class UploadSyncFile(NamedTuple): diff --git a/tests/unit/synapseclient/services/unit_test_manifest.py b/tests/unit/synapseclient/services/unit_test_manifest.py index e13f8f574..ef276aeaf 100644 --- a/tests/unit/synapseclient/services/unit_test_manifest.py +++ b/tests/unit/synapseclient/services/unit_test_manifest.py @@ -2,6 +2,7 @@ import datetime import os import tempfile +from unittest.mock import patch import pytest @@ -53,6 +54,10 @@ def test_filename_is_manifest_csv(self) -> None: class TestGenerateManifestCsv: """Tests for the generate_manifest_csv and related helper functions.""" + @pytest.fixture(scope="function", autouse=True) + def setup_method(self, syn) -> None: + self.syn = syn + def _make_file( self, syn_id: str = "syn123", @@ -140,14 +145,20 @@ def test_generate_manifest_csv_data_items_are_converted_to_strings(self) -> None }, ) - with tempfile.TemporaryDirectory() as tmpdir: + with ( + tempfile.TemporaryDirectory() as tmpdir, + patch.object(self.syn.logger, "info") as mock_logger_info, + ): # WHEN generate_manifest_csv is called - generate_manifest_csv(all_files=[f], path=tmpdir) + generate_manifest_csv(all_files=[f], path=tmpdir, syn=self.syn) manifest_path = os.path.join(tmpdir, "manifest.csv") content = open(manifest_path, encoding="utf8").read() with open(manifest_path, newline="", encoding="utf8") as fp: row = next(csv.DictReader(fp)) + mock_logger_info.assert_called_once_with( + f"Manifest file {manifest_path} has been generated." + ) assert '"a, b, c"' in content assert row["single_str"] == "hello" assert row["multi_str"] == "[a,b,c]" @@ -160,9 +171,12 @@ def test_generate_manifest_csv_data_items_are_converted_to_strings(self) -> None def test_generate_manifest_csv_with_only_header_row(self) -> None: # GIVEN an empty file list - with tempfile.TemporaryDirectory() as tmpdir: + with ( + tempfile.TemporaryDirectory() as tmpdir, + patch.object(self.syn.logger, "info") as mock_logger_info, + ): # WHEN generate_manifest_csv is called with no files - generate_manifest_csv(all_files=[], path=tmpdir) + generate_manifest_csv(all_files=[], path=tmpdir, syn=self.syn) # THEN the manifest.csv file is created with only the header row and no data rows manifest_path = os.path.join(tmpdir, "manifest.csv") @@ -182,6 +196,9 @@ def test_generate_manifest_csv_with_only_header_row(self) -> None: "activityDescription", ] assert rows == [] + mock_logger_info.assert_called_once_with( + f"Manifest file {manifest_path} has been generated." + ) def test_generate_manifest_csv_with_path_None_raises_ValueError(self) -> None: # GIVEN an empty file list @@ -191,32 +208,45 @@ def test_generate_manifest_csv_with_path_None_raises_ValueError(self) -> None: ValueError, match="The path argument is required to generate a manifest.csv file.", ): - generate_manifest_csv(all_files=[], path=None) + generate_manifest_csv(all_files=[], path=None, syn=self.syn) def test_generate_manifest_csv_quotes_values_with_commas(self) -> None: # GIVEN a File whose name contains a comma f = self._make_file(name="file, extra.txt", path="/tmp/file, extra.txt") - with tempfile.TemporaryDirectory() as tmpdir: - generate_manifest_csv(all_files=[f], path=tmpdir) + with ( + tempfile.TemporaryDirectory() as tmpdir, + patch.object(self.syn.logger, "info") as mock_logger_info, + ): + generate_manifest_csv(all_files=[f], path=tmpdir, syn=self.syn) manifest_path = os.path.join(tmpdir, "manifest.csv") content = open(manifest_path, encoding="utf8").read() # THEN the comma-containing value is quoted in the CSV assert '"file, extra.txt"' in content + mock_logger_info.assert_called_once_with( + f"Manifest file {manifest_path} has been generated." + ) class TestWriteManifestDataCsv: """Tests for the _write_manifest_data_csv helper.""" + @pytest.fixture(scope="function", autouse=True) + def setup_method(self, syn) -> None: + self.syn = syn + def test_writes_header_and_rows(self) -> None: # GIVEN keys and one row of data keys = ["path", "parentId", "name"] data = [{"path": "/data/f.txt", "parentId": "syn1", "name": "f.txt"}] - with tempfile.TemporaryDirectory() as tmpdir: + with ( + tempfile.TemporaryDirectory() as tmpdir, + patch.object(self.syn.logger, "info") as mock_logger_info, + ): filename = os.path.join(tmpdir, "manifest.csv") # WHEN _write_manifest_data_csv is called - _write_manifest_data_csv(filename, keys, data) + _write_manifest_data_csv(filename, keys, data, syn=self.syn) with open(filename, newline="", encoding="utf8") as fp: rows = list(csv.DictReader(fp)) @@ -226,32 +256,44 @@ def test_writes_header_and_rows(self) -> None: assert rows[0]["path"] == "/data/f.txt" assert rows[0]["parentId"] == "syn1" assert rows[0]["name"] == "f.txt" + mock_logger_info.assert_called_once_with( + f"Manifest file {filename} has been generated." + ) def test_missing_keys_use_empty_string(self) -> None: # GIVEN a row missing the "name" key keys = ["path", "parentId", "name"] data = [{"path": "/data/f.txt", "parentId": "syn1"}] - with tempfile.TemporaryDirectory() as tmpdir: + with ( + tempfile.TemporaryDirectory() as tmpdir, + patch.object(self.syn.logger, "info") as mock_logger_info, + ): filename = os.path.join(tmpdir, "manifest.csv") - _write_manifest_data_csv(filename, keys, data) + _write_manifest_data_csv(filename, keys, data, syn=self.syn) with open(filename, newline="", encoding="utf8") as fp: rows = list(csv.DictReader(fp)) # THEN the missing field is written as an empty string assert rows[0]["name"] == "" + mock_logger_info.assert_called_once_with( + f"Manifest file {filename} has been generated." + ) def test_extra_keys_in_row_are_ignored(self) -> None: # GIVEN a row with a key not in the fieldnames list keys = ["path", "name"] data = [{"path": "/data/f.txt", "name": "f.txt", "extra": "ignored"}] - with tempfile.TemporaryDirectory() as tmpdir: + with ( + tempfile.TemporaryDirectory() as tmpdir, + patch.object(self.syn.logger, "info") as mock_logger_info, + ): filename = os.path.join(tmpdir, "manifest.csv") # WHEN _write_manifest_data_csv is called # THEN no exception is raised and only declared keys appear - _write_manifest_data_csv(filename, keys, data) + _write_manifest_data_csv(filename, keys, data, syn=self.syn) with open(filename, newline="", encoding="utf8") as fp: reader = csv.DictReader(fp) @@ -259,15 +301,21 @@ def test_extra_keys_in_row_are_ignored(self) -> None: assert "extra" not in reader.fieldnames assert rows[0]["path"] == "/data/f.txt" + mock_logger_info.assert_called_once_with( + f"Manifest file {filename} has been generated." + ) def test_values_with_commas_are_quoted(self) -> None: # GIVEN a value that contains a comma keys = ["name", "parentId"] data = [{"name": "file, with comma.txt", "parentId": "syn1"}] - with tempfile.TemporaryDirectory() as tmpdir: + with ( + tempfile.TemporaryDirectory() as tmpdir, + patch.object(self.syn.logger, "info") as mock_logger_info, + ): filename = os.path.join(tmpdir, "manifest.csv") - _write_manifest_data_csv(filename, keys, data) + _write_manifest_data_csv(filename, keys, data, syn=self.syn) content = open(filename, encoding="utf8").read() with open(filename, newline="", encoding="utf8") as fp: @@ -277,14 +325,20 @@ def test_values_with_commas_are_quoted(self) -> None: assert '"file, with comma.txt"' in content # AND reads back correctly assert rows[0]["name"] == "file, with comma.txt" + mock_logger_info.assert_called_once_with( + f"Manifest file {filename} has been generated." + ) def test_empty_data_writes_header_only(self) -> None: # GIVEN no data rows keys = ["path", "parentId", "name"] - with tempfile.TemporaryDirectory() as tmpdir: + with ( + tempfile.TemporaryDirectory() as tmpdir, + patch.object(self.syn.logger, "info") as mock_logger_info, + ): filename = os.path.join(tmpdir, "manifest.csv") - _write_manifest_data_csv(filename, keys, []) + _write_manifest_data_csv(filename, keys, [], syn=self.syn) with open(filename, newline="", encoding="utf8") as fp: reader = csv.DictReader(fp) @@ -294,21 +348,30 @@ def test_empty_data_writes_header_only(self) -> None: # THEN the file exists with only the header assert rows == [] assert header == keys + mock_logger_info.assert_called_once_with( + f"Manifest file {filename} has been generated." + ) def test_unicode_values_are_written_correctly(self) -> None: # GIVEN a value with non-ASCII characters keys = ["name", "parentId"] data = [{"name": "données_été.txt", "parentId": "syn1"}] - with tempfile.TemporaryDirectory() as tmpdir: + with ( + tempfile.TemporaryDirectory() as tmpdir, + patch.object(self.syn.logger, "info") as mock_logger_info, + ): filename = os.path.join(tmpdir, "manifest.csv") - _write_manifest_data_csv(filename, keys, data) + _write_manifest_data_csv(filename, keys, data, syn=self.syn) with open(filename, newline="", encoding="utf8") as fp: rows = list(csv.DictReader(fp)) # THEN unicode characters round-trip correctly assert rows[0]["name"] == "données_été.txt" + mock_logger_info.assert_called_once_with( + f"Manifest file {filename} has been generated." + ) def test_multiple_rows_written_in_order(self) -> None: # GIVEN multiple rows @@ -319,9 +382,12 @@ def test_multiple_rows_written_in_order(self) -> None: {"name": "c.txt", "parentId": "syn3"}, ] - with tempfile.TemporaryDirectory() as tmpdir: + with ( + tempfile.TemporaryDirectory() as tmpdir, + patch.object(self.syn.logger, "info") as mock_logger_info, + ): filename = os.path.join(tmpdir, "manifest.csv") - _write_manifest_data_csv(filename, keys, data) + _write_manifest_data_csv(filename, keys, data, syn=self.syn) with open(filename, newline="", encoding="utf8") as fp: rows = list(csv.DictReader(fp)) @@ -329,6 +395,9 @@ def test_multiple_rows_written_in_order(self) -> None: # THEN all rows are present and in order assert len(rows) == 3 assert [r["name"] for r in rows] == ["a.txt", "b.txt", "c.txt"] + mock_logger_info.assert_called_once_with( + f"Manifest file {filename} has been generated." + ) class TestGetEntityProvenanceDictForManifest: From 7f829a2ed575813f89602ada939ca60107da37f6 Mon Sep 17 00:00:00 2001 From: danlu1 Date: Tue, 5 May 2026 13:28:33 -0700 Subject: [PATCH 26/26] add test cases for logging info --- .../models/async/test_folder_async.py | 44 +++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/tests/integration/synapseclient/models/async/test_folder_async.py b/tests/integration/synapseclient/models/async/test_folder_async.py index cc04238fb..067ceb989 100644 --- a/tests/integration/synapseclient/models/async/test_folder_async.py +++ b/tests/integration/synapseclient/models/async/test_folder_async.py @@ -6,6 +6,7 @@ import tempfile import uuid from typing import Callable, List +from unittest.mock import patch import pytest @@ -1050,3 +1051,46 @@ async def test_manifest_includes_provenance(self, project_model: Project) -> Non assert row["activityName"] == "my_activity" assert row["activityDescription"] == "my_description" assert row["used"] == "my_source" + + async def test_manifest_generation_logs_info_per_directory( + self, project_model: Project + ) -> None: + # GIVEN a root folder with a file and a nested subfolder with its own file + root_folder = Folder(name=str(uuid.uuid4()), parent_id=project_model.id) + root_folder = await root_folder.store_async(synapse_client=self.syn) + self.schedule_for_cleanup(root_folder.id) + + root_file = self.create_file_instance() + root_file.parent_id = root_folder.id + root_file = await root_file.store_async(synapse_client=self.syn) + self.schedule_for_cleanup(root_file.id) + + sub_folder = Folder(name=str(uuid.uuid4()), parent_id=root_folder.id) + sub_folder = await sub_folder.store_async(synapse_client=self.syn) + self.schedule_for_cleanup(sub_folder.id) + + sub_file = self.create_file_instance() + sub_file.parent_id = sub_folder.id + sub_file = await sub_file.store_async(synapse_client=self.syn) + self.schedule_for_cleanup(sub_file.id) + + with tempfile.TemporaryDirectory() as tmpdir: + root_manifest = os.path.join(tmpdir, "manifest.csv") + sub_manifest = os.path.join(tmpdir, sub_folder.name, "manifest.csv") + + with patch.object(self.syn.logger, "info") as mock_logger_info: + # WHEN I sync with manifest="all" + await root_folder.sync_from_synapse_async( + path=tmpdir, + manifest="all", + synapse_client=self.syn, + ) + + # THEN _write_manifest_data_csv logs one message per generated manifest + logged_messages = [call.args[0] for call in mock_logger_info.call_args_list] + assert ( + f"Manifest file {root_manifest} has been generated." in logged_messages + ) + assert ( + f"Manifest file {sub_manifest} has been generated." in logged_messages + )