diff --git a/docs/explanations/manifest_csv.md b/docs/explanations/manifest_csv.md
index ae542a800..e46d52f4a 100644
--- a/docs/explanations/manifest_csv.md
+++ b/docs/explanations/manifest_csv.md
@@ -17,7 +17,7 @@ The format of the manifest file is a comma-separated value (CSV) file with one r
| parentId | Synapse ID of parent | syn1235 |
!!! note
- The legacy TSV manifest used a column named `parent`. The CSV manifest uses `parentId` instead, which is consistent with the Synapse REST API field name. If you are migrating an existing TSV manifest to CSV, rename the `parent` column to `parentId`.
+ The legacy TSV manifest used the columns `parent` and `id`, while the CSV manifest uses `parentId` and `ID` to align with Synapse REST API field names. If you’re migrating a TSV manifest to CSV, you’ll need to rename `parent` to `parentId` and `id` to `ID`.
### Standard fields
@@ -60,11 +60,11 @@ Any columns that are not in the standard or metadata fields described above will
Adding annotations to each row:
-| path | parentId | annot1 | annot2 | annot3 | annot4 | annot5 |
-| --- | --- | --- | --- | --- | --- | --- |
-| /path/file1.txt | syn1243 | bar | 3.1415 | "aaaa, bbbb" | "[14,27,30]" | "Annotation, with a comma" |
-| /path/file2.txt | syn12433 | baz | 2.71 | value_1 | "[1,2,3]" | test 123 |
-| /path/file3.txt | syn12455 | zzz | 3.52 | value_3 | "[42,56,77]" | a single annotation |
+| path | parentId | annot1 | annot2 | annot3 | annot4 | annot5 | annot6 |
+| --- | --- | --- | --- | --- | --- | --- | --- |
+| /path/file1.txt | syn1243 | bar | 3.1415 | "aaaa, bbbb" | "[14,27,30]" | "Annotation, with a comma" | "True" |
+| /path/file2.txt | syn12433 | baz | 2.71 | value_1 | "[1,2,3]" | string without commas | "[True,False]" |
+| /path/file3.txt | syn12455 | zzz | 3.52 | value_3 | "[42,56,77]" | a_single_string | |
#### Multiple values of annotations per key
diff --git a/docs/tutorials/python/download_data_in_bulk.md b/docs/tutorials/python/download_data_in_bulk.md
index 04e27ec92..3534e141a 100644
--- a/docs/tutorials/python/download_data_in_bulk.md
+++ b/docs/tutorials/python/download_data_in_bulk.md
@@ -28,6 +28,7 @@ With a project that has this example layout:
In this tutorial you will:
1. Download all files/folder from a project
+1. Control manifest CSV generation during download
1. Download all files/folders for a specific folder within the project
1. Loop over all files/folders on the project/folder object instances
@@ -44,48 +45,75 @@ another desired directory exists.
#### First let's set up some constants we'll use in this script
```python
-{!docs/tutorials/python/tutorial_scripts/download_data_in_bulk.py!lines=5-19}
+--8<-- "docs/tutorials/python/tutorial_scripts/download_data_in_bulk.py:setup"
```
#### Next we'll create an instance of the Project we are going to sync
```python
-{!docs/tutorials/python/tutorial_scripts/download_data_in_bulk.py!lines=20-22}
+--8<-- "docs/tutorials/python/tutorial_scripts/download_data_in_bulk.py:get_project"
```
#### Finally we'll sync the project from synapse to your local machine
```python
-{!docs/tutorials/python/tutorial_scripts/download_data_in_bulk.py!lines=23-28}
+--8<-- "docs/tutorials/python/tutorial_scripts/download_data_in_bulk.py:sync_project"
```
While syncing your project you'll see results like:
```
-Syncing Project (syn53185532:My uniquely named project about Alzheimer's Disease) from Synapse.
-Syncing Folder (syn53205630:experiment_notes) from Synapse.
-Syncing Folder (syn53205632:notes_2022) from Synapse.
-Syncing Folder (syn53205629:single_cell_RNAseq_batch_1) from Synapse.
-Syncing Folder (syn53205656:single_cell_RNAseq_batch_2) from Synapse.
-Syncing Folder (syn53205631:notes_2023) from Synapse.
-Downloading [####################]100.00% 4.0bytes/4.0bytes (1.8kB/s) fileA.txt Done...
-Downloading [####################]100.00% 3.0bytes/3.0bytes (1.1kB/s) SRR92345678_R1.fastq.gz Done...
-Downloading [####################]100.00% 4.0bytes/4.0bytes (1.7kB/s) SRR12345678_R1.fastq.gz Done...
-Downloading [####################]100.00% 4.0bytes/4.0bytes (1.9kB/s) fileC.txt Done...
-Downloading [####################]100.00% 4.0bytes/4.0bytes (2.7kB/s) fileB.txt Done...
-Downloading [####################]100.00% 4.0bytes/4.0bytes (2.7kB/s) SRR12345678_R2.fastq.gz Done...
-Downloading [####################]100.00% 4.0bytes/4.0bytes (2.6kB/s) SRR12345678_R2.fastq.gz Done...
-Downloading [####################]100.00% 4.0bytes/4.0bytes (1.8kB/s) SRR12345678_R1.fastq.gz Done...
-Downloading [####################]100.00% 3.0bytes/3.0bytes (1.5kB/s) SRR92345678_R2.fastq.gz Done...
-Downloading [####################]100.00% 4.0bytes/4.0bytes (1.6kB/s) fileD.txt Done...
-['single_cell_RNAseq_batch_2', 'single_cell_RNAseq_batch_1', 'experiment_notes']
+[syn74583648:My uniquely named project about Alzheimer's Disease]: Syncing Project from Synapse.
+[syn74584000:biospecimen_experiment_1]: Syncing Folder from Synapse.
+[syn74584007:single_cell_RNAseq_batch_2]: Syncing Folder from Synapse.
+[syn74584001:biospecimen_experiment_2]: Syncing Folder from Synapse.
+[syn74584006:single_cell_RNAseq_batch_1]: Syncing Folder from Synapse.
+[syn74584146]: Downloaded to /biospecimen_experiment_1/fileB.png
+[syn74584154]: Downloaded to /biospecimen_experiment_2/fileD.png
+[syn74584155]: Downloaded to /biospecimen_experiment_2/fileC.png
+[syn74584188]: Downloaded to /single_cell_RNAseq_batch_1/SRR12345678_R1.fastq.png
+[syn74584147]: Downloaded to /biospecimen_experiment_1/fileA.png
+[syn74584206]: Downloaded to /single_cell_RNAseq_batch_2/SRR12345678_R1.fastq.png
+[syn74584189]: Downloaded to /single_cell_RNAseq_batch_1/SRR12345678_R2.fastq.png
+[syn74584207]: Downloaded to /single_cell_RNAseq_batch_2/SRR12345678_R2.fastq.png
+Downloading files: 100%|████████████████████| 1.31M/1.31M [00:02<00:00, 606kB/s]
+Project(id='syn74583648', name="My uniquely named project about Alzheimer's Disease", files=[], folders=[
+ Folder(id='syn74584000', name='biospecimen_experiment_1', parent_id='syn74583648', files=[
+ File(id='syn74584147', name='fileA.png', path='/biospecimen_experiment_1/fileA.png', parent_id='syn74584000', ...),
+ File(id='syn74584146', name='fileB.png', path='/biospecimen_experiment_1/fileB.png', parent_id='syn74584000', ...)
+ ], folders=[], ...),
+ Folder(id='syn74584001', name='biospecimen_experiment_2', parent_id='syn74583648', files=[
+ File(id='syn74584155', name='fileC.png', path='/biospecimen_experiment_2/fileC.png', parent_id='syn74584001', ...),
+ File(id='syn74584154', name='fileD.png', path='/biospecimen_experiment_2/fileD.png', parent_id='syn74584001', ...)
+ ], folders=[], ...),
+ Folder(id='syn74584006', name='single_cell_RNAseq_batch_1', parent_id='syn74583648', files=[
+ File(id='syn74584188', name='SRR12345678_R1.fastq.png', path='/single_cell_RNAseq_batch_1/SRR12345678_R1.fastq.png', parent_id='syn74584006', ...),
+ File(id='syn74584189', name='SRR12345678_R2.fastq.png', path='/single_cell_RNAseq_batch_1/SRR12345678_R2.fastq.png', parent_id='syn74584006', ...)
+ ], folders=[], ...),
+ Folder(id='syn74584007', name='single_cell_RNAseq_batch_2', parent_id='syn74583648', files=[
+ File(id='syn74584206', name='SRR12345678_R1.fastq.png', path='/single_cell_RNAseq_batch_2/SRR12345678_R1.fastq.png', parent_id='syn74584007', ...),
+ File(id='syn74584207', name='SRR12345678_R2.fastq.png', path='/single_cell_RNAseq_batch_2/SRR12345678_R2.fastq.png', parent_id='syn74584007', ...)
+ ], folders=[], ...)
+], ...)
```
-## 2. Download all files/folders for a specific folder within the project
+## 2. Control manifest CSV generation during download
+
+By default (`manifest="all"`), `sync_from_synapse` writes a `manifest.csv` into every
+synced directory. The manifest.csv is interoperable with sync_to_synapse, the Synapse UI download cart, and `download_list_files`.
+
+Use `manifest="root"` to write a single manifest at the root path, or
+`manifest="suppress"` to skip manifest generation entirely.
+
+```python
+--8<-- "docs/tutorials/python/tutorial_scripts/download_data_in_bulk.py:sync_project_with_root_manifest"
+```
+
+## 3. Download all files/folders for a specific folder within the project
Following the same set of steps let's sync a specific folder
```python
-{!docs/tutorials/python/tutorial_scripts/download_data_in_bulk.py!lines=30-36}
+--8<-- "docs/tutorials/python/tutorial_scripts/download_data_in_bulk.py:sync_folder"
```
@@ -105,12 +133,12 @@ download the content again. If you were to use an `if_collision` of `"overwrite.
you would see that when the content on your machine does not match Synapse the file
will be overwritten.
-## 3. Loop over all files/folders on the project/folder object instances
+## 4. Loop over all files/folders on the project/folder object instances
Using `sync_from_synapse` will load into memory the state of all Folders and Files
retrieved from Synapse. This will allow you to loop over the contents of your container.
```python
-{!docs/tutorials/python/tutorial_scripts/download_data_in_bulk.py!lines=37-47}
+--8<-- "docs/tutorials/python/tutorial_scripts/download_data_in_bulk.py:loop_over_project_folder"
```
diff --git a/docs/tutorials/python/tutorial_scripts/download_data_in_bulk.py b/docs/tutorials/python/tutorial_scripts/download_data_in_bulk.py
index 0ca946f13..1757c1037 100644
--- a/docs/tutorials/python/tutorial_scripts/download_data_in_bulk.py
+++ b/docs/tutorials/python/tutorial_scripts/download_data_in_bulk.py
@@ -2,6 +2,7 @@
Here is where you'll find the code for the downloading data in bulk tutorial.
"""
+# --8<-- [start:setup]
import os
import synapseclient
@@ -16,32 +17,55 @@
DIRECTORY_TO_SYNC_FOLDER_TO = os.path.join(
DIRECTORY_TO_SYNC_PROJECT_TO, FOLDER_NAME_TO_SYNC
)
+# --8<-- [end:setup]
-# Step 1: Create an instance of the container I want to sync the data from and sync
-project = Project(name="My uniquely named project about Alzheimer's Disease")
-# We'll set the `if_collision` to `keep.local` so that we don't overwrite any files
+# Step 1: Get an instance of the container I want to sync the data from and sync
+# --8<-- [start:get_project]
+project = Project(name="My uniquely named project about Alzheimer's Disease").get()
+# --8<-- [end:get_project]
+
+# By default, sync_from_synapse generates a manifest.csv in each synced directory.
+# The manifest.csv is interoperable with sync_to_synapse, the Synapse
+# UI download cart, and `download_list_files`.
+# --8<-- [start:sync_project]
+# We'll set the `if_collision` to `keep.local` so that we don't overwrite any files.
project.sync_from_synapse(path=DIRECTORY_TO_SYNC_PROJECT_TO, if_collision="keep.local")
# Print out the contents of the directory where the data was synced to
# Explore the directory to see the contents have been recursively synced.
print(os.listdir(DIRECTORY_TO_SYNC_PROJECT_TO))
+# --8<-- [end:sync_project]
+# Or, use `manifest="root"` to generate a single manifest.csv at the root directory
+# instead of one in each sub-directory. Use `manifest="suppress"` to skip
+# manifest generation entirely.
+
+# --8<-- [start:sync_project_with_root_manifest]
+project.sync_from_synapse(
+ path=DIRECTORY_TO_SYNC_PROJECT_TO,
+ if_collision="keep.local",
+ manifest="root",
+)
+print(os.listdir(DIRECTORY_TO_SYNC_PROJECT_TO))
+# --8<-- [end:sync_project_with_root_manifest]
-# Step 2: The same as step 1, but for a single folder
+# Step 3: The same as step 1, but for a single folder
+# --8<-- [start:sync_folder]
folder = Folder(name=FOLDER_NAME_TO_SYNC, parent_id=project.id)
folder.sync_from_synapse(path=DIRECTORY_TO_SYNC_FOLDER_TO, if_collision="keep.local")
print(os.listdir(os.path.expanduser(DIRECTORY_TO_SYNC_FOLDER_TO)))
+# --8<-- [end:sync_folder]
-# Step 3: Loop over all files/folders on the project/folder object instances
+# Step 4: Loop over all files/folders on the project/folder object instances
+# --8<-- [start:loop_over_project_folder]
for folder_at_root in project.folders:
print(f"Folder at root: {folder_at_root.name}")
-
for file_in_root_folder in folder_at_root.files:
print(f"File in {folder_at_root.name}: {file_in_root_folder.name}")
-
for folder_in_folder in folder_at_root.folders:
print(f"Folder in {folder_at_root.name}: {folder_in_folder.name}")
for file_in_folder in folder_in_folder.files:
print(f"File in {folder_in_folder.name}: {file_in_folder.name}")
+# --8<-- [end:loop_over_project_folder]
diff --git a/mkdocs.yml b/mkdocs.yml
index f61d9ec6e..1638308ea 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -164,7 +164,7 @@ nav:
- Domain Models of Synapse: explanations/domain_models_of_synapse.md
- Access Control: explanations/access_control.md
- Properties vs Annotations: explanations/properties_vs_annotations.md
- - Manifest TSV: explanations/manifest_tsv.md
+ - Manifest CSV: explanations/manifest_csv.md
- Benchmarking: explanations/benchmarking.md
- Structuring Your Project: explanations/structuring_your_project.md
- Asyncio Changes in Python 3.14: explanations/asyncio_in_python_3_14.md
diff --git a/synapseclient/models/mixins/storable_container.py b/synapseclient/models/mixins/storable_container.py
index c19ee7ed0..c0ef3c82c 100644
--- a/synapseclient/models/mixins/storable_container.py
+++ b/synapseclient/models/mixins/storable_container.py
@@ -8,6 +8,7 @@
Dict,
Generator,
List,
+ Literal,
NoReturn,
Optional,
Tuple,
@@ -48,6 +49,7 @@
StorableContainerSynchronousProtocol,
)
from synapseclient.models.services.manifest import (
+ generate_manifest_csv,
read_manifest_for_upload,
upload_sync_files,
)
@@ -167,6 +169,7 @@ async def sync_from_synapse_async(
link_hops: int = 1,
queue: asyncio.Queue = None,
include_types: Optional[List[str]] = None,
+ manifest: Literal["all", "suppress", "root"] = "all",
*,
synapse_client: Optional[Synapse] = None,
) -> Self:
@@ -178,9 +181,10 @@ async def sync_from_synapse_async(
If you only want to retrieve the full tree of metadata about your
container specify `download_file` as False.
- This works similar to [synapseutils.syncFromSynapse][], however, this does not
- currently support the writing of data to a manifest TSV file. This will be a
- future enhancement.
+ This works similar to [synapseutils.syncFromSynapse][], and generates a
+ `manifest.csv` file in each synced directory. The manifest uses CSV format
+ with `parentId` and `ID` columns, interoperable with the Synapse UI download
+ cart and `synapse get-download-list` CLI output.
Supports syncing Files, Folders, Tables, EntityViews, SubmissionViews, Datasets,
DatasetCollections, MaterializedViews, and VirtualTables from Synapse. The
@@ -216,6 +220,11 @@ async def sync_from_synapse_async(
`["folder", "file", "table", "entityview", "dockerrepo",
"submissionview", "dataset", "datasetcollection", "materializedview",
"virtualtable"]`.
+ manifest: Determines whether to generate a manifest CSV file. Options are:
+
+ - `all` (default): generate `manifest.csv` in every synced directory
+ - `root`: generate `manifest.csv` only in the root `path` directory
+ - `suppress`: do not generate any manifest file
synapse_client: If not passed in and caching was not disabled by
`Synapse.allow_client_caching(False)` this will use the last created
instance from the Synapse class constructor.
@@ -320,8 +329,38 @@ async def my_function():
asyncio.run(my_function())
```
+ Suppose I want to download all the children of a Project and all sub-folders and files and generate a manifest file:
+
+ ```python
+ import asyncio
+ from synapseclient import Synapse
+ from synapseclient.models import Project
+
+ async def my_function():
+ syn = Synapse()
+ syn.login()
+
+ my_project = Project(id="syn12345")
+ await my_project.sync_from_synapse_async(path="/path/to/folder", manifest="all")
+
+ asyncio.run(my_function())
+ ```
+ Suppose I want to download a manifest file at the root path:
+
+ ```python
+ import asyncio
+ from synapseclient import Synapse
+ from synapseclient.models import Project
+
+ async def my_function():
+ syn = Synapse()
+ syn.login()
+ my_project = Project(id="syn12345")
+ await my_project.sync_from_synapse_async(path="/path/to/folder", manifest="root", download_file=False)
+ asyncio.run(my_function())
+ ```
Raises:
ValueError: If the folder does not have an id set.
@@ -383,6 +422,18 @@ async def my_function():
end
end
+ opt manifest != "suppress" and path is set
+ alt manifest == "all"
+ loop For each directory path
+ sync_from_synapse->>manifest: call `generate_manifest_csv(files, dir_path)`
+ manifest-->>sync_from_synapse: manifest.csv written to dir_path
+ end
+ else manifest == "root"
+ sync_from_synapse->>manifest: call `generate_manifest_csv(all_files, root_path)`
+ manifest-->>sync_from_synapse: manifest.csv written to root_path
+ end
+ end
+
deactivate sync_from_synapse
deactivate project_or_folder
```
@@ -405,6 +456,7 @@ async def my_function():
link_hops=link_hops,
queue=queue,
include_types=include_types,
+ manifest=manifest,
synapse_client=syn,
)
@@ -420,6 +472,7 @@ async def _sync_from_synapse_async(
link_hops: int = 1,
queue: asyncio.Queue = None,
include_types: Optional[List[str]] = None,
+ manifest: Literal["all", "suppress", "root"] = "all",
*,
synapse_client: Optional[Synapse] = None,
) -> Self:
@@ -428,12 +481,18 @@ async def _sync_from_synapse_async(
All arguments are passed through from the wrapper function.
"""
+ if manifest not in ("all", "root", "suppress"):
+ raise ValueError(
+ f"Invalid manifest value: {manifest}. Must be one of: 'all', 'root', 'suppress'."
+ )
+
syn = Synapse.get_client(synapse_client=synapse_client)
if not self._last_persistent_instance:
await self.get_async(synapse_client=syn)
syn.logger.info(
f"[{self.id}:{self.name}]: Syncing {self.__class__.__name__} from Synapse."
)
+
path = os.path.expanduser(path) if path else None
children = await self._retrieve_children(
@@ -496,12 +555,31 @@ async def _sync_from_synapse_async(
if create_workers:
try:
- # Wait until the queue is fully processed.
+ # Blocks until every queued item has been picked up and
+ # task_done() called by a worker.
await queue.join()
finally:
+ # Workers are now blocked waiting on an empty queue; cancel
+ # them so they don't hang the event loop.
for task in worker_tasks:
task.cancel()
+ if path and manifest != "suppress":
+ if manifest == "all":
+ for (
+ directory_path,
+ file_entities,
+ ) in self.map_directory_to_all_contained_files(root_path=path).items():
+ generate_manifest_csv(
+ all_files=file_entities,
+ path=directory_path,
+ )
+ elif manifest == "root":
+ generate_manifest_csv(
+ all_files=self.flatten_file_list(),
+ path=path,
+ )
+
return self
@otel_trace_method(
@@ -1232,6 +1310,7 @@ async def _wrap_recursive_get_children(
synapse_client=synapse_client,
queue=queue,
include_types=include_types,
+ manifest="suppress", # The manifest is suppressed for child folders because they’re already accounted for when iterating through their parent folder. This is handled in the map_directory_to_all_contained_files function, which returns all files in the directory, including those in its child directories.
)
def _create_task_for_child(
diff --git a/synapseclient/models/protocols/storable_container_protocol.py b/synapseclient/models/protocols/storable_container_protocol.py
index cfbdf5f1d..413b3dfd0 100644
--- a/synapseclient/models/protocols/storable_container_protocol.py
+++ b/synapseclient/models/protocols/storable_container_protocol.py
@@ -2,7 +2,7 @@
generated at runtime."""
import asyncio
-from typing import TYPE_CHECKING, List, Optional, Protocol
+from typing import TYPE_CHECKING, List, Literal, Optional, Protocol
from typing_extensions import Self
@@ -35,6 +35,7 @@ def sync_from_synapse(
link_hops: int = 1,
queue: asyncio.Queue = None,
include_types: Optional[List[str]] = None,
+ manifest: Literal["all", "root", "suppress"] = "all",
*,
synapse_client: Optional[Synapse] = None,
) -> Self:
@@ -46,9 +47,10 @@ def sync_from_synapse(
If you only want to retrieve the full tree of metadata about your
container specify `download_file` as False.
- This works similar to [synapseutils.syncFromSynapse][], however, this does not
- currently support the writing of data to a manifest TSV file. This will be a
- future enhancement.
+ This works similar to [synapseutils.syncFromSynapse][], and generates a
+ `manifest.csv` file in each synced directory. The manifest uses CSV format
+ with `parentId` and `ID` columns, interoperable with the Synapse UI download
+ cart and `synapse get-download-list` CLI output.
Supports syncing Files, Folders, Tables, EntityViews, SubmissionViews, Datasets,
DatasetCollections, MaterializedViews, and VirtualTables from Synapse. The
@@ -80,6 +82,11 @@ def sync_from_synapse(
include_types: Must be a list of entity types (ie. ["folder","file"]) which
can be found
[here](https://rest-docs.synapse.org/rest/org/sagebionetworks/repo/model/EntityType.html)
+ manifest: Determines whether to generate a manifest CSV file. Options are:
+
+ - `all` (default): generate `manifest.csv` in every synced directory
+ - `root`: generate `manifest.csv` only in the root `path` directory
+ - `suppress`: do not generate any manifest file
synapse_client: If not passed in and caching was not disabled by
`Synapse.allow_client_caching(False)` this will use the last created
instance from the Synapse class constructor.
diff --git a/synapseclient/models/services/manifest.py b/synapseclient/models/services/manifest.py
index 0a9547ee3..e5a87beaa 100644
--- a/synapseclient/models/services/manifest.py
+++ b/synapseclient/models/services/manifest.py
@@ -1,16 +1,23 @@
-"""Services for reading a Synapse manifest CSV file and preparing it for upload."""
+"""Services for reading and writing Synapse manifest CSV files.
+
+This includes reading a manifest CSV file and preparing it for upload, as well
+as writing a manifest CSV file from a list of File entities.
+"""
from __future__ import annotations
import ast
import asyncio
+import csv
import datetime
+import io
import os
import re
from dataclasses import dataclass
-from typing import TYPE_CHECKING, Iterable, NamedTuple
+from typing import TYPE_CHECKING, Any, Iterable, NamedTuple, Union
from synapseclient import Synapse
+from synapseclient.core import utils
from synapseclient.core.exceptions import (
SynapseFileNotFoundError,
SynapseHTTPError,
@@ -34,6 +41,20 @@
from synapseclient.models import UsedEntity, UsedURL
from synapseclient.models.file import File
+
+MANIFEST_CSV_FILENAME = "manifest.csv"
+DEFAULT_GENERATED_MANIFEST_CSV_KEYS = [
+ "path",
+ "parentId",
+ "name",
+ "ID",
+ "synapseStore",
+ "contentType",
+ "used",
+ "executed",
+ "activityName",
+ "activityDescription",
+]
#: Scalar types that Synapse supports as annotation values.
SynapseAnnotationType = datetime.datetime | float | int | bool | str
@@ -77,6 +98,232 @@
_FILE_NAME_PATTERN = re.compile(r"^[`\w \-\+\.\(\)]{1,256}$")
+def _manifest_csv_filename(path: str) -> str:
+ return os.path.join(os.path.expanduser(path), MANIFEST_CSV_FILENAME)
+
+
+def _get_entity_provenance_dict_for_manifest(entity: File) -> dict[str, str]:
+ """
+ Gets the provenance metadata for the entity.
+
+ Arguments:
+ entity: A File entity object
+
+ Returns:
+ dict[str, str]: a dictionary with a subset of the provenance metadata for the entity.
+ An empty dictionary is returned if the metadata does not have a provenance record.
+ """
+ if not entity.activity:
+ return {}
+ used = [a.format_for_manifest() for a in entity.activity.used]
+ executed = [a.format_for_manifest() for a in entity.activity.executed]
+ return {
+ "used": ";".join(used),
+ "executed": ";".join(executed),
+ "activityName": entity.activity.name or "",
+ "activityDescription": entity.activity.description or "",
+ }
+
+
+def _convert_manifest_data_items_to_string_list(
+ items: list[Union[str, datetime.datetime, bool, int, float]],
+) -> Union[str, list[str]]:
+ """
+ Handle coverting an individual key that contains a possible list of data into a
+ list of strings or objects that can be written to the manifest file.
+
+ This has specific logic around how to handle datetime fields.
+
+ When working with datetime fields we are printing the ISO 8601 UTC representation of
+ the datetime.
+
+ When working with non strings we are printing the non-quoted version of the object.
+
+ Example: Single-element lists are unwrapped
+ A list with one item returns the item directly, not wrapped in brackets.
+ ```python
+ _convert_manifest_data_items_to_string_list(["string,with,commas"]) # 'string,with,commas'
+ _convert_manifest_data_items_to_string_list([True]) # 'True'
+ _convert_manifest_data_items_to_string_list([1]) # '1'
+ _convert_manifest_data_items_to_string_list([1.0]) # '1.0'
+ _convert_manifest_data_items_to_string_list(
+ [datetime.datetime(2020, 1, 1, tzinfo=datetime.timezone.utc)]
+ ) # '2020-01-01T00:00:00Z'
+ ```
+
+ Example: Multi-element lists are bracket-wrapped
+ Multiple items are joined with commas inside `[...]`. String items containing
+ commas are individually quoted.
+ ```python
+ _convert_manifest_data_items_to_string_list(["a", "b", "c"])
+ # '[a,b,c]'
+ _convert_manifest_data_items_to_string_list([True, False])
+ # '[True,False]'
+ _convert_manifest_data_items_to_string_list(["string,with,commas", "string without commas"])
+ # '["string,with,commas",string without commas]'
+ _convert_manifest_data_items_to_string_list(
+ [datetime.datetime(2020, 1, 1, tzinfo=datetime.timezone.utc),
+ datetime.datetime(2021, 1, 1, tzinfo=datetime.timezone.utc)]
+ )
+ # '[2020-01-01T00:00:00Z,2021-01-01T00:00:00Z]'
+ ```
+
+ Args:
+ items: The list of items to convert.
+
+ Returns:
+ The list of items converted to strings.
+ """
+ items_to_write = []
+ for item in items:
+ if isinstance(item, datetime.datetime):
+ items_to_write.append(
+ utils.datetime_to_iso(dt=item, include_milliseconds_if_zero=False)
+ )
+ else:
+ # If a string based annotation has a comma in it
+ # this will wrap the string in quotes so it won't be parsed
+ # as multiple values. For example this is an annotation with 2 values:
+ # [my first annotation, "my, second, annotation"]
+ # This is an annotation with 4 value:
+ # [my first annotation, my, second, annotation]
+ if isinstance(item, str):
+ if len(items) > 1 and "," in item:
+ items_to_write.append(f'"{item}"')
+ else:
+ items_to_write.append(item)
+ else:
+ items_to_write.append(repr(item))
+
+ if len(items_to_write) > 1:
+ return f'[{",".join(items_to_write)}]'
+ elif len(items_to_write) == 1:
+ return items_to_write[0]
+ else:
+ return ""
+
+
+def _extract_entity_metadata_for_manifest_csv(
+ all_files: list[File],
+) -> tuple[list[str], list[dict[str, Any]]]:
+ """Extracts metadata from a list of File entities into a form usable by csv.DictWriter.
+
+ Builds the column header list starting from DEFAULT_GENERATED_MANIFEST_CSV_KEYS, then
+ appends any annotation keys discovered across all files. Each row dict contains the
+ standard fields plus annotation values (serialized via
+ _convert_manifest_data_items_to_string_list) and provenance fields from
+ _get_entity_provenance_dict_for_manifest.
+
+ Arguments:
+ all_files: A list of File model objects to extract metadata from.
+
+ Returns:
+ A tuple of (keys, data) where keys is the ordered list of column headers and
+ data is a list of row dicts, one per file.
+ """
+ keys = list(DEFAULT_GENERATED_MANIFEST_CSV_KEYS)
+ annotation_keys: set = set()
+ data = []
+ for entity in all_files:
+ row: dict = {
+ "path": entity.path,
+ "parentId": entity.parent_id,
+ "name": entity.name,
+ "ID": entity.id,
+ "synapseStore": entity.synapse_store,
+ "contentType": entity.content_type,
+ }
+ if entity.annotations:
+ for key, val in entity.annotations.items():
+ annotation_keys.add(key)
+ row[key] = (
+ _convert_manifest_data_items_to_string_list(val)
+ if isinstance(val, list)
+ else val
+ )
+ row.update(_get_entity_provenance_dict_for_manifest(entity=entity))
+ data.append(row)
+ keys.extend(annotation_keys)
+ return keys, data
+
+
+def _convert_manifest_data_row_to_dict(row: dict, keys: list[str]) -> dict:
+ """
+ Convert a row of data to a dict that can be written to a manifest file.
+
+ Args:
+ row: The row of data to convert.
+ keys: The keys of the manifest. Used to select the rows of data.
+
+ Returns:
+ The dict representation of the row.
+ """
+ data_to_write = {}
+ for key in keys:
+ data_for_key = row.get(key, "")
+ if isinstance(data_for_key, list):
+ items_to_write = _convert_manifest_data_items_to_string_list(data_for_key)
+ data_to_write[key] = items_to_write
+ else:
+ data_to_write[key] = data_for_key
+ return data_to_write
+
+
+def _write_manifest_data_csv(path: str, keys: list[str], data: list[dict]) -> None:
+ """Writes manifest data to a CSV file using csv.DictWriter with QUOTE_MINIMAL that automatically quotes any cell containing a comma, newline, or the quote character.
+
+ Each row dict is normalized via _convert_manifest_data_row_to_dict so that
+ list-valued annotation fields are serialized to strings before writing. Missing
+ fields default to an empty string; extra keys not in fieldnames are silently ignored.
+
+ Arguments:
+ path: Absolute path of the CSV file to create or overwrite.
+ keys: Ordered list of column headers used as DictWriter fieldnames.
+ data: List of row dicts, one per file. Keys absent from a row are written as
+ empty strings; keys not in fieldnames are ignored.
+ """
+ with io.open(path, "w", encoding="utf8", newline="") as fp:
+ writer = csv.DictWriter(
+ fp,
+ fieldnames=keys,
+ restval="",
+ extrasaction="ignore",
+ quoting=csv.QUOTE_MINIMAL,
+ )
+ writer.writeheader()
+ for row in data:
+ writer.writerow(_convert_manifest_data_row_to_dict(row, keys))
+
+ # add a log message to the console
+ print(f"Manifest file {path} has been generated.")
+
+
+def generate_manifest_csv(all_files: list[File], path: str) -> None:
+ """Generates a manifest.csv file based on a list of File entities.
+
+ The generated file uses CSV format with comma delimiter and is interoperable
+ with the Synapse UI download cart. Column names follow the new convention:
+ `parentId` (instead of `parent`) and `ID` (instead of `id`).
+ If all_files is empty, a manifest.csv with only the header row will be generated.
+ If path is None, a ValueError will be raised.
+
+ Args:
+ all_files: A list of File model objects.
+ path: The directory path where manifest.csv will be written.
+
+ raises:
+ ValueError: If path is None.
+ """
+ if not path:
+ raise ValueError(
+ "The path argument is required to generate a manifest.csv file."
+ )
+ if path:
+ filename = _manifest_csv_filename(path=path)
+ keys, data = _extract_entity_metadata_for_manifest_csv(all_files=all_files)
+ _write_manifest_data_csv(filename, keys, data)
+
+
class UploadSyncFile(NamedTuple):
"""Represents a single file being uploaded.
diff --git a/synapseutils/sync.py b/synapseutils/sync.py
index b7fd34023..47f4a22a1 100644
--- a/synapseutils/sync.py
+++ b/synapseutils/sync.py
@@ -78,6 +78,13 @@
COMMAS_OUTSIDE_DOUBLE_QUOTES_PATTERN = re.compile(r",(?=(?:[^\"]*\"[^\"]*\")*[^\"]*$)")
+@deprecated(
+ version="4.12.0",
+ reason=(
+ "To be removed in 5.0.0. Use StorableContainer.sync_from_synapse instead, "
+ "which generates a manifest.csv file interoperable with the Synapse UI download cart."
+ ),
+)
def syncFromSynapse(
syn: Synapse,
entity: Union[str, SynapseFile, SynapseProject, SynapseFolder],
diff --git a/tests/integration/synapseclient/models/async/test_folder_async.py b/tests/integration/synapseclient/models/async/test_folder_async.py
index fc6075594..cc04238fb 100644
--- a/tests/integration/synapseclient/models/async/test_folder_async.py
+++ b/tests/integration/synapseclient/models/async/test_folder_async.py
@@ -1,6 +1,9 @@
"""Integration tests for the synapseclient.models.Folder class."""
+import csv
+import datetime
import os
+import tempfile
import uuid
from typing import Callable, List
@@ -10,6 +13,7 @@
from synapseclient.core import utils
from synapseclient.core.exceptions import SynapseHTTPError
from synapseclient.models import (
+ Activity,
Column,
ColumnType,
Dataset,
@@ -25,6 +29,7 @@
ViewTypeMask,
VirtualTable,
)
+from synapseclient.models.activity import UsedURL
DESCRIPTION_FOLDER = "This is an example folder."
DESCRIPTION_FILE = "This is an example file."
@@ -811,3 +816,237 @@ async def test_walk_async_recursive_false(self, project_model: Project) -> None:
assert hasattr(nondirs[0], "name")
assert hasattr(nondirs[0], "id")
assert hasattr(nondirs[0], "type")
+
+
+class TestFolderManifestCSV:
+ """Integration tests for manifest CSV generation during sync_from_synapse_async."""
+
+ BOGUS_URL = "https://example.com"
+
+ @pytest.fixture(autouse=True, scope="function")
+ def init(self, syn: Synapse, schedule_for_cleanup: Callable[..., None]) -> None:
+ self.syn = syn
+ self.schedule_for_cleanup = schedule_for_cleanup
+
+ def create_file_instance(self) -> File:
+ filename = utils.make_bogus_uuid_file()
+ self.schedule_for_cleanup(filename)
+ return File(
+ path=filename,
+ content_type="text/plain",
+ )
+
+ async def test_manifest_all_creates_csv_per_directory(
+ self, project_model: Project
+ ) -> None:
+ # GIVEN a root folder with a file and a nested subfolder with its own file
+ root_folder = Folder(name=str(uuid.uuid4()), parent_id=project_model.id)
+ root_folder = await root_folder.store_async(synapse_client=self.syn)
+ self.schedule_for_cleanup(root_folder.id)
+
+ root_file = self.create_file_instance()
+ root_file.parent_id = root_folder.id
+ root_file = await root_file.store_async(synapse_client=self.syn)
+ self.schedule_for_cleanup(root_file.id)
+
+ sub_folder = Folder(name=str(uuid.uuid4()), parent_id=root_folder.id)
+ sub_folder = await sub_folder.store_async(synapse_client=self.syn)
+ self.schedule_for_cleanup(sub_folder.id)
+
+ sub_file = self.create_file_instance()
+ sub_file.parent_id = sub_folder.id
+ sub_file = await sub_file.store_async(synapse_client=self.syn)
+ self.schedule_for_cleanup(sub_file.id)
+
+ # WHEN I sync the root folder with manifest="all"
+ with tempfile.TemporaryDirectory() as tmpdir:
+ await root_folder.sync_from_synapse_async(
+ path=tmpdir,
+ manifest="all",
+ synapse_client=self.syn,
+ )
+
+ root_manifest = os.path.join(tmpdir, "manifest.csv")
+ sub_manifest = os.path.join(tmpdir, sub_folder.name, "manifest.csv")
+
+ assert os.path.isfile(root_manifest)
+ assert os.path.isfile(sub_manifest)
+
+ with open(root_manifest, newline="", encoding="utf8") as f:
+ reader = csv.DictReader(f)
+ rows = list(reader)
+ assert len(rows) == 2
+ rows_by_id = {row["ID"]: row for row in rows}
+ root_row = rows_by_id[root_file.id]
+ assert root_row["name"] == root_file.name
+ assert root_row["parentId"] == root_folder.id
+ sub_row = rows_by_id[sub_file.id]
+ assert sub_row["name"] == sub_file.name
+ assert sub_row["parentId"] == sub_folder.id
+
+ with open(sub_manifest, newline="", encoding="utf8") as f:
+ reader = csv.DictReader(f)
+ rows = list(reader)
+ assert len(rows) == 1
+ sub_row = rows[0]
+ assert sub_row["name"] == sub_file.name
+ assert sub_row["parentId"] == sub_folder.id
+
+ async def test_manifest_root_creates_csv_only_at_root(
+ self, project_model: Project
+ ) -> None:
+ # GIVEN a root folder with a file and a nested subfolder with its own file
+ root_folder = Folder(name=str(uuid.uuid4()), parent_id=project_model.id)
+ root_folder = await root_folder.store_async(synapse_client=self.syn)
+ self.schedule_for_cleanup(root_folder.id)
+
+ root_file = self.create_file_instance()
+ root_file.parent_id = root_folder.id
+ root_file = await root_file.store_async(synapse_client=self.syn)
+ self.schedule_for_cleanup(root_file.id)
+
+ sub_folder = Folder(name=str(uuid.uuid4()), parent_id=root_folder.id)
+ sub_folder = await sub_folder.store_async(synapse_client=self.syn)
+ self.schedule_for_cleanup(sub_folder.id)
+
+ sub_file = self.create_file_instance()
+ sub_file.parent_id = sub_folder.id
+ sub_file = await sub_file.store_async(synapse_client=self.syn)
+ self.schedule_for_cleanup(sub_file.id)
+
+ # WHEN I sync with manifest="root"
+ with tempfile.TemporaryDirectory() as tmpdir:
+ await root_folder.sync_from_synapse_async(
+ path=tmpdir,
+ manifest="root",
+ synapse_client=self.syn,
+ )
+
+ root_manifest = os.path.join(tmpdir, "manifest.csv")
+ sub_manifest = os.path.join(tmpdir, sub_folder.name, "manifest.csv")
+
+ # THEN manifest.csv exists only at the root
+ assert os.path.isfile(root_manifest)
+ assert not os.path.isfile(sub_manifest)
+ with open(root_manifest, newline="", encoding="utf8") as f:
+ reader = csv.DictReader(f)
+ rows = list(reader)
+ assert len(rows) == 2
+ rows_by_id = {row["ID"]: row for row in rows}
+ root_row = rows[0]
+ assert root_row["name"] == root_file.name
+ assert root_row["parentId"] == root_folder.id
+ sub_row = rows_by_id[sub_file.id]
+ assert sub_row["name"] == sub_file.name
+ assert sub_row["parentId"] == sub_folder.id
+
+ async def test_manifest_suppress_creates_no_csv(
+ self, project_model: Project
+ ) -> None:
+ # GIVEN a folder with a file
+ folder = Folder(name=str(uuid.uuid4()), parent_id=project_model.id)
+ folder = await folder.store_async(synapse_client=self.syn)
+ self.schedule_for_cleanup(folder.id)
+
+ f = self.create_file_instance()
+ f.parent_id = folder.id
+ f = await f.store_async(synapse_client=self.syn)
+ self.schedule_for_cleanup(f.id)
+
+ # WHEN I sync with manifest="suppress"
+ with tempfile.TemporaryDirectory() as tmpdir:
+ await folder.sync_from_synapse_async(
+ path=tmpdir,
+ manifest="suppress",
+ synapse_client=self.syn,
+ )
+
+ # THEN no manifest.csv is created
+ assert not os.path.isfile(os.path.join(tmpdir, "manifest.csv"))
+
+ async def test_manifest_includes_annotations(self, project_model: Project) -> None:
+ # GIVEN a file with mixed-type annotations
+ folder = Folder(name=str(uuid.uuid4()), parent_id=project_model.id)
+ folder = await folder.store_async(synapse_client=self.syn)
+ self.schedule_for_cleanup(folder.id)
+
+ f = self.create_file_instance()
+ f.parent_id = folder.id
+ f.annotations = {
+ "single_str": ["hello"],
+ "multi_str": ["a", "b", "c"],
+ "str_with_comma": ["hello,world", "plain text"],
+ "booleans": [True, False],
+ "integers": [1, 2, 3],
+ "floats": [1.0],
+ "datetimes": [
+ datetime.datetime(2020, 1, 1, 0, 0, 0, 0, tzinfo=datetime.timezone.utc)
+ ],
+ }
+ f = await f.store_async(synapse_client=self.syn)
+ self.schedule_for_cleanup(f.id)
+
+ # WHEN I sync with manifest generation
+ with tempfile.TemporaryDirectory() as tmpdir:
+ await folder.sync_from_synapse_async(
+ path=tmpdir,
+ manifest="root",
+ synapse_client=self.syn,
+ )
+
+ manifest_path = os.path.join(tmpdir, "manifest.csv")
+ assert os.path.isfile(manifest_path)
+
+ with open(manifest_path, newline="", encoding="utf8") as mf:
+ reader = csv.DictReader(mf)
+ rows = list(reader)
+
+ # THEN annotation columns are present and correctly serialized
+ assert len(rows) == 1
+ row = rows[0]
+ assert row["single_str"] == "hello"
+ assert row["multi_str"] == "[a,b,c]"
+ assert row["str_with_comma"] == '["hello,world",plain text]'
+ assert row["booleans"] == "[True,False]"
+ assert row["integers"] == "[1,2,3]"
+ assert row["floats"] == "1.0"
+ assert row["datetimes"] == "2020-01-01T00:00:00Z"
+
+ async def test_manifest_includes_provenance(self, project_model: Project) -> None:
+ # GIVEN a file with activity (provenance)
+ folder = Folder(name=str(uuid.uuid4()), parent_id=project_model.id)
+ folder = await folder.store_async(synapse_client=self.syn)
+ self.schedule_for_cleanup(folder.id)
+
+ f = self.create_file_instance()
+ f.parent_id = folder.id
+ f.activity = Activity(
+ name="my_activity",
+ description="my_description",
+ used=[UsedURL(name="my_source", url=self.BOGUS_URL)],
+ )
+ f = await f.store_async(synapse_client=self.syn)
+ self.schedule_for_cleanup(f.id)
+
+ # WHEN I sync with manifest generation and include_activity=True
+ with tempfile.TemporaryDirectory() as tmpdir:
+ await folder.sync_from_synapse_async(
+ path=tmpdir,
+ manifest="root",
+ include_activity=True,
+ synapse_client=self.syn,
+ )
+
+ manifest_path = os.path.join(tmpdir, "manifest.csv")
+ assert os.path.isfile(manifest_path)
+
+ with open(manifest_path, newline="", encoding="utf8") as mf:
+ reader = csv.DictReader(mf)
+ rows = list(reader)
+
+ # THEN provenance columns are populated
+ assert len(rows) == 1
+ row = rows[0]
+ assert row["activityName"] == "my_activity"
+ assert row["activityDescription"] == "my_description"
+ assert row["used"] == "my_source"
diff --git a/tests/unit/synapseclient/models/async/unit_test_folder_async.py b/tests/unit/synapseclient/models/async/unit_test_folder_async.py
index f465206a2..c296496f9 100644
--- a/tests/unit/synapseclient/models/async/unit_test_folder_async.py
+++ b/tests/unit/synapseclient/models/async/unit_test_folder_async.py
@@ -1,5 +1,6 @@
"""Tests for the Folder class."""
+import os
import uuid
from typing import Dict
from unittest.mock import AsyncMock, MagicMock, patch
@@ -9,7 +10,7 @@
from synapseclient import Folder as Synapse_Folder
from synapseclient import Synapse
from synapseclient.core.constants import concrete_types
-from synapseclient.core.constants.concrete_types import FILE_ENTITY
+from synapseclient.core.constants.concrete_types import FILE_ENTITY, FOLDER_ENTITY
from synapseclient.core.exceptions import SynapseNotFoundError
from synapseclient.models import FailureStrategy, File, Folder
from synapseclient.models.project_setting import ProjectSetting
@@ -824,6 +825,219 @@ async def mock_get_children(*args, **kwargs):
assert result.files[0].id == SYN_456
assert result.files[0].name == "example_file_1"
+ async def test_sync_from_synapse_manifest_all_generates_per_directory(
+ self,
+ ) -> None:
+ SUB_FOLDER_ID = "syn789"
+ SUB_FOLDER_NAME = "sub_folder"
+ FILE_2_ID = "syn012"
+ FILE_2_NAME = "example_file_2"
+
+ # GIVEN a root folder with one file and one subfolder containing one file
+ folder = Folder(id=SYN_123)
+
+ root_children = [
+ {"id": SYN_456, "type": FILE_ENTITY, "name": "example_file_1"},
+ {"id": SUB_FOLDER_ID, "type": FOLDER_ENTITY, "name": SUB_FOLDER_NAME},
+ ]
+ sub_children = [
+ {"id": FILE_2_ID, "type": FILE_ENTITY, "name": FILE_2_NAME},
+ ]
+ get_children_call_count = 0
+
+ async def mock_get_children(*args, **kwargs):
+ nonlocal get_children_call_count
+ children = root_children if get_children_call_count == 0 else sub_children
+ get_children_call_count += 1
+ for child in children:
+ yield child
+
+ downloaded_file_1 = File(
+ id=SYN_456,
+ name="example_file_1",
+ parent_id=SYN_123,
+ )
+ downloaded_file_2 = File(
+ id=FILE_2_ID,
+ name=FILE_2_NAME,
+ parent_id=SUB_FOLDER_ID,
+ )
+ file_map = {SYN_456: downloaded_file_1, FILE_2_ID: downloaded_file_2}
+
+ async def mock_file_get(self_file, **kwargs):
+ return file_map[self_file.id]
+
+ async def mock_get_entity_bundle(entity_id, *args, **kwargs):
+ if entity_id == SUB_FOLDER_ID:
+ return {
+ "entity": {
+ "concreteType": concrete_types.FOLDER_ENTITY,
+ "id": SUB_FOLDER_ID,
+ "name": SUB_FOLDER_NAME,
+ "parentId": SYN_123,
+ "etag": ETAG,
+ "createdOn": CREATED_ON,
+ "modifiedOn": MODIFIED_ON,
+ "createdBy": CREATED_BY,
+ "modifiedBy": MODIFIED_BY,
+ }
+ }
+ return self.get_example_rest_api_folder_output()
+
+ # WHEN I call sync_from_synapse with manifest="all" and a path
+ with (
+ patch(
+ "synapseclient.models.mixins.storable_container.get_children",
+ side_effect=mock_get_children,
+ ),
+ patch(
+ "synapseclient.api.entity_factory.get_entity_id_bundle2",
+ side_effect=mock_get_entity_bundle,
+ ),
+ patch(
+ "synapseclient.models.file.File.get_async",
+ side_effect=mock_file_get,
+ ),
+ patch(
+ "synapseclient.models.mixins.storable_container.os.path.exists",
+ return_value=True,
+ ),
+ patch(
+ "synapseclient.models.mixins.storable_container.generate_manifest_csv",
+ ) as mock_generate,
+ ):
+ await folder.sync_from_synapse_async(
+ path="/tmp/mydir", manifest="all", synapse_client=self.syn
+ )
+
+ # THEN generate_manifest_csv is called once per directory (root + subfolder)
+ assert mock_generate.call_count == 2
+ calls_by_path = {
+ c.kwargs["path"]: c.kwargs["all_files"]
+ for c in mock_generate.call_args_list
+ }
+ assert any(f.id == SYN_456 for f in calls_by_path["/tmp/mydir"])
+ assert any(
+ f.id == FILE_2_ID
+ for f in calls_by_path[os.path.join("/tmp/mydir", SUB_FOLDER_NAME)]
+ )
+
+ async def test_sync_from_synapse_manifest_root_generates_only_at_root(
+ self,
+ ) -> None:
+ # GIVEN a Folder object with a path
+ folder = Folder(id=SYN_123)
+ children = [{"id": SYN_456, "type": FILE_ENTITY, "name": "example_file_1"}]
+
+ async def mock_get_children(*args, **kwargs):
+ for child in children:
+ yield child
+
+ downloaded_file = File(
+ id=SYN_456,
+ name="example_file_1",
+ path="/tmp/mydir/example_file_1.txt",
+ parent_id=SYN_123,
+ )
+
+ # WHEN I call sync_from_synapse with manifest="root" and a path
+ with (
+ patch(
+ "synapseclient.models.mixins.storable_container.get_children",
+ side_effect=mock_get_children,
+ ),
+ patch(
+ "synapseclient.api.entity_factory.get_entity_id_bundle2",
+ new_callable=AsyncMock,
+ return_value=self.get_example_rest_api_folder_output(),
+ ),
+ patch(
+ "synapseclient.models.file.File.get_async",
+ return_value=downloaded_file,
+ ),
+ patch(
+ "synapseclient.models.mixins.storable_container.generate_manifest_csv",
+ ) as mock_generate,
+ ):
+ await folder.sync_from_synapse_async(
+ path="/tmp/mydir", manifest="root", synapse_client=self.syn
+ )
+
+ # THEN generate_manifest_csv should be called exactly once with the root path
+ mock_generate.assert_called_once()
+ assert mock_generate.call_args.kwargs["path"] == "/tmp/mydir"
+ assert mock_generate.call_args.kwargs["all_files"][0].id == SYN_456
+
+ async def test_sync_from_synapse_manifest_suppress_skips_generation(
+ self,
+ ) -> None:
+ # GIVEN a Folder object with a path
+ folder = Folder(id=SYN_123)
+ children = [{"id": SYN_456, "type": FILE_ENTITY, "name": "example_file_1"}]
+
+ async def mock_get_children(*args, **kwargs):
+ for child in children:
+ yield child
+
+ # WHEN I call sync_from_synapse with manifest="suppress"
+ with (
+ patch(
+ "synapseclient.models.mixins.storable_container.get_children",
+ side_effect=mock_get_children,
+ ),
+ patch(
+ "synapseclient.api.entity_factory.get_entity_id_bundle2",
+ new_callable=AsyncMock,
+ return_value=self.get_example_rest_api_folder_output(),
+ ),
+ patch(
+ "synapseclient.models.file.File.get_async",
+ return_value=(File(id=SYN_456, name="example_file_1")),
+ ),
+ patch(
+ "synapseclient.models.mixins.storable_container.generate_manifest_csv",
+ ) as mock_generate,
+ ):
+ await folder.sync_from_synapse_async(
+ path="/tmp/mydir", manifest="suppress", synapse_client=self.syn
+ )
+
+ # THEN generate_manifest_csv should never be called
+ mock_generate.assert_not_called()
+
+ async def test_sync_from_synapse_no_manifest_without_path(self) -> None:
+ # GIVEN a Folder with no path specified
+ folder = Folder(id=SYN_123)
+ children = [{"id": SYN_456, "type": FILE_ENTITY, "name": "example_file_1"}]
+
+ async def mock_get_children(*args, **kwargs):
+ for child in children:
+ yield child
+
+ # WHEN I call sync_from_synapse with no path (default manifest="all")
+ with (
+ patch(
+ "synapseclient.models.mixins.storable_container.get_children",
+ side_effect=mock_get_children,
+ ),
+ patch(
+ "synapseclient.api.entity_factory.get_entity_id_bundle2",
+ new_callable=AsyncMock,
+ return_value=self.get_example_rest_api_folder_output(),
+ ),
+ patch(
+ "synapseclient.models.file.File.get_async",
+ return_value=(File(id=SYN_456, name="example_file_1")),
+ ),
+ patch(
+ "synapseclient.models.mixins.storable_container.generate_manifest_csv",
+ ) as mock_generate,
+ ):
+ await folder.sync_from_synapse_async(synapse_client=self.syn)
+
+ # THEN generate_manifest_csv should not be called (no path to write to)
+ mock_generate.assert_not_called()
+
class TestStorageLocationMixin:
"""Tests for ProjectSettingsMixin methods on Folder."""
diff --git a/tests/unit/synapseclient/services/unit_test_manifest.py b/tests/unit/synapseclient/services/unit_test_manifest.py
new file mode 100644
index 000000000..e13f8f574
--- /dev/null
+++ b/tests/unit/synapseclient/services/unit_test_manifest.py
@@ -0,0 +1,474 @@
+import csv
+import datetime
+import os
+import tempfile
+
+import pytest
+
+from synapseclient.models import Activity, File
+from synapseclient.models.activity import UsedEntity, UsedURL
+from synapseclient.models.services.manifest import (
+ MANIFEST_CSV_FILENAME,
+ _convert_manifest_data_items_to_string_list,
+ _convert_manifest_data_row_to_dict,
+ _extract_entity_metadata_for_manifest_csv,
+ _get_entity_provenance_dict_for_manifest,
+ _manifest_csv_filename,
+ _write_manifest_data_csv,
+ generate_manifest_csv,
+)
+
+
+class TestManifestCsvFilename:
+ """Tests for the _manifest_csv_filename helper."""
+
+ def test_plain_directory(self) -> None:
+ # GIVEN a plain absolute path
+ # WHEN _manifest_csv_filename is called
+ result = _manifest_csv_filename("/tmp/mydir")
+
+ # THEN it joins the path with the manifest filename
+ assert result == os.path.join("/tmp/mydir", MANIFEST_CSV_FILENAME)
+
+ def test_tilde_is_expanded(self) -> None:
+ # GIVEN a path starting with ~
+ # WHEN _manifest_csv_filename is called
+ result = _manifest_csv_filename("~/mydir")
+
+ # THEN ~ is expanded to the user's home directory
+ assert result == os.path.join(
+ os.path.expanduser("~/mydir"), MANIFEST_CSV_FILENAME
+ )
+ assert "~" not in result
+
+ def test_filename_is_manifest_csv(self) -> None:
+ # GIVEN any directory path
+ # WHEN _manifest_csv_filename is called
+ result = _manifest_csv_filename("/some/path")
+
+ # THEN the basename of the result is MANIFEST_CSV_FILENAME
+ assert os.path.basename(result) == MANIFEST_CSV_FILENAME
+
+
+class TestGenerateManifestCsv:
+ """Tests for the generate_manifest_csv and related helper functions."""
+
+ def _make_file(
+ self,
+ syn_id: str = "syn123",
+ name: str = "file.txt",
+ path: str = "/data/file.txt",
+ parent_id: str = "syn456",
+ content_type: str = "text/plain",
+ synapse_store: bool = True,
+ annotations: dict = None,
+ activity: Activity = None,
+ ) -> File:
+ f = File(
+ id=syn_id,
+ name=name,
+ path=path,
+ parent_id=parent_id,
+ content_type=content_type,
+ synapse_store=synapse_store,
+ )
+ if annotations:
+ f.annotations = annotations
+ if activity:
+ f.activity = activity
+ return f
+
+ def test_extract_entity_metadata_includes_annotations_and_activity(self) -> None:
+ # GIVEN a File entity with provenance
+ activity = Activity(
+ name="My Pipeline",
+ description="Run analysis",
+ used=[UsedEntity(target_id="syn111", target_version_number=1)],
+ executed=[UsedURL(url="https://github.com/example/pipeline")],
+ )
+ f = self._make_file(
+ activity=activity, annotations={"tissue": ["brain"], "count": [42]}
+ )
+
+ # WHEN metadata is extracted
+ keys, data = _extract_entity_metadata_for_manifest_csv([f])
+
+ # THEN provenance keys are present in the column list
+ assert {
+ "used",
+ "executed",
+ "activityName",
+ "activityDescription",
+ "tissue",
+ "count",
+ }.issubset(keys)
+
+ assert data[0]["parentId"] == "syn456"
+ assert data[0]["ID"] == "syn123"
+ assert data[0]["path"] == "/data/file.txt"
+ assert data[0]["name"] == "file.txt"
+ assert data[0]["activityName"] == "My Pipeline"
+ assert data[0]["activityDescription"] == "Run analysis"
+ assert data[0]["used"] == "syn111.1"
+ assert data[0]["executed"] == "https://github.com/example/pipeline"
+ assert data[0]["tissue"] == "brain"
+ assert data[0]["count"] == "42"
+
+ def test_generate_manifest_csv_data_items_are_converted_to_strings(self) -> None:
+ # GIVEN a File with a name containing a comma and mixed-type annotations
+ f = self._make_file(
+ name="a, b, c",
+ path="/data/file.txt",
+ annotations={
+ "single_str": "hello",
+ "multi_str": ["a", "b", "c"],
+ "str_with_comma": ["hello,world", "plain text"],
+ "booleans": [True, False],
+ "integers": [1],
+ "floats": [1.0],
+ "single_dt": [
+ datetime.datetime(2020, 1, 1, 0, 0, 0, tzinfo=datetime.timezone.utc)
+ ],
+ "multi_dt": [
+ datetime.datetime(
+ 2020, 1, 1, 0, 0, 0, tzinfo=datetime.timezone.utc
+ ),
+ datetime.datetime(
+ 2021, 6, 15, 12, 30, 0, tzinfo=datetime.timezone.utc
+ ),
+ ],
+ },
+ )
+
+ with tempfile.TemporaryDirectory() as tmpdir:
+ # WHEN generate_manifest_csv is called
+ generate_manifest_csv(all_files=[f], path=tmpdir)
+ manifest_path = os.path.join(tmpdir, "manifest.csv")
+ content = open(manifest_path, encoding="utf8").read()
+ with open(manifest_path, newline="", encoding="utf8") as fp:
+ row = next(csv.DictReader(fp))
+
+ assert '"a, b, c"' in content
+ assert row["single_str"] == "hello"
+ assert row["multi_str"] == "[a,b,c]"
+ assert row["str_with_comma"] == '["hello,world",plain text]'
+ assert row["booleans"] == "[True,False]"
+ assert row["integers"] == "1"
+ assert row["floats"] == "1.0"
+ assert row["single_dt"] == "2020-01-01T00:00:00Z"
+ assert row["multi_dt"] == "[2020-01-01T00:00:00Z,2021-06-15T12:30:00Z]"
+
+ def test_generate_manifest_csv_with_only_header_row(self) -> None:
+ # GIVEN an empty file list
+ with tempfile.TemporaryDirectory() as tmpdir:
+ # WHEN generate_manifest_csv is called with no files
+ generate_manifest_csv(all_files=[], path=tmpdir)
+
+ # THEN the manifest.csv file is created with only the header row and no data rows
+ manifest_path = os.path.join(tmpdir, "manifest.csv")
+ with open(manifest_path, newline="", encoding="utf8") as fp:
+ reader = csv.DictReader(fp)
+ rows = list(reader)
+ assert reader.fieldnames == [
+ "path",
+ "parentId",
+ "name",
+ "ID",
+ "synapseStore",
+ "contentType",
+ "used",
+ "executed",
+ "activityName",
+ "activityDescription",
+ ]
+ assert rows == []
+
+ def test_generate_manifest_csv_with_path_None_raises_ValueError(self) -> None:
+ # GIVEN an empty file list
+ with tempfile.TemporaryDirectory() as tmpdir:
+ # WHEN generate_manifest_csv is called with path=None
+ with pytest.raises(
+ ValueError,
+ match="The path argument is required to generate a manifest.csv file.",
+ ):
+ generate_manifest_csv(all_files=[], path=None)
+
+ def test_generate_manifest_csv_quotes_values_with_commas(self) -> None:
+ # GIVEN a File whose name contains a comma
+ f = self._make_file(name="file, extra.txt", path="/tmp/file, extra.txt")
+
+ with tempfile.TemporaryDirectory() as tmpdir:
+ generate_manifest_csv(all_files=[f], path=tmpdir)
+ manifest_path = os.path.join(tmpdir, "manifest.csv")
+ content = open(manifest_path, encoding="utf8").read()
+ # THEN the comma-containing value is quoted in the CSV
+ assert '"file, extra.txt"' in content
+
+
+class TestWriteManifestDataCsv:
+ """Tests for the _write_manifest_data_csv helper."""
+
+ def test_writes_header_and_rows(self) -> None:
+ # GIVEN keys and one row of data
+ keys = ["path", "parentId", "name"]
+ data = [{"path": "/data/f.txt", "parentId": "syn1", "name": "f.txt"}]
+
+ with tempfile.TemporaryDirectory() as tmpdir:
+ filename = os.path.join(tmpdir, "manifest.csv")
+ # WHEN _write_manifest_data_csv is called
+ _write_manifest_data_csv(filename, keys, data)
+
+ with open(filename, newline="", encoding="utf8") as fp:
+ rows = list(csv.DictReader(fp))
+
+ # THEN header and row values are written correctly
+ assert len(rows) == 1
+ assert rows[0]["path"] == "/data/f.txt"
+ assert rows[0]["parentId"] == "syn1"
+ assert rows[0]["name"] == "f.txt"
+
+ def test_missing_keys_use_empty_string(self) -> None:
+ # GIVEN a row missing the "name" key
+ keys = ["path", "parentId", "name"]
+ data = [{"path": "/data/f.txt", "parentId": "syn1"}]
+
+ with tempfile.TemporaryDirectory() as tmpdir:
+ filename = os.path.join(tmpdir, "manifest.csv")
+ _write_manifest_data_csv(filename, keys, data)
+
+ with open(filename, newline="", encoding="utf8") as fp:
+ rows = list(csv.DictReader(fp))
+
+ # THEN the missing field is written as an empty string
+ assert rows[0]["name"] == ""
+
+ def test_extra_keys_in_row_are_ignored(self) -> None:
+ # GIVEN a row with a key not in the fieldnames list
+ keys = ["path", "name"]
+ data = [{"path": "/data/f.txt", "name": "f.txt", "extra": "ignored"}]
+
+ with tempfile.TemporaryDirectory() as tmpdir:
+ filename = os.path.join(tmpdir, "manifest.csv")
+ # WHEN _write_manifest_data_csv is called
+ # THEN no exception is raised and only declared keys appear
+ _write_manifest_data_csv(filename, keys, data)
+
+ with open(filename, newline="", encoding="utf8") as fp:
+ reader = csv.DictReader(fp)
+ rows = list(reader)
+ assert "extra" not in reader.fieldnames
+
+ assert rows[0]["path"] == "/data/f.txt"
+
+ def test_values_with_commas_are_quoted(self) -> None:
+ # GIVEN a value that contains a comma
+ keys = ["name", "parentId"]
+ data = [{"name": "file, with comma.txt", "parentId": "syn1"}]
+
+ with tempfile.TemporaryDirectory() as tmpdir:
+ filename = os.path.join(tmpdir, "manifest.csv")
+ _write_manifest_data_csv(filename, keys, data)
+ content = open(filename, encoding="utf8").read()
+
+ with open(filename, newline="", encoding="utf8") as fp:
+ rows = list(csv.DictReader(fp))
+
+ # THEN the comma-containing value is quoted in the raw CSV
+ assert '"file, with comma.txt"' in content
+ # AND reads back correctly
+ assert rows[0]["name"] == "file, with comma.txt"
+
+ def test_empty_data_writes_header_only(self) -> None:
+ # GIVEN no data rows
+ keys = ["path", "parentId", "name"]
+
+ with tempfile.TemporaryDirectory() as tmpdir:
+ filename = os.path.join(tmpdir, "manifest.csv")
+ _write_manifest_data_csv(filename, keys, [])
+
+ with open(filename, newline="", encoding="utf8") as fp:
+ reader = csv.DictReader(fp)
+ rows = list(reader)
+ header = reader.fieldnames
+
+ # THEN the file exists with only the header
+ assert rows == []
+ assert header == keys
+
+ def test_unicode_values_are_written_correctly(self) -> None:
+ # GIVEN a value with non-ASCII characters
+ keys = ["name", "parentId"]
+ data = [{"name": "données_été.txt", "parentId": "syn1"}]
+
+ with tempfile.TemporaryDirectory() as tmpdir:
+ filename = os.path.join(tmpdir, "manifest.csv")
+ _write_manifest_data_csv(filename, keys, data)
+
+ with open(filename, newline="", encoding="utf8") as fp:
+ rows = list(csv.DictReader(fp))
+
+ # THEN unicode characters round-trip correctly
+ assert rows[0]["name"] == "données_été.txt"
+
+ def test_multiple_rows_written_in_order(self) -> None:
+ # GIVEN multiple rows
+ keys = ["name", "parentId"]
+ data = [
+ {"name": "a.txt", "parentId": "syn1"},
+ {"name": "b.txt", "parentId": "syn2"},
+ {"name": "c.txt", "parentId": "syn3"},
+ ]
+
+ with tempfile.TemporaryDirectory() as tmpdir:
+ filename = os.path.join(tmpdir, "manifest.csv")
+ _write_manifest_data_csv(filename, keys, data)
+
+ with open(filename, newline="", encoding="utf8") as fp:
+ rows = list(csv.DictReader(fp))
+
+ # THEN all rows are present and in order
+ assert len(rows) == 3
+ assert [r["name"] for r in rows] == ["a.txt", "b.txt", "c.txt"]
+
+
+class TestGetEntityProvenanceDictForManifest:
+ """Tests for _get_entity_provenance_dict_for_manifest."""
+
+ def _make_file_with_activity(self, activity: Activity = None) -> File:
+ f = File(id="syn1", name="f.txt", path="/f.txt", parent_id="syn2")
+ if activity:
+ f.activity = activity
+ return f
+
+ def test_returns_empty_dict_when_no_activity(self) -> None:
+ f = self._make_file_with_activity()
+ result = _get_entity_provenance_dict_for_manifest(f)
+ assert result == {}
+
+ def test_returns_all_provenance_keys_with_activity(self) -> None:
+ activity = Activity(
+ name="Pipeline",
+ description="Runs analysis",
+ used=[UsedEntity(target_id="syn10", target_version_number=2)],
+ executed=[UsedURL(url="https://github.com/example/run")],
+ )
+ f = self._make_file_with_activity(activity)
+
+ result = _get_entity_provenance_dict_for_manifest(f)
+ assert result["used"] == "syn10.2"
+ assert result["executed"] == "https://github.com/example/run"
+ assert result["activityName"] == "Pipeline"
+ assert result["activityDescription"] == "Runs analysis"
+
+ def test_activity_name_and_description_default_to_empty_string(self) -> None:
+ activity = Activity(name=None, description=None)
+ f = self._make_file_with_activity(activity)
+
+ result = _get_entity_provenance_dict_for_manifest(f)
+ assert result["activityName"] == ""
+ assert result["activityDescription"] == ""
+
+ def test_empty_used_and_executed_lists(self) -> None:
+ activity = Activity(name="minimal", used=[], executed=[])
+ f = self._make_file_with_activity(activity)
+
+ result = _get_entity_provenance_dict_for_manifest(f)
+
+ assert result["activityName"] == "minimal"
+ assert result["used"] == ""
+ assert result["executed"] == ""
+
+ def test_multiple_used_and_executed_are_semicolon_joined(self) -> None:
+ # GIVEN an activity with multiple used and executed entries
+ activity = Activity(
+ name="multi",
+ used=[
+ UsedEntity(target_id="syn1", target_version_number=1),
+ UsedEntity(target_id="syn2", target_version_number=3),
+ ],
+ executed=[
+ UsedURL(url="https://github.com/a"),
+ UsedURL(url="https://github.com/b"),
+ ],
+ )
+ f = self._make_file_with_activity(activity)
+
+ result = _get_entity_provenance_dict_for_manifest(f)
+
+ assert result["activityName"] == "multi"
+ assert result["used"] == "syn1.1;syn2.3"
+ assert result["executed"] == "https://github.com/a;https://github.com/b"
+
+
+_UTC = datetime.timezone.utc
+
+
+class TestConvertManifestDataItemsToStringList:
+ """Tests for _convert_manifest_data_items_to_string_list."""
+
+ @pytest.mark.parametrize(
+ "items,expected",
+ [
+ ([], ""),
+ (["hello"], "hello"),
+ # single item with comma is NOT quoted — quoting only applies in multi-item lists
+ (["hello,world"], "hello,world"),
+ (["a", "b", "c"], "[a,b,c]"),
+ (["hello,world", "plain"], '["hello,world",plain]'),
+ ([True], "True"),
+ ([True, False], "[True,False]"),
+ ([42], "42"),
+ ([1, 2, 3], "[1,2,3]"),
+ ([1.5], "1.5"),
+ (
+ [datetime.datetime(2020, 1, 1, tzinfo=_UTC)],
+ "2020-01-01T00:00:00Z",
+ ),
+ (
+ [
+ datetime.datetime(2020, 1, 1, tzinfo=_UTC),
+ datetime.datetime(2021, 6, 15, 12, 30, tzinfo=_UTC),
+ ],
+ "[2020-01-01T00:00:00Z,2021-06-15T12:30:00Z]",
+ ),
+ ],
+ )
+ def test_converts_items(self, items: list, expected: str) -> None:
+ assert _convert_manifest_data_items_to_string_list(items) == expected
+
+
+class TestConvertManifestDataRowToDict:
+ """Tests for _convert_manifest_data_row_to_dict."""
+
+ def test_all_keys_present_passes_through(self) -> None:
+ row = {"path": "/f.txt", "parentId": "syn1", "name": "f.txt"}
+ keys = ["path", "parentId", "name"]
+
+ result = _convert_manifest_data_row_to_dict(row, keys)
+
+ assert result == {"path": "/f.txt", "parentId": "syn1", "name": "f.txt"}
+
+ def test_missing_key_defaults_to_empty_string(self) -> None:
+ row = {"path": "/f.txt", "parentId": "syn1"}
+ keys = ["path", "parentId", "name"]
+
+ result = _convert_manifest_data_row_to_dict(row, keys)
+
+ assert result["name"] == ""
+
+ def test_list_value_converted_to_string(self) -> None:
+ row = {"tags": ["a", "b", "c"]}
+ keys = ["tags"]
+
+ result = _convert_manifest_data_row_to_dict(row, keys)
+
+ assert result["tags"] == "[a,b,c]"
+
+ def test_extra_keys_in_row_are_not_included_in_output(self) -> None:
+ row = {"path": "/f.txt", "extra": "ignored"}
+ keys = ["path"]
+
+ result = _convert_manifest_data_row_to_dict(row, keys)
+
+ assert "extra" not in result
+ assert result == {"path": "/f.txt"}
diff --git a/tests/unit/synapseutils/unit_test_synapseutils_sync.py b/tests/unit/synapseutils/unit_test_synapseutils_sync.py
index a2fc2580b..8187c6802 100644
--- a/tests/unit/synapseutils/unit_test_synapseutils_sync.py
+++ b/tests/unit/synapseutils/unit_test_synapseutils_sync.py
@@ -1916,3 +1916,21 @@ def test_multiple_item(self) -> None:
"baz",
'"foo, bar, baz"',
]
+
+
+class TestSyncFromSynapseDeprecation:
+ """Tests for the deprecation of syncFromSynapse."""
+
+ def test_syncFromSynapse_emits_deprecation_warning(self, syn: Synapse) -> None:
+ # GIVEN the legacy syncFromSynapse function
+ # WHEN it is called
+ # THEN a DeprecationWarning is raised pointing to StorableContainer
+ with pytest.warns(
+ DeprecationWarning, match="StorableContainer.sync_from_synapse"
+ ):
+ with patch.object(
+ sync,
+ "syncFromSynapse_async",
+ return_value=AsyncMock(return_value=[])(),
+ ):
+ sync.syncFromSynapse(syn=syn, entity="syn123")