diff --git a/docs/explanations/manifest_csv.md b/docs/explanations/manifest_csv.md index ae542a800..e46d52f4a 100644 --- a/docs/explanations/manifest_csv.md +++ b/docs/explanations/manifest_csv.md @@ -17,7 +17,7 @@ The format of the manifest file is a comma-separated value (CSV) file with one r | parentId | Synapse ID of parent | syn1235 | !!! note - The legacy TSV manifest used a column named `parent`. The CSV manifest uses `parentId` instead, which is consistent with the Synapse REST API field name. If you are migrating an existing TSV manifest to CSV, rename the `parent` column to `parentId`. + The legacy TSV manifest used the columns `parent` and `id`, while the CSV manifest uses `parentId` and `ID` to align with Synapse REST API field names. If you’re migrating a TSV manifest to CSV, you’ll need to rename `parent` to `parentId` and `id` to `ID`. ### Standard fields @@ -60,11 +60,11 @@ Any columns that are not in the standard or metadata fields described above will Adding annotations to each row: -| path | parentId | annot1 | annot2 | annot3 | annot4 | annot5 | -| --- | --- | --- | --- | --- | --- | --- | -| /path/file1.txt | syn1243 | bar | 3.1415 | "aaaa, bbbb" | "[14,27,30]" | "Annotation, with a comma" | -| /path/file2.txt | syn12433 | baz | 2.71 | value_1 | "[1,2,3]" | test 123 | -| /path/file3.txt | syn12455 | zzz | 3.52 | value_3 | "[42,56,77]" | a single annotation | +| path | parentId | annot1 | annot2 | annot3 | annot4 | annot5 | annot6 | +| --- | --- | --- | --- | --- | --- | --- | --- | +| /path/file1.txt | syn1243 | bar | 3.1415 | "aaaa, bbbb" | "[14,27,30]" | "Annotation, with a comma" | "True" | +| /path/file2.txt | syn12433 | baz | 2.71 | value_1 | "[1,2,3]" | string without commas | "[True,False]" | +| /path/file3.txt | syn12455 | zzz | 3.52 | value_3 | "[42,56,77]" | a_single_string | | #### Multiple values of annotations per key diff --git a/docs/tutorials/python/download_data_in_bulk.md b/docs/tutorials/python/download_data_in_bulk.md index 04e27ec92..3534e141a 100644 --- a/docs/tutorials/python/download_data_in_bulk.md +++ b/docs/tutorials/python/download_data_in_bulk.md @@ -28,6 +28,7 @@ With a project that has this example layout: In this tutorial you will: 1. Download all files/folder from a project +1. Control manifest CSV generation during download 1. Download all files/folders for a specific folder within the project 1. Loop over all files/folders on the project/folder object instances @@ -44,48 +45,75 @@ another desired directory exists. #### First let's set up some constants we'll use in this script ```python -{!docs/tutorials/python/tutorial_scripts/download_data_in_bulk.py!lines=5-19} +--8<-- "docs/tutorials/python/tutorial_scripts/download_data_in_bulk.py:setup" ``` #### Next we'll create an instance of the Project we are going to sync ```python -{!docs/tutorials/python/tutorial_scripts/download_data_in_bulk.py!lines=20-22} +--8<-- "docs/tutorials/python/tutorial_scripts/download_data_in_bulk.py:get_project" ``` #### Finally we'll sync the project from synapse to your local machine ```python -{!docs/tutorials/python/tutorial_scripts/download_data_in_bulk.py!lines=23-28} +--8<-- "docs/tutorials/python/tutorial_scripts/download_data_in_bulk.py:sync_project" ```
While syncing your project you'll see results like: ``` -Syncing Project (syn53185532:My uniquely named project about Alzheimer's Disease) from Synapse. -Syncing Folder (syn53205630:experiment_notes) from Synapse. -Syncing Folder (syn53205632:notes_2022) from Synapse. -Syncing Folder (syn53205629:single_cell_RNAseq_batch_1) from Synapse. -Syncing Folder (syn53205656:single_cell_RNAseq_batch_2) from Synapse. -Syncing Folder (syn53205631:notes_2023) from Synapse. -Downloading [####################]100.00% 4.0bytes/4.0bytes (1.8kB/s) fileA.txt Done... -Downloading [####################]100.00% 3.0bytes/3.0bytes (1.1kB/s) SRR92345678_R1.fastq.gz Done... -Downloading [####################]100.00% 4.0bytes/4.0bytes (1.7kB/s) SRR12345678_R1.fastq.gz Done... -Downloading [####################]100.00% 4.0bytes/4.0bytes (1.9kB/s) fileC.txt Done... -Downloading [####################]100.00% 4.0bytes/4.0bytes (2.7kB/s) fileB.txt Done... -Downloading [####################]100.00% 4.0bytes/4.0bytes (2.7kB/s) SRR12345678_R2.fastq.gz Done... -Downloading [####################]100.00% 4.0bytes/4.0bytes (2.6kB/s) SRR12345678_R2.fastq.gz Done... -Downloading [####################]100.00% 4.0bytes/4.0bytes (1.8kB/s) SRR12345678_R1.fastq.gz Done... -Downloading [####################]100.00% 3.0bytes/3.0bytes (1.5kB/s) SRR92345678_R2.fastq.gz Done... -Downloading [####################]100.00% 4.0bytes/4.0bytes (1.6kB/s) fileD.txt Done... -['single_cell_RNAseq_batch_2', 'single_cell_RNAseq_batch_1', 'experiment_notes'] +[syn74583648:My uniquely named project about Alzheimer's Disease]: Syncing Project from Synapse. +[syn74584000:biospecimen_experiment_1]: Syncing Folder from Synapse. +[syn74584007:single_cell_RNAseq_batch_2]: Syncing Folder from Synapse. +[syn74584001:biospecimen_experiment_2]: Syncing Folder from Synapse. +[syn74584006:single_cell_RNAseq_batch_1]: Syncing Folder from Synapse. +[syn74584146]: Downloaded to /biospecimen_experiment_1/fileB.png +[syn74584154]: Downloaded to /biospecimen_experiment_2/fileD.png +[syn74584155]: Downloaded to /biospecimen_experiment_2/fileC.png +[syn74584188]: Downloaded to /single_cell_RNAseq_batch_1/SRR12345678_R1.fastq.png +[syn74584147]: Downloaded to /biospecimen_experiment_1/fileA.png +[syn74584206]: Downloaded to /single_cell_RNAseq_batch_2/SRR12345678_R1.fastq.png +[syn74584189]: Downloaded to /single_cell_RNAseq_batch_1/SRR12345678_R2.fastq.png +[syn74584207]: Downloaded to /single_cell_RNAseq_batch_2/SRR12345678_R2.fastq.png +Downloading files: 100%|████████████████████| 1.31M/1.31M [00:02<00:00, 606kB/s] +Project(id='syn74583648', name="My uniquely named project about Alzheimer's Disease", files=[], folders=[ + Folder(id='syn74584000', name='biospecimen_experiment_1', parent_id='syn74583648', files=[ + File(id='syn74584147', name='fileA.png', path='/biospecimen_experiment_1/fileA.png', parent_id='syn74584000', ...), + File(id='syn74584146', name='fileB.png', path='/biospecimen_experiment_1/fileB.png', parent_id='syn74584000', ...) + ], folders=[], ...), + Folder(id='syn74584001', name='biospecimen_experiment_2', parent_id='syn74583648', files=[ + File(id='syn74584155', name='fileC.png', path='/biospecimen_experiment_2/fileC.png', parent_id='syn74584001', ...), + File(id='syn74584154', name='fileD.png', path='/biospecimen_experiment_2/fileD.png', parent_id='syn74584001', ...) + ], folders=[], ...), + Folder(id='syn74584006', name='single_cell_RNAseq_batch_1', parent_id='syn74583648', files=[ + File(id='syn74584188', name='SRR12345678_R1.fastq.png', path='/single_cell_RNAseq_batch_1/SRR12345678_R1.fastq.png', parent_id='syn74584006', ...), + File(id='syn74584189', name='SRR12345678_R2.fastq.png', path='/single_cell_RNAseq_batch_1/SRR12345678_R2.fastq.png', parent_id='syn74584006', ...) + ], folders=[], ...), + Folder(id='syn74584007', name='single_cell_RNAseq_batch_2', parent_id='syn74583648', files=[ + File(id='syn74584206', name='SRR12345678_R1.fastq.png', path='/single_cell_RNAseq_batch_2/SRR12345678_R1.fastq.png', parent_id='syn74584007', ...), + File(id='syn74584207', name='SRR12345678_R2.fastq.png', path='/single_cell_RNAseq_batch_2/SRR12345678_R2.fastq.png', parent_id='syn74584007', ...) + ], folders=[], ...) +], ...) ```
-## 2. Download all files/folders for a specific folder within the project +## 2. Control manifest CSV generation during download + +By default (`manifest="all"`), `sync_from_synapse` writes a `manifest.csv` into every +synced directory. The manifest.csv is interoperable with sync_to_synapse, the Synapse UI download cart, and `download_list_files`. + +Use `manifest="root"` to write a single manifest at the root path, or +`manifest="suppress"` to skip manifest generation entirely. + +```python +--8<-- "docs/tutorials/python/tutorial_scripts/download_data_in_bulk.py:sync_project_with_root_manifest" +``` + +## 3. Download all files/folders for a specific folder within the project Following the same set of steps let's sync a specific folder ```python -{!docs/tutorials/python/tutorial_scripts/download_data_in_bulk.py!lines=30-36} +--8<-- "docs/tutorials/python/tutorial_scripts/download_data_in_bulk.py:sync_folder" ```
@@ -105,12 +133,12 @@ download the content again. If you were to use an `if_collision` of `"overwrite. you would see that when the content on your machine does not match Synapse the file will be overwritten. -## 3. Loop over all files/folders on the project/folder object instances +## 4. Loop over all files/folders on the project/folder object instances Using `sync_from_synapse` will load into memory the state of all Folders and Files retrieved from Synapse. This will allow you to loop over the contents of your container. ```python -{!docs/tutorials/python/tutorial_scripts/download_data_in_bulk.py!lines=37-47} +--8<-- "docs/tutorials/python/tutorial_scripts/download_data_in_bulk.py:loop_over_project_folder" ```
diff --git a/docs/tutorials/python/tutorial_scripts/download_data_in_bulk.py b/docs/tutorials/python/tutorial_scripts/download_data_in_bulk.py index 0ca946f13..1757c1037 100644 --- a/docs/tutorials/python/tutorial_scripts/download_data_in_bulk.py +++ b/docs/tutorials/python/tutorial_scripts/download_data_in_bulk.py @@ -2,6 +2,7 @@ Here is where you'll find the code for the downloading data in bulk tutorial. """ +# --8<-- [start:setup] import os import synapseclient @@ -16,32 +17,55 @@ DIRECTORY_TO_SYNC_FOLDER_TO = os.path.join( DIRECTORY_TO_SYNC_PROJECT_TO, FOLDER_NAME_TO_SYNC ) +# --8<-- [end:setup] -# Step 1: Create an instance of the container I want to sync the data from and sync -project = Project(name="My uniquely named project about Alzheimer's Disease") -# We'll set the `if_collision` to `keep.local` so that we don't overwrite any files +# Step 1: Get an instance of the container I want to sync the data from and sync +# --8<-- [start:get_project] +project = Project(name="My uniquely named project about Alzheimer's Disease").get() +# --8<-- [end:get_project] + +# By default, sync_from_synapse generates a manifest.csv in each synced directory. +# The manifest.csv is interoperable with sync_to_synapse, the Synapse +# UI download cart, and `download_list_files`. +# --8<-- [start:sync_project] +# We'll set the `if_collision` to `keep.local` so that we don't overwrite any files. project.sync_from_synapse(path=DIRECTORY_TO_SYNC_PROJECT_TO, if_collision="keep.local") # Print out the contents of the directory where the data was synced to # Explore the directory to see the contents have been recursively synced. print(os.listdir(DIRECTORY_TO_SYNC_PROJECT_TO)) +# --8<-- [end:sync_project] +# Or, use `manifest="root"` to generate a single manifest.csv at the root directory +# instead of one in each sub-directory. Use `manifest="suppress"` to skip +# manifest generation entirely. + +# --8<-- [start:sync_project_with_root_manifest] +project.sync_from_synapse( + path=DIRECTORY_TO_SYNC_PROJECT_TO, + if_collision="keep.local", + manifest="root", +) +print(os.listdir(DIRECTORY_TO_SYNC_PROJECT_TO)) +# --8<-- [end:sync_project_with_root_manifest] -# Step 2: The same as step 1, but for a single folder +# Step 3: The same as step 1, but for a single folder +# --8<-- [start:sync_folder] folder = Folder(name=FOLDER_NAME_TO_SYNC, parent_id=project.id) folder.sync_from_synapse(path=DIRECTORY_TO_SYNC_FOLDER_TO, if_collision="keep.local") print(os.listdir(os.path.expanduser(DIRECTORY_TO_SYNC_FOLDER_TO))) +# --8<-- [end:sync_folder] -# Step 3: Loop over all files/folders on the project/folder object instances +# Step 4: Loop over all files/folders on the project/folder object instances +# --8<-- [start:loop_over_project_folder] for folder_at_root in project.folders: print(f"Folder at root: {folder_at_root.name}") - for file_in_root_folder in folder_at_root.files: print(f"File in {folder_at_root.name}: {file_in_root_folder.name}") - for folder_in_folder in folder_at_root.folders: print(f"Folder in {folder_at_root.name}: {folder_in_folder.name}") for file_in_folder in folder_in_folder.files: print(f"File in {folder_in_folder.name}: {file_in_folder.name}") +# --8<-- [end:loop_over_project_folder] diff --git a/mkdocs.yml b/mkdocs.yml index f61d9ec6e..1638308ea 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -164,7 +164,7 @@ nav: - Domain Models of Synapse: explanations/domain_models_of_synapse.md - Access Control: explanations/access_control.md - Properties vs Annotations: explanations/properties_vs_annotations.md - - Manifest TSV: explanations/manifest_tsv.md + - Manifest CSV: explanations/manifest_csv.md - Benchmarking: explanations/benchmarking.md - Structuring Your Project: explanations/structuring_your_project.md - Asyncio Changes in Python 3.14: explanations/asyncio_in_python_3_14.md diff --git a/synapseclient/models/mixins/storable_container.py b/synapseclient/models/mixins/storable_container.py index c19ee7ed0..c0ef3c82c 100644 --- a/synapseclient/models/mixins/storable_container.py +++ b/synapseclient/models/mixins/storable_container.py @@ -8,6 +8,7 @@ Dict, Generator, List, + Literal, NoReturn, Optional, Tuple, @@ -48,6 +49,7 @@ StorableContainerSynchronousProtocol, ) from synapseclient.models.services.manifest import ( + generate_manifest_csv, read_manifest_for_upload, upload_sync_files, ) @@ -167,6 +169,7 @@ async def sync_from_synapse_async( link_hops: int = 1, queue: asyncio.Queue = None, include_types: Optional[List[str]] = None, + manifest: Literal["all", "suppress", "root"] = "all", *, synapse_client: Optional[Synapse] = None, ) -> Self: @@ -178,9 +181,10 @@ async def sync_from_synapse_async( If you only want to retrieve the full tree of metadata about your container specify `download_file` as False. - This works similar to [synapseutils.syncFromSynapse][], however, this does not - currently support the writing of data to a manifest TSV file. This will be a - future enhancement. + This works similar to [synapseutils.syncFromSynapse][], and generates a + `manifest.csv` file in each synced directory. The manifest uses CSV format + with `parentId` and `ID` columns, interoperable with the Synapse UI download + cart and `synapse get-download-list` CLI output. Supports syncing Files, Folders, Tables, EntityViews, SubmissionViews, Datasets, DatasetCollections, MaterializedViews, and VirtualTables from Synapse. The @@ -216,6 +220,11 @@ async def sync_from_synapse_async( `["folder", "file", "table", "entityview", "dockerrepo", "submissionview", "dataset", "datasetcollection", "materializedview", "virtualtable"]`. + manifest: Determines whether to generate a manifest CSV file. Options are: + + - `all` (default): generate `manifest.csv` in every synced directory + - `root`: generate `manifest.csv` only in the root `path` directory + - `suppress`: do not generate any manifest file synapse_client: If not passed in and caching was not disabled by `Synapse.allow_client_caching(False)` this will use the last created instance from the Synapse class constructor. @@ -320,8 +329,38 @@ async def my_function(): asyncio.run(my_function()) ``` + Suppose I want to download all the children of a Project and all sub-folders and files and generate a manifest file: + + ```python + import asyncio + from synapseclient import Synapse + from synapseclient.models import Project + + async def my_function(): + syn = Synapse() + syn.login() + + my_project = Project(id="syn12345") + await my_project.sync_from_synapse_async(path="/path/to/folder", manifest="all") + + asyncio.run(my_function()) + ``` + Suppose I want to download a manifest file at the root path: + + ```python + import asyncio + from synapseclient import Synapse + from synapseclient.models import Project + + async def my_function(): + syn = Synapse() + syn.login() + my_project = Project(id="syn12345") + await my_project.sync_from_synapse_async(path="/path/to/folder", manifest="root", download_file=False) + asyncio.run(my_function()) + ``` Raises: ValueError: If the folder does not have an id set. @@ -383,6 +422,18 @@ async def my_function(): end end + opt manifest != "suppress" and path is set + alt manifest == "all" + loop For each directory path + sync_from_synapse->>manifest: call `generate_manifest_csv(files, dir_path)` + manifest-->>sync_from_synapse: manifest.csv written to dir_path + end + else manifest == "root" + sync_from_synapse->>manifest: call `generate_manifest_csv(all_files, root_path)` + manifest-->>sync_from_synapse: manifest.csv written to root_path + end + end + deactivate sync_from_synapse deactivate project_or_folder ``` @@ -405,6 +456,7 @@ async def my_function(): link_hops=link_hops, queue=queue, include_types=include_types, + manifest=manifest, synapse_client=syn, ) @@ -420,6 +472,7 @@ async def _sync_from_synapse_async( link_hops: int = 1, queue: asyncio.Queue = None, include_types: Optional[List[str]] = None, + manifest: Literal["all", "suppress", "root"] = "all", *, synapse_client: Optional[Synapse] = None, ) -> Self: @@ -428,12 +481,18 @@ async def _sync_from_synapse_async( All arguments are passed through from the wrapper function. """ + if manifest not in ("all", "root", "suppress"): + raise ValueError( + f"Invalid manifest value: {manifest}. Must be one of: 'all', 'root', 'suppress'." + ) + syn = Synapse.get_client(synapse_client=synapse_client) if not self._last_persistent_instance: await self.get_async(synapse_client=syn) syn.logger.info( f"[{self.id}:{self.name}]: Syncing {self.__class__.__name__} from Synapse." ) + path = os.path.expanduser(path) if path else None children = await self._retrieve_children( @@ -496,12 +555,31 @@ async def _sync_from_synapse_async( if create_workers: try: - # Wait until the queue is fully processed. + # Blocks until every queued item has been picked up and + # task_done() called by a worker. await queue.join() finally: + # Workers are now blocked waiting on an empty queue; cancel + # them so they don't hang the event loop. for task in worker_tasks: task.cancel() + if path and manifest != "suppress": + if manifest == "all": + for ( + directory_path, + file_entities, + ) in self.map_directory_to_all_contained_files(root_path=path).items(): + generate_manifest_csv( + all_files=file_entities, + path=directory_path, + ) + elif manifest == "root": + generate_manifest_csv( + all_files=self.flatten_file_list(), + path=path, + ) + return self @otel_trace_method( @@ -1232,6 +1310,7 @@ async def _wrap_recursive_get_children( synapse_client=synapse_client, queue=queue, include_types=include_types, + manifest="suppress", # The manifest is suppressed for child folders because they’re already accounted for when iterating through their parent folder. This is handled in the map_directory_to_all_contained_files function, which returns all files in the directory, including those in its child directories. ) def _create_task_for_child( diff --git a/synapseclient/models/protocols/storable_container_protocol.py b/synapseclient/models/protocols/storable_container_protocol.py index cfbdf5f1d..413b3dfd0 100644 --- a/synapseclient/models/protocols/storable_container_protocol.py +++ b/synapseclient/models/protocols/storable_container_protocol.py @@ -2,7 +2,7 @@ generated at runtime.""" import asyncio -from typing import TYPE_CHECKING, List, Optional, Protocol +from typing import TYPE_CHECKING, List, Literal, Optional, Protocol from typing_extensions import Self @@ -35,6 +35,7 @@ def sync_from_synapse( link_hops: int = 1, queue: asyncio.Queue = None, include_types: Optional[List[str]] = None, + manifest: Literal["all", "root", "suppress"] = "all", *, synapse_client: Optional[Synapse] = None, ) -> Self: @@ -46,9 +47,10 @@ def sync_from_synapse( If you only want to retrieve the full tree of metadata about your container specify `download_file` as False. - This works similar to [synapseutils.syncFromSynapse][], however, this does not - currently support the writing of data to a manifest TSV file. This will be a - future enhancement. + This works similar to [synapseutils.syncFromSynapse][], and generates a + `manifest.csv` file in each synced directory. The manifest uses CSV format + with `parentId` and `ID` columns, interoperable with the Synapse UI download + cart and `synapse get-download-list` CLI output. Supports syncing Files, Folders, Tables, EntityViews, SubmissionViews, Datasets, DatasetCollections, MaterializedViews, and VirtualTables from Synapse. The @@ -80,6 +82,11 @@ def sync_from_synapse( include_types: Must be a list of entity types (ie. ["folder","file"]) which can be found [here](https://rest-docs.synapse.org/rest/org/sagebionetworks/repo/model/EntityType.html) + manifest: Determines whether to generate a manifest CSV file. Options are: + + - `all` (default): generate `manifest.csv` in every synced directory + - `root`: generate `manifest.csv` only in the root `path` directory + - `suppress`: do not generate any manifest file synapse_client: If not passed in and caching was not disabled by `Synapse.allow_client_caching(False)` this will use the last created instance from the Synapse class constructor. diff --git a/synapseclient/models/services/manifest.py b/synapseclient/models/services/manifest.py index 0a9547ee3..e5a87beaa 100644 --- a/synapseclient/models/services/manifest.py +++ b/synapseclient/models/services/manifest.py @@ -1,16 +1,23 @@ -"""Services for reading a Synapse manifest CSV file and preparing it for upload.""" +"""Services for reading and writing Synapse manifest CSV files. + +This includes reading a manifest CSV file and preparing it for upload, as well +as writing a manifest CSV file from a list of File entities. +""" from __future__ import annotations import ast import asyncio +import csv import datetime +import io import os import re from dataclasses import dataclass -from typing import TYPE_CHECKING, Iterable, NamedTuple +from typing import TYPE_CHECKING, Any, Iterable, NamedTuple, Union from synapseclient import Synapse +from synapseclient.core import utils from synapseclient.core.exceptions import ( SynapseFileNotFoundError, SynapseHTTPError, @@ -34,6 +41,20 @@ from synapseclient.models import UsedEntity, UsedURL from synapseclient.models.file import File + +MANIFEST_CSV_FILENAME = "manifest.csv" +DEFAULT_GENERATED_MANIFEST_CSV_KEYS = [ + "path", + "parentId", + "name", + "ID", + "synapseStore", + "contentType", + "used", + "executed", + "activityName", + "activityDescription", +] #: Scalar types that Synapse supports as annotation values. SynapseAnnotationType = datetime.datetime | float | int | bool | str @@ -77,6 +98,232 @@ _FILE_NAME_PATTERN = re.compile(r"^[`\w \-\+\.\(\)]{1,256}$") +def _manifest_csv_filename(path: str) -> str: + return os.path.join(os.path.expanduser(path), MANIFEST_CSV_FILENAME) + + +def _get_entity_provenance_dict_for_manifest(entity: File) -> dict[str, str]: + """ + Gets the provenance metadata for the entity. + + Arguments: + entity: A File entity object + + Returns: + dict[str, str]: a dictionary with a subset of the provenance metadata for the entity. + An empty dictionary is returned if the metadata does not have a provenance record. + """ + if not entity.activity: + return {} + used = [a.format_for_manifest() for a in entity.activity.used] + executed = [a.format_for_manifest() for a in entity.activity.executed] + return { + "used": ";".join(used), + "executed": ";".join(executed), + "activityName": entity.activity.name or "", + "activityDescription": entity.activity.description or "", + } + + +def _convert_manifest_data_items_to_string_list( + items: list[Union[str, datetime.datetime, bool, int, float]], +) -> Union[str, list[str]]: + """ + Handle coverting an individual key that contains a possible list of data into a + list of strings or objects that can be written to the manifest file. + + This has specific logic around how to handle datetime fields. + + When working with datetime fields we are printing the ISO 8601 UTC representation of + the datetime. + + When working with non strings we are printing the non-quoted version of the object. + + Example: Single-element lists are unwrapped + A list with one item returns the item directly, not wrapped in brackets. + ```python + _convert_manifest_data_items_to_string_list(["string,with,commas"]) # 'string,with,commas' + _convert_manifest_data_items_to_string_list([True]) # 'True' + _convert_manifest_data_items_to_string_list([1]) # '1' + _convert_manifest_data_items_to_string_list([1.0]) # '1.0' + _convert_manifest_data_items_to_string_list( + [datetime.datetime(2020, 1, 1, tzinfo=datetime.timezone.utc)] + ) # '2020-01-01T00:00:00Z' + ``` + + Example: Multi-element lists are bracket-wrapped + Multiple items are joined with commas inside `[...]`. String items containing + commas are individually quoted. + ```python + _convert_manifest_data_items_to_string_list(["a", "b", "c"]) + # '[a,b,c]' + _convert_manifest_data_items_to_string_list([True, False]) + # '[True,False]' + _convert_manifest_data_items_to_string_list(["string,with,commas", "string without commas"]) + # '["string,with,commas",string without commas]' + _convert_manifest_data_items_to_string_list( + [datetime.datetime(2020, 1, 1, tzinfo=datetime.timezone.utc), + datetime.datetime(2021, 1, 1, tzinfo=datetime.timezone.utc)] + ) + # '[2020-01-01T00:00:00Z,2021-01-01T00:00:00Z]' + ``` + + Args: + items: The list of items to convert. + + Returns: + The list of items converted to strings. + """ + items_to_write = [] + for item in items: + if isinstance(item, datetime.datetime): + items_to_write.append( + utils.datetime_to_iso(dt=item, include_milliseconds_if_zero=False) + ) + else: + # If a string based annotation has a comma in it + # this will wrap the string in quotes so it won't be parsed + # as multiple values. For example this is an annotation with 2 values: + # [my first annotation, "my, second, annotation"] + # This is an annotation with 4 value: + # [my first annotation, my, second, annotation] + if isinstance(item, str): + if len(items) > 1 and "," in item: + items_to_write.append(f'"{item}"') + else: + items_to_write.append(item) + else: + items_to_write.append(repr(item)) + + if len(items_to_write) > 1: + return f'[{",".join(items_to_write)}]' + elif len(items_to_write) == 1: + return items_to_write[0] + else: + return "" + + +def _extract_entity_metadata_for_manifest_csv( + all_files: list[File], +) -> tuple[list[str], list[dict[str, Any]]]: + """Extracts metadata from a list of File entities into a form usable by csv.DictWriter. + + Builds the column header list starting from DEFAULT_GENERATED_MANIFEST_CSV_KEYS, then + appends any annotation keys discovered across all files. Each row dict contains the + standard fields plus annotation values (serialized via + _convert_manifest_data_items_to_string_list) and provenance fields from + _get_entity_provenance_dict_for_manifest. + + Arguments: + all_files: A list of File model objects to extract metadata from. + + Returns: + A tuple of (keys, data) where keys is the ordered list of column headers and + data is a list of row dicts, one per file. + """ + keys = list(DEFAULT_GENERATED_MANIFEST_CSV_KEYS) + annotation_keys: set = set() + data = [] + for entity in all_files: + row: dict = { + "path": entity.path, + "parentId": entity.parent_id, + "name": entity.name, + "ID": entity.id, + "synapseStore": entity.synapse_store, + "contentType": entity.content_type, + } + if entity.annotations: + for key, val in entity.annotations.items(): + annotation_keys.add(key) + row[key] = ( + _convert_manifest_data_items_to_string_list(val) + if isinstance(val, list) + else val + ) + row.update(_get_entity_provenance_dict_for_manifest(entity=entity)) + data.append(row) + keys.extend(annotation_keys) + return keys, data + + +def _convert_manifest_data_row_to_dict(row: dict, keys: list[str]) -> dict: + """ + Convert a row of data to a dict that can be written to a manifest file. + + Args: + row: The row of data to convert. + keys: The keys of the manifest. Used to select the rows of data. + + Returns: + The dict representation of the row. + """ + data_to_write = {} + for key in keys: + data_for_key = row.get(key, "") + if isinstance(data_for_key, list): + items_to_write = _convert_manifest_data_items_to_string_list(data_for_key) + data_to_write[key] = items_to_write + else: + data_to_write[key] = data_for_key + return data_to_write + + +def _write_manifest_data_csv(path: str, keys: list[str], data: list[dict]) -> None: + """Writes manifest data to a CSV file using csv.DictWriter with QUOTE_MINIMAL that automatically quotes any cell containing a comma, newline, or the quote character. + + Each row dict is normalized via _convert_manifest_data_row_to_dict so that + list-valued annotation fields are serialized to strings before writing. Missing + fields default to an empty string; extra keys not in fieldnames are silently ignored. + + Arguments: + path: Absolute path of the CSV file to create or overwrite. + keys: Ordered list of column headers used as DictWriter fieldnames. + data: List of row dicts, one per file. Keys absent from a row are written as + empty strings; keys not in fieldnames are ignored. + """ + with io.open(path, "w", encoding="utf8", newline="") as fp: + writer = csv.DictWriter( + fp, + fieldnames=keys, + restval="", + extrasaction="ignore", + quoting=csv.QUOTE_MINIMAL, + ) + writer.writeheader() + for row in data: + writer.writerow(_convert_manifest_data_row_to_dict(row, keys)) + + # add a log message to the console + print(f"Manifest file {path} has been generated.") + + +def generate_manifest_csv(all_files: list[File], path: str) -> None: + """Generates a manifest.csv file based on a list of File entities. + + The generated file uses CSV format with comma delimiter and is interoperable + with the Synapse UI download cart. Column names follow the new convention: + `parentId` (instead of `parent`) and `ID` (instead of `id`). + If all_files is empty, a manifest.csv with only the header row will be generated. + If path is None, a ValueError will be raised. + + Args: + all_files: A list of File model objects. + path: The directory path where manifest.csv will be written. + + raises: + ValueError: If path is None. + """ + if not path: + raise ValueError( + "The path argument is required to generate a manifest.csv file." + ) + if path: + filename = _manifest_csv_filename(path=path) + keys, data = _extract_entity_metadata_for_manifest_csv(all_files=all_files) + _write_manifest_data_csv(filename, keys, data) + + class UploadSyncFile(NamedTuple): """Represents a single file being uploaded. diff --git a/synapseutils/sync.py b/synapseutils/sync.py index b7fd34023..47f4a22a1 100644 --- a/synapseutils/sync.py +++ b/synapseutils/sync.py @@ -78,6 +78,13 @@ COMMAS_OUTSIDE_DOUBLE_QUOTES_PATTERN = re.compile(r",(?=(?:[^\"]*\"[^\"]*\")*[^\"]*$)") +@deprecated( + version="4.12.0", + reason=( + "To be removed in 5.0.0. Use StorableContainer.sync_from_synapse instead, " + "which generates a manifest.csv file interoperable with the Synapse UI download cart." + ), +) def syncFromSynapse( syn: Synapse, entity: Union[str, SynapseFile, SynapseProject, SynapseFolder], diff --git a/tests/integration/synapseclient/models/async/test_folder_async.py b/tests/integration/synapseclient/models/async/test_folder_async.py index fc6075594..cc04238fb 100644 --- a/tests/integration/synapseclient/models/async/test_folder_async.py +++ b/tests/integration/synapseclient/models/async/test_folder_async.py @@ -1,6 +1,9 @@ """Integration tests for the synapseclient.models.Folder class.""" +import csv +import datetime import os +import tempfile import uuid from typing import Callable, List @@ -10,6 +13,7 @@ from synapseclient.core import utils from synapseclient.core.exceptions import SynapseHTTPError from synapseclient.models import ( + Activity, Column, ColumnType, Dataset, @@ -25,6 +29,7 @@ ViewTypeMask, VirtualTable, ) +from synapseclient.models.activity import UsedURL DESCRIPTION_FOLDER = "This is an example folder." DESCRIPTION_FILE = "This is an example file." @@ -811,3 +816,237 @@ async def test_walk_async_recursive_false(self, project_model: Project) -> None: assert hasattr(nondirs[0], "name") assert hasattr(nondirs[0], "id") assert hasattr(nondirs[0], "type") + + +class TestFolderManifestCSV: + """Integration tests for manifest CSV generation during sync_from_synapse_async.""" + + BOGUS_URL = "https://example.com" + + @pytest.fixture(autouse=True, scope="function") + def init(self, syn: Synapse, schedule_for_cleanup: Callable[..., None]) -> None: + self.syn = syn + self.schedule_for_cleanup = schedule_for_cleanup + + def create_file_instance(self) -> File: + filename = utils.make_bogus_uuid_file() + self.schedule_for_cleanup(filename) + return File( + path=filename, + content_type="text/plain", + ) + + async def test_manifest_all_creates_csv_per_directory( + self, project_model: Project + ) -> None: + # GIVEN a root folder with a file and a nested subfolder with its own file + root_folder = Folder(name=str(uuid.uuid4()), parent_id=project_model.id) + root_folder = await root_folder.store_async(synapse_client=self.syn) + self.schedule_for_cleanup(root_folder.id) + + root_file = self.create_file_instance() + root_file.parent_id = root_folder.id + root_file = await root_file.store_async(synapse_client=self.syn) + self.schedule_for_cleanup(root_file.id) + + sub_folder = Folder(name=str(uuid.uuid4()), parent_id=root_folder.id) + sub_folder = await sub_folder.store_async(synapse_client=self.syn) + self.schedule_for_cleanup(sub_folder.id) + + sub_file = self.create_file_instance() + sub_file.parent_id = sub_folder.id + sub_file = await sub_file.store_async(synapse_client=self.syn) + self.schedule_for_cleanup(sub_file.id) + + # WHEN I sync the root folder with manifest="all" + with tempfile.TemporaryDirectory() as tmpdir: + await root_folder.sync_from_synapse_async( + path=tmpdir, + manifest="all", + synapse_client=self.syn, + ) + + root_manifest = os.path.join(tmpdir, "manifest.csv") + sub_manifest = os.path.join(tmpdir, sub_folder.name, "manifest.csv") + + assert os.path.isfile(root_manifest) + assert os.path.isfile(sub_manifest) + + with open(root_manifest, newline="", encoding="utf8") as f: + reader = csv.DictReader(f) + rows = list(reader) + assert len(rows) == 2 + rows_by_id = {row["ID"]: row for row in rows} + root_row = rows_by_id[root_file.id] + assert root_row["name"] == root_file.name + assert root_row["parentId"] == root_folder.id + sub_row = rows_by_id[sub_file.id] + assert sub_row["name"] == sub_file.name + assert sub_row["parentId"] == sub_folder.id + + with open(sub_manifest, newline="", encoding="utf8") as f: + reader = csv.DictReader(f) + rows = list(reader) + assert len(rows) == 1 + sub_row = rows[0] + assert sub_row["name"] == sub_file.name + assert sub_row["parentId"] == sub_folder.id + + async def test_manifest_root_creates_csv_only_at_root( + self, project_model: Project + ) -> None: + # GIVEN a root folder with a file and a nested subfolder with its own file + root_folder = Folder(name=str(uuid.uuid4()), parent_id=project_model.id) + root_folder = await root_folder.store_async(synapse_client=self.syn) + self.schedule_for_cleanup(root_folder.id) + + root_file = self.create_file_instance() + root_file.parent_id = root_folder.id + root_file = await root_file.store_async(synapse_client=self.syn) + self.schedule_for_cleanup(root_file.id) + + sub_folder = Folder(name=str(uuid.uuid4()), parent_id=root_folder.id) + sub_folder = await sub_folder.store_async(synapse_client=self.syn) + self.schedule_for_cleanup(sub_folder.id) + + sub_file = self.create_file_instance() + sub_file.parent_id = sub_folder.id + sub_file = await sub_file.store_async(synapse_client=self.syn) + self.schedule_for_cleanup(sub_file.id) + + # WHEN I sync with manifest="root" + with tempfile.TemporaryDirectory() as tmpdir: + await root_folder.sync_from_synapse_async( + path=tmpdir, + manifest="root", + synapse_client=self.syn, + ) + + root_manifest = os.path.join(tmpdir, "manifest.csv") + sub_manifest = os.path.join(tmpdir, sub_folder.name, "manifest.csv") + + # THEN manifest.csv exists only at the root + assert os.path.isfile(root_manifest) + assert not os.path.isfile(sub_manifest) + with open(root_manifest, newline="", encoding="utf8") as f: + reader = csv.DictReader(f) + rows = list(reader) + assert len(rows) == 2 + rows_by_id = {row["ID"]: row for row in rows} + root_row = rows[0] + assert root_row["name"] == root_file.name + assert root_row["parentId"] == root_folder.id + sub_row = rows_by_id[sub_file.id] + assert sub_row["name"] == sub_file.name + assert sub_row["parentId"] == sub_folder.id + + async def test_manifest_suppress_creates_no_csv( + self, project_model: Project + ) -> None: + # GIVEN a folder with a file + folder = Folder(name=str(uuid.uuid4()), parent_id=project_model.id) + folder = await folder.store_async(synapse_client=self.syn) + self.schedule_for_cleanup(folder.id) + + f = self.create_file_instance() + f.parent_id = folder.id + f = await f.store_async(synapse_client=self.syn) + self.schedule_for_cleanup(f.id) + + # WHEN I sync with manifest="suppress" + with tempfile.TemporaryDirectory() as tmpdir: + await folder.sync_from_synapse_async( + path=tmpdir, + manifest="suppress", + synapse_client=self.syn, + ) + + # THEN no manifest.csv is created + assert not os.path.isfile(os.path.join(tmpdir, "manifest.csv")) + + async def test_manifest_includes_annotations(self, project_model: Project) -> None: + # GIVEN a file with mixed-type annotations + folder = Folder(name=str(uuid.uuid4()), parent_id=project_model.id) + folder = await folder.store_async(synapse_client=self.syn) + self.schedule_for_cleanup(folder.id) + + f = self.create_file_instance() + f.parent_id = folder.id + f.annotations = { + "single_str": ["hello"], + "multi_str": ["a", "b", "c"], + "str_with_comma": ["hello,world", "plain text"], + "booleans": [True, False], + "integers": [1, 2, 3], + "floats": [1.0], + "datetimes": [ + datetime.datetime(2020, 1, 1, 0, 0, 0, 0, tzinfo=datetime.timezone.utc) + ], + } + f = await f.store_async(synapse_client=self.syn) + self.schedule_for_cleanup(f.id) + + # WHEN I sync with manifest generation + with tempfile.TemporaryDirectory() as tmpdir: + await folder.sync_from_synapse_async( + path=tmpdir, + manifest="root", + synapse_client=self.syn, + ) + + manifest_path = os.path.join(tmpdir, "manifest.csv") + assert os.path.isfile(manifest_path) + + with open(manifest_path, newline="", encoding="utf8") as mf: + reader = csv.DictReader(mf) + rows = list(reader) + + # THEN annotation columns are present and correctly serialized + assert len(rows) == 1 + row = rows[0] + assert row["single_str"] == "hello" + assert row["multi_str"] == "[a,b,c]" + assert row["str_with_comma"] == '["hello,world",plain text]' + assert row["booleans"] == "[True,False]" + assert row["integers"] == "[1,2,3]" + assert row["floats"] == "1.0" + assert row["datetimes"] == "2020-01-01T00:00:00Z" + + async def test_manifest_includes_provenance(self, project_model: Project) -> None: + # GIVEN a file with activity (provenance) + folder = Folder(name=str(uuid.uuid4()), parent_id=project_model.id) + folder = await folder.store_async(synapse_client=self.syn) + self.schedule_for_cleanup(folder.id) + + f = self.create_file_instance() + f.parent_id = folder.id + f.activity = Activity( + name="my_activity", + description="my_description", + used=[UsedURL(name="my_source", url=self.BOGUS_URL)], + ) + f = await f.store_async(synapse_client=self.syn) + self.schedule_for_cleanup(f.id) + + # WHEN I sync with manifest generation and include_activity=True + with tempfile.TemporaryDirectory() as tmpdir: + await folder.sync_from_synapse_async( + path=tmpdir, + manifest="root", + include_activity=True, + synapse_client=self.syn, + ) + + manifest_path = os.path.join(tmpdir, "manifest.csv") + assert os.path.isfile(manifest_path) + + with open(manifest_path, newline="", encoding="utf8") as mf: + reader = csv.DictReader(mf) + rows = list(reader) + + # THEN provenance columns are populated + assert len(rows) == 1 + row = rows[0] + assert row["activityName"] == "my_activity" + assert row["activityDescription"] == "my_description" + assert row["used"] == "my_source" diff --git a/tests/unit/synapseclient/models/async/unit_test_folder_async.py b/tests/unit/synapseclient/models/async/unit_test_folder_async.py index f465206a2..c296496f9 100644 --- a/tests/unit/synapseclient/models/async/unit_test_folder_async.py +++ b/tests/unit/synapseclient/models/async/unit_test_folder_async.py @@ -1,5 +1,6 @@ """Tests for the Folder class.""" +import os import uuid from typing import Dict from unittest.mock import AsyncMock, MagicMock, patch @@ -9,7 +10,7 @@ from synapseclient import Folder as Synapse_Folder from synapseclient import Synapse from synapseclient.core.constants import concrete_types -from synapseclient.core.constants.concrete_types import FILE_ENTITY +from synapseclient.core.constants.concrete_types import FILE_ENTITY, FOLDER_ENTITY from synapseclient.core.exceptions import SynapseNotFoundError from synapseclient.models import FailureStrategy, File, Folder from synapseclient.models.project_setting import ProjectSetting @@ -824,6 +825,219 @@ async def mock_get_children(*args, **kwargs): assert result.files[0].id == SYN_456 assert result.files[0].name == "example_file_1" + async def test_sync_from_synapse_manifest_all_generates_per_directory( + self, + ) -> None: + SUB_FOLDER_ID = "syn789" + SUB_FOLDER_NAME = "sub_folder" + FILE_2_ID = "syn012" + FILE_2_NAME = "example_file_2" + + # GIVEN a root folder with one file and one subfolder containing one file + folder = Folder(id=SYN_123) + + root_children = [ + {"id": SYN_456, "type": FILE_ENTITY, "name": "example_file_1"}, + {"id": SUB_FOLDER_ID, "type": FOLDER_ENTITY, "name": SUB_FOLDER_NAME}, + ] + sub_children = [ + {"id": FILE_2_ID, "type": FILE_ENTITY, "name": FILE_2_NAME}, + ] + get_children_call_count = 0 + + async def mock_get_children(*args, **kwargs): + nonlocal get_children_call_count + children = root_children if get_children_call_count == 0 else sub_children + get_children_call_count += 1 + for child in children: + yield child + + downloaded_file_1 = File( + id=SYN_456, + name="example_file_1", + parent_id=SYN_123, + ) + downloaded_file_2 = File( + id=FILE_2_ID, + name=FILE_2_NAME, + parent_id=SUB_FOLDER_ID, + ) + file_map = {SYN_456: downloaded_file_1, FILE_2_ID: downloaded_file_2} + + async def mock_file_get(self_file, **kwargs): + return file_map[self_file.id] + + async def mock_get_entity_bundle(entity_id, *args, **kwargs): + if entity_id == SUB_FOLDER_ID: + return { + "entity": { + "concreteType": concrete_types.FOLDER_ENTITY, + "id": SUB_FOLDER_ID, + "name": SUB_FOLDER_NAME, + "parentId": SYN_123, + "etag": ETAG, + "createdOn": CREATED_ON, + "modifiedOn": MODIFIED_ON, + "createdBy": CREATED_BY, + "modifiedBy": MODIFIED_BY, + } + } + return self.get_example_rest_api_folder_output() + + # WHEN I call sync_from_synapse with manifest="all" and a path + with ( + patch( + "synapseclient.models.mixins.storable_container.get_children", + side_effect=mock_get_children, + ), + patch( + "synapseclient.api.entity_factory.get_entity_id_bundle2", + side_effect=mock_get_entity_bundle, + ), + patch( + "synapseclient.models.file.File.get_async", + side_effect=mock_file_get, + ), + patch( + "synapseclient.models.mixins.storable_container.os.path.exists", + return_value=True, + ), + patch( + "synapseclient.models.mixins.storable_container.generate_manifest_csv", + ) as mock_generate, + ): + await folder.sync_from_synapse_async( + path="/tmp/mydir", manifest="all", synapse_client=self.syn + ) + + # THEN generate_manifest_csv is called once per directory (root + subfolder) + assert mock_generate.call_count == 2 + calls_by_path = { + c.kwargs["path"]: c.kwargs["all_files"] + for c in mock_generate.call_args_list + } + assert any(f.id == SYN_456 for f in calls_by_path["/tmp/mydir"]) + assert any( + f.id == FILE_2_ID + for f in calls_by_path[os.path.join("/tmp/mydir", SUB_FOLDER_NAME)] + ) + + async def test_sync_from_synapse_manifest_root_generates_only_at_root( + self, + ) -> None: + # GIVEN a Folder object with a path + folder = Folder(id=SYN_123) + children = [{"id": SYN_456, "type": FILE_ENTITY, "name": "example_file_1"}] + + async def mock_get_children(*args, **kwargs): + for child in children: + yield child + + downloaded_file = File( + id=SYN_456, + name="example_file_1", + path="/tmp/mydir/example_file_1.txt", + parent_id=SYN_123, + ) + + # WHEN I call sync_from_synapse with manifest="root" and a path + with ( + patch( + "synapseclient.models.mixins.storable_container.get_children", + side_effect=mock_get_children, + ), + patch( + "synapseclient.api.entity_factory.get_entity_id_bundle2", + new_callable=AsyncMock, + return_value=self.get_example_rest_api_folder_output(), + ), + patch( + "synapseclient.models.file.File.get_async", + return_value=downloaded_file, + ), + patch( + "synapseclient.models.mixins.storable_container.generate_manifest_csv", + ) as mock_generate, + ): + await folder.sync_from_synapse_async( + path="/tmp/mydir", manifest="root", synapse_client=self.syn + ) + + # THEN generate_manifest_csv should be called exactly once with the root path + mock_generate.assert_called_once() + assert mock_generate.call_args.kwargs["path"] == "/tmp/mydir" + assert mock_generate.call_args.kwargs["all_files"][0].id == SYN_456 + + async def test_sync_from_synapse_manifest_suppress_skips_generation( + self, + ) -> None: + # GIVEN a Folder object with a path + folder = Folder(id=SYN_123) + children = [{"id": SYN_456, "type": FILE_ENTITY, "name": "example_file_1"}] + + async def mock_get_children(*args, **kwargs): + for child in children: + yield child + + # WHEN I call sync_from_synapse with manifest="suppress" + with ( + patch( + "synapseclient.models.mixins.storable_container.get_children", + side_effect=mock_get_children, + ), + patch( + "synapseclient.api.entity_factory.get_entity_id_bundle2", + new_callable=AsyncMock, + return_value=self.get_example_rest_api_folder_output(), + ), + patch( + "synapseclient.models.file.File.get_async", + return_value=(File(id=SYN_456, name="example_file_1")), + ), + patch( + "synapseclient.models.mixins.storable_container.generate_manifest_csv", + ) as mock_generate, + ): + await folder.sync_from_synapse_async( + path="/tmp/mydir", manifest="suppress", synapse_client=self.syn + ) + + # THEN generate_manifest_csv should never be called + mock_generate.assert_not_called() + + async def test_sync_from_synapse_no_manifest_without_path(self) -> None: + # GIVEN a Folder with no path specified + folder = Folder(id=SYN_123) + children = [{"id": SYN_456, "type": FILE_ENTITY, "name": "example_file_1"}] + + async def mock_get_children(*args, **kwargs): + for child in children: + yield child + + # WHEN I call sync_from_synapse with no path (default manifest="all") + with ( + patch( + "synapseclient.models.mixins.storable_container.get_children", + side_effect=mock_get_children, + ), + patch( + "synapseclient.api.entity_factory.get_entity_id_bundle2", + new_callable=AsyncMock, + return_value=self.get_example_rest_api_folder_output(), + ), + patch( + "synapseclient.models.file.File.get_async", + return_value=(File(id=SYN_456, name="example_file_1")), + ), + patch( + "synapseclient.models.mixins.storable_container.generate_manifest_csv", + ) as mock_generate, + ): + await folder.sync_from_synapse_async(synapse_client=self.syn) + + # THEN generate_manifest_csv should not be called (no path to write to) + mock_generate.assert_not_called() + class TestStorageLocationMixin: """Tests for ProjectSettingsMixin methods on Folder.""" diff --git a/tests/unit/synapseclient/services/unit_test_manifest.py b/tests/unit/synapseclient/services/unit_test_manifest.py new file mode 100644 index 000000000..e13f8f574 --- /dev/null +++ b/tests/unit/synapseclient/services/unit_test_manifest.py @@ -0,0 +1,474 @@ +import csv +import datetime +import os +import tempfile + +import pytest + +from synapseclient.models import Activity, File +from synapseclient.models.activity import UsedEntity, UsedURL +from synapseclient.models.services.manifest import ( + MANIFEST_CSV_FILENAME, + _convert_manifest_data_items_to_string_list, + _convert_manifest_data_row_to_dict, + _extract_entity_metadata_for_manifest_csv, + _get_entity_provenance_dict_for_manifest, + _manifest_csv_filename, + _write_manifest_data_csv, + generate_manifest_csv, +) + + +class TestManifestCsvFilename: + """Tests for the _manifest_csv_filename helper.""" + + def test_plain_directory(self) -> None: + # GIVEN a plain absolute path + # WHEN _manifest_csv_filename is called + result = _manifest_csv_filename("/tmp/mydir") + + # THEN it joins the path with the manifest filename + assert result == os.path.join("/tmp/mydir", MANIFEST_CSV_FILENAME) + + def test_tilde_is_expanded(self) -> None: + # GIVEN a path starting with ~ + # WHEN _manifest_csv_filename is called + result = _manifest_csv_filename("~/mydir") + + # THEN ~ is expanded to the user's home directory + assert result == os.path.join( + os.path.expanduser("~/mydir"), MANIFEST_CSV_FILENAME + ) + assert "~" not in result + + def test_filename_is_manifest_csv(self) -> None: + # GIVEN any directory path + # WHEN _manifest_csv_filename is called + result = _manifest_csv_filename("/some/path") + + # THEN the basename of the result is MANIFEST_CSV_FILENAME + assert os.path.basename(result) == MANIFEST_CSV_FILENAME + + +class TestGenerateManifestCsv: + """Tests for the generate_manifest_csv and related helper functions.""" + + def _make_file( + self, + syn_id: str = "syn123", + name: str = "file.txt", + path: str = "/data/file.txt", + parent_id: str = "syn456", + content_type: str = "text/plain", + synapse_store: bool = True, + annotations: dict = None, + activity: Activity = None, + ) -> File: + f = File( + id=syn_id, + name=name, + path=path, + parent_id=parent_id, + content_type=content_type, + synapse_store=synapse_store, + ) + if annotations: + f.annotations = annotations + if activity: + f.activity = activity + return f + + def test_extract_entity_metadata_includes_annotations_and_activity(self) -> None: + # GIVEN a File entity with provenance + activity = Activity( + name="My Pipeline", + description="Run analysis", + used=[UsedEntity(target_id="syn111", target_version_number=1)], + executed=[UsedURL(url="https://github.com/example/pipeline")], + ) + f = self._make_file( + activity=activity, annotations={"tissue": ["brain"], "count": [42]} + ) + + # WHEN metadata is extracted + keys, data = _extract_entity_metadata_for_manifest_csv([f]) + + # THEN provenance keys are present in the column list + assert { + "used", + "executed", + "activityName", + "activityDescription", + "tissue", + "count", + }.issubset(keys) + + assert data[0]["parentId"] == "syn456" + assert data[0]["ID"] == "syn123" + assert data[0]["path"] == "/data/file.txt" + assert data[0]["name"] == "file.txt" + assert data[0]["activityName"] == "My Pipeline" + assert data[0]["activityDescription"] == "Run analysis" + assert data[0]["used"] == "syn111.1" + assert data[0]["executed"] == "https://github.com/example/pipeline" + assert data[0]["tissue"] == "brain" + assert data[0]["count"] == "42" + + def test_generate_manifest_csv_data_items_are_converted_to_strings(self) -> None: + # GIVEN a File with a name containing a comma and mixed-type annotations + f = self._make_file( + name="a, b, c", + path="/data/file.txt", + annotations={ + "single_str": "hello", + "multi_str": ["a", "b", "c"], + "str_with_comma": ["hello,world", "plain text"], + "booleans": [True, False], + "integers": [1], + "floats": [1.0], + "single_dt": [ + datetime.datetime(2020, 1, 1, 0, 0, 0, tzinfo=datetime.timezone.utc) + ], + "multi_dt": [ + datetime.datetime( + 2020, 1, 1, 0, 0, 0, tzinfo=datetime.timezone.utc + ), + datetime.datetime( + 2021, 6, 15, 12, 30, 0, tzinfo=datetime.timezone.utc + ), + ], + }, + ) + + with tempfile.TemporaryDirectory() as tmpdir: + # WHEN generate_manifest_csv is called + generate_manifest_csv(all_files=[f], path=tmpdir) + manifest_path = os.path.join(tmpdir, "manifest.csv") + content = open(manifest_path, encoding="utf8").read() + with open(manifest_path, newline="", encoding="utf8") as fp: + row = next(csv.DictReader(fp)) + + assert '"a, b, c"' in content + assert row["single_str"] == "hello" + assert row["multi_str"] == "[a,b,c]" + assert row["str_with_comma"] == '["hello,world",plain text]' + assert row["booleans"] == "[True,False]" + assert row["integers"] == "1" + assert row["floats"] == "1.0" + assert row["single_dt"] == "2020-01-01T00:00:00Z" + assert row["multi_dt"] == "[2020-01-01T00:00:00Z,2021-06-15T12:30:00Z]" + + def test_generate_manifest_csv_with_only_header_row(self) -> None: + # GIVEN an empty file list + with tempfile.TemporaryDirectory() as tmpdir: + # WHEN generate_manifest_csv is called with no files + generate_manifest_csv(all_files=[], path=tmpdir) + + # THEN the manifest.csv file is created with only the header row and no data rows + manifest_path = os.path.join(tmpdir, "manifest.csv") + with open(manifest_path, newline="", encoding="utf8") as fp: + reader = csv.DictReader(fp) + rows = list(reader) + assert reader.fieldnames == [ + "path", + "parentId", + "name", + "ID", + "synapseStore", + "contentType", + "used", + "executed", + "activityName", + "activityDescription", + ] + assert rows == [] + + def test_generate_manifest_csv_with_path_None_raises_ValueError(self) -> None: + # GIVEN an empty file list + with tempfile.TemporaryDirectory() as tmpdir: + # WHEN generate_manifest_csv is called with path=None + with pytest.raises( + ValueError, + match="The path argument is required to generate a manifest.csv file.", + ): + generate_manifest_csv(all_files=[], path=None) + + def test_generate_manifest_csv_quotes_values_with_commas(self) -> None: + # GIVEN a File whose name contains a comma + f = self._make_file(name="file, extra.txt", path="/tmp/file, extra.txt") + + with tempfile.TemporaryDirectory() as tmpdir: + generate_manifest_csv(all_files=[f], path=tmpdir) + manifest_path = os.path.join(tmpdir, "manifest.csv") + content = open(manifest_path, encoding="utf8").read() + # THEN the comma-containing value is quoted in the CSV + assert '"file, extra.txt"' in content + + +class TestWriteManifestDataCsv: + """Tests for the _write_manifest_data_csv helper.""" + + def test_writes_header_and_rows(self) -> None: + # GIVEN keys and one row of data + keys = ["path", "parentId", "name"] + data = [{"path": "/data/f.txt", "parentId": "syn1", "name": "f.txt"}] + + with tempfile.TemporaryDirectory() as tmpdir: + filename = os.path.join(tmpdir, "manifest.csv") + # WHEN _write_manifest_data_csv is called + _write_manifest_data_csv(filename, keys, data) + + with open(filename, newline="", encoding="utf8") as fp: + rows = list(csv.DictReader(fp)) + + # THEN header and row values are written correctly + assert len(rows) == 1 + assert rows[0]["path"] == "/data/f.txt" + assert rows[0]["parentId"] == "syn1" + assert rows[0]["name"] == "f.txt" + + def test_missing_keys_use_empty_string(self) -> None: + # GIVEN a row missing the "name" key + keys = ["path", "parentId", "name"] + data = [{"path": "/data/f.txt", "parentId": "syn1"}] + + with tempfile.TemporaryDirectory() as tmpdir: + filename = os.path.join(tmpdir, "manifest.csv") + _write_manifest_data_csv(filename, keys, data) + + with open(filename, newline="", encoding="utf8") as fp: + rows = list(csv.DictReader(fp)) + + # THEN the missing field is written as an empty string + assert rows[0]["name"] == "" + + def test_extra_keys_in_row_are_ignored(self) -> None: + # GIVEN a row with a key not in the fieldnames list + keys = ["path", "name"] + data = [{"path": "/data/f.txt", "name": "f.txt", "extra": "ignored"}] + + with tempfile.TemporaryDirectory() as tmpdir: + filename = os.path.join(tmpdir, "manifest.csv") + # WHEN _write_manifest_data_csv is called + # THEN no exception is raised and only declared keys appear + _write_manifest_data_csv(filename, keys, data) + + with open(filename, newline="", encoding="utf8") as fp: + reader = csv.DictReader(fp) + rows = list(reader) + assert "extra" not in reader.fieldnames + + assert rows[0]["path"] == "/data/f.txt" + + def test_values_with_commas_are_quoted(self) -> None: + # GIVEN a value that contains a comma + keys = ["name", "parentId"] + data = [{"name": "file, with comma.txt", "parentId": "syn1"}] + + with tempfile.TemporaryDirectory() as tmpdir: + filename = os.path.join(tmpdir, "manifest.csv") + _write_manifest_data_csv(filename, keys, data) + content = open(filename, encoding="utf8").read() + + with open(filename, newline="", encoding="utf8") as fp: + rows = list(csv.DictReader(fp)) + + # THEN the comma-containing value is quoted in the raw CSV + assert '"file, with comma.txt"' in content + # AND reads back correctly + assert rows[0]["name"] == "file, with comma.txt" + + def test_empty_data_writes_header_only(self) -> None: + # GIVEN no data rows + keys = ["path", "parentId", "name"] + + with tempfile.TemporaryDirectory() as tmpdir: + filename = os.path.join(tmpdir, "manifest.csv") + _write_manifest_data_csv(filename, keys, []) + + with open(filename, newline="", encoding="utf8") as fp: + reader = csv.DictReader(fp) + rows = list(reader) + header = reader.fieldnames + + # THEN the file exists with only the header + assert rows == [] + assert header == keys + + def test_unicode_values_are_written_correctly(self) -> None: + # GIVEN a value with non-ASCII characters + keys = ["name", "parentId"] + data = [{"name": "données_été.txt", "parentId": "syn1"}] + + with tempfile.TemporaryDirectory() as tmpdir: + filename = os.path.join(tmpdir, "manifest.csv") + _write_manifest_data_csv(filename, keys, data) + + with open(filename, newline="", encoding="utf8") as fp: + rows = list(csv.DictReader(fp)) + + # THEN unicode characters round-trip correctly + assert rows[0]["name"] == "données_été.txt" + + def test_multiple_rows_written_in_order(self) -> None: + # GIVEN multiple rows + keys = ["name", "parentId"] + data = [ + {"name": "a.txt", "parentId": "syn1"}, + {"name": "b.txt", "parentId": "syn2"}, + {"name": "c.txt", "parentId": "syn3"}, + ] + + with tempfile.TemporaryDirectory() as tmpdir: + filename = os.path.join(tmpdir, "manifest.csv") + _write_manifest_data_csv(filename, keys, data) + + with open(filename, newline="", encoding="utf8") as fp: + rows = list(csv.DictReader(fp)) + + # THEN all rows are present and in order + assert len(rows) == 3 + assert [r["name"] for r in rows] == ["a.txt", "b.txt", "c.txt"] + + +class TestGetEntityProvenanceDictForManifest: + """Tests for _get_entity_provenance_dict_for_manifest.""" + + def _make_file_with_activity(self, activity: Activity = None) -> File: + f = File(id="syn1", name="f.txt", path="/f.txt", parent_id="syn2") + if activity: + f.activity = activity + return f + + def test_returns_empty_dict_when_no_activity(self) -> None: + f = self._make_file_with_activity() + result = _get_entity_provenance_dict_for_manifest(f) + assert result == {} + + def test_returns_all_provenance_keys_with_activity(self) -> None: + activity = Activity( + name="Pipeline", + description="Runs analysis", + used=[UsedEntity(target_id="syn10", target_version_number=2)], + executed=[UsedURL(url="https://github.com/example/run")], + ) + f = self._make_file_with_activity(activity) + + result = _get_entity_provenance_dict_for_manifest(f) + assert result["used"] == "syn10.2" + assert result["executed"] == "https://github.com/example/run" + assert result["activityName"] == "Pipeline" + assert result["activityDescription"] == "Runs analysis" + + def test_activity_name_and_description_default_to_empty_string(self) -> None: + activity = Activity(name=None, description=None) + f = self._make_file_with_activity(activity) + + result = _get_entity_provenance_dict_for_manifest(f) + assert result["activityName"] == "" + assert result["activityDescription"] == "" + + def test_empty_used_and_executed_lists(self) -> None: + activity = Activity(name="minimal", used=[], executed=[]) + f = self._make_file_with_activity(activity) + + result = _get_entity_provenance_dict_for_manifest(f) + + assert result["activityName"] == "minimal" + assert result["used"] == "" + assert result["executed"] == "" + + def test_multiple_used_and_executed_are_semicolon_joined(self) -> None: + # GIVEN an activity with multiple used and executed entries + activity = Activity( + name="multi", + used=[ + UsedEntity(target_id="syn1", target_version_number=1), + UsedEntity(target_id="syn2", target_version_number=3), + ], + executed=[ + UsedURL(url="https://github.com/a"), + UsedURL(url="https://github.com/b"), + ], + ) + f = self._make_file_with_activity(activity) + + result = _get_entity_provenance_dict_for_manifest(f) + + assert result["activityName"] == "multi" + assert result["used"] == "syn1.1;syn2.3" + assert result["executed"] == "https://github.com/a;https://github.com/b" + + +_UTC = datetime.timezone.utc + + +class TestConvertManifestDataItemsToStringList: + """Tests for _convert_manifest_data_items_to_string_list.""" + + @pytest.mark.parametrize( + "items,expected", + [ + ([], ""), + (["hello"], "hello"), + # single item with comma is NOT quoted — quoting only applies in multi-item lists + (["hello,world"], "hello,world"), + (["a", "b", "c"], "[a,b,c]"), + (["hello,world", "plain"], '["hello,world",plain]'), + ([True], "True"), + ([True, False], "[True,False]"), + ([42], "42"), + ([1, 2, 3], "[1,2,3]"), + ([1.5], "1.5"), + ( + [datetime.datetime(2020, 1, 1, tzinfo=_UTC)], + "2020-01-01T00:00:00Z", + ), + ( + [ + datetime.datetime(2020, 1, 1, tzinfo=_UTC), + datetime.datetime(2021, 6, 15, 12, 30, tzinfo=_UTC), + ], + "[2020-01-01T00:00:00Z,2021-06-15T12:30:00Z]", + ), + ], + ) + def test_converts_items(self, items: list, expected: str) -> None: + assert _convert_manifest_data_items_to_string_list(items) == expected + + +class TestConvertManifestDataRowToDict: + """Tests for _convert_manifest_data_row_to_dict.""" + + def test_all_keys_present_passes_through(self) -> None: + row = {"path": "/f.txt", "parentId": "syn1", "name": "f.txt"} + keys = ["path", "parentId", "name"] + + result = _convert_manifest_data_row_to_dict(row, keys) + + assert result == {"path": "/f.txt", "parentId": "syn1", "name": "f.txt"} + + def test_missing_key_defaults_to_empty_string(self) -> None: + row = {"path": "/f.txt", "parentId": "syn1"} + keys = ["path", "parentId", "name"] + + result = _convert_manifest_data_row_to_dict(row, keys) + + assert result["name"] == "" + + def test_list_value_converted_to_string(self) -> None: + row = {"tags": ["a", "b", "c"]} + keys = ["tags"] + + result = _convert_manifest_data_row_to_dict(row, keys) + + assert result["tags"] == "[a,b,c]" + + def test_extra_keys_in_row_are_not_included_in_output(self) -> None: + row = {"path": "/f.txt", "extra": "ignored"} + keys = ["path"] + + result = _convert_manifest_data_row_to_dict(row, keys) + + assert "extra" not in result + assert result == {"path": "/f.txt"} diff --git a/tests/unit/synapseutils/unit_test_synapseutils_sync.py b/tests/unit/synapseutils/unit_test_synapseutils_sync.py index a2fc2580b..8187c6802 100644 --- a/tests/unit/synapseutils/unit_test_synapseutils_sync.py +++ b/tests/unit/synapseutils/unit_test_synapseutils_sync.py @@ -1916,3 +1916,21 @@ def test_multiple_item(self) -> None: "baz", '"foo, bar, baz"', ] + + +class TestSyncFromSynapseDeprecation: + """Tests for the deprecation of syncFromSynapse.""" + + def test_syncFromSynapse_emits_deprecation_warning(self, syn: Synapse) -> None: + # GIVEN the legacy syncFromSynapse function + # WHEN it is called + # THEN a DeprecationWarning is raised pointing to StorableContainer + with pytest.warns( + DeprecationWarning, match="StorableContainer.sync_from_synapse" + ): + with patch.object( + sync, + "syncFromSynapse_async", + return_value=AsyncMock(return_value=[])(), + ): + sync.syncFromSynapse(syn=syn, entity="syn123")