Skip to content

Commit df4e111

Browse files
committed
reorg manifest.py
1 parent f759bf3 commit df4e111

1 file changed

Lines changed: 76 additions & 81 deletions

File tree

synapseclient/models/services/manifest.py

Lines changed: 76 additions & 81 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,46 @@
1-
"""Functions for generating manifest CSV files from File entities."""
1+
"""Services for reading and writing Synapse manifest CSV files.
22
3+
This includes reading a manifest CSV file and preparing it for upload, as well
4+
as writing a manifest CSV file from a list of File entities.
5+
"""
6+
7+
from __future__ import annotations
8+
9+
import ast
10+
import asyncio
311
import csv
412
import datetime
513
import io
614
import os
7-
from typing import TYPE_CHECKING, Any, Union
15+
import re
16+
from dataclasses import dataclass
17+
from typing import TYPE_CHECKING, Any, Iterable, NamedTuple, Union
818

19+
from synapseclient import Synapse
920
from synapseclient.core import utils
21+
from synapseclient.core.exceptions import (
22+
SynapseFileNotFoundError,
23+
SynapseHTTPError,
24+
SynapseProvenanceError,
25+
)
26+
from synapseclient.core.utils import (
27+
bool_or_none,
28+
datetime_or_none,
29+
get_synid_and_version,
30+
is_synapse_id_str,
31+
is_url,
32+
test_import_pandas,
33+
topolgical_sort,
34+
)
35+
from synapseclient.operations.factory_operations import FileOptions
36+
from synapseclient.operations.factory_operations import get_async as factory_get_async
1037

1138
if TYPE_CHECKING:
12-
from synapseclient.models import File
39+
from pandas import DataFrame, Series
40+
41+
from synapseclient.models import UsedEntity, UsedURL
42+
from synapseclient.models.file import File
43+
1344

1445
MANIFEST_CSV_FILENAME = "manifest.csv"
1546
DEFAULT_GENERATED_MANIFEST_CSV_KEYS = [
@@ -24,6 +55,47 @@
2455
"activityName",
2556
"activityDescription",
2657
]
58+
#: Scalar types that Synapse supports as annotation values.
59+
SynapseAnnotationType = datetime.datetime | float | int | bool | str
60+
61+
# Columns that are NOT annotations — stripped before building File.annotations.
62+
# Covers the standard manifest columns plus the extra metadata columns produced
63+
# by the Synapse UI download cart and synapse get-download-list CLI.
64+
NON_ANNOTATION_COLUMNS = frozenset(
65+
[
66+
# Standard manifest columns used directly during upload
67+
"path",
68+
"parentId",
69+
"ID",
70+
"name",
71+
"synapseStore",
72+
"contentType",
73+
"activityName",
74+
"activityDescription",
75+
"forceVersion",
76+
"used",
77+
"executed",
78+
# Download-list / Synapse UI informational columns — ignore for upload
79+
"error",
80+
"versionNumber",
81+
"dataFileSizeBytes",
82+
"createdBy",
83+
"createdOn",
84+
"modifiedBy",
85+
"modifiedOn",
86+
"synapseURL",
87+
"dataFileMD5Hex",
88+
]
89+
)
90+
91+
# Regex patterns used when parsing annotation cell values.
92+
# Matches a cell that is a bracket-delimited list, e.g. "[a, b, c]".
93+
# Disallows ']' inside to avoid matching adjacent lists like "[a][b]".
94+
_ARRAY_BRACKET_PATTERN = re.compile(r"^\[[^\]]*\]$")
95+
# https://stackoverflow.com/questions/18893390/splitting-on-comma-outside-quotes
96+
_COMMAS_OUTSIDE_DOUBLE_QUOTES_PATTERN = re.compile(r",(?=(?:[^\"]*\"[^\"]*\")*[^\"]*$)")
97+
# Valid Synapse file name characters (1–256 chars).
98+
_FILE_NAME_PATTERN = re.compile(r"^[`\w \-\+\.\(\)]{1,256}$")
2799

28100

29101
def _manifest_csv_filename(path: str) -> str:
@@ -218,7 +290,7 @@ def _write_manifest_data_csv(path: str, keys: list[str], data: list[dict]) -> No
218290
writer.writerow(_convert_manifest_data_row_to_dict(row, keys))
219291

220292

221-
def generate_manifest_csv(all_files: list["File"], path: str) -> None:
293+
def generate_manifest_csv(all_files: list[File], path: str) -> None:
222294
"""Generates a manifest.csv file based on a list of File entities.
223295
224296
The generated file uses CSV format with comma delimiter and is interoperable
@@ -233,83 +305,6 @@ def generate_manifest_csv(all_files: list["File"], path: str) -> None:
233305
filename = _manifest_csv_filename(path=path)
234306
keys, data = _extract_entity_metadata_for_manifest_csv(all_files=all_files)
235307
_write_manifest_data_csv(filename, keys, data)
236-
"""Services for reading a Synapse manifest CSV file and preparing it for upload."""
237-
238-
from __future__ import annotations
239-
240-
import ast
241-
import asyncio
242-
import datetime
243-
import os
244-
import re
245-
from dataclasses import dataclass
246-
from typing import TYPE_CHECKING, Iterable, NamedTuple
247-
248-
from synapseclient import Synapse
249-
from synapseclient.core.exceptions import (
250-
SynapseFileNotFoundError,
251-
SynapseHTTPError,
252-
SynapseProvenanceError,
253-
)
254-
from synapseclient.core.utils import (
255-
bool_or_none,
256-
datetime_or_none,
257-
get_synid_and_version,
258-
is_synapse_id_str,
259-
is_url,
260-
test_import_pandas,
261-
topolgical_sort,
262-
)
263-
from synapseclient.operations.factory_operations import FileOptions
264-
from synapseclient.operations.factory_operations import get_async as factory_get_async
265-
266-
if TYPE_CHECKING:
267-
from pandas import DataFrame, Series
268-
269-
from synapseclient.models import UsedEntity, UsedURL
270-
from synapseclient.models.file import File
271-
272-
#: Scalar types that Synapse supports as annotation values.
273-
SynapseAnnotationType = datetime.datetime | float | int | bool | str
274-
275-
# Columns that are NOT annotations — stripped before building File.annotations.
276-
# Covers the standard manifest columns plus the extra metadata columns produced
277-
# by the Synapse UI download cart and synapse get-download-list CLI.
278-
NON_ANNOTATION_COLUMNS = frozenset(
279-
[
280-
# Standard manifest columns used directly during upload
281-
"path",
282-
"parentId",
283-
"ID",
284-
"name",
285-
"synapseStore",
286-
"contentType",
287-
"activityName",
288-
"activityDescription",
289-
"forceVersion",
290-
"used",
291-
"executed",
292-
# Download-list / Synapse UI informational columns — ignore for upload
293-
"error",
294-
"versionNumber",
295-
"dataFileSizeBytes",
296-
"createdBy",
297-
"createdOn",
298-
"modifiedBy",
299-
"modifiedOn",
300-
"synapseURL",
301-
"dataFileMD5Hex",
302-
]
303-
)
304-
305-
# Regex patterns used when parsing annotation cell values.
306-
# Matches a cell that is a bracket-delimited list, e.g. "[a, b, c]".
307-
# Disallows ']' inside to avoid matching adjacent lists like "[a][b]".
308-
_ARRAY_BRACKET_PATTERN = re.compile(r"^\[[^\]]*\]$")
309-
# https://stackoverflow.com/questions/18893390/splitting-on-comma-outside-quotes
310-
_COMMAS_OUTSIDE_DOUBLE_QUOTES_PATTERN = re.compile(r",(?=(?:[^\"]*\"[^\"]*\")*[^\"]*$)")
311-
# Valid Synapse file name characters (1–256 chars).
312-
_FILE_NAME_PATTERN = re.compile(r"^[`\w \-\+\.\(\)]{1,256}$")
313308

314309

315310
class UploadSyncFile(NamedTuple):

0 commit comments

Comments
 (0)