Skip to content
Merged
9 changes: 5 additions & 4 deletions schematic/store/synapse.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,10 @@
import numpy as np
import pandas as pd
import synapseclient
import synapseutils
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If you're using vscode, you may see unused imports "grayed" out. It's ok to remove them in PRs, the intent is that each PR that we contribute should put the codebase in a better place.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Pylint would also catch this, but this module hasn't been gone through and fixed so that is passes, so it isn't included in the pylint check.

from opentelemetry import trace
from synapseclient import Annotations as OldAnnotations
from synapseclient import (
Column,
Entity,
EntityViewSchema,
EntityViewType,
File,
Expand Down Expand Up @@ -705,13 +703,16 @@ def getFilesInStorageDataset(
ValueError: Dataset ID not found.
"""
file_list = []
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Work with Bryan to see the difference in speeds between dev branch and prod branch within signoz.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

#1552

Is necessary to filter out difference between branches in gh runs. It's possible to get the data now, just a bit more difficult to filter it out. As of now the average duration of this function plotted over time:

image

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Feel free to pull develop into this feature branch and we can compare the develop branch to this feature branch performance.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks @BryanFauble. Done

Copy link
Member

@BryanFauble BryanFauble Nov 26, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@thomasyu888 These are the results over the past 5 days. We can see your branch has a much better average execution time for this function. Although, similar to bwmac/SCHEMATIC-163/error-message-update for some reason.

image

We can selected on a few fields to filter for what we want, perform an average of the duration for the function, then group by a few fields to get these results.

Some other tests:
image

image


# HACK: must requery the fileview to get new files, since SynapseStorage will query the last state
# of the fileview which may not contain any new folders in the fileview.
# This is a workaround to fileviews not always containing the latest information
# self.query_fileview(force_requery=True)
# Get path to dataset folder by using childern to avoid cases where the dataset is the scope of the view
if self.storageFileviewTable.empty:
raise ValueError(
f"Fileview {self.storageFileview} is empty, please check the table and the provided synID and try again."
)

# Get path to dataset folder by using children to avoid cases where the dataset is the scope of the view
child_path = self.storageFileviewTable.loc[
self.storageFileviewTable["parentId"] == datasetId, "path"
]
Expand Down
60 changes: 32 additions & 28 deletions tests/integration/test_commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,14 +95,14 @@ def test_validate_valid_manifest(self, runner: CliRunner) -> None:
# command has no (python) errors, has exit code 0
assert result.exit_code == 0
# command output has success message
assert result.output.split("\n")[4] == (
result_list = result.output.split("\n")
assert (
"Your manifest has been validated successfully. "
"There are no errors in your manifest, "
"and it can be submitted without any modifications."
)
) in result_list
# command output has no validation errors
for line in result.output.split("\n")[4]:
assert not line.startswith("error")
errors = [errors for result in result_list if result.startswith("error")]

def test_validate_invalid_manifest(self, runner: CliRunner) -> None:
"""
Expand Down Expand Up @@ -504,9 +504,10 @@ def test_generate_empty_excel_manifest(
os.remove("tests/data/example.Biospecimen.schema.json")

# command output has excel file creation message
result_list = result.output.split("\n")
assert (
result.output.split("\n")[7]
== "Find the manifest template using this Excel file path: ./CLI_empty_excel.xlsx"
"Find the manifest template using this Excel file path: ./CLI_empty_excel.xlsx"
in result_list
)

sheet1 = workbook["Sheet1"]
Expand Down Expand Up @@ -665,18 +666,19 @@ def test_generate_bulk_rna_google_sheet_manifest(
# Reset config to it's default values
CONFIG.load_config("config_example.yml")

assert result.output.split("\n")[8] == (
"Find the manifest template using this Google Sheet URL:"
)
assert result.output.split("\n")[9].startswith(
"https://docs.google.com/spreadsheets/d/"
)
assert result.output.split("\n")[10] == (
result_list = result.output.split("\n")
assert "Find the manifest template using this Google Sheet URL:" in result_list
assert (
"Find the manifest template using this CSV file path: "
"./CLI_gs_bulk_rna.csv"
)

google_sheet_url = result.output.split("\n")[9]
) in result_list
google_sheet_result = [
result
for result in result_list
if result.startswith("https://docs.google.com/spreadsheets/d/")
]
assert len(google_sheet_result) == 1
google_sheet_url = google_sheet_result[0]

# Download the Google Sheets content as an Excel file and load into openpyxl
export_url = f"{google_sheet_url}/export?format=xlsx"
Expand Down Expand Up @@ -908,18 +910,19 @@ def test_generate_bulk_rna_google_sheet_manifest_with_annotations(
os.remove("tests/data/example.BulkRNA-seqAssay.schema.json")
os.remove("./CLI_gs_bulk_rna_annos.csv")

assert result.output.split("\n")[11] == (
"Find the manifest template using this Google Sheet URL:"
)
assert result.output.split("\n")[12].startswith(
"https://docs.google.com/spreadsheets/d/"
)
assert result.output.split("\n")[13] == (
result_list = result.output.split("\n")
assert "Find the manifest template using this Google Sheet URL:" in result_list
assert (
"Find the manifest template using this CSV file path: "
"./CLI_gs_bulk_rna_annos.csv"
)

google_sheet_url = result.output.split("\n")[12]
) in result_list
google_sheet_result = [
result
for result in result_list
if result.startswith("https://docs.google.com/spreadsheets/d/")
]
assert len(google_sheet_result) == 1
google_sheet_url = google_sheet_result[0]

# Download the Google Sheets content as an Excel file and load into openpyxl
export_url = f"{google_sheet_url}/export?format=xlsx"
Expand Down Expand Up @@ -1177,10 +1180,11 @@ def test_generate_mock_component_excel_manifest(self, runner: CliRunner) -> None
# TODO: remove with https://sagebionetworks.jira.com/browse/SCHEMATIC-202
# Reset config to it's default values
CONFIG.load_config("config_example.yml")

# Command output has excel file message
assert result.output.split("\n")[8] == (
result_list = result.output.split("\n")
assert (
"Find the manifest template using this Excel file path: ./CLI_mock_comp.xlsx"
in result_list
)

sheet1 = workbook["Sheet1"]
Expand Down
7 changes: 5 additions & 2 deletions tests/integration/test_metadata_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,6 @@ async def test_submit_filebased_manifest_file_and_entities_valid_manifest_submit
dir=create_temp_folder(path=tempfile.gettempdir()),
) as tmp_file:
df.to_csv(tmp_file.name, index=False)

# WHEN the manifest is submitted (Assertions are handled in the helper method)
self._submit_and_verify_manifest(
helpers=helpers,
Expand Down Expand Up @@ -229,7 +228,6 @@ async def test_submit_filebased_manifest_file_and_entities_mock_filename(
dir=create_temp_folder(path=tempfile.gettempdir()),
) as tmp_file:
df.to_csv(tmp_file.name, index=False)

# WHEN the manifest is submitted (Assertions are handled in the helper method)
self._submit_and_verify_manifest(
helpers=helpers,
Expand Down Expand Up @@ -450,6 +448,11 @@ def _submit_and_verify_manifest(
raise ValueError(
"expected_manifest_id or expected_manifest_name must be provided"
)
# HACK: must requery the fileview to get new files, since SynapseStorage will query the last state
# of the fileview which may not contain any new folders in the fileview.
# This is a workaround to fileviews not always containing the latest information
# Since the tests don't always follow a similar process as testing resources are created and destroyed
synapse_store.query_fileview(force_requery=True)

# Spies
if already_spied:
Expand Down
57 changes: 25 additions & 32 deletions tests/integration/test_store_synapse.py
Original file line number Diff line number Diff line change
Expand Up @@ -400,38 +400,31 @@ def test_mock_get_files_in_storage_dataset(
with patch(
"schematic.store.synapse.CONFIG", return_value=TEST_CONFIG
) as mock_config:
with patch.object(synapse_store, "syn") as mock_synapse_client:
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is me misleading Gianna, but there are times when we want a connection with Synapse and there are times we don't.

This is probably a case where it is ok for there to a direct connection with Synapse to simplify the mocking - since you've included in your other test/test_store.py the mocked version.

# AND the appropriate asset view
mock_config.synapse_master_fileview_id = asset_view
# AND the appropriately filtered filenames
if filenames:
files_to_remove = []
for f in expected_files:
retain = False
for name in filenames:
if name in f[1]:
retain = True
if not retain:
files_to_remove.append(f)
# AND the appropriate asset view
mock_config.synapse_master_fileview_id = asset_view
# AND the appropriately filtered filenames
if filenames:
files_to_remove = []
for f in expected_files:
retain = False
for name in filenames:
if name in f[1]:
retain = True
if not retain:
files_to_remove.append(f)

for file in files_to_remove:
expected_files.remove(file)
for file in files_to_remove:
expected_files.remove(file)

mock_table_dataFrame = pd.DataFrame(
expected_files, columns=["id", "path"]
)
mock_table = build_table("Mock Table", "syn123", mock_table_dataFrame)
mock_synapse_client.tableQuery.return_value = mock_table

# WHEN getFilesInStorageDataset is called for the given dataset
dataset_files = synapse_store.getFilesInStorageDataset(
datasetId=dataset_id, fileNames=filenames
)
# WHEN getFilesInStorageDataset is called for the given dataset
dataset_files = synapse_store.getFilesInStorageDataset(
datasetId=dataset_id, fileNames=filenames
)

# THEN the expected files are returned
# AND there are no unexpected files
assert dataset_files == expected_files
# AND the (synId, path) order is correct
synapse_id_regex = re_compile(SYN_ID_REGEX)
if dataset_files:
assert synapse_id_regex.fullmatch(dataset_files[0][0])
# THEN the expected files are returned
# AND there are no unexpected files
assert dataset_files == expected_files
# AND the (synId, path) order is correct
synapse_id_regex = re_compile(SYN_ID_REGEX)
if dataset_files:
assert synapse_id_regex.fullmatch(dataset_files[0][0])
72 changes: 33 additions & 39 deletions tests/test_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
import uuid
from contextlib import nullcontext as does_not_raise
from typing import Any, Callable, Generator
from unittest.mock import AsyncMock, patch
from unittest.mock import AsyncMock, MagicMock, patch

import pandas as pd
import pytest
Expand Down Expand Up @@ -464,49 +464,45 @@ def test_getDatasetProject(self, dataset_id, synapse_store):
(
True,
[
("syn126", "schematic - main/parent_folder/test_file"),
("syn126", "syn_mock", "schematic - main/parent_folder/test_file"),
(
"syn125",
"syn_mock",
"schematic - main/parent_folder/test_folder/test_file_2",
),
],
),
(False, [("syn126", "test_file"), ("syn125", "test_file_2")]),
(
False,
[
("syn126", "syn_mock", "test_file"),
("syn125", "syn_mock", "test_file_2"),
],
),
],
)
def test_getFilesInStorageDataset(self, synapse_store, full_path, expected):
mock_table_dataFrame_initial = pd.DataFrame(
{
"id": ["syn_mock"],
"path": ["schematic - main/parent_folder"],
}
)

mock_table_dataFrame_return = pd.DataFrame(
mock_table_dataframe_return = pd.DataFrame(
{
"id": ["syn126", "syn125"],
"parentId": ["syn_mock", "syn_mock"],
"path": [
"schematic - main/parent_folder/test_file",
"schematic - main/parent_folder/test_folder/test_file_2",
],
}
)
mock_table_return = build_table(
"Mock Table", "syn123", mock_table_dataFrame_return
)

with patch.object(synapse_store, "syn") as mocked_synapse_client:
with patch.object(
synapse_store, "storageFileviewTable"
) as mocked_fileview_table:
mocked_fileview_table.storageFileviewTable.return_value = (
mock_table_dataFrame_initial
)
mocked_synapse_client.tableQuery.return_value = mock_table_return
file_list = synapse_store.getFilesInStorageDataset(
datasetId="syn_mock", fileNames=None, fullpath=full_path
)
assert file_list == expected
with patch.object(
synapse_store, "storageFileviewTable", mock_table_dataframe_return
), patch.object(synapse_store, "query_fileview") as mocked_query:
# query_fileview is the function called to get the fileview
mocked_query.return_value = mock_table_dataframe_return

file_list = synapse_store.getFilesInStorageDataset(
datasetId="syn_mock", fileNames=None, fullpath=full_path
)
assert file_list == expected

@pytest.mark.parametrize(
"full_path",
Expand All @@ -516,27 +512,25 @@ def test_getFilesInStorageDataset(self, synapse_store, full_path, expected):
],
)
def test_get_files_in_storage_dataset_exception(self, synapse_store, full_path):
mock_table_dataFrame_initial = pd.DataFrame(
mock_table_dataframe_return = pd.DataFrame(
{
"id": ["child_syn_mock"],
"path": ["schematic - main/parent_folder/child_entity"],
"parentId": ["wrong_syn_mock"],
}
)
with patch.object(
synapse_store, "storageFileviewTable", mock_table_dataframe_return
), patch.object(synapse_store, "query_fileview") as mocked_query:
# query_fileview is the function called to get the fileview
mocked_query.return_value = mock_table_dataframe_return

with patch.object(synapse_store, "syn") as mocked_synapse_client:
with patch.object(
synapse_store, "storageFileviewTable"
) as mocked_fileview_table:
mocked_fileview_table.storageFileviewTable.return_value = (
mock_table_dataFrame_initial
with pytest.raises(
LookupError, match="Dataset syn_mock could not be found"
):
synapse_store.getFilesInStorageDataset(
datasetId="syn_mock", fileNames=None, fullpath=full_path
)
with pytest.raises(
LookupError, match="Dataset syn_mock could not be found"
):
file_list = synapse_store.getFilesInStorageDataset(
datasetId="syn_mock", fileNames=None, fullpath=full_path
)

@pytest.mark.parametrize("downloadFile", [True, False])
def test_getDatasetManifest(self, synapse_store, downloadFile):
Expand Down
Loading