Skip to content

Commit a1fda61

Browse files
authored
[GEN-2375] Replace deprecated DataFrame.append with pandas.concat (#620)
* replace append with concat * lint
1 parent da18b5f commit a1fda61

File tree

2 files changed

+166
-13
lines changed

2 files changed

+166
-13
lines changed

genie/dashboard_table_updater.py

Lines changed: 28 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -51,18 +51,26 @@ def get_center_data_completion(center, df):
5151

5252

5353
def update_samples_in_release_table(
54-
syn, file_mapping, release, samples_in_release_synid
55-
):
54+
syn: synapseclient.Synapse,
55+
file_mapping: dict,
56+
release: str,
57+
samples_in_release_synid: str,
58+
) -> None:
5659
"""
5760
Updates the sample in release table
5861
This tracks the samples of each release. 1 means it exists, and 0
59-
means it doesn't
62+
means it doesn't. For releases without a column in the sample
63+
in release table, a new release column will be created for
64+
that release.
65+
66+
New samples will be displayed on top of old samples (pre-existing samples
67+
already in the sample in release table) in the release column.
6068
6169
Args:
62-
syn: synapse object
63-
file_mapping: file mapping generated from file mapping function
64-
release: GENIE release number (ie. 5.3-consortium)
65-
samples_in_release_synid: Synapse Id of 'samples in release' Table
70+
syn (synapseclient.Synapse): synapse object
71+
file_mapping (dict): file mapping generated from file mapping function
72+
release (str): GENIE release number (ie. 5.3-consortium)
73+
samples_in_release_synid (str): Synapse Id of 'samples in release' Table
6674
"""
6775
clinical_ent = syn.get(file_mapping["sample"], followLink=True)
6876
clinicaldf = pd.read_csv(clinical_ent.path, sep="\t", comment="#")
@@ -92,7 +100,7 @@ def update_samples_in_release_table(
92100
]
93101

94102
old_samples[release] = 1
95-
samples_in_releasedf = new_samples.append(old_samples)
103+
samples_in_releasedf = pd.concat([new_samples, old_samples])
96104
load._update_table(
97105
syn,
98106
samples_per_releasedf,
@@ -796,15 +804,22 @@ def print_clinical_values_difference_table(syn, database_mappingdf):
796804
)
797805

798806

799-
def run_dashboard(syn, database_mappingdf, release, staging=False, public=False):
807+
def run_dashboard(
808+
syn: synapseclient.Synapse,
809+
database_mappingdf: pd.DataFrame,
810+
release: str,
811+
staging: bool = False,
812+
public: bool = False,
813+
) -> None:
800814
"""
801815
Runs the dashboard scripts
802816
803817
Args:
804-
syn: synapse object
805-
database_mappingdf: mapping between synapse ids and database
806-
release: GENIE release (ie. 5.3-consortium)
807-
818+
syn (synapseclient.Synapse): synapse object
819+
database_mappingdf (pd.DataFrame): mapping between synapse ids and database
820+
release (str): GENIE release (ie. 5.3-consortium)
821+
public (bool): whether to run for public release or not
822+
staging (bool): whether to run in staging mode or not
808823
"""
809824
update_release_numbers(syn, database_mappingdf, release=release)
810825

tests/test_dashboard_table_updater.py

Lines changed: 138 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,33 @@
1+
import pandas as pd
2+
from pandas.testing import assert_frame_equal
13
import pytest
4+
from unittest import mock
25

36
from genie import dashboard_table_updater as dash_update
47

58

9+
@pytest.fixture
10+
def mock_syn(tmp_path):
11+
"""Fixture for a mock synapse client and test file."""
12+
syn = mock.MagicMock()
13+
14+
# create a fake clinical file
15+
clinical_path = tmp_path / "clinical.txt"
16+
clinical_path.write_text("SAMPLE_ID\nS1\nS2\nS3\n")
17+
18+
# entity returned by syn.get for the clinical file
19+
clinical_entity = mock.MagicMock()
20+
clinical_entity.path = str(clinical_path)
21+
22+
# syn.get() will return this entity
23+
syn.get.return_value = clinical_entity
24+
25+
# Default columns of existing table
26+
syn.getTableColumns.return_value = [{"name": "SAMPLE_ID"}]
27+
28+
return syn
29+
30+
631
@pytest.mark.parametrize(
732
"input_string_time, expected_output_time",
833
[
@@ -17,3 +42,116 @@ def test_that_string_to_unix_epoch_time_milliseconds_gives_expected_time(
1742
):
1843
output = dash_update.string_to_unix_epoch_time_milliseconds(input_string_time)
1944
assert output == expected_output_time
45+
46+
47+
def test_that_update_samples_in_release_table_adds_column_and_calls_update(
48+
tmp_path, mock_syn
49+
):
50+
"""Test that a new release column is added and load._update_table gets correct data."""
51+
file_mapping = {"sample": "syn123"}
52+
release = "5.3-consortium"
53+
samples_in_release_synid = "syn999"
54+
55+
clinical_df = pd.DataFrame({"SAMPLE_ID": ["S1", "S2", "S3"]})
56+
existing_df = pd.DataFrame({"SAMPLE_ID": ["S1"], release: [1]})
57+
58+
with (
59+
mock.patch.object(pd, "read_csv", return_value=clinical_df) as mock_read,
60+
mock.patch.object(
61+
dash_update.extract, "get_syntabledf", return_value=existing_df
62+
) as mock_extract,
63+
mock.patch.object(dash_update.load, "_update_table") as mock_update,
64+
):
65+
dash_update.update_samples_in_release_table(
66+
syn=mock_syn,
67+
file_mapping=file_mapping,
68+
release=release,
69+
samples_in_release_synid=samples_in_release_synid,
70+
)
71+
72+
# assertions on Synapse calls
73+
mock_syn.get.assert_has_calls(
74+
[
75+
mock.call(file_mapping["sample"], followLink=True),
76+
mock.call(samples_in_release_synid),
77+
]
78+
)
79+
mock_syn.getTableColumns.assert_called_once_with(samples_in_release_synid)
80+
81+
# assertions on pd.read_csv
82+
mock_read.assert_called_once()
83+
assert list(mock_read.return_value.columns) == ["SAMPLE_ID"]
84+
85+
# assertions on extract.get_syntabledf
86+
mock_extract.assert_called_once_with(
87+
syn=mock_syn,
88+
query_string=f'SELECT SAMPLE_ID, "{release}" FROM {samples_in_release_synid}',
89+
)
90+
91+
# assertions on load._update_table inputs
92+
mock_update.assert_called_once()
93+
args, kwargs = mock_update.call_args
94+
95+
# extract arguments
96+
_, _, samples_in_releasedf, synid_arg, key_cols = args
97+
98+
# Ensure correct Synapse ID and key columns
99+
assert synid_arg == samples_in_release_synid
100+
assert key_cols == ["SAMPLE_ID"]
101+
102+
# check that new samples and old samples are in expected order
103+
assert_frame_equal(
104+
samples_in_releasedf.reset_index(drop=True),
105+
pd.DataFrame(
106+
{"SAMPLE_ID": ["S2", "S3", "S1"], "5.3-consortium": [1, 1, 1]}
107+
),
108+
)
109+
110+
111+
def test_that_update_samples_in_release_table_existing_column_calls_update_directly(
112+
mock_syn,
113+
):
114+
"""If the release column already exists, we skip creating new column but still call _update_table."""
115+
release = "5.3-consortium"
116+
file_mapping = {"sample": "syn123"}
117+
samples_in_release_synid = "syn999"
118+
119+
# pre-xisting release column
120+
mock_syn.getTableColumns.return_value = [{"name": "SAMPLE_ID"}, {"name": release}]
121+
122+
clinical_df = pd.DataFrame({"SAMPLE_ID": ["S1", "S2"]})
123+
existing_df = pd.DataFrame({"SAMPLE_ID": ["S2", "S1"], release: [1, 1]})
124+
125+
with (
126+
mock.patch.object(pd, "read_csv", return_value=clinical_df) as mock_read,
127+
mock.patch.object(
128+
dash_update.extract, "get_syntabledf", return_value=existing_df
129+
) as mock_extract,
130+
mock.patch.object(dash_update.load, "_update_table") as mock_update,
131+
):
132+
dash_update.update_samples_in_release_table(
133+
syn=mock_syn,
134+
file_mapping=file_mapping,
135+
release=release,
136+
samples_in_release_synid=samples_in_release_synid,
137+
)
138+
# assert that samples_in_release schema is not retrieved by syn.get
139+
mock_syn.get.assert_has_calls(
140+
[
141+
mock.call(file_mapping["sample"], followLink=True),
142+
]
143+
)
144+
args, kwargs = mock_update.call_args
145+
146+
# extract arguments
147+
_, _, samples_in_releasedf, synid_arg, key_cols = args
148+
149+
# ensure correct Synapse ID and key columns
150+
assert synid_arg == samples_in_release_synid
151+
assert key_cols == ["SAMPLE_ID"]
152+
153+
# check that new samples and old samples are in expected order
154+
assert_frame_equal(
155+
samples_in_releasedf.reset_index(drop=True),
156+
pd.DataFrame({"SAMPLE_ID": ["S1", "S2"], "5.3-consortium": [1, 1]}),
157+
)

0 commit comments

Comments
 (0)