Skip to content

Commit 1a852c0

Browse files
authored
Require unique values for join column in secondary file to be joined (#471)
1 parent 6a79f20 commit 1a852c0

File tree

3 files changed

+22
-2
lines changed

3 files changed

+22
-2
lines changed

src/python/dropseq_metadata/src/dropseq_metadata/join_and_filter_tsv.py

+12-2
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,11 @@
4040
import argparse
4141
import sys
4242
import pandas as pd
43+
from pandas.errors import MergeError
44+
try:
45+
from . import cli
46+
except ImportError:
47+
import cli
4348

4449
DELETEME_COLUMN_SUFFIX = '_deleteme'
4550

@@ -100,8 +105,13 @@ def main(options):
100105
for join_file, input_col, join_col in options.join:
101106
join_col_in_left = join_col in primary.columns
102107
secondary = pd.read_csv(join_file, sep='\t')
103-
primary = primary.merge(secondary, how='left', left_on=input_col, right_on=join_col,
104-
suffixes=(None, DELETEME_COLUMN_SUFFIX))
108+
try:
109+
# many_to_one: require that the join column in the secondary file is unique
110+
primary = primary.merge(secondary, how='left', left_on=input_col, right_on=join_col, validate="many_to_one",
111+
suffixes=(None, DELETEME_COLUMN_SUFFIX))
112+
except MergeError as e:
113+
cli.logger.error(f"Error joining {join_file} on {input_col} and {join_col}: {e}")
114+
return 1
105115
if not join_col_in_left:
106116
# drop the join column from the merged data frame
107117
primary.drop(join_col, axis=1, inplace=True)

src/python/dropseq_metadata/tests/test_join_and_filter_tsv.py

+7
Original file line numberDiff line numberDiff line change
@@ -157,6 +157,13 @@ def test_include_exclude(self):
157157
self.assertTrue((outputDf["DONOR"].isin(donorsToInclude)).all())
158158
self.assertFalse((outputDf["predClass"].isin(predClassesToExclude)).any())
159159

160+
def test_negative_non_unique_join(self):
161+
primary = os.path.join(self.testDataDir, "sample1.100.cell_metadata.txt")
162+
secondary = os.path.join(self.testDataDir, "sample1.nonunique.scPred.txt")
163+
options = self.options._replace(input=open(primary),
164+
join=[(secondary, "CELL_BARCODE", "CELL_BARCODE"),
165+
(secondary, "CELL_BARCODE", "CELL_BARCODE")])
166+
self.assertEqual(dropseq_metadata.join_and_filter_tsv.main(options), 1)
160167

161168
def assertSharedColumnsEqual(self, wideFile, narrowFile, wideRows = None, narrowRows = None, dropColumns = None):
162169
wideDf = pd.read_csv(wideFile, sep='\t', index_col=False)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
CELL_BARCODE predClass max.prob predClass_allen_gluta_subclass max.prob_allen_gluta_subclass predClass_allen_gluta_subtype max.prob_allen_gluta_subtype predClass_allen_gaba_subclass max.prob_allen_gaba_subclass predClass_allen_gaba_subtype max.prob_allen_gaba_subtype doublet
2+
TGACTTTGTACGCTGC glutamatergic 0.7 L23IT 1 Exc L2-3 LINC00507 FREM3 0.78 NA NA NA NA FALSE
3+
TGACTTTGTACGCTGC glutamatergic 1 L23IT 1 Exc L2-3 LINC00507 FREM3 0.667 NA NA NA NA FALSE

0 commit comments

Comments
 (0)