Skip to content

Commit 6a79f20

Browse files
authored
Script to join and filter metadata TSVs (#470)
* Script to join and filter metadata TSVs * Fix location of testdata directory
1 parent 0a6096a commit 6a79f20

16 files changed

+878
-1
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
name: Python Package using Conda
2+
3+
on:
4+
push:
5+
branches: [ "master" ]
6+
pull_request:
7+
branches: [ "master" ]
8+
paths:
9+
- 'src/python/dropseq_metadata/**'
10+
11+
12+
jobs:
13+
build-linux:
14+
runs-on: ubuntu-latest
15+
strategy:
16+
max-parallel: 5
17+
18+
steps:
19+
- uses: actions/checkout@v4
20+
- name: Set up Python
21+
uses: actions/setup-python@v3
22+
with:
23+
python-version: '3.12'
24+
- name: Add conda to system path
25+
run: |
26+
# $CONDA is an environment variable pointing to the root of the miniconda directory
27+
# https://github.com/actions/runner-images/blob/ff9acc6/images/ubuntu/Ubuntu2204-Readme.md#environment-variables
28+
echo $CONDA/bin >> $GITHUB_PATH
29+
- name: Modify the conda configuration to use conda-forge
30+
run: |
31+
conda config --add channels conda-forge
32+
conda config --remove channels defaults || true
33+
34+
# There are many issues filed regarding:
35+
# > warning libmamba Problem type not implemented SOLVER_RULE_STRICT_REPO_PRIORITY
36+
# when using the recommended strict channel priority and creating environments from YAML files,
37+
# but the FOSS conda/mamba community has not been able to fix the issue.
38+
# https://github.com/mamba-org/mamba/issues/2810#issuecomment-1910011988
39+
conda config --set channel_priority flexible
40+
41+
# Disable lock files to avoid bugs in mamba and libmamba
42+
# https://mamba.readthedocs.io/en/latest/user_guide/troubleshooting.html#hangs-during-package-installation-on-nfs-network-file-systems
43+
# https://github.com/mamba-org/mamba/issues/1993#issuecomment-1268397084
44+
echo "use_lockfiles: false" >> ~/.mambarc
45+
- name: Install dependencies
46+
run: |
47+
cd src/python/dropseq_metadata
48+
conda env update --file environment.yml --name base
49+
- name: Lint with flake8
50+
run: |
51+
cd src/python/dropseq_metadata
52+
53+
# Explicitly using the classic solver to avoid:
54+
# a) "libarchive.so.20: cannot open shared object", and
55+
# b) switching from the miniconda installed in the "ubuntu-latest" image
56+
# https://github.com/actions/runner-images/blob/ff9acc6/images/ubuntu/Ubuntu2204-Readme.md#package-management
57+
# c) upgrading to a working version using the below takes minutes while the tests themselves take seconds
58+
# https://stackoverflow.com/questions/77617946/solve-conda-libmamba-solver-libarchive-so-19-error-after-updating-conda-to-23#answer-78293971
59+
conda install --solver=classic --override-channels --channel conda-forge flake8
60+
61+
# stop the build if there are Python syntax errors or undefined names
62+
flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
63+
64+
# exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
65+
flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
66+
- name: Test with unittest
67+
run: |
68+
cd src/python/dropseq_metadata
69+
PYTHONPATH=src python -m unittest discover -s tests

src/python/dropseq_hdf5/pyproject.toml

+1-1
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ dependencies = [
1313
authors = [
1414
{ name="Alec Wysoker", email="[email protected]" },
1515
]
16-
description = "A small example package"
16+
description = "Command-line tools for converting from various HDF5 formats to Drop-seq format."
1717
readme = "README.md"
1818
requires-python = ">=3.8"
1919
classifiers = [

src/python/dropseq_metadata/LICENSE

+19
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
Copyright 2024 Broad Institute
2+
3+
Permission is hereby granted, free of charge, to any person obtaining a copy
4+
of this software and associated documentation files (the "Software"), to deal
5+
in the Software without restriction, including without limitation the rights
6+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7+
copies of the Software, and to permit persons to whom the Software is
8+
furnished to do so, subject to the following conditions:
9+
10+
The above copyright notice and this permission notice shall be included in all
11+
copies or substantial portions of the Software.
12+
13+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
19+
SOFTWARE.

src/python/dropseq_metadata/README.md

+13
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
# Simple python tools for managing tabular cell metadata text files..
2+
3+
## Installation
4+
5+
Requires python >= 3.8
6+
```
7+
pip install 'git+https://github.com/broadinstitute/Drop-seq.git#egg=dropseq_metadata&subdirectory=src/python/dropseq_metadata'
8+
```
9+
10+
## Usage
11+
12+
Run `dropseq_metadata -h` for usage information.
13+
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
name: dropseq_metadata
2+
channels:
3+
- conda-forge
4+
- nodefaults
5+
dependencies:
6+
- pandas
7+
- python=3.12
+27
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
[build-system]
2+
requires = ["hatchling"]
3+
build-backend = "hatchling.build"
4+
[project]
5+
name = "dropseq_metadata"
6+
version = "3.0.2"
7+
dependencies = [
8+
"pandas==2.2.3"
9+
]
10+
authors = [
11+
{ name="Alec Wysoker", email="[email protected]" },
12+
]
13+
description = "Command-line tools for converting from various HDF5 formats to Drop-seq format."
14+
readme = "README.md"
15+
requires-python = ">=3.8"
16+
classifiers = [
17+
"Programming Language :: Python :: 3",
18+
"License :: OSI Approved :: MIT License",
19+
"Operating System :: OS Independent",
20+
]
21+
22+
[project.urls]
23+
Homepage = "https://github.com/broadinstitute/Drop-seq/"
24+
Issues = "https://github.com/broadinstitute/Drop-seq/issues"
25+
26+
[project.scripts]
27+
dropseq_metadata = "dropseq_metadata.cli:main"
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
#!/usr/bin/env python3
2+
# MIT License
3+
#
4+
# Copyright 2024 Broad Institute
5+
#
6+
# Permission is hereby granted, free of charge, to any person obtaining a copy
7+
# of this software and associated documentation files (the "Software"), to deal
8+
# in the Software without restriction, including without limitation the rights
9+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10+
# copies of the Software, and to permit persons to whom the Software is
11+
# furnished to do so, subject to the following conditions:
12+
#
13+
# The above copyright notice and this permission notice shall be included in all
14+
# copies or substantial portions of the Software.
15+
#
16+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22+
# SOFTWARE.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
#!/usr/bin/env python3
2+
# MIT License
3+
#
4+
# Copyright 2024 Broad Institute
5+
#
6+
# Permission is hereby granted, free of charge, to any person obtaining a copy
7+
# of this software and associated documentation files (the "Software"), to deal
8+
# in the Software without restriction, including without limitation the rights
9+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10+
# copies of the Software, and to permit persons to whom the Software is
11+
# furnished to do so, subject to the following conditions:
12+
#
13+
# The above copyright notice and this permission notice shall be included in all
14+
# copies or substantial portions of the Software.
15+
#
16+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22+
# SOFTWARE.
23+
import argparse
24+
import logging
25+
import sys
26+
27+
try:
28+
from . import join_and_filter_tsv
29+
except ImportError:
30+
import join_and_filter_tsv
31+
32+
# I cannot believe I need to do this to cause logger to write to stderr.
33+
logging.basicConfig(
34+
level=logging.INFO, # Set the logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL)
35+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
36+
handlers=[logging.StreamHandler()] # StreamHandler writes to sys.stderr by default
37+
)
38+
logger = logging.getLogger(__name__)
39+
40+
dctLogLevel = {
41+
"DEBUG": logging.DEBUG,
42+
"INFO": logging.INFO,
43+
"WARNING": logging.WARNING,
44+
"ERROR": logging.ERROR,
45+
"CRITICAL": logging.CRITICAL
46+
}
47+
48+
49+
def main(args=None):
50+
parser = argparse.ArgumentParser(prog="dropseq_metadata", description=__doc__)
51+
parser.add_argument("--log-level", "-l", default="INFO", choices=dctLogLevel.keys(),
52+
help="Set the logging level. (default: %(default)s)")
53+
subparsers = parser.add_subparsers(
54+
title="sub-commands",
55+
description="valid commands",
56+
dest="tool")
57+
join_and_filter_tsv.add_subparser(subparsers)
58+
59+
if args is None:
60+
args = sys.argv[1:]
61+
if len(args) == 0:
62+
parser.print_help()
63+
return 1
64+
else:
65+
options = parser.parse_args(args)
66+
logger.setLevel(dctLogLevel[options.log_level])
67+
if options.tool == "join_and_filter_tsv":
68+
return join_and_filter_tsv.main(options)
69+
else:
70+
# should be unpossible because parse_args will complain
71+
raise ValueError(f"Unrecognized tool: {options.tool}")
72+
73+
74+
if __name__ == "__main__":
75+
sys.exit(main())
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,141 @@
1+
#!/usr/bin/env python3
2+
# MIT License
3+
#
4+
# Copyright 2024 Broad Institute
5+
#
6+
# Permission is hereby granted, free of charge, to any person obtaining a copy
7+
# of this software and associated documentation files (the "Software"), to deal
8+
# in the Software without restriction, including without limitation the rights
9+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10+
# copies of the Software, and to permit persons to whom the Software is
11+
# furnished to do so, subject to the following conditions:
12+
#
13+
# The above copyright notice and this permission notice shall be included in all
14+
# copies or substantial portions of the Software.
15+
#
16+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22+
# SOFTWARE.
23+
"""
24+
Load a primary tab-separated file, and zero or more secondary tab-separated files, each with join columns
25+
specified. Each secondary file is left outer joined to the primary file on the join column.
26+
27+
The values in the join column of a join file must be unique.
28+
29+
If there is a collision between the column names in the primary and secondary files, the column in the primary
30+
file is used. If there is a collision between the column names in the secondary files, the column in the earlier
31+
file is used.
32+
33+
In addition, there may be zero or more (column name, column value) pairs that are set on each line unconditionally.
34+
35+
Each resulting row is passed through zero more filters, which can apply a min or a max threshold to a column,
36+
have a list of values to match against, or have a list of values to exclude, which may be specified on the command
37+
line or in a file.
38+
"""
39+
40+
import argparse
41+
import sys
42+
import pandas as pd
43+
44+
DELETEME_COLUMN_SUFFIX = '_deleteme'
45+
46+
def try_convert_string_to_number(s):
47+
int_val = None
48+
float_val = None
49+
try:
50+
int_val = int(s)
51+
except ValueError:
52+
pass
53+
try:
54+
float_val = float(s)
55+
except ValueError:
56+
pass
57+
if int_val is None and float_val is None:
58+
return s
59+
if int_val is None and float_val is not None:
60+
return float_val
61+
if float_val is None and int_val is not None:
62+
return int_val
63+
if int_val == float_val:
64+
return int_val
65+
return float_val
66+
67+
def load_values_file(file):
68+
return pd.read_csv(file, sep='\t', header=None).iloc[0]
69+
70+
def add_subparser(subparsers):
71+
parser = subparsers.add_parser("join_and_filter_tsv", description=__doc__)
72+
parser.add_argument("--input", "-i", type=argparse.FileType('r'),
73+
help="Primary tab-separated file to join. Default: %(default)s", default=sys.stdin)
74+
parser.add_argument("--output", "-o", type=argparse.FileType('w'),
75+
help="Output file. Default: %(default)s", default=sys.stdout)
76+
parser.add_argument("--join", "-j", nargs=3, action='append', default=[],
77+
metavar=('SECONDARY_FILE', 'INPUT_COL', 'JOIN_COL'),
78+
help="Secondary tab-separated file to join, and the columns in the primary input "
79+
"and join file to join on. May be specified multiple times.")
80+
parser.add_argument("--set", "-s", nargs=2, action='append', default=[], metavar=('COLUMN', 'VALUE'),
81+
help="Set a column to a constant value. May be specified multiple times.")
82+
parser.add_argument("--min", nargs=2, action='append', default=[], metavar=('COLUMN', 'VALUE'),
83+
help="Filter out rows where COLUMN is less than VALUE. May be specified multiple times.")
84+
parser.add_argument("--max", nargs=2, action='append', default=[], metavar=('COLUMN', 'VALUE'),
85+
help="Filter out rows where COLUMN is greater than VALUE. May be specified multiple times.")
86+
parser.add_argument("--include-file", nargs=2, action='append', default=[], metavar=('COLUMN', 'FILE'),
87+
help="Filter out rows where COLUMN is not in FILE. May be specified multiple times.")
88+
parser.add_argument("--exclude-file", nargs=2, action='append', default=[], metavar=('COLUMN', 'FILE'),
89+
help="Filter out rows where COLUMN is in FILE. May be specified multiple times.")
90+
parser.add_argument("--include", nargs='+', action='append', default=[], metavar=('COLUMN', 'VALUE'),
91+
help="Filter out rows where COLUMN is not one of the given VALUEs. May be specified multiple times.")
92+
parser.add_argument("--exclude", nargs='+', action='append', default=[], metavar=('COLUMN', 'VALUE'),
93+
help="Filter out rows where COLUMN is one of the given VALUEs. May be specified multiple times.")
94+
95+
def main(options):
96+
# load the primary file
97+
primary = pd.read_csv(options.input, sep='\t')
98+
options.input.close()
99+
# load each secondary file, and join it to the primary file, dropping secondary columns that are already in the primary
100+
for join_file, input_col, join_col in options.join:
101+
join_col_in_left = join_col in primary.columns
102+
secondary = pd.read_csv(join_file, sep='\t')
103+
primary = primary.merge(secondary, how='left', left_on=input_col, right_on=join_col,
104+
suffixes=(None, DELETEME_COLUMN_SUFFIX))
105+
if not join_col_in_left:
106+
# drop the join column from the merged data frame
107+
primary.drop(join_col, axis=1, inplace=True)
108+
# drop the secondary columns that are already in the primary
109+
for col in primary.columns:
110+
if col.endswith(DELETEME_COLUMN_SUFFIX):
111+
primary.drop(col, axis=1, inplace=True)
112+
# set columns to constant values
113+
for column, value in options.set:
114+
primary[column] = try_convert_string_to_number(value)
115+
# filter out rows based on column values
116+
for column, value in options.min:
117+
primary = primary[primary[column] >= try_convert_string_to_number(value)]
118+
for column, value in options.max:
119+
primary = primary[primary[column] <= try_convert_string_to_number(value)]
120+
for column, file in options.include_file:
121+
include_values = load_values_file(file)
122+
primary = primary[primary[column].isin(include_values)]
123+
for column, file in options.exclude_file:
124+
exclude_values = load_values_file(file)
125+
primary = primary[~primary[column].isin(exclude_values)]
126+
for includes in options.include:
127+
column = includes[0]
128+
values = includes[1:]
129+
values = [try_convert_string_to_number(value) for value in values]
130+
primary = primary[primary[column].isin(values)]
131+
for excludes in options.exclude:
132+
column = excludes[0]
133+
values = excludes[1:]
134+
values = [try_convert_string_to_number(value) for value in values]
135+
primary = primary[~primary[column].isin(values)]
136+
# write the output
137+
primary.to_csv(options.output, sep='\t', index=False)
138+
options.output.close()
139+
return 0
140+
141+

0 commit comments

Comments
 (0)