broadinstitute
diff --git a/‎.github/workflows/python-dropseq_metadata-conda.yml
Lines changed: 69 additions & 0 deletions b/‎.github/workflows/python-dropseq_metadata-conda.yml
Lines changed: 69 additions & 0 deletions
diff --git a/‎src/python/dropseq_hdf5/pyproject.toml
Lines changed: 1 addition & 1 deletion b/‎src/python/dropseq_hdf5/pyproject.toml
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/python/dropseq_metadata/LICENSE
Lines changed: 19 additions & 0 deletions b/‎src/python/dropseq_metadata/LICENSE
Lines changed: 19 additions & 0 deletions
diff --git a/‎src/python/dropseq_metadata/README.md
Lines changed: 13 additions & 0 deletions b/‎src/python/dropseq_metadata/README.md
Lines changed: 13 additions & 0 deletions
diff --git a/‎src/python/dropseq_metadata/environment.yml
Lines changed: 7 additions & 0 deletions b/‎src/python/dropseq_metadata/environment.yml
Lines changed: 7 additions & 0 deletions
diff --git a/‎src/python/dropseq_metadata/pyproject.toml
Lines changed: 27 additions & 0 deletions b/‎src/python/dropseq_metadata/pyproject.toml
Lines changed: 27 additions & 0 deletions
diff --git a/‎src/python/dropseq_metadata/src/dropseq_metadata/__init__.py
Lines changed: 22 additions & 0 deletions b/‎src/python/dropseq_metadata/src/dropseq_metadata/__init__.py
Lines changed: 22 additions & 0 deletions
diff --git a/‎src/python/dropseq_metadata/src/dropseq_metadata/cli.py
Lines changed: 75 additions & 0 deletions b/‎src/python/dropseq_metadata/src/dropseq_metadata/cli.py
Lines changed: 75 additions & 0 deletions
diff --git a/‎src/python/dropseq_metadata/src/dropseq_metadata/join_and_filter_tsv.py
Lines changed: 141 additions & 0 deletions b/‎src/python/dropseq_metadata/src/dropseq_metadata/join_and_filter_tsv.py
Lines changed: 141 additions & 0 deletions
@@ -0,0 +1,69 @@
+name: Python Package using Conda
+
+on:
+  push:
+    branches: [ "master" ]
+  pull_request:
+    branches: [ "master" ]
+    paths:
+      - 'src/python/dropseq_metadata/**'
+
+
+jobs:
+  build-linux:
+    runs-on: ubuntu-latest
+    strategy:
+      max-parallel: 5
+
+    steps:
+    - uses: actions/checkout@v4
+    - name: Set up Python
+      uses: actions/setup-python@v3
+      with:
+        python-version: '3.12'
+    - name: Add conda to system path
+      run: |
+        # $CONDA is an environment variable pointing to the root of the miniconda directory
+        # https://github.com/actions/runner-images/blob/ff9acc6/images/ubuntu/Ubuntu2204-Readme.md#environment-variables
+        echo $CONDA/bin >> $GITHUB_PATH
+    - name: Modify the conda configuration to use conda-forge
+      run: |
+        conda config --add channels conda-forge
+        conda config --remove channels defaults || true
+
+        # There are many issues filed regarding:
+        #   > warning  libmamba Problem type not implemented SOLVER_RULE_STRICT_REPO_PRIORITY
+        # when using the recommended strict channel priority and creating environments from YAML files,
+        # but the FOSS conda/mamba community has not been able to fix the issue.
+        # https://github.com/mamba-org/mamba/issues/2810#issuecomment-1910011988
+        conda config --set channel_priority flexible
+
+        # Disable lock files to avoid bugs in mamba and libmamba
+        # https://mamba.readthedocs.io/en/latest/user_guide/troubleshooting.html#hangs-during-package-installation-on-nfs-network-file-systems
+        # https://github.com/mamba-org/mamba/issues/1993#issuecomment-1268397084
+        echo "use_lockfiles: false" >> ~/.mambarc
+    - name: Install dependencies
+      run: |
+        cd src/python/dropseq_metadata
+        conda env update --file environment.yml --name base
+    - name: Lint with flake8
+      run: |
+        cd src/python/dropseq_metadata
+
+        # Explicitly using the classic solver to avoid:
+        #   a) "libarchive.so.20: cannot open shared object", and
+        #   b) switching from the miniconda installed in the "ubuntu-latest" image
+        #      https://github.com/actions/runner-images/blob/ff9acc6/images/ubuntu/Ubuntu2204-Readme.md#package-management
+        #   c) upgrading to a working version using the below takes minutes while the tests themselves take seconds
+        #      https://stackoverflow.com/questions/77617946/solve-conda-libmamba-solver-libarchive-so-19-error-after-updating-conda-to-23#answer-78293971
+        conda install --solver=classic --override-channels --channel conda-forge flake8
+
+        # stop the build if there are Python syntax errors or undefined names
+        flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
+
+        # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
+        flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
+    - name: Test with unittest
+      run: |
+        cd src/python/dropseq_metadata
+        PYTHONPATH=src python -m unittest discover -s tests
@@ -13,7 +13,7 @@ dependencies = [
 authors = [
   { name="Alec Wysoker", email="[email protected]" },
 ]
-description = "A small example package"
+description = "Command-line tools for converting from various HDF5 formats to Drop-seq format."
 readme = "README.md"
 requires-python = ">=3.8"
 classifiers = [
 
@@ -0,0 +1,19 @@
+Copyright 2024 Broad Institute
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
@@ -0,0 +1,13 @@
+# Simple python tools for managing tabular cell metadata text files..
+
+## Installation
+
+Requires python >= 3.8
+```
+pip install 'git+https://github.com/broadinstitute/Drop-seq.git#egg=dropseq_metadata&subdirectory=src/python/dropseq_metadata'
+```
+
+## Usage
+
+Run `dropseq_metadata -h` for usage information.
+
@@ -0,0 +1,7 @@
+name: dropseq_metadata
+channels:
+  - conda-forge
+  - nodefaults
+dependencies:
+  - pandas
+  - python=3.12
@@ -0,0 +1,27 @@
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+[project]
+name = "dropseq_metadata"
+version = "3.0.2"
+dependencies = [
+    "pandas==2.2.3"
+]
+authors = [
+  { name="Alec Wysoker", email="[email protected]" },
+]
+description = "Command-line tools for converting from various HDF5 formats to Drop-seq format."
+readme = "README.md"
+requires-python = ">=3.8"
+classifiers = [
+    "Programming Language :: Python :: 3",
+    "License :: OSI Approved :: MIT License",
+    "Operating System :: OS Independent",
+]
+
+[project.urls]
+Homepage = "https://github.com/broadinstitute/Drop-seq/"
+Issues = "https://github.com/broadinstitute/Drop-seq/issues"
+
+[project.scripts]
+dropseq_metadata = "dropseq_metadata.cli:main"
@@ -0,0 +1,22 @@
+#!/usr/bin/env python3
+# MIT License
+# 
+# Copyright 2024 Broad Institute
+# 
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+# 
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+# 
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
@@ -0,0 +1,75 @@
+#!/usr/bin/env python3
+# MIT License
+# 
+# Copyright 2024 Broad Institute
+# 
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+# 
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+# 
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+import argparse
+import logging
+import sys
+
+try:
+    from . import join_and_filter_tsv
+except ImportError:
+    import join_and_filter_tsv
+
+# I cannot believe I need to do this to cause logger to write to stderr.
+logging.basicConfig(
+    level=logging.INFO,               # Set the logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL)
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
+    handlers=[logging.StreamHandler()] # StreamHandler writes to sys.stderr by default
+)
+logger = logging.getLogger(__name__)
+
+dctLogLevel = {
+    "DEBUG": logging.DEBUG,
+    "INFO": logging.INFO,
+    "WARNING": logging.WARNING,
+    "ERROR": logging.ERROR,
+    "CRITICAL": logging.CRITICAL
+}
+
+
+def main(args=None):
+    parser = argparse.ArgumentParser(prog="dropseq_metadata", description=__doc__)
+    parser.add_argument("--log-level", "-l", default="INFO", choices=dctLogLevel.keys(),
+                        help="Set the logging level.  (default: %(default)s)")
+    subparsers = parser.add_subparsers(
+        title="sub-commands",
+        description="valid commands",
+        dest="tool")
+    join_and_filter_tsv.add_subparser(subparsers)
+
+    if args is None:
+        args = sys.argv[1:]
+    if len(args) == 0:
+        parser.print_help()
+        return 1
+    else:
+        options = parser.parse_args(args)
+        logger.setLevel(dctLogLevel[options.log_level])
+        if options.tool == "join_and_filter_tsv":
+            return join_and_filter_tsv.main(options)
+        else:
+            # should be unpossible because parse_args will complain
+            raise ValueError(f"Unrecognized tool: {options.tool}")
+
+
+if __name__ == "__main__":
+    sys.exit(main())
@@ -0,0 +1,141 @@
+#!/usr/bin/env python3
+# MIT License
+# 
+# Copyright 2024 Broad Institute
+# 
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+# 
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+# 
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+"""
+Load a primary tab-separated file, and zero or more secondary tab-separated files, each with join columns
+specified.  Each secondary file is left outer joined to the primary file on the join column.
+
+The values in the join column of a join file must be unique.
+
+If there is a collision between the column names in the primary and secondary files, the column in the primary
+file is used.  If there is a collision between the column names in the secondary files, the column in the earlier
+file is used.
+
+In addition, there may be zero or more (column name, column value) pairs that are set on each line unconditionally.
+
+Each resulting row is passed through zero more filters, which can apply a min or a max threshold to a column,
+have a list of values to match against, or have a list of values to exclude, which may be specified on the command
+line or in a file.
+"""
+
+import argparse
+import sys
+import pandas as pd
+
+DELETEME_COLUMN_SUFFIX = '_deleteme'
+
+def try_convert_string_to_number(s):
+    int_val = None
+    float_val = None
+    try:
+        int_val = int(s)
+    except ValueError:
+        pass
+    try:
+        float_val = float(s)
+    except ValueError:
+        pass
+    if int_val is None and float_val is None:
+        return s
+    if int_val is None and float_val is not None:
+        return float_val
+    if float_val is None and int_val is not None:
+        return int_val
+    if int_val == float_val:
+        return int_val
+    return float_val
+
+def load_values_file(file):
+    return pd.read_csv(file, sep='\t', header=None).iloc[0]
+
+def add_subparser(subparsers):
+    parser = subparsers.add_parser("join_and_filter_tsv", description=__doc__)
+    parser.add_argument("--input", "-i", type=argparse.FileType('r'),
+                        help="Primary tab-separated file to join.  Default: %(default)s", default=sys.stdin)
+    parser.add_argument("--output", "-o", type=argparse.FileType('w'),
+                        help="Output file.  Default: %(default)s", default=sys.stdout)
+    parser.add_argument("--join", "-j", nargs=3, action='append', default=[],
+                        metavar=('SECONDARY_FILE', 'INPUT_COL', 'JOIN_COL'),
+                        help="Secondary tab-separated file to join, and the columns in the primary input "
+                             "and join file to join on. May be specified multiple times.")
+    parser.add_argument("--set", "-s", nargs=2, action='append', default=[], metavar=('COLUMN', 'VALUE'),
+                        help="Set a column to a constant value.  May be specified multiple times.")
+    parser.add_argument("--min", nargs=2, action='append', default=[], metavar=('COLUMN', 'VALUE'),
+                        help="Filter out rows where COLUMN is less than VALUE.  May be specified multiple times.")
+    parser.add_argument("--max", nargs=2, action='append', default=[], metavar=('COLUMN', 'VALUE'),
+                        help="Filter out rows where COLUMN is greater than VALUE.  May be specified multiple times.")
+    parser.add_argument("--include-file", nargs=2, action='append', default=[], metavar=('COLUMN', 'FILE'),
+                        help="Filter out rows where COLUMN is not in FILE.  May be specified multiple times.")
+    parser.add_argument("--exclude-file", nargs=2, action='append', default=[], metavar=('COLUMN', 'FILE'),
+                        help="Filter out rows where COLUMN is in FILE.  May be specified multiple times.")
+    parser.add_argument("--include", nargs='+', action='append', default=[], metavar=('COLUMN', 'VALUE'),
+                        help="Filter out rows where COLUMN is not one of the given VALUEs.  May be specified multiple times.")
+    parser.add_argument("--exclude", nargs='+', action='append', default=[], metavar=('COLUMN', 'VALUE'),
+                        help="Filter out rows where COLUMN is one of the given VALUEs.  May be specified multiple times.")
+
+def main(options):
+    # load the primary file
+    primary = pd.read_csv(options.input, sep='\t')
+    options.input.close()
+    # load each secondary file, and join it to the primary file, dropping secondary columns that are already in the primary
+    for join_file, input_col, join_col in options.join:
+        join_col_in_left = join_col in primary.columns
+        secondary = pd.read_csv(join_file, sep='\t')
+        primary = primary.merge(secondary, how='left', left_on=input_col, right_on=join_col,
+                                suffixes=(None, DELETEME_COLUMN_SUFFIX))
+        if not join_col_in_left:
+            # drop the join column from the merged data frame
+            primary.drop(join_col, axis=1, inplace=True)
+        # drop the secondary columns that are already in the primary
+        for col in primary.columns:
+            if col.endswith(DELETEME_COLUMN_SUFFIX):
+                primary.drop(col, axis=1, inplace=True)
+    # set columns to constant values
+    for column, value in options.set:
+        primary[column] = try_convert_string_to_number(value)
+    # filter out rows based on column values
+    for column, value in options.min:
+        primary = primary[primary[column] >= try_convert_string_to_number(value)]
+    for column, value in options.max:
+        primary = primary[primary[column] <= try_convert_string_to_number(value)]
+    for column, file in options.include_file:
+        include_values = load_values_file(file)
+        primary = primary[primary[column].isin(include_values)]
+    for column, file in options.exclude_file:
+        exclude_values = load_values_file(file)
+        primary = primary[~primary[column].isin(exclude_values)]
+    for includes in options.include:
+        column = includes[0]
+        values = includes[1:]
+        values = [try_convert_string_to_number(value) for value in values]
+        primary = primary[primary[column].isin(values)]
+    for excludes in options.exclude:
+        column = excludes[0]
+        values = excludes[1:]
+        values = [try_convert_string_to_number(value) for value in values]
+        primary = primary[~primary[column].isin(values)]
+    # write the output
+    primary.to_csv(options.output, sep='\t', index=False)
+    options.output.close()
+    return 0
+
+
Original file line number	Diff line number	Diff line change
`@@ -13,7 +13,7 @@ dependencies = [`
`13`	`13`	`authors = [`
`14`	`14`	`{ name="Alec Wysoker", email="[email protected]" },`
`15`	`15`	`]`
`16`		`-description = "A small example package"`
	`16`	`+description = "Command-line tools for converting from various HDF5 formats to Drop-seq format."`
`17`	`17`	`readme = "README.md"`
`18`	`18`	`requires-python = ">=3.8"`
`19`	`19`	`classifiers = [`