clemente-lab · adamcantor22 · Jul 24, 2024 · Jul 24, 2024 · Jul 25, 2024 · Jul 25, 2024
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -20,7 +20,7 @@ jobs:
         run: echo "PYTHONPATH=." >> $GITHUB_ENV
 
       - name: Install packages
-        run: pip install -U pip; pip install cryptography; pip install jupyter_client==6.1.12; pip install ipython_genutils==0.2.0; pip install nbconvert==5.6.1; pip install rpy2; pip install ipykernel; pip install pandas==1.2.3; pip install pillow; pip install -U Jinja2==3.0; pip install coverage;
+        run: python -m pip install pip==24.0; pip install cryptography; pip install jupyter_client==6.1.12; pip install ipython_genutils==0.2.0; pip install nbconvert==5.6.1; pip install rpy2; pip install ipykernel; pip install pandas==1.2.3; pip install pillow; pip install -U Jinja2==3.0; pip install coverage;
 
       - name: install pandoc
         run: sudo apt-get install pandoc;
@@ -89,7 +89,7 @@ jobs:
         with:
           python-version: 3.9
       - name: Install packages
-        run: pip install -U pip; sudo apt-get install tidy environment-modules -y; pip install cryptography; pip install coverage;
+        run: python -m pip install pip==24.0; sudo apt-get install tidy environment-modules -y; pip install cryptography; pip install coverage;
 
       - name: Set PYTHONPATH
         run: echo "PYTHONPATH=." >> $GITHUB_ENV

diff --git a/mmeds/config.py b/mmeds/config.py
@@ -17,8 +17,8 @@
 # Check where this code is being run
 TESTING = not ('chimera' in getfqdn().split('.'))
 
-# If not running on web01, can't connect to databases
-IS_PRODUCTION = 'web01' in getfqdn().split('.')
+# If not running on web03, can't connect to databases
+IS_PRODUCTION = 'web03' in getfqdn().split('.')
 
 # While this is false, users cannot be added, cannot upload, and cannot query from webpage
 LIVE_PROD_ACCESS = True
@@ -49,7 +49,7 @@
     IMAGE_PATH = str(CSS_DIR) + '/'
 
 else:
-    # We're on web01 and using MMEDs out of if it's project diredctory
+    # We're on web03 and using MMEDs out of if it's project diredctory
     # OR, we're in the folder /sc/arion/projects/MMEDS
     DATABASE_DIR = Path('/sc/arion/projects/MMEDS/mmeds_server_data')
 
@@ -277,14 +277,27 @@
             'taxonomic_database',
             'sequencing_runs',
             'taxa_levels'
-        ]
+        ],
+        "optional_parameters": []
     },
     "lefse": {
         "parameters": [
             "tables",
-            "classes",
-            "subclasses"
+            "classes"
+        ],
+        "optional_parameters": [
+            "subclasses",
+            "clean_strings",
+            "plot_max_rows",
+            "include_string",
+            "exclude_string"
         ]
+    },
+    "picrust2": {
+        "parameters": [
+            "tables"  # this is going to always be 'asv_table.qza' and 'rep_seqs_table.qza'  TODO: default parameters?
+        ],
+        "optional_parameters": []
     }
 }
 
@@ -429,6 +442,12 @@
 TEST_CODE_MIXED = 'mixedstudy'
 TEST_CODE_OTU = 'otutable'
 TEST_CODE_LEFSE = 'lefsetable'
+TEST_FORMAT_HUMANN_MAPPING = str(TEST_PATH / 'test_qiime_mapping_file_format_to_humann.tsv')
+TEST_FORMAT_HUMANN_TABLE = str(TEST_PATH / 'test_BRITE_pathways_stratified_format_to_humann.tsv')
+TEST_FORMAT_HUMANN_RESULT = str(TEST_PATH / 'test_formatted_humann_table.tsv')
+TEST_FORMAT_LEFSE_MAPPING = str(TEST_PATH / 'test_qiime_mapping_file_format_to_lefse.tsv')
+TEST_FORMAT_LEFSE_TABLE = str(TEST_PATH / 'test_taxa_table_L7_format_to_lefse.tsv')
+TEST_FORMAT_LEFSE_RESULT = str(TEST_PATH / 'test_formatted_lefse_table.tsv')
 TEST_MIXS = str(TEST_PATH / 'test_MIxS.tsv')
 TEST_MIXS_MMEDS = str(TEST_PATH / 'MIxS_metadata.tsv')
 TEST_OTU = str(TEST_PATH / 'test_otu_table.txt')

diff --git a/mmeds/database/database.py b/mmeds/database/database.py
@@ -1032,10 +1032,8 @@ def get_sequencing_run_locations(self, metadata, user, column=("RawDataProtocol"
         df = pd.read_csv(metadata, sep='\t', header=[0, 1], skiprows=[2, 3, 4])
 
         # Store run names from metadata
-        runs = []
-        for run in df[column]:
-            if run not in runs:
-                runs.append(run)
+        runs = list(df[column].unique())
+
         # Get paths, these should exist due to already checking during validation
         run_paths = {}
         for run in runs:

diff --git a/mmeds/server.py b/mmeds/server.py
@@ -717,6 +717,7 @@ def upload_specimen_metadata(self, uploadType, studyName):
             cp.session['metadata_type'] = 'specimen'
             cp.session['study_name'] = studyName
             cp.session['upload_type'] = uploadType
+            cp.log(cp.session["upload_type"])
 
             with Database(path='.', testing=self.testing, owner=self.get_user()) as db:
                 db.check_study_name(studyName)

diff --git a/mmeds/snakemake/rules/common.smk b/mmeds/snakemake/rules/common.smk
@@ -2,85 +2,159 @@ import pandas as pd
 from copy import deepcopy
 from pathlib import Path
 from mmeds.config import TOOLS_DIR
+from subprocess import run
 
 """
-This common.smk file, following snakemake conventions, contains all the python logic necessary for generating the snakemake rule DAG
+This common.smk file, following snakemake conventions, contains all the python logic necessary
+    for generating the snakemake rule DAG
 """
 
-metadata = pd.read_csv("tables/qiime_mapping_file.tsv", sep='\t', header=[0], skiprows=[1])
+metadata = pd.read_csv("tables/qiime_mapping_file.tsv", sep='\t', header=[0], skiprows=[1], dtype='str')
 
-def lefse_splits(wildcards):
-    """ Calculates all the pairwise splits that should be compared by LEfSe. Will not include groups with an insufficient number of comparisons """
-    splits = []
-    for lefse_class in config["classes"]:
-        # 'classes' in this case refer to metadata columns, whereas categories refer to the possible values of those columns
-        categories = list(metadata[lefse_class].unique())
 
-        # Discard samples with a 'nan' for the selected class. This will only work while the input has been run through MMEDS already
-        categories = [c for c in categories if str(c) != "nan"]
-        value_counts = metadata[lefse_class].value_counts()
+def pairwise_splits(wildcards, tool, vars):
+    """
+    When running differential analysis on any number of variables and tables, create all the possible pairwise splits
+        per-table and per-variable that have sufficient data to form a comparison
+    """
+    if "tables" in config:
+        tables = config["tables"]
+    else:
+        tables = [f"taxa_table_L{x}" for x in config["taxa_levels"]]
 
-        subclasses = []
-        if "subclasses" in config and config["subclasses"]:
-            subclasses = deepcopy(config["subclasses"])
+    subclasses = False
+    if tool == "lefse" and "subclasses" in config and config["subclasses"]:
+        subclasses = deepcopy(config["subclasses"])
 
-        if len(categories) < 2:
-            # Only one value in the class, nothing to compare
-            continue
+    splits = []
+    for table in tables:
+        if not Path(f"tables/{table}.tsv").exists():
+            extract_feature_table_subprocess(table)
+        table_df = pd.read_csv(f"tables/{table}.tsv", sep='\t', header=[0], index_col=0)
+        filtered_metadata = metadata.loc[metadata["#SampleID"].isin(table_df.columns)]
+        for var in vars:
+            categories = list(filtered_metadata[var].unique())
+            categories = [c for c in categories if str(c) != "nan"]
+            value_counts = filtered_metadata[var].value_counts()
 
-        if len(categories) < 3:
-            # Exactly two values in the class, no pairwise checks needed
-            if not sufficient_values(value_counts, categories[0], categories[1]):
+            if len(categories) < 2:  # Only one value in the class, nothing to compare
                 continue
-            splits += expand("results/{lefse_class}/lefse_plot.{feature_table}.{lefse_class}.NA.pdf",
-                             feature_table=config["tables"], lefse_class=lefse_class)
-            if subclasses:
-                splits += expand("results/{lefse_class}/lefse_plot.{feature_table}.{lefse_class}.{subclass}.pdf",
-                                feature_table=config["tables"], lefse_class=lefse_class, subclass=subclasses)
-            continue
-
-
-        splits += expand("results/{lefse_class}/lefse_plot_strict.{feature_table}.{lefse_class}.{subclass}.pdf",
-                         feature_table=config["tables"], lefse_class=lefse_class, subclass=subclasses)
-        for i in range(len(categories)-1):
-            for j in range(i+1, len(categories)):
-                # Perform pairwise checks
-                if not sufficient_values(value_counts, categories[i], categories[j]):
+
+            if len(categories) < 3:  # Exactly two values in the class, no pairwise checks needed
+                if not sufficient_values(value_counts, categories[0], categories[1]):
                     continue
-                splits += expand("results/{lefse_class}/lefse_plot.{feature_table}_{lefse_class}_{cat1}_or_{cat2}.{lefse_class}.NA.pdf",
-                                 feature_table=config["tables"], lefse_class=lefse_class, cat1=categories[i], cat2=categories[j])
-                if subclasses:
-                    splits += expand("results/{lefse_class}/lefse_plot.{feature_table}_{lefse_class}_{cat1}_or_{cat2}.{lefse_class}.{subclass}.pdf",
-                                     feature_table=config["tables"], lefse_class=lefse_class, cat1=categories[i], cat2=categories[j], subclass=subclasses)
+                if tool == "lefse":
+                    splits += expand("results/{var}/lefse_plot.{feature_table}.{var}.NA.pdf",
+                                     feature_table=table, var=var)
+                    if subclasses:
+                        splits += expand("results/{var}/lefse_plot.{feature_table}.{var}.{subclass}.pdf",
+                                         feature_table=table, var=var, subclass=subclasses)
+                elif tool == "ancombc":
+                    splits += expand("differential_abundance/{var}/ancom-bc_barplot.{feature_table}.{var}::{cat}.qzv",
+                                     feature_table=table, var=var, cat=categories[0])
+                continue
+
+            for i in range(len(categories)-1):
+                if tool == "ancombc":  # Do not need a separate comparison for each pairwise split with ANCOM-BD
+                    splits += expand("differential_abundance/{var}/ancom-bc_barplot.{feature_table}.{var}::{cat}.qzv",
+                                     feature_table=table, var=var, cat=categories[i])
+
+                else:  # Perform LEfSe strict analyses using all variable classes
+                    splits += expand("results/{var}/lefse_plot_strict.{feature_table}.{var}.NA.pdf",
+                                     feature_table=table, var=var)
+                    if subclasses:
+                        splits += expand("results/{var}/lefse_plot_strict.{feature_table}.{var}.{subclass}.pdf",
+                                         feature_table=table, var=var, subclass=subclasses)
+
+                for j in range(i+1, len(categories)):  # Perform pairwise checks
+                    if not sufficient_values(value_counts, categories[i], categories[j]):
+                        continue
+                    if tool == "lefse":
+                        splits += expand("results/{var}/lefse_plot.{feature_table}.{var}-{cat1}-or-{cat2}.{var}.NA.pdf",
+                                         feature_table=table, var=var, cat1=categories[i], cat2=categories[j])
+                        if subclasses:
+                            splits += expand(
+                                "results/{var}/lefse_plot.{feature_table}.{var}-{cat1}-or-{cat2}.{var}.{subclass}.pdf",
+                                feature_table=table, var=var, cat1=categories[i], cat2=categories[j],
+                                subclass=subclasses)
+    return splits
+
+
+def ancombc_splits(wildcards):
+    """ Get pairwise splits prepared in ANCOM-BC format """
+    return pairwise_splits(wildcards, "ancombc", config["metadata"])
+
 
+def lefse_splits(wildcards):
+    """ Get pairwise splits prepared in LEfSe format """
+    splits = pairwise_splits(wildcards, "lefse", config["classes"])
     formatted_splits = []
     for s in splits:
-        # Replace occurrences where class==subclass with subclass="NA", which is the default behavior, this handles the issue at the DAG level
-        #   e.g. separated: ["results/class/lefse_plot", "feature_table_class_cat1_or_cat2", "class", "subclass", "pdf"]
         separated = s.split(".")
         if separated[-2] == separated[-3]:
             separated[-2] = "NA"
         formatted_splits += [".".join(separated)]
 
     return formatted_splits
 
+
 def lefse_get_subclass(wildcards):
-    """ Handle class==subclass behavior at the rule level """
+    """
+    Replace occurrences where class==subclass with subclass="NA", which is the default behavior,
+        this handles the issue at the DAG level e.g. separated:
+            ["results/class/lefse_plot", "feature_table_class_cat1_or_cat2", "class", "subclass", "pdf"]
+    """
     subclass = wildcards["class"] if wildcards["subclass"] == "NA" else wildcards["subclass"]
     return subclass
 
+
 def sufficient_values(value_counts, cat1, cat2, threshold=2):
     """ Check if two categories have enough samples for a comparison """
     if value_counts[cat1] < threshold or value_counts[cat2] < threshold:
         return False
     return True
 
+
 def demux_single_option(wildcards):
-    """ Studies from MSQ past their 90th run require no golay error correction, all others require rev comp mapping barcodes """
+    """
+    Studies from MSQ past id 90 require no golay error correction, all others runs require
+        rev-comp mapping barcodes. This is a poor generalization and will need to be improved in the future.
+    """
     components = wildcards.sequencing_run.split("_")
     if "MSQ" in components and int(components[-1]) > 90:
         return "--p-no-golay-error-correction"
     return "--p-rev-comp-mapping-barcodes"
 
+
+def get_lefse_plot_options():
+    """ Add various visualization options for LEfSe plot output """
+    opts = ""
+    if "clean_strings" in config and config["clean_strings"] is not None and not config["clean_strings"]:
+        opts += "--no-string-clean "
+    if "plot_max_rows" in config and type(config["plot_max_rows"]) is int and config["plot_max_rows"] > 0:
+        opts += f"--row-max {config['plot_max_rows']} "
+    if "include_string" in config and config["include_string"]:
+        opts += f"--include-string {config['include_string']} "
+    if "exclude_string" in config and config["exclude_string"]:
+        opts += f"--exclude-string {config['exclude_string']} "
+    return opts
+
+
 def get_tool_dir():
+    """ Get the location of needed scripts """
     return TOOLS_DIR
+
+
+def extract_feature_table_subprocess(table):
+    """ Equal to the 'extract_feature_table.sh' script but done without the external call """
+    qza_file = Path(f"tables/{table}.qza")
+    tsv_file = Path(f"tables/{table}.tsv")
+    tmp_dir = Path("tables/tmp_unzip")
+
+    if not qza_file.exists():
+        raise FileNotFoundError(f"{qza_file.name} not found in tables folder")
+
+    run(["unzip", "-qq", "-jo", str(qza_file), "-d", str(tmp_dir)])
+    run(["biom", "convert", "--to-tsv", "-i", str(tmp_dir / "feature-table.biom"), "-o", str(tsv_file)])
+    run(["rm", "-rf", str(tmp_dir)])
+    run(["sed", "-i", "1d;2s/^#//", str(tsv_file)])
diff --git a/mmeds/snakemake/rules/demux_denoise.smk b/mmeds/snakemake/rules/demux_denoise.smk
@@ -5,7 +5,8 @@ rule demux_single_barcodes:
         barcodes = "section_{sequencing_run}/qiime_mapping_file_{sequencing_run}.tsv"
     output:
         error_correction = "section_{sequencing_run}/error_correction.qza",
-        demux_file = "section_{sequencing_run}/demux_file.qza"
+        demux_file = "section_{sequencing_run}/demux_file.qza",
+        demux_viz = "section_{sequencing_run}/demux_viz.qza"
     conda:
         "qiime2-2020.8.0"
     params:
@@ -17,17 +18,21 @@ rule demux_single_barcodes:
         "--m-barcodes-column BarcodeSequence "
         "{params.option} "
         "--o-error-correction-details {output.error_correction} "
-        "--o-per-sample-sequences {output.demux_file}"
+        "--o-per-sample-sequences {output.demux_file}; "
+        "qiime demux summarize "
+        "--i-data {output.demux_file} "
+        "--o-visualization {output.demux_viz}"
 
 rule demux_dual_barcodes_pheniqs:
     """ Demultiplex a paired-end dual-barcoded sequencing run with Pheniqs """
     input:
         "section_{sequencing_run}/pheniqs_config.json"
     output:
-        "section_{sequencing_run}/pheniqs_output"
+        directory("section_{sequencing_run}/pheniqs_output")
     conda:
         "pheniqs"
     shell:
+        "mkdir {output}; "
         "pheniqs mux --config {input}"
 
 rule strip_error_barcodes:
@@ -36,10 +41,11 @@ rule strip_error_barcodes:
         dir = "section_{sequencing_run}/pheniqs_output",
         mapping_file = "section_{sequencing_run}/qiime_mapping_file_{sequencing_run}.tsv",
     output:
-        dir = "section_{sequencing_run}/stripped_output"
+        dir = directory("section_{sequencing_run}/stripped_output")
     conda:
-        "mmeds"
+        "mmeds_test"
     shell:
+        "mkdir {output}; "
         "strip_error_barcodes.py "
         "--num-allowed-errors 1 "
         "--m-mapping-file {input.mapping_file} "