-
Notifications
You must be signed in to change notification settings - Fork 1
Enhancement improved snakemake functionality #474
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
97b839c
778de4b
48fac27
1ce36d4
d1f868a
df720d5
4bca62d
2e85fc5
9bb6cfb
168ae81
9397f48
657acd0
8fb72b5
4c27630
36d47b1
b1a8ad2
562a12a
8805679
f2f2b03
11ce90c
096c26b
d3d001f
6ffca14
4eca2a4
2e0c7f4
979423b
93add91
8872446
b855e00
9a474a9
eab32cf
1e45a90
b6379c9
1bd98e6
23dbe02
e4070c6
6fa1859
5c4d9c9
61e8ffe
21174d5
c51eb0e
e33eaa8
8894169
976ccb3
527a2ea
faeb16e
cc19791
82b3919
8d80f94
cd5490c
44a3352
1ee83ec
e847c91
c35d3f0
fcacec6
a51ff5a
fbaa65a
45c0da7
70b7cab
2831e0c
50af8ab
33ce251
0149bf9
79ee4a6
2d1dba6
5125c48
c2a6317
3b9c707
ab6d27e
025c171
b9ab343
2dd427b
63e750c
5dbc401
7eb75fd
c3b8f14
2f38961
f25cfdf
a025d50
6ae99b5
1f47f11
224c19f
8036f95
8e37a94
ad807e9
fb40c41
4e05473
a61277e
16aee6c
b03a29d
9283bdf
5a2b13d
76a99af
f35756f
e87160d
06f04ae
4bd4912
9d05743
2f5cd1d
425de4c
50ec200
505811e
c387e01
e59d8e0
b01d635
d0b85cb
3c818b2
476180c
86b65d9
419c076
dd81041
53bba79
71286ad
c062c8b
eff819b
d1d6dc6
1541929
202f81d
e87bf76
4380c8e
52ccda2
177d99d
37f17df
17d455d
954c8c5
d1a4f07
af9f8c0
0137641
43dffb2
f014f7f
bf9678b
422fd21
b4ff44b
d3571d6
c12a32d
baf9f64
f3743db
d125016
bd065a0
369d0d3
1b02243
d19dec3
27cce6a
b8006d7
4cea632
f9ca99d
722a690
6349f7d
18c7c9c
a110c37
575efba
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -2,85 +2,159 @@ import pandas as pd | |
| from copy import deepcopy | ||
| from pathlib import Path | ||
| from mmeds.config import TOOLS_DIR | ||
| from subprocess import run | ||
|
|
||
| """ | ||
| This common.smk file, following snakemake conventions, contains all the python logic necessary for generating the snakemake rule DAG | ||
| This common.smk file, following snakemake conventions, contains all the python logic necessary | ||
| for generating the snakemake rule DAG | ||
| """ | ||
|
|
||
| metadata = pd.read_csv("tables/qiime_mapping_file.tsv", sep='\t', header=[0], skiprows=[1]) | ||
| metadata = pd.read_csv("tables/qiime_mapping_file.tsv", sep='\t', header=[0], skiprows=[1], dtype='str') | ||
|
|
||
| def lefse_splits(wildcards): | ||
| """ Calculates all the pairwise splits that should be compared by LEfSe. Will not include groups with an insufficient number of comparisons """ | ||
| splits = [] | ||
| for lefse_class in config["classes"]: | ||
| # 'classes' in this case refer to metadata columns, whereas categories refer to the possible values of those columns | ||
| categories = list(metadata[lefse_class].unique()) | ||
|
|
||
| # Discard samples with a 'nan' for the selected class. This will only work while the input has been run through MMEDS already | ||
| categories = [c for c in categories if str(c) != "nan"] | ||
| value_counts = metadata[lefse_class].value_counts() | ||
| def pairwise_splits(wildcards, tool, vars): | ||
| """ | ||
| When running differential analysis on any number of variables and tables, create all the possible pairwise splits | ||
| per-table and per-variable that have sufficient data to form a comparison | ||
| """ | ||
| if "tables" in config: | ||
| tables = config["tables"] | ||
| else: | ||
| tables = [f"taxa_table_L{x}" for x in config["taxa_levels"]] | ||
|
|
||
| subclasses = [] | ||
| if "subclasses" in config and config["subclasses"]: | ||
| subclasses = deepcopy(config["subclasses"]) | ||
| subclasses = False | ||
| if tool == "lefse" and "subclasses" in config and config["subclasses"]: | ||
| subclasses = deepcopy(config["subclasses"]) | ||
|
|
||
| if len(categories) < 2: | ||
| # Only one value in the class, nothing to compare | ||
| continue | ||
| splits = [] | ||
| for table in tables: | ||
| if not Path(f"tables/{table}.tsv").exists(): | ||
| extract_feature_table_subprocess(table) | ||
| table_df = pd.read_csv(f"tables/{table}.tsv", sep='\t', header=[0], index_col=0) | ||
| filtered_metadata = metadata.loc[metadata["#SampleID"].isin(table_df.columns)] | ||
| for var in vars: | ||
| categories = list(filtered_metadata[var].unique()) | ||
| categories = [c for c in categories if str(c) != "nan"] | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. not sure if tables with this case can reach this stage of processing but will this be ok with other non-"nan" representations of NA? |
||
| value_counts = filtered_metadata[var].value_counts() | ||
|
|
||
| if len(categories) < 3: | ||
| # Exactly two values in the class, no pairwise checks needed | ||
| if not sufficient_values(value_counts, categories[0], categories[1]): | ||
| if len(categories) < 2: # Only one value in the class, nothing to compare | ||
| continue | ||
| splits += expand("results/{lefse_class}/lefse_plot.{feature_table}.{lefse_class}.NA.pdf", | ||
| feature_table=config["tables"], lefse_class=lefse_class) | ||
| if subclasses: | ||
| splits += expand("results/{lefse_class}/lefse_plot.{feature_table}.{lefse_class}.{subclass}.pdf", | ||
| feature_table=config["tables"], lefse_class=lefse_class, subclass=subclasses) | ||
| continue | ||
|
|
||
|
|
||
| splits += expand("results/{lefse_class}/lefse_plot_strict.{feature_table}.{lefse_class}.{subclass}.pdf", | ||
| feature_table=config["tables"], lefse_class=lefse_class, subclass=subclasses) | ||
| for i in range(len(categories)-1): | ||
| for j in range(i+1, len(categories)): | ||
| # Perform pairwise checks | ||
| if not sufficient_values(value_counts, categories[i], categories[j]): | ||
|
|
||
| if len(categories) < 3: # Exactly two values in the class, no pairwise checks needed | ||
| if not sufficient_values(value_counts, categories[0], categories[1]): | ||
| continue | ||
| splits += expand("results/{lefse_class}/lefse_plot.{feature_table}_{lefse_class}_{cat1}_or_{cat2}.{lefse_class}.NA.pdf", | ||
| feature_table=config["tables"], lefse_class=lefse_class, cat1=categories[i], cat2=categories[j]) | ||
| if subclasses: | ||
| splits += expand("results/{lefse_class}/lefse_plot.{feature_table}_{lefse_class}_{cat1}_or_{cat2}.{lefse_class}.{subclass}.pdf", | ||
| feature_table=config["tables"], lefse_class=lefse_class, cat1=categories[i], cat2=categories[j], subclass=subclasses) | ||
| if tool == "lefse": | ||
| splits += expand("results/{var}/lefse_plot.{feature_table}.{var}.NA.pdf", | ||
| feature_table=table, var=var) | ||
| if subclasses: | ||
| splits += expand("results/{var}/lefse_plot.{feature_table}.{var}.{subclass}.pdf", | ||
| feature_table=table, var=var, subclass=subclasses) | ||
| elif tool == "ancombc": | ||
| splits += expand("differential_abundance/{var}/ancom-bc_barplot.{feature_table}.{var}::{cat}.qzv", | ||
| feature_table=table, var=var, cat=categories[0]) | ||
| continue | ||
|
|
||
| for i in range(len(categories)-1): | ||
| if tool == "ancombc": # Do not need a separate comparison for each pairwise split with ANCOM-BD | ||
| splits += expand("differential_abundance/{var}/ancom-bc_barplot.{feature_table}.{var}::{cat}.qzv", | ||
| feature_table=table, var=var, cat=categories[i]) | ||
|
|
||
| else: # Perform LEfSe strict analyses using all variable classes | ||
| splits += expand("results/{var}/lefse_plot_strict.{feature_table}.{var}.NA.pdf", | ||
| feature_table=table, var=var) | ||
| if subclasses: | ||
| splits += expand("results/{var}/lefse_plot_strict.{feature_table}.{var}.{subclass}.pdf", | ||
| feature_table=table, var=var, subclass=subclasses) | ||
|
|
||
| for j in range(i+1, len(categories)): # Perform pairwise checks | ||
| if not sufficient_values(value_counts, categories[i], categories[j]): | ||
| continue | ||
| if tool == "lefse": | ||
| splits += expand("results/{var}/lefse_plot.{feature_table}.{var}-{cat1}-or-{cat2}.{var}.NA.pdf", | ||
| feature_table=table, var=var, cat1=categories[i], cat2=categories[j]) | ||
| if subclasses: | ||
| splits += expand( | ||
| "results/{var}/lefse_plot.{feature_table}.{var}-{cat1}-or-{cat2}.{var}.{subclass}.pdf", | ||
| feature_table=table, var=var, cat1=categories[i], cat2=categories[j], | ||
| subclass=subclasses) | ||
| return splits | ||
|
|
||
|
|
||
| def ancombc_splits(wildcards): | ||
| """ Get pairwise splits prepared in ANCOM-BC format """ | ||
| return pairwise_splits(wildcards, "ancombc", config["metadata"]) | ||
|
|
||
|
|
||
| def lefse_splits(wildcards): | ||
| """ Get pairwise splits prepared in LEfSe format """ | ||
| splits = pairwise_splits(wildcards, "lefse", config["classes"]) | ||
| formatted_splits = [] | ||
| for s in splits: | ||
| # Replace occurrences where class==subclass with subclass="NA", which is the default behavior, this handles the issue at the DAG level | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. perhaps replace this comment with the string of what the expected file name/string being parsed as |
||
| # e.g. separated: ["results/class/lefse_plot", "feature_table_class_cat1_or_cat2", "class", "subclass", "pdf"] | ||
| separated = s.split(".") | ||
| if separated[-2] == separated[-3]: | ||
| separated[-2] = "NA" | ||
| formatted_splits += [".".join(separated)] | ||
|
|
||
| return formatted_splits | ||
|
|
||
|
|
||
| def lefse_get_subclass(wildcards): | ||
| """ Handle class==subclass behavior at the rule level """ | ||
| """ | ||
| Replace occurrences where class==subclass with subclass="NA", which is the default behavior, | ||
| this handles the issue at the DAG level e.g. separated: | ||
| ["results/class/lefse_plot", "feature_table_class_cat1_or_cat2", "class", "subclass", "pdf"] | ||
| """ | ||
| subclass = wildcards["class"] if wildcards["subclass"] == "NA" else wildcards["subclass"] | ||
| return subclass | ||
|
|
||
|
|
||
| def sufficient_values(value_counts, cat1, cat2, threshold=2): | ||
| """ Check if two categories have enough samples for a comparison """ | ||
| if value_counts[cat1] < threshold or value_counts[cat2] < threshold: | ||
| return False | ||
| return True | ||
|
|
||
|
|
||
| def demux_single_option(wildcards): | ||
| """ Studies from MSQ past their 90th run require no golay error correction, all others require rev comp mapping barcodes """ | ||
| """ | ||
| Studies from MSQ past id 90 require no golay error correction, all others runs require | ||
| rev-comp mapping barcodes. This is a poor generalization and will need to be improved in the future. | ||
| """ | ||
|
Comment on lines
+119
to
+122
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Eventually will change into a user-inputted parameter prior to the analysis step in the user-defined config file |
||
| components = wildcards.sequencing_run.split("_") | ||
| if "MSQ" in components and int(components[-1]) > 90: | ||
| return "--p-no-golay-error-correction" | ||
| return "--p-rev-comp-mapping-barcodes" | ||
|
|
||
|
|
||
| def get_lefse_plot_options(): | ||
| """ Add various visualization options for LEfSe plot output """ | ||
| opts = "" | ||
| if "clean_strings" in config and config["clean_strings"] is not None and not config["clean_strings"]: | ||
| opts += "--no-string-clean " | ||
| if "plot_max_rows" in config and type(config["plot_max_rows"]) is int and config["plot_max_rows"] > 0: | ||
| opts += f"--row-max {config['plot_max_rows']} " | ||
| if "include_string" in config and config["include_string"]: | ||
| opts += f"--include-string {config['include_string']} " | ||
| if "exclude_string" in config and config["exclude_string"]: | ||
| opts += f"--exclude-string {config['exclude_string']} " | ||
| return opts | ||
|
|
||
|
|
||
| def get_tool_dir(): | ||
| """ Get the location of needed scripts """ | ||
| return TOOLS_DIR | ||
|
|
||
|
|
||
| def extract_feature_table_subprocess(table): | ||
| """ Equal to the 'extract_feature_table.sh' script but done without the external call """ | ||
| qza_file = Path(f"tables/{table}.qza") | ||
| tsv_file = Path(f"tables/{table}.tsv") | ||
| tmp_dir = Path("tables/tmp_unzip") | ||
|
|
||
| if not qza_file.exists(): | ||
| raise FileNotFoundError(f"{qza_file.name} not found in tables folder") | ||
|
|
||
| run(["unzip", "-qq", "-jo", str(qza_file), "-d", str(tmp_dir)]) | ||
| run(["biom", "convert", "--to-tsv", "-i", str(tmp_dir / "feature-table.biom"), "-o", str(tsv_file)]) | ||
| run(["rm", "-rf", str(tmp_dir)]) | ||
| run(["sed", "-i", "1d;2s/^#//", str(tsv_file)]) | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -5,7 +5,8 @@ rule demux_single_barcodes: | |
| barcodes = "section_{sequencing_run}/qiime_mapping_file_{sequencing_run}.tsv" | ||
| output: | ||
| error_correction = "section_{sequencing_run}/error_correction.qza", | ||
| demux_file = "section_{sequencing_run}/demux_file.qza" | ||
| demux_file = "section_{sequencing_run}/demux_file.qza", | ||
| demux_viz = "section_{sequencing_run}/demux_viz.qza" | ||
| conda: | ||
| "qiime2-2020.8.0" | ||
|
Comment on lines
10
to
11
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is qiime2-2025.4 or qiime2-2023.9 etc available now? |
||
| params: | ||
|
|
@@ -17,17 +18,21 @@ rule demux_single_barcodes: | |
| "--m-barcodes-column BarcodeSequence " | ||
| "{params.option} " | ||
| "--o-error-correction-details {output.error_correction} " | ||
| "--o-per-sample-sequences {output.demux_file}" | ||
| "--o-per-sample-sequences {output.demux_file}; " | ||
| "qiime demux summarize " | ||
| "--i-data {output.demux_file} " | ||
| "--o-visualization {output.demux_viz}" | ||
|
|
||
| rule demux_dual_barcodes_pheniqs: | ||
| """ Demultiplex a paired-end dual-barcoded sequencing run with Pheniqs """ | ||
| input: | ||
| "section_{sequencing_run}/pheniqs_config.json" | ||
| output: | ||
| "section_{sequencing_run}/pheniqs_output" | ||
| directory("section_{sequencing_run}/pheniqs_output") | ||
| conda: | ||
| "pheniqs" | ||
| shell: | ||
| "mkdir {output}; " | ||
| "pheniqs mux --config {input}" | ||
|
|
||
| rule strip_error_barcodes: | ||
|
|
@@ -36,10 +41,11 @@ rule strip_error_barcodes: | |
| dir = "section_{sequencing_run}/pheniqs_output", | ||
| mapping_file = "section_{sequencing_run}/qiime_mapping_file_{sequencing_run}.tsv", | ||
| output: | ||
| dir = "section_{sequencing_run}/stripped_output" | ||
| dir = directory("section_{sequencing_run}/stripped_output") | ||
| conda: | ||
| "mmeds" | ||
| "mmeds_test" | ||
| shell: | ||
| "mkdir {output}; " | ||
| "strip_error_barcodes.py " | ||
| "--num-allowed-errors 1 " | ||
| "--m-mapping-file {input.mapping_file} " | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think you're ok to get rid of the brackets around
[0]and useheader=0, skiprows=1