Skip to content

Commit 24eef94

Browse files
authored
Merge pull request #15 from VIB-PSB/dev
Release 1.1 - e2e test
2 parents 3cb9481 + db632dc commit 24eef94

32 files changed

+101140
-112
lines changed

.github/workflows/test.yml

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
name: MINI-AC test suite
2+
3+
on:
4+
push:
5+
branches: [ "main", "dev" ]
6+
pull_request:
7+
branches: [ "main", "dev" ]
8+
9+
jobs:
10+
nf-test:
11+
12+
runs-on: ubuntu-latest
13+
14+
steps:
15+
- uses: actions/checkout@v3
16+
17+
- name: Prepare nf-test config file
18+
run: sed -i -e "s@%TMP%@${RUNNER_TEMP}@g" tests/nextflow.config
19+
20+
- uses: actions/setup-java@v3
21+
with:
22+
distribution: oracle
23+
java-version: 17
24+
25+
- name: Check Java version
26+
run: java -version
27+
28+
- name: Setup Nextflow
29+
uses: nf-core/setup-nextflow@v1.3.0
30+
31+
- name: Setup singularity
32+
uses: eWaterCycle/setup-singularity@v7
33+
with:
34+
singularity-version: 3.8.3
35+
36+
- name: Setup nf-test
37+
run: wget -qO- https://code.askimed.com/install/nf-test | bash
38+
39+
- name: Fetch motif mapping files
40+
run: |
41+
curl -k -o tests/data/zma_v4_chr1/zma_v4_genome_wide_motif_mappings_chr1.bed https://floppy.psb.ugent.be/index.php/s/NekMYztyxEnsQiY/download/zma_v4_genome_wide_motif_mappings_chr1.bed
42+
curl -k -o tests/data/zma_v4_chr1/zma_v4_locus_based_motif_mappings_5kbup_1kbdown_chr1.bed https://floppy.psb.ugent.be/index.php/s/r2wQmFjPy79qSp7/download/zma_v4_locus_based_motif_mappings_5kbup_1kbdown_chr1.bed
43+
curl -k -o data/ath/ath_genome_wide_motif_mappings.bed https://floppy.psb.ugent.be/index.php/s/iaZPwdrRGe3YDdK/download/ath_genome_wide_motif_mappings.bed
44+
curl -k -o data/ath/ath_locus_based_motif_mappings_5kbup_1kbdown.bed https://floppy.psb.ugent.be/index.php/s/qcQ7KndzHaSpd9e/download/ath_locus_based_motif_mappings_5kbup_1kbdown.bed
45+
46+
- name: Run nf-test
47+
shell: bash
48+
run: ./nf-test test

.gitignore

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
# ignore Nextflow cache and logs
2+
.nextflow/
3+
.nextflow.log*
4+
5+
# ignore Singularity cache
6+
singularity_cache/
7+
8+
# ignore large motif mapping files
9+
*motif_mappings*.bed
10+
11+
# ignore nf-test executable
12+
nf-test
13+
14+
# ignore test cache
15+
.nf-test/
16+
17+
# ignore test outputs
18+
tests/outputs/
19+
20+
# ignore SLURM output and error files
21+
slurm.*.out
22+
slurm.*.err
23+
24+
# ignore jupyter notebook checkpoints
25+
.ipynb_checkpoints/
26+
27+
# python cache and compiled files
28+
__pycache__/
29+
*.pyc

bin/add_go_names.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,8 @@
44
import go_manipulations
55

66
gene_go_file = argv[1]
7+
ontology_file = argv[2]
78

8-
go_tree = go_manipulations.GOtree(path.join(path.dirname(path.dirname(argv[0])), "ontologies", "go.obo"))
9+
go_tree = go_manipulations.GOtree(ontology_file)
910

1011
go_tree.add_descriptions(gene_go_file)

bin/getGO_xlsx_gw.py

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ def parseArgs():
3434
parser.add_argument('-ex', '--expressed_genes_file', nargs = 1, type = str,
3535
default = None, help = '',
3636
metavar = 'List of genes expressed in biological context of experiment')
37-
37+
3838
args = parser.parse_args()
3939

4040
return args
@@ -94,8 +94,11 @@ def parseArgs():
9494

9595
if not GO_info:
9696
empty_table = pd.DataFrame(["### This dataset did not yield any GO enrichment"])
97-
with pd.ExcelWriter(output_file) as writer:
98-
empty_table.to_excel(writer, index = False, header = False)
97+
if(output_file.endswith('.csv')):
98+
empty_table.to_csv(output_file, index = False, header = False)
99+
else:
100+
with pd.ExcelWriter(output_file) as writer:
101+
empty_table.to_excel(writer, index = False, header = False)
99102
sys.exit()
100103

101104
### Integrating data ###
@@ -130,5 +133,8 @@ def parseArgs():
130133

131134
### Writing output file ###
132135

133-
with pd.ExcelWriter(output_file) as writer:
134-
go_df.to_excel(writer, index = False)
136+
if (output_file.endswith('.csv')):
137+
go_df.to_csv(output_file, index = False)
138+
else:
139+
with pd.ExcelWriter(output_file) as writer:
140+
go_df.to_excel(writer, index = False)

bin/getGO_xlsx_lb.py

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ def parseArgs():
3434
parser.add_argument('-ex', '--expressed_genes_file', nargs = 1, type = str,
3535
default = None, help = '',
3636
metavar = 'List of genes expressed in biological context of experiment')
37-
37+
3838
args = parser.parse_args()
3939

4040
return args
@@ -94,8 +94,11 @@ def parseArgs():
9494

9595
if not GO_info:
9696
empty_table = pd.DataFrame(["### This dataset did not yield any GO enrichment"])
97-
with pd.ExcelWriter(output_file) as writer:
98-
empty_table.to_excel(writer, index = False, header = False)
97+
if(output_file.endswith('.csv')):
98+
empty_table.to_csv(output_file, index = False, header = False)
99+
else:
100+
with pd.ExcelWriter(output_file) as writer:
101+
empty_table.to_excel(writer, index = False, header = False)
99102
sys.exit()
100103

101104
### Integrating data ###
@@ -130,5 +133,8 @@ def parseArgs():
130133

131134
### Writing output file ###
132135

133-
with pd.ExcelWriter(output_file) as writer:
134-
go_df.to_excel(writer, index = False)
136+
if (output_file.endswith('.csv')):
137+
go_df.to_csv(output_file, index = False)
138+
else:
139+
with pd.ExcelWriter(output_file) as writer:
140+
go_df.to_excel(writer, index = False)

bin/getMotifCentricOutput_gw.py

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -104,8 +104,11 @@ def parseArgs():
104104

105105
if enr_stats.empty:
106106
empty_table = pd.DataFrame(["### This dataset did not yield any motif enrichment"])
107-
with pd.ExcelWriter(output_file) as writer:
108-
empty_table.to_excel(writer, index = False, header = False)
107+
if(output_file.endswith('.csv')):
108+
empty_table.to_csv(output_file, index = False, header = False)
109+
else:
110+
with pd.ExcelWriter(output_file) as writer:
111+
empty_table.to_excel(writer, index = False, header = False)
109112
sys.exit()
110113

111114
for col in enr_stats.select_dtypes(include = ['float']).columns:
@@ -122,11 +125,11 @@ def parseArgs():
122125
if expressed_genes_file:
123126
enr_stats['Any expressed gene'] = enr_stats.gene_id.isin(exp_genes)
124127

125-
enr_stats = enr_stats.groupby(['dataset', 'motif', 'real_int', 'shuffled_int', 'p_val', 'enr_fold', 'adj_pval', 'pi_value', 'rank_pi_val']).agg({'gene_id': ','.join, 'family': lambda x: ','.join(list(set(x))), 'Any expressed gene': any}).reset_index().sort_values(by = 'rank_pi_val').drop('gene_id', axis = 1)
128+
enr_stats = enr_stats.groupby(['dataset', 'motif', 'real_int', 'shuffled_int', 'p_val', 'enr_fold', 'adj_pval', 'pi_value', 'rank_pi_val']).agg({'gene_id': ','.join, 'family': lambda x: ','.join(sorted(set(x))), 'Any expressed gene': any}).reset_index().sort_values(by = 'rank_pi_val').drop('gene_id', axis = 1)
126129

127130
if not expressed_genes_file:
128131

129-
enr_stats = enr_stats.groupby(['dataset', 'motif', 'real_int', 'shuffled_int', 'p_val', 'enr_fold', 'adj_pval', 'pi_value', 'rank_pi_val']).agg({'gene_id': ','.join, 'family': lambda x: ','.join(list(set(x)))}).reset_index().sort_values(by = 'rank_pi_val').drop('gene_id', axis = 1)
132+
enr_stats = enr_stats.groupby(['dataset', 'motif', 'real_int', 'shuffled_int', 'p_val', 'enr_fold', 'adj_pval', 'pi_value', 'rank_pi_val']).agg({'gene_id': ','.join, 'family': lambda x: ','.join(sorted(set(x)))}).reset_index().sort_values(by = 'rank_pi_val').drop('gene_id', axis = 1)
130133

131134
enr_stats = enr_stats.merge(mot_tf, how = 'right', left_on = 'motif', right_on = 'motif_id').drop('motif_id', axis = 1)
132135

@@ -146,5 +149,8 @@ def parseArgs():
146149

147150
### Writing output file ###
148151

149-
with pd.ExcelWriter(output_file) as writer:
150-
enr_stats.to_excel(writer, index = False)
152+
if (output_file.endswith('.csv')):
153+
enr_stats.to_csv(output_file, index = False)
154+
else:
155+
with pd.ExcelWriter(output_file) as writer:
156+
enr_stats.to_excel(writer, index = False)

bin/getMotifCentricOutput_lb.py

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -104,8 +104,11 @@ def parseArgs():
104104

105105
if enr_stats.empty:
106106
empty_table = pd.DataFrame(["### This dataset did not yield any motif enrichment"])
107-
with pd.ExcelWriter(output_file) as writer:
108-
empty_table.to_excel(writer, index = False, header = False)
107+
if(output_file.endswith('.csv')):
108+
empty_table.to_csv(output_file, index = False, header = False)
109+
else:
110+
with pd.ExcelWriter(output_file) as writer:
111+
empty_table.to_excel(writer, index = False, header = False)
109112
sys.exit()
110113

111114
for col in enr_stats.select_dtypes(include = ['float']).columns:
@@ -122,11 +125,11 @@ def parseArgs():
122125
if expressed_genes_file:
123126
enr_stats['Any expressed gene'] = enr_stats.gene_id.isin(exp_genes)
124127

125-
enr_stats = enr_stats.groupby(['dataset', 'input_total_peaks', 'peaks_in_promoter', 'motif','real_int', 'shuffled_int', 'p_val', 'enr_fold', 'adj_pval', 'pi_value', 'rank_pi_val']).agg({'gene_id': ','.join, 'family': lambda x: ','.join(list(set(x))), 'Any expressed gene': any}).reset_index().sort_values(by = 'rank_pi_val').drop('gene_id', axis = 1)
128+
enr_stats = enr_stats.groupby(['dataset', 'input_total_peaks', 'peaks_in_promoter', 'motif','real_int', 'shuffled_int', 'p_val', 'enr_fold', 'adj_pval', 'pi_value', 'rank_pi_val']).agg({'gene_id': ','.join, 'family': lambda x: ','.join(sorted(set(x))), 'Any expressed gene': any}).reset_index().sort_values(by = 'rank_pi_val').drop('gene_id', axis = 1)
126129

127130
if not expressed_genes_file:
128131

129-
enr_stats = enr_stats.groupby(['dataset', 'input_total_peaks', 'peaks_in_promoter', 'motif', 'real_int', 'shuffled_int', 'p_val', 'enr_fold', 'adj_pval', 'pi_value', 'rank_pi_val']).agg({'gene_id': ','.join, 'family': lambda x: ','.join(list(set(x)))}).reset_index().sort_values(by = 'rank_pi_val').drop('gene_id', axis = 1)
132+
enr_stats = enr_stats.groupby(['dataset', 'input_total_peaks', 'peaks_in_promoter', 'motif', 'real_int', 'shuffled_int', 'p_val', 'enr_fold', 'adj_pval', 'pi_value', 'rank_pi_val']).agg({'gene_id': ','.join, 'family': lambda x: ','.join(sorted(set(x)))}).reset_index().sort_values(by = 'rank_pi_val').drop('gene_id', axis = 1)
130133

131134
enr_stats = enr_stats.merge(mot_tf, how = 'right', left_on = 'motif', right_on = 'motif_id').drop('motif_id', axis = 1)
132135

@@ -146,5 +149,8 @@ def parseArgs():
146149

147150
### Writing output file ###
148151

149-
with pd.ExcelWriter(output_file) as writer:
150-
enr_stats.to_excel(writer, index = False)
152+
if (output_file.endswith('.csv')):
153+
enr_stats.to_csv(output_file, index = False)
154+
else:
155+
with pd.ExcelWriter(output_file) as writer:
156+
enr_stats.to_excel(writer, index = False)

bin/getTFCentricOutput_gw.py

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -133,8 +133,11 @@ def parseArgs():
133133

134134
if enr_stats.empty:
135135
empty_table = pd.DataFrame(["### This dataset did not yield any motif enrichment"])
136-
with pd.ExcelWriter(output_file) as writer:
137-
empty_table.to_excel(writer, index = False, header = False)
136+
if(output_file.endswith('.csv')):
137+
empty_table.to_csv(output_file, index = False, header = False)
138+
else:
139+
with pd.ExcelWriter(output_file) as writer:
140+
empty_table.to_excel(writer, index = False, header = False)
138141
sys.exit()
139142

140143
### Reading and processing GO enrichment data ###
@@ -261,5 +264,9 @@ def parseArgs():
261264

262265
### Writing output file ###
263266

264-
with pd.ExcelWriter(output_file) as writer:
265-
enr_stats.to_excel(writer, index = False)
267+
if (output_file.endswith('.csv')):
268+
enr_stats.to_csv(output_file, index = False)
269+
else:
270+
with pd.ExcelWriter(output_file) as writer:
271+
enr_stats.to_excel(writer, index = False)
272+

bin/getTFCentricOutput_lb.py

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ def parseArgs():
5252
parser.add_argument('-ex', '--expressed_genes_file', nargs = 1, type = str,
5353
default = None, help = '',
5454
metavar = 'List of genes expressed in biological context of experiment')
55-
55+
5656
args = parser.parse_args()
5757

5858
return args
@@ -133,8 +133,11 @@ def parseArgs():
133133

134134
if enr_stats.empty:
135135
empty_table = pd.DataFrame(["### This dataset did not yield any motif enrichment"])
136-
with pd.ExcelWriter(output_file) as writer:
137-
empty_table.to_excel(writer, index = False, header = False)
136+
if(output_file.endswith('.csv')):
137+
empty_table.to_csv(output_file, index = False, header = False)
138+
else:
139+
with pd.ExcelWriter(output_file) as writer:
140+
empty_table.to_excel(writer, index = False, header = False)
138141
sys.exit()
139142

140143
### Reading and processing GO enrichment data ###
@@ -261,5 +264,8 @@ def parseArgs():
261264

262265
### Writing output file ###
263266

264-
with pd.ExcelWriter(output_file) as writer:
265-
enr_stats.to_excel(writer, index = False)
267+
if (output_file.endswith('.csv')):
268+
enr_stats.to_csv(output_file, index = False)
269+
else:
270+
with pd.ExcelWriter(output_file) as writer:
271+
enr_stats.to_excel(writer, index = False)

bin/processStats_bps_gw.py

Lines changed: 15 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,10 @@ def parseArgs():
2626
parser.add_argument('output', nargs = 1, type = str,
2727
help = '',
2828
metavar = 'output_file')
29+
30+
parser.add_argument('shuffle_count', nargs = 1, type = int,
31+
help = '',
32+
metavar = 'Number of ACR shuffles that were performed for background generation')
2933

3034
args = parser.parse_args()
3135

@@ -37,6 +41,7 @@ def parseArgs():
3741
out_file = args.output[0]
3842
cns_list_file = args.cns_sets_list[0]
3943
total_peaks = args.num_peaks[0]
44+
shuffle_count = args.shuffle_count[0]
4045

4146
file_name = "_".join(raw_file.split("/")[-1].split("_")[0:-5])
4247

@@ -77,26 +82,28 @@ def parseArgs():
7782
try:
7883
shuff_overlap = np.array(shuff_dict[cns_set])
7984
except KeyError:
80-
shuff_overlap = np.zeros(1000, dtype = int)
85+
shuff_overlap = np.zeros(shuffle_count, dtype = int)
8186
try:
8287
real_overlap = real_dict[cns_set]
8388
except KeyError:
8489
real_overlap = 0
85-
if len(shuff_overlap) < 1000:
86-
zero_pad = np.zeros(1000 - len(shuff_overlap), dtype = int)
90+
if len(shuff_overlap) < shuffle_count:
91+
zero_pad = np.zeros(shuffle_count - len(shuff_overlap), dtype = int)
8792
shuff_overlap = np.concatenate([shuff_overlap, zero_pad])
8893
else:
8994
pass
90-
p_val_1000 = len(shuff_overlap[shuff_overlap >= real_overlap])
91-
if p_val_1000 == 0:
92-
p_val_1000 = 0.9
95+
times_above_real_overlap = len(shuff_overlap[shuff_overlap >= real_overlap])
96+
if times_above_real_overlap == 0:
97+
times_above_real_overlap = 0.9
9398
median = np.median(shuff_overlap)
9499
if real_overlap == 0 or median == 0:
95100
enrichment_fold = 0
96101
else:
97102
enrichment_fold = real_overlap / median
103+
104+
p_val = times_above_real_overlap/shuffle_count
98105

99-
stats[(file_name, cns_set)] = [int(total_peaks), real_overlap, median, (p_val_1000/1000), enrichment_fold]
106+
stats[(file_name, cns_set)] = [int(total_peaks), real_overlap, median, p_val, enrichment_fold]
100107

101108
data_df = pd.DataFrame.from_dict(stats).T
102109
data_df = data_df.reset_index()
@@ -105,5 +112,5 @@ def parseArgs():
105112
FDR = multipletests(data_df['p_val'], method = 'fdr_bh', alpha = 0.05)
106113
data_df.insert(6, 'adj_pval', FDR[1])
107114

108-
data_df.to_csv(out_file, sep = "\t", index = None, na_rep = "nan")
115+
data_df.sort_values(by = 'motif').to_csv(out_file, sep = "\t", index = None, na_rep = "nan")
109116

0 commit comments

Comments
 (0)