enrichment_analysis/config/config.yaml at main · epigen/enrichment_analysis · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159

##### RESOURCES #####
mem: 32000
threads: 1

##### GENERAL #####
annotation: config/annotation.csv
result_path: test/results
project_name: Corces_CellTypes

# genome
# human 'hg19' or 'hg38'
# mouse 'mm9' or 'mm10'
genome: 'hg38'

##### DATABASES #####
## local databases as GMT (*.gmt) or JSON (*.json) files with gene symbols!
# will be used by rGREAT for genomic regions and GSEApy (both ORA and preranked) for genes
# GMT databases can be donwloaded from
#     MSigDB (http://www.gsea-msigdb.org/gsea/msigdb)
#     Enrichr (https://maayanlab.cloud/Enrichr/#libraries)
# JSON example content: { "MyDB_Term1": ["geneA","geneB","geneC"],"MyDB_Term2": ["geneX","geneY","geneZ"]}
# to skip you have to(!) leave one database entry with an empty path i.e., dummy_db: ""
local_databases:
    Azimuth_2023: "test/resources/enrichment_analysis/Azimuth_2023.json"
    Reactome: "test/resources/enrichment_analysis/ReactomePathways.gmt"
## LOLA compatible region set databases
# loaded using loadRegionDB() (https://code.databio.org/LOLA/reference/loadRegionDB.html)
# download from the docs or create your own: https://databio.org/regiondb
# pre-cached .RData files are supported by simpleCache
# provide the exact path to the folder containing the collections e.g., "resources/LOLACore/hg38"
# to skip you have to(!) leave one database entry with an empty path i.e., dummy_db: ""
lola_databases:
    LOLACore: "test/resources/LOLACore/hg38"

##### TOOLS #####

### GSEApy - ORA Enrichr (Fisher/hypergeometric test) and preranked GSEA based analysis

### LOLA - region overlap based analysis

### GREAT - region-gene association based analysis
# https://jokergoo.github.io/rGREAT/reference/great.html
great_parameters:
    min_gene_set_size: 0 #default: 5
    mode: "basalPlusExt" # options: 'basalPlusExt', 'twoClosest', 'oneClosest'
    basal_upstream: 5000 # used in 'basalPlusExt' mode
    basal_downstream: 1000 # used in 'basalPlusExt' mode
    extension: 1000000
    map_associated_regions: 1 # number of top significant terms to annotate with regions/genes; 0: none, -1: all, n >=1: n. This is slow, creates large files, and can exceed Excel's 32,767 character cell limit.

### pycisTarget - region based Transcription Factor Binding Site (TFBS) motif enrichment analysis
# https://pycistarget.readthedocs.io/en/latest/index.html
# resources: https://resources.aertslab.org/cistarget/
# instructions for custom cisTarget databases using your own regions (e.g., conensus regions or TF ChIP-seq data): https://github.com/aertslab/create_cisTarget_databases
# instructions for hg19 database here: https://github.com/aertslab/pycistarget/issues/37
# to skip you have to(!) leave one database entry with an empty path i.e., dummy_db: ""
pycistarget_parameters:
    databases:
        hg38_screen_v10clust: "test/resources/enrichment_analysis/600regions_test.regions_vs_motifs.rankings.feather" # TEST FILE : should be replaced by real feather for analysis
    path_to_motif_annotations: "test/resources/enrichment_analysis/89_motifs_test.tbl" # TEST FILE : should be replaced by real feather for analysis
    temp_dir: "/tmp" # should have space available
    fraction_overlap_w_cistarget_database: 0.4 # default 0.4
    auc_threshold: 0.005 # default 0.005
    nes_threshold: 3 # default 3
    rank_threshold: 0.05 # default 0.05
    annotation_version: "v10nr_clust"
    annotations_to_use: ["Direct_annot", "Motif_similarity_annot", "Orthology_annot", "Motif_similarity_and_Orthology_annot"] # the first entry of the list is used downstream for annotation
    motif_similarity_fdr: 0.001 # default 0.001
    orthologous_identity_threshold: 0 # default 0

### RcisTarget - gene based Transcription Factor Binding Site (TFBS) motif enrichment analysis
# https://www.bioconductor.org/packages/release/bioc/html/RcisTarget.html
# resources: https://resources.aertslab.org/cistarget/
# to skip you have to(!) leave one database entry with an empty path i.e., dummy_db: ""
rcistarget_parameters:
    databases:
        hg38_500bp_up_100bp_down_v10clust: "test/resources/enrichment_analysis/5000genes_test.genes_vs_motifs.rankings.feather" # TEST FILE : should be replaced by real feather for analysis
    motifAnnot: "test/resources/enrichment_analysis/89_motifs_test.tbl" # TEST FILE : should be replaced by real feather for analysis
    motifAnnot_highConfCat: ["directAnnotation", "inferredBy_Orthology"]
    motifAnnot_lowConfCat: ["inferredBy_MotifSimilarity", "inferredBy_MotifSimilarity_n_Orthology"]
    nesThreshold: 3
    aucMaxRank_factor: 0.05 # used for aucMaxRank = aucMaxRank_factor * ncol(motifRankings)
    geneErnMethod: "aprox" # alternatively, exact but more computationally intense: "icistarget"
    geneErnMaxRank: 5000

### Enrichment plot

# tool specific column names for aggregation, plotting & summaries
column_names:
    ORA_GSEApy:
        top_n: 25
        p_value: 'P_value'
        adj_pvalue: 'Adjusted_P_value'
        effect_size: 'Odds_Ratio'
        overlap: 'Overlap'
        term: 'Term'
    preranked_GSEApy:
        top_n: 25
        p_value: 'NOM_p_val'
        adj_pvalue: 'FDR_q_val'
        effect_size: 'NES'
        overlap: 'Tag'
        term: 'Term'
    GREAT:
        top_n: 25
        p_value: "p_value_hyper" # or binomial test result: p_value
        adj_pvalue: "p_adjust_hyper" # or binomial test result: p_adjust
        effect_size: "fold_enrichment_hyper" # or binomial test result: fold_enrichment
        overlap: "observed_region_hits" # or binomial test result: observed_gene_hits
        term: "description"
    LOLA:
        top_n: 25
        p_value: "pValue"
        adj_pvalue: "qValue"
        effect_size: "oddsRatio"
        overlap: "support"
        term: "description"
    pycisTarget:
        top_n: 25
        p_value: "AUC"
        adj_pvalue: "NES"
        effect_size: "NES" # NES combines statistical significance and effect size
        overlap: "Motif_hits"
        term: "description" # a combination of the motif name and the first entry of aboves "annotations_to_use" list
    RcisTarget:
        top_n: 25
        p_value: "AUC"
        adj_pvalue: "NES"
        effect_size: "NES" # NES combines statistical significance and effect size
        overlap: "nEnrGenes"
        term: "description" # a combination of the motif name and the motifAnnot_highConfCat entries

##### AGGREGATE & SUMMARIZE #####

# adjusted p-value threshold per tool to denote statistical significance
adjp_th:
    ORA_GSEApy: 0.05
    preranked_GSEApy: 0.05
    GREAT: 0.01
    LOLA: 0.01
    pycisTarget: 5 # keep results greater(!) than provided threshold
    RcisTarget: 5 # keep results greater(!) than provided threshold

# number of top terms per feature set within each group for all overview plots (adjusted p-value, effect-size and bubble-heatmap)
top_terms_n: 5

# cap for adjusted p-value plotting: -log10(adjusted p-value) > adjp_cap -> adjp_cap
adjp_cap: 4

# cap for log2 odds ratio plotting: abs(log2(odds ratio)) > or_cap -> sign(log2(odds ratio)) * or_cap
or_cap: 5

# cap for  normalized enrichemnt scores (NES) abs(nes) > nes_cap -> sign(nes) * nes_cap
# applied only to preranked_GSEApy
nes_cap: 5

# hierarchical cluster flag for summary plots (0=no; 1=yes)
cluster_summary: 1