harvdev-gene-identifier/config.ini at main · FlyBase/harvdev-gene-identifier · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
[FLYBASE]
# Paths to flybase files
gene_synonyms = /src/input/fb_synonym_latest.tsv
current_genes = /src/input/currentDmelHsap.txt

[PUBMED]
# you can download the file from https://ftp.ncbi.nlm.nih.gov/pub/pmc/PMC-ids.csv.gz
PMC_ids = /src/input/PMC-ids.csv

[PICKLES]
# Paths to pickles. You can generate these paths by running the script 'update_resources.py'
PMC_ids_dict = pickles/PMC_ids_dict.pickle
gene_dict = pickles/gene_dict.pickle
fbid_to_symbol_dict = pickles/fbid_to_symbol_dict.pickle

[PATHS]
# The exceptions file contains a list of gene names that shouldn't be tagged as genes
exceptions = config/exceptions.txt
# Where to output downloaded papers
corpus = corpus
# Where to output the xml paper files
xml = xml
# Will contain the output of the script
output = /src/output/output.tsv
# The model
deep_learning_model = FlyBaseGeneAbstractClassifier/

[PARAMETERS]
# This is a politeness parameter, it is the time in seconds between two requests to the ncbi server.
# They might also block your IP if you make too many requests in a short time.
sleep_time_between_requests = 6
# Removes unnecessary downloaded files after processing them.
remove_files = True
output_gene_occurence = false
#snippet type can be either 'short', 'long' or 'none'
snippet_type = short
#confidence is computed as a frequency. You can output the frequency of the gene in the paper
#(percentage of words in the paper that are the gene),
#or the frequency of the gene as a percentage of the total number of genes in the paper, or both. You can also output the
#raw number of occurrences of the gene in the paper. If you use deep learning, this setting will be ignored, and actual confidence
#scores will be output.
output_gene_frequency= false
output_word_frequency= false
output_raw_occurence= false
# If you want to use deep learning to predict the gene names, set this to true. It is slower but more accurate
use_deep_learning = true