Skip to content

Commit 33c8364

Browse files
authored
Merge pull request #3 from fmfi-compbio/test_case
Added test case
2 parents 22b2549 + 0f38971 commit 33c8364

File tree

11 files changed

+61
-14
lines changed

11 files changed

+61
-14
lines changed

run_test_case.sh

+15
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
#~/bin/bash
2+
3+
echo Running WarpSTR test run case
4+
echo Please provide path to the GRCh38 reference, i.e. /path/GRCh38.fa
5+
read REF
6+
echo Please provide path to the guppy executable, i.e. /path/guppy_basecaller
7+
read GUPPY
8+
9+
cp 'test/config_template.yaml' 'test/config_test.yaml'
10+
sed -i "s|PLACEHOLDER_REFERENCE|$REF|" 'test/config_test.yaml'
11+
sed -i "s|PLACEHOLDER_GUPPY|$GUPPY|" 'test/config_test.yaml'
12+
13+
echo "Running WarpSTR with config: 'test/config_test.yaml'"
14+
echo "... This could take approx. 3-5 minutes ..."
15+
python WarpSTR.py test/config_test.yaml 2> test/log.err

src/dtw_automata/overview.py

-1
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@ def store_collapsed(results, units, rep_units, reverse_lst, locus_path):
99
Stores results as given by repeat units
1010
"""
1111
preds = {}
12-
print(rep_units)
1312
for idx, i in enumerate(units):
1413
if len(results[0][idx]) > 1:
1514
main = "main_"+rep_units[idx][0]

src/dtw_automata/plotter.py

-1
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,6 @@ def plot_collapsed(df_overview, locus_path):
2323
plt.close()
2424

2525
df_template = df_overview[df_overview["reverse"] == False]
26-
print(df_template)
2726
df_reverse = df_overview[df_overview["reverse"] == True]
2827
out_path = os.path.join(locus_path, tmpl.SUMMARY_SUBDIR,
2928
"collapsed_predictions_strand.svg")

src/extractor/read_extractor.py

+3-7
Original file line numberDiff line numberDiff line change
@@ -62,15 +62,14 @@ def extract_from_multifast5s(self, out_path):
6262
dbg_msg = "There are {num} in {path}".format(
6363
num=len(fast5names_lst), path=self.path)
6464
handle_msg_dbg(dbg_msg)
65-
6665
# find all multi fast5 files and extract only single fast5s
6766
for dirpath, dirnames, filenames in os.walk(self.path):
6867
for filename in [f for f in filenames if f.endswith(".fast5")]:
6968
fast5path = os.path.join(dirpath, filename)
7069
fast5file = h5py.File(fast5path, 'r')
71-
found = False
7270

7371
# check if read exists in this multi fast5 file
72+
found_lst = []
7473
for readname in fast5names_lst:
7574
if readname in fast5file:
7675
new_path = os.path.join(
@@ -82,11 +81,8 @@ def extract_from_multifast5s(self, out_path):
8281
err=e, read=readname, path=fast5file)
8382
handle_msg_err(err_msg)
8483

85-
r = readname
86-
found = True
87-
break
88-
if found:
89-
fast5names_lst.remove(r)
84+
found_lst.append(readname)
85+
fast5names_lst = [name for name in fast5names_lst if name not in found_lst]
9086

9187
if len(fast5names_lst) == 0:
9288
dbg_msg = "All raw fast5 files have been successfully extracted"

src/guppy_annotate/guppy_annotate.sh

+4-4
Original file line numberDiff line numberDiff line change
@@ -17,15 +17,15 @@ THREADS=$6;
1717
SAVEPATH=$SAMPLEPATH/"aux"
1818
mkdir -p $SAVEPATH
1919

20-
21-
echo "calling script $GUPPY --input_path $SAMPLEPATH --save_path $SAVEPATH --flowcell $FLOWCELL --kit $KIT --fast5_out --cpu_threads_per_caller $THREADS"
22-
( $GUPPY --input_path $SAMPLEPATH --save_path $SAVEPATH --flowcell $FLOWCELL --kit $KIT --fast5_out --cpu_threads_per_caller $THREADS)
20+
echo "Running Guppy..."
21+
>&2 echo "calling script $GUPPY --input_path $SAMPLEPATH --save_path $SAVEPATH --flowcell $FLOWCELL --kit $KIT --fast5_out --cpu_threads_per_caller $THREADS"
22+
( $GUPPY --input_path $SAMPLEPATH --save_path $SAVEPATH --flowcell $FLOWCELL --kit $KIT --fast5_out --cpu_threads_per_caller $THREADS 1>&2)
2323

2424
WORKSPACEPATH=$SAVEPATH/"workspace"
2525
ANNOTPATH=$SAMPLEPATH/$OUT_SUFFIX
2626
mkdir -p $ANNOTPATH
2727

28-
echo "find $WORKSPACEPATH -maxdepth 1 -name "*.fast5" -print0 | xargs -0 cp -t $ANNOTPATH;"
28+
>&2 echo "find $WORKSPACEPATH -maxdepth 1 -name "*.fast5" -print0 | xargs -0 cp -t $ANNOTPATH;"
2929
find $WORKSPACEPATH -maxdepth 1 -name "*.fast5" -print0 | xargs -0 cp -t $ANNOTPATH;
3030

3131
rm -r $SAVEPATH

src/report/genotyping.py

+2
Original file line numberDiff line numberDiff line change
@@ -125,6 +125,8 @@ def run_genotyping_overview(overview, locus_path, config, muscle_path):
125125
f.write("{a1},{a1f},{a2},{a2f},".format(a1=alleles[0],a1f=alleles[1],a2=alleles[2],a2f=alleles[3]))
126126
f.write("{b1},{b1f},{b2},{b2f}".format(b1=alleles_bc[0],b1f=alleles_bc[1],\
127127
b2=alleles_bc[2],b2f=alleles_bc[3]))
128+
print(f"Allele lengths as given by WarpSTR: {alleles[0]},{alleles[2]}, frequency: {alleles[1]},{alleles[3]}")
129+
print(f"Allele lengths as given by basecall: {alleles_bc[0]},{alleles_bc[2]}, frequency: {alleles_bc[1]},{alleles_bc[3]}")
128130
if config['visualize']:
129131
img_path = os.path.join(locus_path,tmpl.SUMMARY_SUBDIR,"alleles.svg")
130132
vals = (gmm_out_dict["group1"],gmm_out_dict["group2"])

src/report/reporting.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -330,7 +330,8 @@ def create_report(main_out_path, tr_results_seq,locus,locus_path,src_path):
330330

331331
f.write("<h2>Report for "+locus['name']+"</h2>")
332332
f.write(create_html_table('summaryTable',summary_df,'summarytable'))
333-
f.write("<h2>Locus reference: "+locus['noting']+"</h2>")
333+
if 'noting' in locus:
334+
f.write("<h2>Locus reference: "+locus['noting']+"</h2>")
334335
f.write("<h2>"+reference+"</h2>")
335336
f.write("<h2>TR length: "+str(len(reference))+"</h2>")
336337
f.write("<h2 id='input_sequence'>Our input sequence: "+locus['sequence']+"</h2>")

test/config_template.yaml

+35
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
reference_path: PLACEHOLDER_REFERENCE # path to the reference, that was used to obtain SAM/BAM
2+
output: test/test_output # Where everything will be outputted
3+
4+
# List of input files. Each new entry starts with dash "-".
5+
# This signals where to find input data like .bam and .fast5
6+
inputs:
7+
- path: test/test_input
8+
runs: test_run1
9+
10+
# if you are re-running the analysis, here you can set which steps to skip by setting them to False
11+
single_read_extraction: True # Extracts reads mapped to the locus and stores them in single .fast5 format
12+
guppy_annotation: True # Annotates .fast5 files with mapping between basecalled sequence and the signal
13+
exp_signal_generation: True # Generates expected signals for flanks and repeats
14+
tr_region_extraction: True # Finds tandem repeat region in read using alignment of basecalled sequence and reference repeat sequence
15+
tr_region_calling: True # Uses state automata with DTW alignment to find the number of repeats for each signal
16+
genotyping: True # Predicts the final allele lengths from the predicted repeat numbers of each read
17+
18+
# in case of using GUPPY (i.e. guppy_annotation=True), path to Guppy must be provided and info about .fast5 files
19+
guppy_config:
20+
path: PLACEHOLDER_GUPPY
21+
flowcell: FLO-MIN106
22+
kit: SQK-LSK109
23+
24+
# here provide a configuration for desired loci
25+
# Each loci must be defined by name and genomic coordinates.
26+
# Input sequence for state automata is defined in 'sequence' element.
27+
# If not set, it will be automatically set using defined patterns in 'motif' element and the reference repeat region
28+
# Also, either 'motif' patterns must be defined or configured sequence
29+
loci:
30+
- name: Human_STR_1108232
31+
coord: chr4:183178378-183178421
32+
# noting: AAAT[11]
33+
# motif: AAAT
34+
sequence: (AAAT)
35+
# flank_length: 110
1.02 MB
Binary file not shown.
154 KB
Binary file not shown.
Binary file not shown.

0 commit comments

Comments
 (0)