Merge pull request #3 from fmfi-compbio/test_case

xsitarcik · web-flow · commit 33c83647a7c0 · 2022-11-25T16:01:11.000+01:00
Added test case
diff --git a/run_test_case.sh b/run_test_case.sh
@@ -0,0 +1,15 @@
+#~/bin/bash
+
+echo Running WarpSTR test run case
+echo Please provide path to the GRCh38 reference, i.e. /path/GRCh38.fa
+read REF
+echo Please provide path to the guppy executable, i.e. /path/guppy_basecaller
+read GUPPY
+
+cp 'test/config_template.yaml' 'test/config_test.yaml'
+sed -i "s|PLACEHOLDER_REFERENCE|$REF|" 'test/config_test.yaml'
+sed -i "s|PLACEHOLDER_GUPPY|$GUPPY|" 'test/config_test.yaml'
+
+echo "Running WarpSTR with config: 'test/config_test.yaml'"
+echo "... This could take approx. 3-5 minutes ..."
+python WarpSTR.py test/config_test.yaml 2> test/log.err
diff --git a/src/dtw_automata/overview.py b/src/dtw_automata/overview.py
@@ -9,7 +9,6 @@ def store_collapsed(results, units, rep_units, reverse_lst, locus_path):
     Stores results as given by repeat units
     """
     preds = {}
-    print(rep_units)
     for idx, i in enumerate(units):
         if len(results[0][idx]) > 1:
             main = "main_"+rep_units[idx][0]
diff --git a/src/dtw_automata/plotter.py b/src/dtw_automata/plotter.py
@@ -23,7 +23,6 @@ def plot_collapsed(df_overview, locus_path):
     plt.close()
 
     df_template = df_overview[df_overview["reverse"] == False]
-    print(df_template)
     df_reverse = df_overview[df_overview["reverse"] == True]
     out_path = os.path.join(locus_path, tmpl.SUMMARY_SUBDIR,
                             "collapsed_predictions_strand.svg")
diff --git a/src/extractor/read_extractor.py b/src/extractor/read_extractor.py
@@ -62,15 +62,14 @@ def extract_from_multifast5s(self, out_path):
         dbg_msg = "There are {num} in {path}".format(
             num=len(fast5names_lst), path=self.path)
         handle_msg_dbg(dbg_msg)
-
         # find all multi fast5 files and extract only single fast5s
         for dirpath, dirnames, filenames in os.walk(self.path):
             for filename in [f for f in filenames if f.endswith(".fast5")]:
                 fast5path = os.path.join(dirpath, filename)
                 fast5file = h5py.File(fast5path, 'r')
-                found = False
 
                 # check if read exists in this multi fast5 file
+                found_lst = [] 
                 for readname in fast5names_lst:
                     if readname in fast5file:
                         new_path = os.path.join(
@@ -82,11 +81,8 @@ def extract_from_multifast5s(self, out_path):
                                 err=e, read=readname, path=fast5file)
                             handle_msg_err(err_msg)
 
-                        r = readname
-                        found = True
-                        break
-                if found:
-                    fast5names_lst.remove(r)
+                        found_lst.append(readname)
+                fast5names_lst = [name for name in fast5names_lst if name not in found_lst]
 
             if len(fast5names_lst) == 0:
                 dbg_msg = "All raw fast5 files have been successfully extracted"
diff --git a/src/guppy_annotate/guppy_annotate.sh b/src/guppy_annotate/guppy_annotate.sh
@@ -17,15 +17,15 @@ THREADS=$6;
 SAVEPATH=$SAMPLEPATH/"aux"
 mkdir -p $SAVEPATH
 
-
-echo "calling script $GUPPY --input_path $SAMPLEPATH --save_path $SAVEPATH --flowcell $FLOWCELL --kit $KIT --fast5_out --cpu_threads_per_caller $THREADS"
-( $GUPPY --input_path $SAMPLEPATH --save_path $SAVEPATH --flowcell $FLOWCELL --kit $KIT --fast5_out --cpu_threads_per_caller $THREADS)
+echo "Running Guppy..."
+>&2 echo "calling script $GUPPY --input_path $SAMPLEPATH --save_path $SAVEPATH --flowcell $FLOWCELL --kit $KIT --fast5_out --cpu_threads_per_caller $THREADS"
+( $GUPPY --input_path $SAMPLEPATH --save_path $SAVEPATH --flowcell $FLOWCELL --kit $KIT --fast5_out --cpu_threads_per_caller $THREADS 1>&2)
 
 WORKSPACEPATH=$SAVEPATH/"workspace"
 ANNOTPATH=$SAMPLEPATH/$OUT_SUFFIX
 mkdir -p $ANNOTPATH
 
-echo "find $WORKSPACEPATH -maxdepth 1 -name "*.fast5" -print0 | xargs  -0 cp -t $ANNOTPATH;"
+>&2 echo "find $WORKSPACEPATH -maxdepth 1 -name "*.fast5" -print0 | xargs  -0 cp -t $ANNOTPATH;"
 find $WORKSPACEPATH -maxdepth 1 -name "*.fast5" -print0 | xargs  -0 cp -t $ANNOTPATH;
 
 rm -r $SAVEPATH
diff --git a/src/report/genotyping.py b/src/report/genotyping.py
@@ -125,6 +125,8 @@ def run_genotyping_overview(overview, locus_path, config, muscle_path):
         f.write("{a1},{a1f},{a2},{a2f},".format(a1=alleles[0],a1f=alleles[1],a2=alleles[2],a2f=alleles[3]))
         f.write("{b1},{b1f},{b2},{b2f}".format(b1=alleles_bc[0],b1f=alleles_bc[1],\
                                                          b2=alleles_bc[2],b2f=alleles_bc[3]))
+    print(f"Allele lengths as given by WarpSTR: {alleles[0]},{alleles[2]}, frequency: {alleles[1]},{alleles[3]}")
+    print(f"Allele lengths as given by basecall: {alleles_bc[0]},{alleles_bc[2]}, frequency: {alleles_bc[1]},{alleles_bc[3]}")
     if config['visualize']:
         img_path = os.path.join(locus_path,tmpl.SUMMARY_SUBDIR,"alleles.svg")
         vals = (gmm_out_dict["group1"],gmm_out_dict["group2"])
diff --git a/src/report/reporting.py b/src/report/reporting.py
@@ -330,7 +330,8 @@ def create_report(main_out_path, tr_results_seq,locus,locus_path,src_path):
 
             f.write("<h2>Report for "+locus['name']+"</h2>")
             f.write(create_html_table('summaryTable',summary_df,'summarytable'))      
-            f.write("<h2>Locus reference: "+locus['noting']+"</h2>")
+            if 'noting' in locus:
+                f.write("<h2>Locus reference: "+locus['noting']+"</h2>")
             f.write("<h2>"+reference+"</h2>")
             f.write("<h2>TR length: "+str(len(reference))+"</h2>")
             f.write("<h2 id='input_sequence'>Our input sequence: "+locus['sequence']+"</h2>")
diff --git a/test/config_template.yaml b/test/config_template.yaml
@@ -0,0 +1,35 @@
+reference_path: PLACEHOLDER_REFERENCE # path to the reference, that was used to obtain SAM/BAM
+output: test/test_output   # Where everything will be outputted
+
+# List of input files. Each new entry starts with dash "-".
+# This signals where to find input data like .bam and .fast5
+inputs:                       
+  - path: test/test_input
+    runs: test_run1
+
+# if you are re-running the analysis, here you can set which steps to skip by setting them to False
+single_read_extraction: True   # Extracts reads mapped to the locus and stores them in single .fast5 format
+guppy_annotation:       True   # Annotates .fast5 files with mapping between basecalled sequence and the signal
+exp_signal_generation:  True   # Generates expected signals for flanks and repeats
+tr_region_extraction:   True   # Finds tandem repeat region in read using alignment of basecalled sequence and reference repeat sequence
+tr_region_calling:      True   # Uses state automata with DTW alignment to find the number of repeats for each signal
+genotyping:             True   # Predicts the final allele lengths from the predicted repeat numbers of each read 
+
+# in case of using GUPPY (i.e. guppy_annotation=True), path to Guppy must be provided and info about .fast5 files
+guppy_config:
+    path: PLACEHOLDER_GUPPY
+    flowcell: FLO-MIN106
+    kit: SQK-LSK109
+
+# here provide a configuration for desired loci
+# Each loci must be defined by name and genomic coordinates.
+# Input sequence for state automata is defined in 'sequence' element. 
+# If not set, it will be automatically set using defined patterns in 'motif' element and the reference repeat region
+# Also, either 'motif' patterns must be defined or configured sequence
+loci:
+  - name: Human_STR_1108232
+    coord: chr4:183178378-183178421
+    # noting: AAAT[11]
+    # motif: AAAT
+    sequence: (AAAT)
+    # flank_length: 110
diff --git a/test/test_input/test_run1/fast5s/batch_0.fast5 b/test/test_input/test_run1/fast5s/batch_0.fast5
diff --git a/test/test_input/test_run1/mapping/mapping.bam b/test/test_input/test_run1/mapping/mapping.bam
diff --git a/test/test_input/test_run1/mapping/mapping.bam.bai b/test/test_input/test_run1/mapping/mapping.bam.bai