minor

dmnfarrell · dmnfarrell · commit 346330cbb3ea · 2023-12-15T15:58:02.000Z
diff --git a/CHANGES b/CHANGES
@@ -8,10 +8,11 @@ CHANGES
 * fix to pair order in sample names when pivoted
 * fix to keeping unmapped reads when not needed
 * check samples match a previous raw.bcf file when running
-* re-added wgmlst code
 * plugin system for gui
 * gui - contam plugin added
 * gui - testing plugin
+* added manifest file option to cmd line
+* removed get samples from bams during workflow, redundant
 
 0.5.0
 -----
diff --git a/README.md b/README.md
@@ -189,11 +189,11 @@ data/
     └── ERR1588785_2.fastq.gz
 ```
 
-Filenames are parsed and a sample name is extracted for each pair (if paired end). This is simply done by splitting on the _ symbol. So a file called /path/13-11594_S85_L001-4_R1_001.fastq.gz will be given a sample name 13-11594. As long as the sample names are unique this is ok. If you had a file names like A_2_L001-4_R1_001, A_3_L001-4_R1_001 you should split on '-' instead. You can specify this in the labelsep option. The workflow won't run unless sample names are unique.
+Filenames are parsed and a sample name is extracted for each pair (if paired end). This is simply done by splitting on the _ symbol by default. So a file called /path/13-11594_S85_L001-4_R1_001.fastq.gz will be given a sample name 13-11594. As long as the sample names are unique this is ok. If you had a file names like A_2_L001-4_R1_001, A_3_L001-4_R1_001 you should split on '-' instead. You can specify this in the `labelsep` option. The workflow won't run unless sample names are unique.
 
 ### Manifest file
 
-You can use a manifest file (-M) with the sample names and files in a table if parsing folders won't work for you. This could be useful if your files have non-unique names but are in different subfolders. This overrides the `input` option. The format of the file is below. You should give the full path of each file. Sample names have to be unique.
+You can use a manifest file (-M) with the sample names and files in a table if parsing folders won't work for you. This could be useful if your files have non-unique names but are in different subfolders. This overrides the `input` option. The format of the file is below. You should give the full path of each file. Sample names have to be unique and you should provide an entry for all the samples you want to run.
 
 ```
 sample,filename1,filename2
@@ -232,7 +232,7 @@ import snipgenie
 args = {'threads':8, 'outdir': 'results', 'labelsep':'-',
         'input':['/my/folder/',
                  '/my/other/folder'],
-        'reference': None, 'overwrite':False}
+        'reference': 'sequence.fa', 'overwrite':False}
 W = snipgenie.app.WorkFlow(**args)
 W.setup()
 W.run()
@@ -250,7 +250,7 @@ You can view a short video on using the GUI [here](https://www.youtube.com/watch
 
 _The run was stopped during execution, can it be resumed?_
 
-Yes, by default the program won't overwrite intermediate files when re-run. So just run it again. Make sure there are no old tmp.****.bam files in the mapped folder if an alignment got interrupted.
+Yes, by default the program won't overwrite intermediate files when re-run. So just run it again. Make sure there are no old tmp.****.bam files in the mapped folder if an alignment got interrupted. Variant calling can't be resumed though, so it will have to start again if interrupted.
 
 _My sample files are not being parsed properly._
 
diff --git a/notebooks/testing.ipynb b/notebooks/testing.ipynb
@@ -23,7 +23,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 2,
    "metadata": {
     "tags": []
    },
@@ -1835,249 +1835,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 21,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "The following options were supplied\n",
-      "time:  10/12/2023 13:23:46\n",
-      "-------\n",
-      "threads : 12\n",
-      "outdir : test_results\n",
-      "labelsep : .\n",
-      "manifest : testsamples.txt\n",
-      "reference : /home/farrell/.config/snipgenie/genome/Sars-Cov-2.fa\n",
-      "overwrite : False\n",
-      "filters : \n",
-      "custom_filters : False\n",
-      "get_stats : False\n",
-      "labelindex : 0\n",
-      "trim : False\n",
-      "unmapped : False\n",
-      "quality : 25\n",
-      "aligner : bwa\n",
-      "platform : illumina\n",
-      "species : None\n",
-      "mask : None\n",
-      "gb_file : None\n",
-      "omit_samples : []\n",
-      "buildtree : False\n",
-      "bootstraps : 100\n",
-      "logfile : test_results/run.log\n",
-      "\n",
-      "using manifest file for samples\n",
-      "3 samples were loaded:\n",
-      "----------------------\n",
-      "  sample                                          filename1                                          filename2  read_length\n",
-      "0      B  /home/farrell/gitprojects/snipgenie/notebooks/...  /home/farrell/gitprojects/snipgenie/notebooks/...          150\n",
-      "1      A  /home/farrell/gitprojects/snipgenie/notebooks/...  /home/farrell/gitprojects/snipgenie/notebooks/...          150\n",
-      "2      C  /home/farrell/gitprojects/snipgenie/notebooks/...  /home/farrell/gitprojects/snipgenie/notebooks/...          150\n",
-      "\n",
-      "building index\n",
-      "indexing..\n",
-      "bwa index /home/farrell/.config/snipgenie/genome/Sars-Cov-2.fa\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "[bwa_index] Pack FASTA... 0.00 sec\n",
-      "[bwa_index] Construct BWT for the packed sequence...\n",
-      "[bwa_index] 0.01 seconds elapse.\n",
-      "[bwa_index] Update BWT... 0.00 sec\n",
-      "[bwa_index] Pack forward-only FASTA... 0.00 sec\n",
-      "[bwa_index] Construct SA from BWT and Occ... 0.00 sec\n",
-      "[main] Version: 0.7.17-r1188\n",
-      "[main] CMD: bwa index /home/farrell/.config/snipgenie/genome/Sars-Cov-2.fa\n",
-      "[main] Real time: 0.014 sec; CPU: 0.009 sec\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "aligning files\n",
-      "--------------\n",
-      "Using reference genome: /home/farrell/.config/snipgenie/genome/Sars-Cov-2.fa\n",
-      "0/3 samples already aligned\n",
-      "bwa mem -M -t 12  /home/farrell/.config/snipgenie/genome/Sars-Cov-2.fa \"/home/farrell/gitprojects/snipgenie/notebooks/test_folder/S1/cleaned_1.fastq.gz\" \"/home/farrell/gitprojects/snipgenie/notebooks/test_folder/S1/cleaned_2.fastq.gz\" | samtools view -F 4 -bt - | samtools sort -o test_results/mapped/B.bam\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "[M::bwa_idx_load_from_disk] read 0 ALT contigs\n",
-      "[M::process] read 489878 sequences (73481700 bp)...\n",
-      "[M::mem_pestat] # candidate unique pairs for (FF, FR, RF, RR): (0, 244939, 0, 0)\n",
-      "[M::mem_pestat] skip orientation FF as there are not enough pairs\n",
-      "[M::mem_pestat] analyzing insert size distribution for orientation FR...\n",
-      "[M::mem_pestat] (25, 50, 75) percentile: (300, 300, 300)\n",
-      "[M::mem_pestat] low and high boundaries for computing mean and std.dev: (300, 300)\n",
-      "[M::mem_pestat] mean and std.dev: (300.00, 0.00)\n",
-      "[M::mem_pestat] low and high boundaries for proper pairs: (300, 300)\n",
-      "[M::mem_pestat] skip orientation RF as there are not enough pairs\n",
-      "[M::mem_pestat] skip orientation RR as there are not enough pairs\n",
-      "[M::mem_process_seqs] Processed 489878 reads in 11.092 CPU sec, 0.970 real sec\n",
-      "[main] Version: 0.7.17-r1188\n",
-      "[main] CMD: bwa mem -M -t 12 /home/farrell/.config/snipgenie/genome/Sars-Cov-2.fa /home/farrell/gitprojects/snipgenie/notebooks/test_folder/S1/cleaned_1.fastq.gz /home/farrell/gitprojects/snipgenie/notebooks/test_folder/S1/cleaned_2.fastq.gz\n",
-      "[main] Real time: 1.997 sec; CPU: 11.784 sec\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "indexing B\n",
-      "samtools index test_results/mapped/B.bam\n",
-      "bwa mem -M -t 12  /home/farrell/.config/snipgenie/genome/Sars-Cov-2.fa \"/home/farrell/gitprojects/snipgenie/notebooks/test_folder/S2/cleaned_1.fastq.gz\" \"/home/farrell/gitprojects/snipgenie/notebooks/test_folder/S2/cleaned_2.fastq.gz\" | samtools view -F 4 -bt - | samtools sort -o test_results/mapped/A.bam\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/home/farrell/gitprojects/snipgenie/snipgenie/app.py:371: FutureWarning: Setting an item of incompatible dtype is deprecated and will raise in a future error of pandas. Value '/home/farrell/gitprojects/snipgenie/notebooks/test_results/mapped/B.bam' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.\n",
-      "  df.loc[i,'bam_file'] = os.path.abspath(out)\n",
-      "[M::bwa_idx_load_from_disk] read 0 ALT contigs\n",
-      "[M::process] read 503586 sequences (75537900 bp)...\n",
-      "[M::mem_pestat] # candidate unique pairs for (FF, FR, RF, RR): (0, 251793, 0, 0)\n",
-      "[M::mem_pestat] skip orientation FF as there are not enough pairs\n",
-      "[M::mem_pestat] analyzing insert size distribution for orientation FR...\n",
-      "[M::mem_pestat] (25, 50, 75) percentile: (300, 300, 300)\n",
-      "[M::mem_pestat] low and high boundaries for computing mean and std.dev: (300, 300)\n",
-      "[M::mem_pestat] mean and std.dev: (300.00, 0.00)\n",
-      "[M::mem_pestat] low and high boundaries for proper pairs: (300, 300)\n",
-      "[M::mem_pestat] skip orientation RF as there are not enough pairs\n",
-      "[M::mem_pestat] skip orientation RR as there are not enough pairs\n",
-      "[M::mem_process_seqs] Processed 503586 reads in 11.380 CPU sec, 0.996 real sec\n",
-      "[main] Version: 0.7.17-r1188\n",
-      "[main] CMD: bwa mem -M -t 12 /home/farrell/.config/snipgenie/genome/Sars-Cov-2.fa /home/farrell/gitprojects/snipgenie/notebooks/test_folder/S2/cleaned_1.fastq.gz /home/farrell/gitprojects/snipgenie/notebooks/test_folder/S2/cleaned_2.fastq.gz\n",
-      "[main] Real time: 2.092 sec; CPU: 12.114 sec\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "indexing A\n",
-      "samtools index test_results/mapped/A.bam\n",
-      "bwa mem -M -t 12  /home/farrell/.config/snipgenie/genome/Sars-Cov-2.fa \"/home/farrell/gitprojects/snipgenie/notebooks/test_folder/S3/cleaned_1.fastq.gz\" \"/home/farrell/gitprojects/snipgenie/notebooks/test_folder/S3/cleaned_2.fastq.gz\" | samtools view -F 4 -bt - | samtools sort -o test_results/mapped/C.bam\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "[M::bwa_idx_load_from_disk] read 0 ALT contigs\n",
-      "[M::process] read 504200 sequences (75630000 bp)...\n",
-      "[M::mem_pestat] # candidate unique pairs for (FF, FR, RF, RR): (0, 252100, 0, 0)\n",
-      "[M::mem_pestat] skip orientation FF as there are not enough pairs\n",
-      "[M::mem_pestat] analyzing insert size distribution for orientation FR...\n",
-      "[M::mem_pestat] (25, 50, 75) percentile: (300, 300, 300)\n",
-      "[M::mem_pestat] low and high boundaries for computing mean and std.dev: (300, 300)\n",
-      "[M::mem_pestat] mean and std.dev: (300.00, 0.00)\n",
-      "[M::mem_pestat] low and high boundaries for proper pairs: (300, 300)\n",
-      "[M::mem_pestat] skip orientation RF as there are not enough pairs\n",
-      "[M::mem_pestat] skip orientation RR as there are not enough pairs\n",
-      "[M::mem_process_seqs] Processed 504200 reads in 11.595 CPU sec, 1.008 real sec\n",
-      "[main] Version: 0.7.17-r1188\n",
-      "[main] CMD: bwa mem -M -t 12 /home/farrell/.config/snipgenie/genome/Sars-Cov-2.fa /home/farrell/gitprojects/snipgenie/notebooks/test_folder/S3/cleaned_1.fastq.gz /home/farrell/gitprojects/snipgenie/notebooks/test_folder/S3/cleaned_2.fastq.gz\n",
-      "[main] Real time: 2.087 sec; CPU: 12.314 sec\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "indexing C\n",
-      "samtools index test_results/mapped/C.bam\n",
-      "\n",
-      "calling variants\n",
-      "----------------\n",
-      "running mpileup for 3 files..\n",
-      "[    1  2492  4984  7476  9968 12460 14952 17443 19935 22427 24919 27411\n",
-      " 29903]\n",
-      "parallel bcftools mpileup -r {1} -a \"AD,ADF,ADR,DP,SP,INFO/AD,INFO/ADF,INFO/ADR\" -O b --max-depth 500 --min-MQ 60 -o {2} -f /home/farrell/.config/snipgenie/genome/Sars-Cov-2.fa /home/farrell/gitprojects/snipgenie/notebooks/test_results/mapped/B.bam /home/farrell/gitprojects/snipgenie/notebooks/test_results/mapped/A.bam /home/farrell/gitprojects/snipgenie/notebooks/test_results/mapped/C.bam ::: \"NC_045512.2\":1-2491 \"NC_045512.2\":2492-4983 \"NC_045512.2\":4984-7475 \"NC_045512.2\":7476-9967 \"NC_045512.2\":9968-12459 \"NC_045512.2\":12460-14951 \"NC_045512.2\":14952-17442 \"NC_045512.2\":17443-19934 \"NC_045512.2\":19935-22426 \"NC_045512.2\":22427-24918 \"NC_045512.2\":24919-27410 \"NC_045512.2\":27411-29902 :::+ test_results/tmp/1-2491.bcf test_results/tmp/2492-4983.bcf test_results/tmp/4984-7475.bcf test_results/tmp/7476-9967.bcf test_results/tmp/9968-12459.bcf test_results/tmp/12460-14951.bcf test_results/tmp/14952-17442.bcf test_results/tmp/17443-19934.bcf test_results/tmp/19935-22426.bcf test_results/tmp/22427-24918.bcf test_results/tmp/24919-27410.bcf test_results/tmp/27411-29902.bcf\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "[mpileup] 3 samples in 3 input files\n",
-      "[mpileup] maximum number of reads per input file set to -d 500\n",
-      "[mpileup] 3 samples in 3 input files\n",
-      "[mpileup] maximum number of reads per input file set to -d 500\n",
-      "[mpileup] 3 samples in 3 input files\n",
-      "[mpileup] maximum number of reads per input file set to -d 500\n",
-      "[mpileup] 3 samples in 3 input files\n",
-      "[mpileup] maximum number of reads per input file set to -d 500\n",
-      "[mpileup] 3 samples in 3 input files\n",
-      "[mpileup] maximum number of reads per input file set to -d 500\n",
-      "[mpileup] 3 samples in 3 input files\n",
-      "[mpileup] maximum number of reads per input file set to -d 500\n",
-      "[mpileup] 3 samples in 3 input files\n",
-      "[mpileup] maximum number of reads per input file set to -d 500\n",
-      "[mpileup] 3 samples in 3 input files\n",
-      "[mpileup] maximum number of reads per input file set to -d 500\n",
-      "[mpileup] 3 samples in 3 input files\n",
-      "[mpileup] maximum number of reads per input file set to -d 500\n",
-      "[mpileup] 3 samples in 3 input files\n",
-      "[mpileup] maximum number of reads per input file set to -d 500\n",
-      "[mpileup] 3 samples in 3 input files\n",
-      "[mpileup] maximum number of reads per input file set to -d 500\n",
-      "[mpileup] 3 samples in 3 input files\n",
-      "[mpileup] maximum number of reads per input file set to -d 500\n",
-      "Checking the headers and starting positions of 12 files\n",
-      "Concatenating test_results/tmp/1-2491.bcf\t0.009546 seconds\n",
-      "Concatenating test_results/tmp/2492-4983.bcf\t0.004426 seconds\n",
-      "Concatenating test_results/tmp/4984-7475.bcf\t0.004128 seconds\n",
-      "Concatenating test_results/tmp/7476-9967.bcf\t0.003428 seconds\n",
-      "Concatenating test_results/tmp/9968-12459.bcf\t0.003823 seconds\n",
-      "Concatenating test_results/tmp/12460-14951.bcf\t0.003815 seconds\n",
-      "Concatenating test_results/tmp/14952-17442.bcf\t0.004609 seconds\n",
-      "Concatenating test_results/tmp/17443-19934.bcf\t0.004330 seconds\n",
-      "Concatenating test_results/tmp/19935-22426.bcf\t0.003555 seconds\n",
-      "Concatenating test_results/tmp/22427-24918.bcf\t0.003998 seconds\n",
-      "Concatenating test_results/tmp/24919-27410.bcf\t0.004125 seconds\n",
-      "Concatenating test_results/tmp/27411-29902.bcf\t0.003791 seconds\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "bcftools concat test_results/tmp/1-2491.bcf test_results/tmp/2492-4983.bcf test_results/tmp/4984-7475.bcf test_results/tmp/7476-9967.bcf test_results/tmp/9968-12459.bcf test_results/tmp/12460-14951.bcf test_results/tmp/14952-17442.bcf test_results/tmp/17443-19934.bcf test_results/tmp/19935-22426.bcf test_results/tmp/22427-24918.bcf test_results/tmp/24919-27410.bcf test_results/tmp/27411-29902.bcf -O b -o test_results/raw.bcf\n",
-      "calling variants..\n",
-      "bcftools call --ploidy 1 -m -v -o test_results/calls.vcf test_results/raw.bcf\n",
-      "47 sites called as variants\n",
-      "bcftools reheader --samples test_results/samples.txt -o /tmp/calls.vcf test_results/calls.vcf\n",
-      "bcftools filter -i \"\" -o test_results/filtered.vcf.gz -O z test_results/calls.vcf\n",
-      "splitting snps and indels..\n",
-      "bcftools view -v snps -o test_results/snps.vcf.gz -O z test_results/filtered.vcf.gz\n",
-      "bcftools view -v indels -o test_results/indels.vcf.gz -O z test_results/filtered.vcf.gz\n",
-      "took 1.0 seconds\n",
-      "test_results/snps.vcf.gz\n",
-      "\n",
-      "making SNP matrix\n",
-      "-----------------\n",
-      "found 42 sites for core snps\n",
-      "0 sites with at least one missing sample\n",
-      "5 uninformative sites\n",
-      "\n",
-      "Done. Sample summary:\n",
-      "---------------------\n",
-      "3 samples processed\n",
-      "\n",
-      "\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "reload(app)\n",
     "ref = app.sarscov2_genome\n",
@@ -2093,6 +1853,25 @@
     "W.run()"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "reload(app)\n",
+    "ref = app.sarscov2_genome\n",
+    "args = {'threads':12, 'outdir': '/home/farrell/espinoza/results', #'labelsep':'.',           \n",
+    "        'manifest': '/home/farrell/espinoza/reads_table.DENV2.csv',\n",
+    "        'reference': '/home/farrell/espinoza/DENV2.fa',\n",
+    "        'overwrite':False,\n",
+    "        'filters':'',\n",
+    "        'custom_filters': False}\n",
+    "W = app.WorkFlow(**args)\n",
+    "st = W.setup()\n",
+    "W.run()"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
diff --git a/requirements.txt b/requirements.txt
@@ -2,6 +2,6 @@ pandas
 numpy
 biopython
 matplotlib
-pyvcf
+pyvcf3
 pyfaidx
-pyside2
+pyside2
diff --git a/snipgenie/app.py b/snipgenie/app.py
@@ -1066,12 +1066,6 @@ def main():
                         help="input folder(s)", metavar="FILE")
     parser.add_argument("-M", "--manifest", dest="manifest", default=None,
                         help="manifest file with samples, optional - overrides input", metavar="FILE")
-    #parser.add_argument("-l", "--labels", dest="labels", default=[],
-    #                    help="sample labels file, optional", metavar="FILE")
-    parser.add_argument("-e", "--labelsep", dest="labelsep", default='_',
-                        help="symbol to split the sample labels on")
-    parser.add_argument("-x", "--labelindex", dest="labelindex", default=0,
-                        help="position to extract label in split filenames")
     parser.add_argument("-r", "--reference", dest="reference", default=None,
                         help="reference genome filename", metavar="FILE")
     parser.add_argument("-S", "--species", dest="species", default=None,
@@ -1081,6 +1075,10 @@ def main():
                         help="annotation file, optional", metavar="FILE")
     parser.add_argument("-t", "--threads", dest="threads", default=4,
                         help="cpu threads to use")
+    parser.add_argument("-e", "--labelsep", dest="labelsep", default='_',
+                        help="symbol to split the sample labels on if parsing filenames")
+    parser.add_argument("-x", "--labelindex", dest="labelindex", default=0,
+                        help="position to extract label in split filenames")
     parser.add_argument("-w", "--overwrite", dest="overwrite", action="store_true", default=False,
                         help="overwrite intermediate files")
     parser.add_argument("-T", "--trim", dest="trim", action="store_true", default=False,