|
23 | 23 | },
|
24 | 24 | {
|
25 | 25 | "cell_type": "code",
|
26 |
| - "execution_count": 1, |
| 26 | + "execution_count": 2, |
27 | 27 | "metadata": {
|
28 | 28 | "tags": []
|
29 | 29 | },
|
|
1835 | 1835 | },
|
1836 | 1836 | {
|
1837 | 1837 | "cell_type": "code",
|
1838 |
| - "execution_count": 21, |
| 1838 | + "execution_count": null, |
1839 | 1839 | "metadata": {},
|
1840 |
| - "outputs": [ |
1841 |
| - { |
1842 |
| - "name": "stdout", |
1843 |
| - "output_type": "stream", |
1844 |
| - "text": [ |
1845 |
| - "The following options were supplied\n", |
1846 |
| - "time: 10/12/2023 13:23:46\n", |
1847 |
| - "-------\n", |
1848 |
| - "threads : 12\n", |
1849 |
| - "outdir : test_results\n", |
1850 |
| - "labelsep : .\n", |
1851 |
| - "manifest : testsamples.txt\n", |
1852 |
| - "reference : /home/farrell/.config/snipgenie/genome/Sars-Cov-2.fa\n", |
1853 |
| - "overwrite : False\n", |
1854 |
| - "filters : \n", |
1855 |
| - "custom_filters : False\n", |
1856 |
| - "get_stats : False\n", |
1857 |
| - "labelindex : 0\n", |
1858 |
| - "trim : False\n", |
1859 |
| - "unmapped : False\n", |
1860 |
| - "quality : 25\n", |
1861 |
| - "aligner : bwa\n", |
1862 |
| - "platform : illumina\n", |
1863 |
| - "species : None\n", |
1864 |
| - "mask : None\n", |
1865 |
| - "gb_file : None\n", |
1866 |
| - "omit_samples : []\n", |
1867 |
| - "buildtree : False\n", |
1868 |
| - "bootstraps : 100\n", |
1869 |
| - "logfile : test_results/run.log\n", |
1870 |
| - "\n", |
1871 |
| - "using manifest file for samples\n", |
1872 |
| - "3 samples were loaded:\n", |
1873 |
| - "----------------------\n", |
1874 |
| - " sample filename1 filename2 read_length\n", |
1875 |
| - "0 B /home/farrell/gitprojects/snipgenie/notebooks/... /home/farrell/gitprojects/snipgenie/notebooks/... 150\n", |
1876 |
| - "1 A /home/farrell/gitprojects/snipgenie/notebooks/... /home/farrell/gitprojects/snipgenie/notebooks/... 150\n", |
1877 |
| - "2 C /home/farrell/gitprojects/snipgenie/notebooks/... /home/farrell/gitprojects/snipgenie/notebooks/... 150\n", |
1878 |
| - "\n", |
1879 |
| - "building index\n", |
1880 |
| - "indexing..\n", |
1881 |
| - "bwa index /home/farrell/.config/snipgenie/genome/Sars-Cov-2.fa\n" |
1882 |
| - ] |
1883 |
| - }, |
1884 |
| - { |
1885 |
| - "name": "stderr", |
1886 |
| - "output_type": "stream", |
1887 |
| - "text": [ |
1888 |
| - "[bwa_index] Pack FASTA... 0.00 sec\n", |
1889 |
| - "[bwa_index] Construct BWT for the packed sequence...\n", |
1890 |
| - "[bwa_index] 0.01 seconds elapse.\n", |
1891 |
| - "[bwa_index] Update BWT... 0.00 sec\n", |
1892 |
| - "[bwa_index] Pack forward-only FASTA... 0.00 sec\n", |
1893 |
| - "[bwa_index] Construct SA from BWT and Occ... 0.00 sec\n", |
1894 |
| - "[main] Version: 0.7.17-r1188\n", |
1895 |
| - "[main] CMD: bwa index /home/farrell/.config/snipgenie/genome/Sars-Cov-2.fa\n", |
1896 |
| - "[main] Real time: 0.014 sec; CPU: 0.009 sec\n" |
1897 |
| - ] |
1898 |
| - }, |
1899 |
| - { |
1900 |
| - "name": "stdout", |
1901 |
| - "output_type": "stream", |
1902 |
| - "text": [ |
1903 |
| - "aligning files\n", |
1904 |
| - "--------------\n", |
1905 |
| - "Using reference genome: /home/farrell/.config/snipgenie/genome/Sars-Cov-2.fa\n", |
1906 |
| - "0/3 samples already aligned\n", |
1907 |
| - "bwa mem -M -t 12 /home/farrell/.config/snipgenie/genome/Sars-Cov-2.fa \"/home/farrell/gitprojects/snipgenie/notebooks/test_folder/S1/cleaned_1.fastq.gz\" \"/home/farrell/gitprojects/snipgenie/notebooks/test_folder/S1/cleaned_2.fastq.gz\" | samtools view -F 4 -bt - | samtools sort -o test_results/mapped/B.bam\n" |
1908 |
| - ] |
1909 |
| - }, |
1910 |
| - { |
1911 |
| - "name": "stderr", |
1912 |
| - "output_type": "stream", |
1913 |
| - "text": [ |
1914 |
| - "[M::bwa_idx_load_from_disk] read 0 ALT contigs\n", |
1915 |
| - "[M::process] read 489878 sequences (73481700 bp)...\n", |
1916 |
| - "[M::mem_pestat] # candidate unique pairs for (FF, FR, RF, RR): (0, 244939, 0, 0)\n", |
1917 |
| - "[M::mem_pestat] skip orientation FF as there are not enough pairs\n", |
1918 |
| - "[M::mem_pestat] analyzing insert size distribution for orientation FR...\n", |
1919 |
| - "[M::mem_pestat] (25, 50, 75) percentile: (300, 300, 300)\n", |
1920 |
| - "[M::mem_pestat] low and high boundaries for computing mean and std.dev: (300, 300)\n", |
1921 |
| - "[M::mem_pestat] mean and std.dev: (300.00, 0.00)\n", |
1922 |
| - "[M::mem_pestat] low and high boundaries for proper pairs: (300, 300)\n", |
1923 |
| - "[M::mem_pestat] skip orientation RF as there are not enough pairs\n", |
1924 |
| - "[M::mem_pestat] skip orientation RR as there are not enough pairs\n", |
1925 |
| - "[M::mem_process_seqs] Processed 489878 reads in 11.092 CPU sec, 0.970 real sec\n", |
1926 |
| - "[main] Version: 0.7.17-r1188\n", |
1927 |
| - "[main] CMD: bwa mem -M -t 12 /home/farrell/.config/snipgenie/genome/Sars-Cov-2.fa /home/farrell/gitprojects/snipgenie/notebooks/test_folder/S1/cleaned_1.fastq.gz /home/farrell/gitprojects/snipgenie/notebooks/test_folder/S1/cleaned_2.fastq.gz\n", |
1928 |
| - "[main] Real time: 1.997 sec; CPU: 11.784 sec\n" |
1929 |
| - ] |
1930 |
| - }, |
1931 |
| - { |
1932 |
| - "name": "stdout", |
1933 |
| - "output_type": "stream", |
1934 |
| - "text": [ |
1935 |
| - "indexing B\n", |
1936 |
| - "samtools index test_results/mapped/B.bam\n", |
1937 |
| - "bwa mem -M -t 12 /home/farrell/.config/snipgenie/genome/Sars-Cov-2.fa \"/home/farrell/gitprojects/snipgenie/notebooks/test_folder/S2/cleaned_1.fastq.gz\" \"/home/farrell/gitprojects/snipgenie/notebooks/test_folder/S2/cleaned_2.fastq.gz\" | samtools view -F 4 -bt - | samtools sort -o test_results/mapped/A.bam\n" |
1938 |
| - ] |
1939 |
| - }, |
1940 |
| - { |
1941 |
| - "name": "stderr", |
1942 |
| - "output_type": "stream", |
1943 |
| - "text": [ |
1944 |
| - "/home/farrell/gitprojects/snipgenie/snipgenie/app.py:371: FutureWarning: Setting an item of incompatible dtype is deprecated and will raise in a future error of pandas. Value '/home/farrell/gitprojects/snipgenie/notebooks/test_results/mapped/B.bam' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.\n", |
1945 |
| - " df.loc[i,'bam_file'] = os.path.abspath(out)\n", |
1946 |
| - "[M::bwa_idx_load_from_disk] read 0 ALT contigs\n", |
1947 |
| - "[M::process] read 503586 sequences (75537900 bp)...\n", |
1948 |
| - "[M::mem_pestat] # candidate unique pairs for (FF, FR, RF, RR): (0, 251793, 0, 0)\n", |
1949 |
| - "[M::mem_pestat] skip orientation FF as there are not enough pairs\n", |
1950 |
| - "[M::mem_pestat] analyzing insert size distribution for orientation FR...\n", |
1951 |
| - "[M::mem_pestat] (25, 50, 75) percentile: (300, 300, 300)\n", |
1952 |
| - "[M::mem_pestat] low and high boundaries for computing mean and std.dev: (300, 300)\n", |
1953 |
| - "[M::mem_pestat] mean and std.dev: (300.00, 0.00)\n", |
1954 |
| - "[M::mem_pestat] low and high boundaries for proper pairs: (300, 300)\n", |
1955 |
| - "[M::mem_pestat] skip orientation RF as there are not enough pairs\n", |
1956 |
| - "[M::mem_pestat] skip orientation RR as there are not enough pairs\n", |
1957 |
| - "[M::mem_process_seqs] Processed 503586 reads in 11.380 CPU sec, 0.996 real sec\n", |
1958 |
| - "[main] Version: 0.7.17-r1188\n", |
1959 |
| - "[main] CMD: bwa mem -M -t 12 /home/farrell/.config/snipgenie/genome/Sars-Cov-2.fa /home/farrell/gitprojects/snipgenie/notebooks/test_folder/S2/cleaned_1.fastq.gz /home/farrell/gitprojects/snipgenie/notebooks/test_folder/S2/cleaned_2.fastq.gz\n", |
1960 |
| - "[main] Real time: 2.092 sec; CPU: 12.114 sec\n" |
1961 |
| - ] |
1962 |
| - }, |
1963 |
| - { |
1964 |
| - "name": "stdout", |
1965 |
| - "output_type": "stream", |
1966 |
| - "text": [ |
1967 |
| - "indexing A\n", |
1968 |
| - "samtools index test_results/mapped/A.bam\n", |
1969 |
| - "bwa mem -M -t 12 /home/farrell/.config/snipgenie/genome/Sars-Cov-2.fa \"/home/farrell/gitprojects/snipgenie/notebooks/test_folder/S3/cleaned_1.fastq.gz\" \"/home/farrell/gitprojects/snipgenie/notebooks/test_folder/S3/cleaned_2.fastq.gz\" | samtools view -F 4 -bt - | samtools sort -o test_results/mapped/C.bam\n" |
1970 |
| - ] |
1971 |
| - }, |
1972 |
| - { |
1973 |
| - "name": "stderr", |
1974 |
| - "output_type": "stream", |
1975 |
| - "text": [ |
1976 |
| - "[M::bwa_idx_load_from_disk] read 0 ALT contigs\n", |
1977 |
| - "[M::process] read 504200 sequences (75630000 bp)...\n", |
1978 |
| - "[M::mem_pestat] # candidate unique pairs for (FF, FR, RF, RR): (0, 252100, 0, 0)\n", |
1979 |
| - "[M::mem_pestat] skip orientation FF as there are not enough pairs\n", |
1980 |
| - "[M::mem_pestat] analyzing insert size distribution for orientation FR...\n", |
1981 |
| - "[M::mem_pestat] (25, 50, 75) percentile: (300, 300, 300)\n", |
1982 |
| - "[M::mem_pestat] low and high boundaries for computing mean and std.dev: (300, 300)\n", |
1983 |
| - "[M::mem_pestat] mean and std.dev: (300.00, 0.00)\n", |
1984 |
| - "[M::mem_pestat] low and high boundaries for proper pairs: (300, 300)\n", |
1985 |
| - "[M::mem_pestat] skip orientation RF as there are not enough pairs\n", |
1986 |
| - "[M::mem_pestat] skip orientation RR as there are not enough pairs\n", |
1987 |
| - "[M::mem_process_seqs] Processed 504200 reads in 11.595 CPU sec, 1.008 real sec\n", |
1988 |
| - "[main] Version: 0.7.17-r1188\n", |
1989 |
| - "[main] CMD: bwa mem -M -t 12 /home/farrell/.config/snipgenie/genome/Sars-Cov-2.fa /home/farrell/gitprojects/snipgenie/notebooks/test_folder/S3/cleaned_1.fastq.gz /home/farrell/gitprojects/snipgenie/notebooks/test_folder/S3/cleaned_2.fastq.gz\n", |
1990 |
| - "[main] Real time: 2.087 sec; CPU: 12.314 sec\n" |
1991 |
| - ] |
1992 |
| - }, |
1993 |
| - { |
1994 |
| - "name": "stdout", |
1995 |
| - "output_type": "stream", |
1996 |
| - "text": [ |
1997 |
| - "indexing C\n", |
1998 |
| - "samtools index test_results/mapped/C.bam\n", |
1999 |
| - "\n", |
2000 |
| - "calling variants\n", |
2001 |
| - "----------------\n", |
2002 |
| - "running mpileup for 3 files..\n", |
2003 |
| - "[ 1 2492 4984 7476 9968 12460 14952 17443 19935 22427 24919 27411\n", |
2004 |
| - " 29903]\n", |
2005 |
| - "parallel bcftools mpileup -r {1} -a \"AD,ADF,ADR,DP,SP,INFO/AD,INFO/ADF,INFO/ADR\" -O b --max-depth 500 --min-MQ 60 -o {2} -f /home/farrell/.config/snipgenie/genome/Sars-Cov-2.fa /home/farrell/gitprojects/snipgenie/notebooks/test_results/mapped/B.bam /home/farrell/gitprojects/snipgenie/notebooks/test_results/mapped/A.bam /home/farrell/gitprojects/snipgenie/notebooks/test_results/mapped/C.bam ::: \"NC_045512.2\":1-2491 \"NC_045512.2\":2492-4983 \"NC_045512.2\":4984-7475 \"NC_045512.2\":7476-9967 \"NC_045512.2\":9968-12459 \"NC_045512.2\":12460-14951 \"NC_045512.2\":14952-17442 \"NC_045512.2\":17443-19934 \"NC_045512.2\":19935-22426 \"NC_045512.2\":22427-24918 \"NC_045512.2\":24919-27410 \"NC_045512.2\":27411-29902 :::+ test_results/tmp/1-2491.bcf test_results/tmp/2492-4983.bcf test_results/tmp/4984-7475.bcf test_results/tmp/7476-9967.bcf test_results/tmp/9968-12459.bcf test_results/tmp/12460-14951.bcf test_results/tmp/14952-17442.bcf test_results/tmp/17443-19934.bcf test_results/tmp/19935-22426.bcf test_results/tmp/22427-24918.bcf test_results/tmp/24919-27410.bcf test_results/tmp/27411-29902.bcf\n" |
2006 |
| - ] |
2007 |
| - }, |
2008 |
| - { |
2009 |
| - "name": "stderr", |
2010 |
| - "output_type": "stream", |
2011 |
| - "text": [ |
2012 |
| - "[mpileup] 3 samples in 3 input files\n", |
2013 |
| - "[mpileup] maximum number of reads per input file set to -d 500\n", |
2014 |
| - "[mpileup] 3 samples in 3 input files\n", |
2015 |
| - "[mpileup] maximum number of reads per input file set to -d 500\n", |
2016 |
| - "[mpileup] 3 samples in 3 input files\n", |
2017 |
| - "[mpileup] maximum number of reads per input file set to -d 500\n", |
2018 |
| - "[mpileup] 3 samples in 3 input files\n", |
2019 |
| - "[mpileup] maximum number of reads per input file set to -d 500\n", |
2020 |
| - "[mpileup] 3 samples in 3 input files\n", |
2021 |
| - "[mpileup] maximum number of reads per input file set to -d 500\n", |
2022 |
| - "[mpileup] 3 samples in 3 input files\n", |
2023 |
| - "[mpileup] maximum number of reads per input file set to -d 500\n", |
2024 |
| - "[mpileup] 3 samples in 3 input files\n", |
2025 |
| - "[mpileup] maximum number of reads per input file set to -d 500\n", |
2026 |
| - "[mpileup] 3 samples in 3 input files\n", |
2027 |
| - "[mpileup] maximum number of reads per input file set to -d 500\n", |
2028 |
| - "[mpileup] 3 samples in 3 input files\n", |
2029 |
| - "[mpileup] maximum number of reads per input file set to -d 500\n", |
2030 |
| - "[mpileup] 3 samples in 3 input files\n", |
2031 |
| - "[mpileup] maximum number of reads per input file set to -d 500\n", |
2032 |
| - "[mpileup] 3 samples in 3 input files\n", |
2033 |
| - "[mpileup] maximum number of reads per input file set to -d 500\n", |
2034 |
| - "[mpileup] 3 samples in 3 input files\n", |
2035 |
| - "[mpileup] maximum number of reads per input file set to -d 500\n", |
2036 |
| - "Checking the headers and starting positions of 12 files\n", |
2037 |
| - "Concatenating test_results/tmp/1-2491.bcf\t0.009546 seconds\n", |
2038 |
| - "Concatenating test_results/tmp/2492-4983.bcf\t0.004426 seconds\n", |
2039 |
| - "Concatenating test_results/tmp/4984-7475.bcf\t0.004128 seconds\n", |
2040 |
| - "Concatenating test_results/tmp/7476-9967.bcf\t0.003428 seconds\n", |
2041 |
| - "Concatenating test_results/tmp/9968-12459.bcf\t0.003823 seconds\n", |
2042 |
| - "Concatenating test_results/tmp/12460-14951.bcf\t0.003815 seconds\n", |
2043 |
| - "Concatenating test_results/tmp/14952-17442.bcf\t0.004609 seconds\n", |
2044 |
| - "Concatenating test_results/tmp/17443-19934.bcf\t0.004330 seconds\n", |
2045 |
| - "Concatenating test_results/tmp/19935-22426.bcf\t0.003555 seconds\n", |
2046 |
| - "Concatenating test_results/tmp/22427-24918.bcf\t0.003998 seconds\n", |
2047 |
| - "Concatenating test_results/tmp/24919-27410.bcf\t0.004125 seconds\n", |
2048 |
| - "Concatenating test_results/tmp/27411-29902.bcf\t0.003791 seconds\n" |
2049 |
| - ] |
2050 |
| - }, |
2051 |
| - { |
2052 |
| - "name": "stdout", |
2053 |
| - "output_type": "stream", |
2054 |
| - "text": [ |
2055 |
| - "bcftools concat test_results/tmp/1-2491.bcf test_results/tmp/2492-4983.bcf test_results/tmp/4984-7475.bcf test_results/tmp/7476-9967.bcf test_results/tmp/9968-12459.bcf test_results/tmp/12460-14951.bcf test_results/tmp/14952-17442.bcf test_results/tmp/17443-19934.bcf test_results/tmp/19935-22426.bcf test_results/tmp/22427-24918.bcf test_results/tmp/24919-27410.bcf test_results/tmp/27411-29902.bcf -O b -o test_results/raw.bcf\n", |
2056 |
| - "calling variants..\n", |
2057 |
| - "bcftools call --ploidy 1 -m -v -o test_results/calls.vcf test_results/raw.bcf\n", |
2058 |
| - "47 sites called as variants\n", |
2059 |
| - "bcftools reheader --samples test_results/samples.txt -o /tmp/calls.vcf test_results/calls.vcf\n", |
2060 |
| - "bcftools filter -i \"\" -o test_results/filtered.vcf.gz -O z test_results/calls.vcf\n", |
2061 |
| - "splitting snps and indels..\n", |
2062 |
| - "bcftools view -v snps -o test_results/snps.vcf.gz -O z test_results/filtered.vcf.gz\n", |
2063 |
| - "bcftools view -v indels -o test_results/indels.vcf.gz -O z test_results/filtered.vcf.gz\n", |
2064 |
| - "took 1.0 seconds\n", |
2065 |
| - "test_results/snps.vcf.gz\n", |
2066 |
| - "\n", |
2067 |
| - "making SNP matrix\n", |
2068 |
| - "-----------------\n", |
2069 |
| - "found 42 sites for core snps\n", |
2070 |
| - "0 sites with at least one missing sample\n", |
2071 |
| - "5 uninformative sites\n", |
2072 |
| - "\n", |
2073 |
| - "Done. Sample summary:\n", |
2074 |
| - "---------------------\n", |
2075 |
| - "3 samples processed\n", |
2076 |
| - "\n", |
2077 |
| - "\n" |
2078 |
| - ] |
2079 |
| - } |
2080 |
| - ], |
| 1840 | + "outputs": [], |
2081 | 1841 | "source": [
|
2082 | 1842 | "reload(app)\n",
|
2083 | 1843 | "ref = app.sarscov2_genome\n",
|
|
2093 | 1853 | "W.run()"
|
2094 | 1854 | ]
|
2095 | 1855 | },
|
| 1856 | + { |
| 1857 | + "cell_type": "code", |
| 1858 | + "execution_count": null, |
| 1859 | + "metadata": {}, |
| 1860 | + "outputs": [], |
| 1861 | + "source": [ |
| 1862 | + "reload(app)\n", |
| 1863 | + "ref = app.sarscov2_genome\n", |
| 1864 | + "args = {'threads':12, 'outdir': '/home/farrell/espinoza/results', #'labelsep':'.', \n", |
| 1865 | + " 'manifest': '/home/farrell/espinoza/reads_table.DENV2.csv',\n", |
| 1866 | + " 'reference': '/home/farrell/espinoza/DENV2.fa',\n", |
| 1867 | + " 'overwrite':False,\n", |
| 1868 | + " 'filters':'',\n", |
| 1869 | + " 'custom_filters': False}\n", |
| 1870 | + "W = app.WorkFlow(**args)\n", |
| 1871 | + "st = W.setup()\n", |
| 1872 | + "W.run()" |
| 1873 | + ] |
| 1874 | + }, |
2096 | 1875 | {
|
2097 | 1876 | "cell_type": "code",
|
2098 | 1877 | "execution_count": null,
|
|
0 commit comments