Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions environment.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
name: monopogen
channels:
- bioconda
- conda-forge
dependencies:
- python>=3.10
- picard
- beagle=4.1
- samtools
- pysam
- bcftools
- tabix
- vcftools
- pandas
- numpy
- gzip
50 changes: 25 additions & 25 deletions src/Monopogen.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,25 +24,15 @@
from multiprocessing import Pool


LIB_PATH = os.path.abspath(
os.path.join(os.path.dirname(os.path.realpath(__file__)), "pipelines/lib"))

if LIB_PATH not in sys.path:
sys.path.insert(0, LIB_PATH)

PIPELINE_BASEDIR = os.path.dirname(os.path.realpath(sys.argv[0]))
CFG_DIR = os.path.join(PIPELINE_BASEDIR, "cfg")

#import pipelines
#from pipelines import get_cluster_cfgfile
#from pipelines import PipelineHandler

# global logger
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
handler = logging.StreamHandler()
handler.setFormatter(logging.Formatter(
'[{asctime}] {levelname:8s} {filename} {message}', style='{'))
'[{asctime}] {levelname:8s} {filename} {message}', style='{'))
logger.addHandler(handler)


Expand Down Expand Up @@ -94,13 +84,13 @@ def germline(args):


imputation_vcf = args.imputation_panel + "CCDG_14151_B01_GRM_WGS_2020-08-05_" + record[0] + ".filtered.shapeit2-duohmm-phased.vcf.gz"
cmd1 = samtools + " mpileup -b" + bam_filter + " -f " + args.reference + " -r " + jobid + " -q 20 -Q 20 --incl-flags 0 --excl-flags 0 -t DP -d 10000000 -v "
cmd1 = cmd1 + " | " + bcftools + " view " + " | " + bcftools + ' filter -e \'REF !~ "^[ATGC]$"\' | ' + bcftools + " norm -m-both -f " + args.reference
cmd1 = cmd1 + " | grep -v \"<X>\" | " + bgzip + " -c > " + args.out + "/germline/" + jobid + ".gl.vcf.gz"
cmd1 = f"{bcftools} mpileup -b {bam_filter} -f {args.reference} -r {jobid} -q 20 -Q 20 --annotate FORMAT/DP -d 10000000 | {bcftools} call -mv -Oz -o {args.out}/germline/{jobid}.gl.vcf.gz"
#cmd1 = cmd1 + " | " + bcftools + " view " + " | " + bcftools + ' filter -e \'REF !~ "^[ATGC]$"\' | ' + bcftools + " norm -m-both -f " + args.reference
#cmd1 = cmd1 + " | grep -v \"<X>\" | " + bgzip + " -c > " + args.out + "/germline/" + jobid + ".gl.vcf.gz"
#cmd2 = bcftools + " view " + out + "/germline/" + jobid + ".gl.vcf.gz" + " -i 'FORMAT/DP>1' | " + bcftools + " call -cv | " + bgzip + " -c > " + args.out + "/SCvarCall/" + jobid + ".gt.vcf.gz"
cmd3 = java + " -Xmx20g -jar " + beagle + " gl=" + out + "/germline/" + jobid + ".gl.vcf.gz" + " ref=" + imputation_vcf + " chrom=" + record[0] + " out=" + out + "/germline/" + jobid + ".gp " + "impute=false modelscale=2 nthreads=24 gprobs=true niterations=0"
cmd3 = beagle + " gl=" + out + "/germline/" + jobid + ".gl.vcf.gz" + " ref=" + imputation_vcf + " chrom=" + record[0] + " out=" + out + "/germline/" + jobid + ".gp " + "impute=false modelscale=2 nthreads=24 gprobs=true niterations=0"

cmd5 = java + " -Xmx20g -jar " + beagle + " gt=" + out + "/germline/" + jobid + ".germline.vcf" + " ref=" + imputation_vcf + " chrom=" + record[0] + " out=" + out + "/germline/" + jobid+ ".phased " + "impute=false modelscale=2 nthreads=24 gprobs=true niterations=0"
cmd5 = beagle + " gt=" + out + "/germline/" + jobid + ".germline.vcf" + " ref=" + imputation_vcf + " chrom=" + record[0] + " out=" + out + "/germline/" + jobid+ ".phased " + "impute=false modelscale=2 nthreads=24 gprobs=true niterations=0"
cmd5 = cmd5 + "\n" + "rm " + out + "/germline/" + jobid + ".germline.vcf"
f_out = open(out + "/Script/runGermline_" + jobid + ".sh","w")
if args.step == "varScan" or args.step == "all":
Expand All @@ -124,12 +114,18 @@ def germline(args):

if not args.norun == "TRUE":
with Pool(processes=args.nthreads) as pool:
print(joblst)
logger.info("Starting germline jobs...")
result = pool.map(runCMD, joblst)
#error_check(all = region_lst, output = result, step = "germline module")



# Check for errors in the results
failed_jobs = [job for job, status in zip(joblst, result) if status != 0]
if failed_jobs:
logger.error("The following jobs failed:")
for job in failed_jobs:
logger.error(job, status)
logger.error("Pipeline failed due to job errors. See logs for details.")
exit(1)
else:
logger.info("All germline jobs completed successfully.")

def somatic(args):

Expand Down Expand Up @@ -229,6 +225,10 @@ def preProcess(args):
bamlist.close()


def runCMD(cmd):
result = subprocess.run(cmd, shell=True)
return result.returncode

def main():
parser = argparse.ArgumentParser(
description="""Monopogen: SNV calling from single cell sequencing
Expand Down Expand Up @@ -323,11 +323,11 @@ def main():

global out, samtools, bcftools, bgzip, java, beagle
out = os.path.abspath(args.out)
samtools = os.path.abspath(args.app_path) + "/samtools"
bcftools = os.path.abspath(args.app_path) + "/bcftools"
bgzip = os.path.abspath(args.app_path) + "/bgzip"
samtools = "samtools"
bcftools = "bcftools"
bgzip = "bgzip"
java = "java"
beagle = os.path.abspath(args.app_path) + "/beagle.27Jul16.86a.jar"
beagle = "beagle"


args.func(args)
Expand Down
8 changes: 4 additions & 4 deletions src/bamProcess.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,11 @@
import pandas as pd
from pysam import VariantFile

LIB_PATH = os.path.abspath(
os.path.join(os.path.dirname(os.path.realpath(__file__)), "pipelines/lib"))
# LIB_PATH = os.path.abspath(
# os.path.join(os.path.dirname(os.path.realpath(__file__)), "pipelines/lib"))

if LIB_PATH not in sys.path:
sys.path.insert(0, LIB_PATH)
# if LIB_PATH not in sys.path:
# sys.path.insert(0, LIB_PATH)

PIPELINE_BASEDIR = os.path.dirname(os.path.realpath(sys.argv[0]))
CFG_DIR = os.path.join(PIPELINE_BASEDIR, "cfg")
Expand Down
8 changes: 4 additions & 4 deletions src/germline.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,11 +108,11 @@ def validate_user_setting_germline(args):


def check_dependencies(args):
programs_to_check = ("vcftools", "bgzip", "bcftools", "beagle.27Jul16.86a.jar","samtools","picard.jar", "java")
programs_to_check = ("vcftools", "bgzip", "bcftools", "samtools", "java", "beagle", "picard")

for prog in programs_to_check:
out = os.popen("command -v {}".format(args.app_path + "/" + prog)).read()
assert out != "", "Program {} cannot be found!".format(prog)
for prog in programs_to_check:
out = os.popen(f"command -v {prog}").read().strip()
assert out != "", f"Program {prog} cannot be found! Ensure it is installed and available in the PATH."

# python_pkgs_to_check = ("drmaa",)

Expand Down
4 changes: 2 additions & 2 deletions test/bam.lst
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
ID1,/rsrch3/scratch/bcb/jdou1/scAncestry/Monopogen/example/A.bam
ID2,/rsrch3/scratch/bcb/jdou1/scAncestry/Monopogen/example/B.bam
ID1,example/A.bam
ID2,example/B.bam
10 changes: 5 additions & 5 deletions test/runGermline.sh
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
path="/rsrch3/scratch/bcb/jdou1/scAncestry/Monopogen"
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${path}/apps
path="$(cd "$(dirname "${BASH_SOURCE[0]}")/../" && pwd)"
# export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${path}/apps


python ${path}/src/Monopogen.py preProcess -b bam.lst -o out -a ${path}/apps -t 8


python ${path}/src/Monopogen.py germline \
-a ${path}/apps -t 8 -r region.lst \
-p ../example/ \
-g ../example/chr20_2Mb.hg38.fa -m 3 -s all -o out
-a ${path}/apps -t 8 -r ${path}/test/region.lst \
-p ${path}/example/ \
-g ${path}/example/chr20_2Mb.hg38.fa -m 3 -s all -o out



11 changes: 3 additions & 8 deletions test/runPreprocess.sh
Original file line number Diff line number Diff line change
@@ -1,9 +1,4 @@
path="/rsrch3/scratch/bcb/jdou1/scAncestry/Monopogen"
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${path}/apps
path="$(cd "$(dirname "${BASH_SOURCE[0]}")/../" && pwd)"
# export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${path}/apps


python ${path}/src/Monopogen.py preProcess -b bam.lst -o out -a ${path}/apps -t 8




python ${path}/src/Monopogen.py preProcess -b ${path}/test/bam.lst -o out -a ${path}/apps -t 8
35 changes: 15 additions & 20 deletions test/runSomatic.sh
Original file line number Diff line number Diff line change
Expand Up @@ -12,37 +12,32 @@



path="/rsrch3/scratch/bcb/jdou1/scAncestry/Monopogen_v1.5"
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${path}/apps
path="$(cd "$(dirname "${BASH_SOURCE[0]}")/../" && pwd)"

python ${path}/src/Monopogen.py preProcess -b bam.somatic.lst -o somatic -a ${path}/apps -t 8

python ${path}/src/Monopogen.py germline \
-a ${path}/apps -t 8 -r region.lst \
-p ../example/ \
-g ../example/chr20_2Mb.hg38.fa -m 3 -s all -o somatic
# python ${path}/src/Monopogen.py preProcess -b bam.somatic.lst -o somatic -a ${path}/apps -t 8

# python ${path}/src/Monopogen.py germline \
# -a ${path}/apps -t 8 -r ${path}/test/region.lst \
# -p ${path}/example/ \
# -g ${path}/example/chr20_2Mb.hg38.fa -m 3 -s all -o out


python ${path}/src/Monopogen.py somatic \
-a ${path}/apps -r region.lst -t 22 \
-i somatic -l test.csv -s featureInfo \
-g ../example/chr20_2Mb.hg38.fa


-a ${path}/apps -r ${path}/test/region.lst -t 22 \
-i out -l ${path}/test.csv -s featureInfo \
-g ${path}/example/chr20_2Mb.hg38.fa


python ${path}/src/Monopogen.py somatic \
-a ${path}/apps -r region.lst -t 22 \
-i somatic -l test.csv -s cellScan \
-g ../example/chr20_2Mb.hg38.fa
-a ${path}/apps -r ${path}/test/region.lst -t 22 \
-i out -l ${path}/test.csv -s cellScan \
-g ${path}/example/chr20_2Mb.hg38.fa


####### Not work due to fewer marker size

module load R
python ${path}/src/Monopogen.py somatic \
-a ${path}/apps -r region.lst -t 22 \
-i somatic -l test.csv -s LDrefinement \
-g ../example/chr20_2Mb.hg38.fa

-a ${path}/apps -r ${path}/test/region.lst -t 22 \
-i out -l ${path}/test.csv -s LDrefinement \
-g ${path}/example/chr20_2Mb.hg38.fa