-
Notifications
You must be signed in to change notification settings - Fork 5
Take in multiple inputs with different assemblies; pull chr info from bam instead of fasta #44
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 2 commits
ad043e3
30018c0
34dfa1e
7e5b6bf
7582e64
d881fcb
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Large diffs are not rendered by default.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -65,5 +65,6 @@ snakefmt = "*" | |
| ruff = "*" | ||
| awscli = "2.22.*" | ||
| taplo = "*" | ||
| pysam = "*" | ||
|
|
||
| [pypi-dependencies] | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -14,3 +14,4 @@ dependencies: | |
| - csvtk | ||
| - mosdepth==0.3.7 | ||
| - bioconda::bigtools==0.5.5 | ||
| #- pysam==0.22 | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -8,4 +8,4 @@ dependencies: | |
| #- tqdm | ||
| #- pip | ||
| #- pip: | ||
| #- pysam==0.22.1 | ||
| - pysam==0.22.1 | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,29 +1,73 @@ | ||
| import re | ||
| import logging | ||
| import sys | ||
| import pysam | ||
|
|
||
| FIRST_REPORT = True | ||
|
|
||
|
|
||
| def get_ref(): | ||
| def get_ref_orig(): | ||
| if "ref" not in config: | ||
| raise ValueError("FIRE: ref parameter is missing in config.yaml") | ||
| ref = config["ref"] | ||
| if not os.path.exists(ref): | ||
| raise ValueError(f"FIRE: reference file {ref} does not exist") | ||
| return os.path.abspath(ref) | ||
|
|
||
| def get_ref_old(wc): | ||
|
||
| if "ref" in config: | ||
| ref = config["ref"] | ||
| if "ref" not in config: | ||
| if "ref" in MANIFEST.columns: | ||
| ref = get_input_ref() | ||
| else: | ||
| raise ValueError("FIRE: ref parameter is missing in config.yaml and no ref column in manifest") | ||
| #ref = config["ref"] | ||
| if not os.path.exists(ref): | ||
| raise ValueError(f"FIRE: reference file {ref} does not exist") | ||
| return os.path.abspath(ref) | ||
|
|
||
| def get_fai(): | ||
| fai = f"{get_ref()}.fai" | ||
| def get_ref(wc): | ||
| ref = MANIFEST.loc[wc.sm, "ref"] | ||
| if not os.path.exists(ref): | ||
| raise ValueError(f"FIRE: reference file {ref} does not exist") | ||
| return os.path.abspath(ref) | ||
|
|
||
| def get_ref_name_old(wc): | ||
| if "ref_name" in config: | ||
| ref_name=config["ref_name"] | ||
| else: | ||
| if "ref_name" in MANIFEST.columns: | ||
| ref_name= get_input_ref_name() | ||
| else: | ||
| raise ValueError("FIRE: ref_name parameter is missing in config.yaml and no ref_name column in manifest") | ||
| #ref=get_ref() | ||
| #temp_split=ref.strip().split('/')[-1] | ||
| #fa_split=temp_split.split('.fa') | ||
| #ref_name = '.fa'.join(fa_split[:-1]) | ||
| return(ref_name) | ||
|
|
||
| def get_ref_name(wc): | ||
| return MANIFEST.loc[wc.sm, "ref_name"] | ||
|
|
||
| def get_fai_orig(wc): | ||
| fai = f"{get_ref(wc)}.fai" | ||
| if not os.path.exists(fai): | ||
| raise ValueError(f"FIRE: reference index file {fai} does not exist") | ||
| return fai | ||
|
|
||
| def get_fai(wc): | ||
| ref = MANIFEST.loc[wc.sm, "ref"] | ||
|
||
| #fai= ref + ".fai" | ||
| fai = str(ref) + ".fai" | ||
| if not os.path.exists(fai): | ||
| raise ValueError(f"FIRE: reference index file {fai} does not exist") | ||
| return fai | ||
|
|
||
| def get_excludes(): | ||
| def get_excludes(wc): | ||
| excludes = config.get("excludes", []) | ||
| if REF_NAME == "hg38" or REF_NAME == "GRCh38": | ||
| ref_name = get_ref_name(wc) | ||
| if ref_name == "hg38" or ref_name == "GRCh38": | ||
| files = [ | ||
| "../annotations/hg38.gap.bed.gz", | ||
| "../annotations/hg38.blacklist.ENCFF356LFX.bed.gz", | ||
|
|
@@ -33,22 +77,23 @@ def get_excludes(): | |
| return excludes | ||
|
|
||
|
|
||
| def get_fai_df(): | ||
| fai = get_fai() | ||
| def get_fai_df(wc): | ||
| fai = get_fai(wc) | ||
| return pd.read_csv(fai, sep="\t", names=["chr", "length", "x", "y", "z"]) | ||
|
|
||
|
|
||
| def get_chroms(): | ||
| def get_chroms_orig(wc): | ||
| global FIRST_REPORT | ||
| min_contig_length = config.get("min_contig_length", 0) | ||
| skipped_contigs = FAI_DF["chr"][FAI_DF["length"] < min_contig_length] | ||
| fai_df=get_fai_df(wc) | ||
| skipped_contigs = fai_df["chr"][fai_df["length"] < min_contig_length] | ||
| if len(skipped_contigs) > 0 and FIRST_REPORT: | ||
| print( | ||
| f"WARNING: Skipping contigs with length < {min_contig_length:,}: {skipped_contigs}", | ||
| file=sys.stderr, | ||
| ) | ||
|
|
||
| chroms = FAI_DF["chr"][FAI_DF["length"] >= min_contig_length] | ||
| chroms = fai_df["chr"][fai_df["length"] >= min_contig_length] | ||
| chroms = sorted([chrom for chrom in chroms if "chrUn_" not in chrom]) | ||
| chroms = [chrom for chrom in chroms if "_random" not in chrom] | ||
| chroms = [chrom for chrom in chroms if re.fullmatch(KEEP_CHRS, chrom)] | ||
|
|
@@ -60,10 +105,115 @@ def get_chroms(): | |
| if len(chroms) == 0: | ||
| raise ValueError( | ||
| f"No chromosomes left after filtering. Check your keep_chromosomes parameter in config.yaml. " | ||
| f"Your fai file contains the following chromosomes: {FAI_DF['chr']}" | ||
| f"Your fai file contains the following chromosomes: {fai_df['chr']}" | ||
| ) | ||
| return chroms | ||
|
|
||
| def get_chroms(wc): | ||
| global FIRST_REPORT | ||
| min_contig_length = config.get("min_contig_length", 0) | ||
| fai_df=get_fai_df(wc) | ||
| skipped_contigs = fai_df["chr"][fai_df["length"] < min_contig_length] | ||
| if len(skipped_contigs) > 0 and FIRST_REPORT: | ||
| print( | ||
| f"WARNING: Skipping contigs with length < {min_contig_length:,}: {skipped_contigs}", | ||
| file=sys.stderr, | ||
| ) | ||
|
|
||
| bam_chr_list=[] | ||
| input_bam_path=get_input_bam(wc) | ||
| input_bam = pysam.AlignmentFile(input_bam_path, "rc", threads=MAX_THREADS) | ||
|
||
| bam_header_dict = input_bam.header.to_dict() | ||
|
|
||
| for line in bam_header_dict['SQ']: | ||
|
||
| chr_name=line['SN'] | ||
| bam_chr_list.append(chr_name) | ||
|
|
||
| input_bam.close() | ||
|
|
||
| chroms = fai_df["chr"][fai_df["length"] >= min_contig_length] | ||
| chroms = sorted([chrom for chrom in chroms if "chrUn_" not in chrom]) | ||
| chroms = [chrom for chrom in chroms if "_random" not in chrom] | ||
| chroms = [chrom for chrom in chroms if re.fullmatch(KEEP_CHRS, chrom)] | ||
| chroms = [chrom for chrom in chroms if chrom in bam_chr_list] | ||
|
|
||
| if FIRST_REPORT: | ||
| FIRST_REPORT = False | ||
| print(f"INFO: Using N chromosomes: {len(chroms)}", file=sys.stderr) | ||
|
|
||
| if len(chroms) == 0: | ||
| raise ValueError( | ||
| f"No chromosomes left after filtering. Check your keep_chromosomes parameter in config.yaml. " | ||
| f"Your fai file contains the following chromosomes: {fai_df['chr']}" | ||
| ) | ||
| return chroms | ||
|
|
||
| def get_chroms_first_element(wc): | ||
| global FIRST_REPORT | ||
| min_contig_length = config.get("min_contig_length", 0) | ||
| fai_df=get_fai_df(wc) | ||
| skipped_contigs = fai_df["chr"][fai_df["length"] < min_contig_length] | ||
| if len(skipped_contigs) > 0 and FIRST_REPORT: | ||
| print( | ||
| f"WARNING: Skipping contigs with length < {min_contig_length:,}: {skipped_contigs}", | ||
| file=sys.stderr, | ||
| ) | ||
|
|
||
| bam_chr_list=[] | ||
| input_bam_path=get_input_bam(wc) | ||
| input_bam = pysam.AlignmentFile(input_bam_path, "rc", threads=MAX_THREADS) | ||
| bam_header_dict = input_bam.header.to_dict() | ||
|
|
||
| for line in bam_header_dict['SQ']: | ||
| chr_name=line['SN'] | ||
| bam_chr_list.append(chr_name) | ||
|
|
||
| input_bam.close() | ||
|
|
||
| chroms = fai_df["chr"][fai_df["length"] >= min_contig_length] | ||
| chroms = sorted([chrom for chrom in chroms if "chrUn_" not in chrom]) | ||
| chroms = [chrom for chrom in chroms if "_random" not in chrom] | ||
| chroms = [chrom for chrom in chroms if re.fullmatch(KEEP_CHRS, chrom)] | ||
| chroms = [chrom for chrom in chroms if chrom in bam_chr_list] | ||
|
|
||
| if FIRST_REPORT: | ||
| FIRST_REPORT = False | ||
| print(f"INFO: Using N chromosomes: {len(chroms)}", file=sys.stderr) | ||
|
|
||
| if len(chroms) == 0: | ||
| raise ValueError( | ||
| f"No chromosomes left after filtering. Check your keep_chromosomes parameter in config.yaml. " | ||
| f"Your fai file contains the following chromosomes: {fai_df['chr']}" | ||
| ) | ||
| return chroms[0] | ||
|
|
||
| def get_chroms_first_element_orig(wc): | ||
| global FIRST_REPORT | ||
| min_contig_length = config.get("min_contig_length", 0) | ||
| fai_df=get_fai_df(wc) | ||
| skipped_contigs = fai_df["chr"][fai_df["length"] < min_contig_length] | ||
| if len(skipped_contigs) > 0 and FIRST_REPORT: | ||
| print( | ||
| f"WARNING: Skipping contigs with length < {min_contig_length:,}: {skipped_contigs}", | ||
| file=sys.stderr, | ||
| ) | ||
|
|
||
| chroms = fai_df["chr"][fai_df["length"] >= min_contig_length] | ||
| chroms = sorted([chrom for chrom in chroms if "chrUn_" not in chrom]) | ||
| chroms = [chrom for chrom in chroms if "_random" not in chrom] | ||
| chroms = [chrom for chrom in chroms if re.fullmatch(KEEP_CHRS, chrom)] | ||
|
|
||
| if FIRST_REPORT: | ||
| FIRST_REPORT = False | ||
| print(f"INFO: Using N chromosomes: {len(chroms)}", file=sys.stderr) | ||
|
|
||
| if len(chroms) == 0: | ||
| raise ValueError( | ||
| f"No chromosomes left after filtering. Check your keep_chromosomes parameter in config.yaml. " | ||
| f"Your fai file contains the following chromosomes: {fai_df['chr']}" | ||
| ) | ||
| return chroms[0] | ||
|
|
||
|
|
||
| def get_manifest(): | ||
| manifest = config.get("manifest") | ||
|
|
@@ -80,6 +230,11 @@ def get_manifest(): | |
| def get_input_bam(wc): | ||
| return MANIFEST.loc[wc.sm, "bam"] | ||
|
|
||
| def get_input_ref(wc): | ||
| return MANIFEST.loc[wc.sm, "ref"] | ||
|
|
||
| def get_input_ref_name(wc): | ||
| return MANIFEST.loc[wc.sm, "ref_name"] | ||
|
|
||
| def get_mem_mb(wildcards, attempt): | ||
| if attempt < 3: | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
delete completely instead of commenting