Skip to content

Commit 9a0b290

Browse files
committed
Drop poly-g trimming here
OK for our data so far, and I submitted a PR to fastp: OpenGene/fastp#508
1 parent ee02103 commit 9a0b290

File tree

1 file changed

+2
-25
lines changed

1 file changed

+2
-25
lines changed

workflow/Snakefile

+2-25
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ DATA_PATH = os.path.join(os.path.dirname(workflow.snakefile), "../data")
3030
data_config_files = glob.glob(f"{DATA_PATH}/*.csv")
3131

3232
# Uncomment for test runs
33-
# data_config_files = glob.glob(f"{DATA_PATH}/test.csv")
33+
data_config_files = glob.glob(f"{DATA_PATH}/test.csv")
3434

3535
# Only operate on EM-seq files (ignore BS-seq)
3636
EMSEQ_ONLY = True
@@ -366,25 +366,6 @@ rule seqtk_subsample:
366366
"""
367367

368368

369-
# Trim poly-g if this is a two-color system
370-
def trim_poly_g(input):
371-
with gzip.open(input.r1_file, "rt") as r1_in, gzip.open(
372-
input.r2_file, "rt"
373-
) as r2_in:
374-
for line_r1, line_r2 in zip(r1_in, r2_in):
375-
line_r1_instrument = line_r1.split(":")[0]
376-
line_r2_instrument = line_r2.split(":")[0]
377-
assert (
378-
line_r1_instrument == line_r2_instrument
379-
), "R1 and R2 come from different instruments! Not possible."
380-
break
381-
return (
382-
"--trim_poly_g"
383-
if line_r1_instrument.startswith(tuple(["@A0", "@NS", "@NB", "@VH"]))
384-
else ""
385-
)
386-
387-
388369
### fastp
389370
# We adapter trimming, poly-g filtering, and initial quality filtering
390371
# NOTE: We also do read trimming here, 10-15 BP depending on EM- vs BS-seq
@@ -416,10 +397,6 @@ rule fastp:
416397
resources:
417398
mem_mb = 6000
418399
params:
419-
# Inspired to do poly-g trimming by the NEB EM-seq pipeline:
420-
# https://github.com/nebiolabs/EM-seq/blob/master/em-seq.nf
421-
# TODO: make this selective for NextSeq/NovaSeq-only runs?
422-
trim_poly_g_flag=lambda wildcards, input: trim_poly_g(input),
423400
# Note: --overrepresentation_analysis was overwhelming with our huge sequencing data, disabled for now.
424401
## Trim specific to BS vs EM-seq input type
425402
# Note that we could probably be less conservative, and trim less from EMSeq
@@ -439,7 +416,7 @@ rule fastp:
439416
fastp --in1 {input.r1_file} --in2 {input.r2_file} \
440417
--trim_front1 {params.trim_r1_5prime} --trim_tail1 {params.trim_r1_3prime} \
441418
--trim_front2 {params.trim_r2_5prime} --trim_tail2 {params.trim_r2_3prime} \
442-
--length_required {params.minimum_length} {params.trim_poly_g_flag} \
419+
--length_required {params.minimum_length} \
443420
--adapter_sequence "{params.adapter_r1}" --adapter_sequence_r2 "{params.adapter_r2}" \
444421
--json "{output.fastp_json}" --html "{output.fastp_html}" \
445422
--thread {threads} --verbose --failed_out {output.fastp_failed} --stdout \

0 commit comments

Comments
 (0)