Skip to content

Commit a6af11e

Browse files
committed
separate filtering functionality into a separate mod. Simplify part of the covcalc
1 parent aba9bbe commit a6af11e

File tree

8 files changed

+278
-186
lines changed

8 files changed

+278
-186
lines changed

docs/content/changelog.rst

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,10 @@ Changes in deepTools4.0
33

44
# Changes
55

6+
## bamCoverage
7+
8+
- --no_collapse flag to not merge bins with equal coverage values together.
9+
610
## computeMatrix
711

812
- --sortRegions 'no' option no longer exists
@@ -20,6 +24,34 @@ Changes in deepTools4.0
2024
- options label, smartLabels, genomeChunkLength are removed.
2125
- ignoreDuplicates is removed, and (if wanted) should be set by the SamFlagExclude setting.
2226

27+
28+
29+
30+
31+
32+
33+
34+
35+
36+
37+
38+
# Good to know
39+
40+
41+
42+
43+
44+
45+
46+
47+
48+
49+
50+
51+
52+
53+
54+
2355
# Testing
2456

2557
## computeMatrix

src/alignmentsieve.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,8 @@ use rust_htslib::bam::{self, Header, IndexedReader, Read, Reader, Writer};
66
use tempfile::{Builder, TempPath};
77
use std::fs::File;
88
use std::io::Write;
9-
use crate::covcalc::{parse_regions, Alignmentfilters, Region};
9+
use crate::covcalc::{parse_regions, Region};
10+
use crate::filtering::Alignmentfilters;
1011
use crate::filehandler::{is_bed_or_gtf, read_bedfile};
1112

1213
#[pyfunction]

src/bamcompare.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,8 @@ use std::fs::File;
88
use itertools::Itertools;
99
use bigtools::{Value};
1010
use crate::filehandler::{bam_ispaired, write_covfile, is_bed_or_gtf, read_bedfile};
11-
use crate::covcalc::{bam_pileup, parse_regions, Alignmentfilters, TempZip, region_divider, Region};
11+
use crate::covcalc::{bam_pileup, parse_regions, TempZip, region_divider, Region};
12+
use crate::filtering::Alignmentfilters;
1213
use crate::normalization::scale_factor_bamcompare;
1314
use crate::calc::{median, calc_ratio};
1415
use tempfile::{TempPath};

src/bamcoverage.rs

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,8 @@ use std::io::prelude::*;
66
use std::io::BufReader;
77
use std::fs::File;
88
use bigtools::Value;
9-
use crate::covcalc::{bam_pileup, parse_regions, Alignmentfilters, region_divider, Region};
9+
use crate::covcalc::{bam_pileup, parse_regions, region_divider, Region};
10+
use crate::filtering::Alignmentfilters;
1011
use crate::filehandler::{bam_ispaired, write_covfile, is_bed_or_gtf, read_bedfile};
1112
use crate::normalization::scale_factor;
1213
use crate::calc::median;
@@ -67,20 +68,32 @@ pub fn r_bamcoverage(
6768
if norm != "None" && scalefactor != 1.0 {
6869
println!("Warning: You have set a normalization option ({}), but also a scale factor. Only the scale factor will be used", norm);
6970
}
71+
if mnase && offset != (1, -1) {
72+
println!("Warning: Both MNase and offset are set. The offset will be ignored !");
73+
}
74+
if offset != (1, -1) {
75+
if offset.0 == 0 {
76+
panic!("Offsets cannot be 0 as they are 1-based positions inside each alignment.");
77+
}
78+
if offset.1 > 0 && offset.1 < offset.0 {
79+
panic!("Right side offset cannot be smaller than the left side offset.");
80+
}
81+
}
7082
if verbose {
7183
println!("Chromosomes to ignore for normalization: {:?}", ignorechr);
7284
}
7385

86+
7487
// if mnase, set the min / max fragment lengths if these are not set.
7588
if mnase {
7689
if minfraglen == 0 {
7790
minfraglen = 130;
78-
} else {
91+
} else if minfraglen != 130{
7992
println!("Note that MNase mode is set, but minfraglen is set at {}. Recommended is 130.", minfraglen);
8093
}
8194
if maxfraglen == 0 {
8295
maxfraglen = 200;
83-
} else {
96+
} else if maxfraglen != 200 {
8497
println!("Note that MNase mode is set, but maxfraglen is set at {}. Recommended is 200.", maxfraglen);
8598
}
8699
if binsize != 1 {

src/covcalc.rs

Lines changed: 40 additions & 180 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
use rust_htslib::bam::{Read, IndexedReader, Record};
1+
use rust_htslib::bam::{Read, IndexedReader};
22
use rust_htslib::bam::ext::BamRecordExtensions;
33
use core::panic;
44
use std::collections::HashMap;
@@ -9,6 +9,7 @@ use std::fmt;
99
use ndarray::Array1;
1010
use std::collections::HashSet;
1111
use std::fs::OpenOptions;
12+
use crate::filtering::Alignmentfilters;
1213

1314
pub fn parse_regions(region: &str, bam_ifile: Vec<&str>) -> (Vec<Region>, HashMap<String, u32>) {
1415
// Takes a vector of regions, and a bam reference
@@ -199,15 +200,11 @@ pub fn bam_pileup<'a>(
199200
counts = vec![0.0; 1];
200201
for record in bam.records() {
201202
let mut record = record.expect("Error parsing record.");
202-
if filters.manipulate {
203-
filters.manipulate_record(&mut record);
204-
}
205203
if filters.filter {
206204
if filters.filter_record(&record) {
207205
continue;
208206
}
209207
}
210-
211208
if !ignorechr.contains(&region.0) {
212209
if record.is_unmapped() {
213210
unmapped_reads += 1;
@@ -242,9 +239,6 @@ pub fn bam_pileup<'a>(
242239
.expect(&format!("Error fetching region: {}:{},{}", regstruct.chrom, exon.0, exon.1));
243240
for record in bam.records() {
244241
let mut record = record.expect("Error parsing record.");
245-
if filters.manipulate {
246-
filters.manipulate_record(&mut record);
247-
}
248242
if filters.filter {
249243
if filters.filter_record(&record) {
250244
continue;
@@ -280,44 +274,58 @@ pub fn bam_pileup<'a>(
280274
let mut binix: u32 = 0;
281275
for record in bam.records() {
282276
let mut record = record.expect("Error parsing record.");
283-
if filters.manipulate {
284-
filters.manipulate_record(&mut record);
285-
}
277+
286278
if filters.filter {
287279
if filters.filter_record(&record) {
288280
continue;
289281
}
290282
}
291-
if !ignorechr.contains(&region.0) {
283+
if filters.manipulate {
284+
let manipulated_blocks = filters.manipulate_record(&mut record);
285+
if manipulated_blocks.is_none() {
286+
continue;
287+
}
288+
let block = manipulated_blocks.unwrap();
289+
let ixstart = ((block[0] - region.1) / binsize) as usize;
290+
let ixend = ((block[1] - region.1) / binsize) as usize;
291+
292+
let indices: HashSet<usize> = (ixstart..ixend).collect();
293+
indices.into_iter()
294+
.for_each(|ix| {
295+
if ix < counts.len() {
296+
counts[ix] += 1.0;
297+
}
298+
});
299+
} else {
300+
let indices: HashSet<usize> = record
301+
.aligned_blocks()
302+
.filter(|x| (x[1] as u32) >= region.1 && (x[1] as u32) <= region.2)
303+
.filter(|x| (x[0] as u32) >= region.1 && (x[0] as u32) <= region.2 )
304+
.flat_map(|x| x[0] as u32..x[1] as u32)
305+
.map(|x| ((x - region.1) / binsize) as usize)
306+
.collect();
307+
indices.into_iter()
308+
.for_each(|ix| {
309+
if ix < counts.len() {
310+
counts[ix] += 1.0;
311+
}
312+
});
313+
}
314+
315+
if !ignorechr.contains(&region.0) && gather_lengths {
292316
if record.is_unmapped() {
293317
unmapped_reads += 1;
294318
} else {
295319
mapped_reads += 1;
296320
if *ispe {
297321
if record.is_paired() && record.is_proper_pair() && (record.flags() & FREAD != 0) {
298-
if gather_lengths {
299-
fraglens.push(record.insert_size().abs() as u32);
300-
}
322+
fraglens.push(record.insert_size().abs() as u32);
301323
}
302324
}
303-
if gather_lengths {
304-
readlens.push(record.seq_len() as u32);
305-
}
325+
readlens.push(record.seq_len() as u32);
306326
}
307327
}
308-
let indices: HashSet<usize> = record
309-
.aligned_blocks()
310-
.filter(|x| (x[1] as u32) >= region.1 && (x[1] as u32) <= region.2)
311-
.filter(|x| (x[0] as u32) >= region.1 && (x[0] as u32) <= region.2 )
312-
.flat_map(|x| x[0] as u32..x[1] as u32)
313-
.map(|x| ((x - region.1) / binsize) as usize)
314-
.collect();
315-
indices.into_iter()
316-
.for_each(|ix| {
317-
if ix < counts.len() {
318-
counts[ix] += 1.0;
319-
}
320-
});
328+
321329
}
322330
}
323331
let file = OpenOptions::new()
@@ -330,6 +338,7 @@ pub fn bam_pileup<'a>(
330338
// There are two scenarios:
331339
// bamCoverage mode -> we can collapse bins with same coverage (collapse = true)
332340
// bamCompare & others -> We cannot collapse the bins, yet. (collapse = false)
341+
// Note that collapse can also be passed as a CLI, for those that want that.
333342
if counts.len() == 1 {
334343
writeln!(writer, "{}\t{}\t{}\t{}", region.0, startstr, endstr, counts[0]).unwrap();
335344
} else {
@@ -383,155 +392,6 @@ fn pos_in_blacklist(pos: i64, chrom: &str, blacklist: &Vec<Region>) -> bool {
383392
return false;
384393
}
385394

386-
pub struct Alignmentfilters {
387-
pub blacklist: Option<Vec<Region>>,
388-
pub minmappingquality: u8,
389-
pub samflaginclude: u16,
390-
pub samflagexclude: u16,
391-
pub minfraglen: u32,
392-
pub maxfraglen: u32,
393-
pub mnase: bool,
394-
pub offset: (i32, i32),
395-
pub filterrnastrand: String,
396-
pub extendreads: u32,
397-
pub centerreads: bool,
398-
pub filter: bool,
399-
pub manipulate: bool,
400-
}
401-
impl Alignmentfilters {
402-
pub fn new(
403-
blacklist: Option<Vec<Region>>,
404-
minmappingquality: Option<u8>,
405-
samflaginclude: Option<u16>,
406-
samflagexclude: Option<u16>,
407-
minfraglen: Option<u32>,
408-
maxfraglen: Option<u32>,
409-
mnase: Option<bool>,
410-
offset: Option<(i32, i32)>,
411-
filterrnastrand: Option<String>,
412-
extendreads: Option<u32>,
413-
centerreads: Option<bool>
414-
) -> Self {
415-
// Go through the arguments, and if they are not set or have default values, we set a filter boolean to false.
416-
// Only when filtering needs to happen the filterrecord will be invoked, for performance.
417-
let mut filter: bool = false;
418-
let mut manipulate: bool = false;
419-
let _mmq = minmappingquality.unwrap_or(0);
420-
let _sfi = samflaginclude.unwrap_or(0);
421-
let _sfe = samflagexclude.unwrap_or(0);
422-
let _mifl = minfraglen.unwrap_or(0);
423-
let _mafl = maxfraglen.unwrap_or(0);
424-
let _mnase = mnase.unwrap_or(false);
425-
let _offset = offset.unwrap_or((1, -1));
426-
let _frs = filterrnastrand.unwrap_or(String::from("None"));
427-
let _extend = extendreads.unwrap_or(0);
428-
let _center = centerreads.unwrap_or(false);
429-
430-
// Set the manipulate bool for a quick escape in case manipulation is not needed.
431-
if _offset != (1, -1) || _mnase || _extend > 0 || _center {
432-
manipulate = true;
433-
}
434-
435-
// Set the filter bool for a quick escape in case filtering is not needed.
436-
if _mmq > 0 || _sfi > 0 || _sfe > 0 || _mifl > 0 || _mafl > 0 || _frs != "None" || blacklist.is_some() {
437-
filter = true;
438-
}
439-
440-
Self {
441-
blacklist: blacklist,
442-
minmappingquality: _mmq,
443-
samflaginclude: _sfi,
444-
samflagexclude: _sfe,
445-
minfraglen: _mifl,
446-
maxfraglen: _mafl,
447-
mnase: _mnase,
448-
offset: _offset,
449-
filterrnastrand: _frs,
450-
extendreads: _extend,
451-
centerreads: _center,
452-
filter: filter,
453-
manipulate: manipulate,
454-
}
455-
}
456-
pub fn filter_record(&self, rec: &Record) -> bool {
457-
// Decides filtering of a record. The bool return is used to 'continue', i.e. skip the record.
458-
if rec.is_unmapped() {
459-
return true;
460-
} else if !self.filter {
461-
return false;
462-
} else {
463-
// True filtering.
464-
// quality > samflags > min/max fraglen
465-
// quality
466-
let mut skip: bool = false;
467-
if rec.mapq() < self.minmappingquality {
468-
skip = true;
469-
}
470-
// samflags
471-
if self.samflaginclude > 0 {
472-
if (rec.flags() & self.samflaginclude) == 0 {
473-
skip = true;
474-
}
475-
}
476-
if self.samflagexclude > 0 {
477-
if (rec.flags() & self.samflagexclude) != 0 {
478-
skip = true;
479-
}
480-
}
481-
// min/max fraglen
482-
if self.minfraglen != 0 || self.maxfraglen != 0 {
483-
if rec.is_paired() {
484-
if rec.insert_size().abs() < self.minfraglen as i64 || rec.insert_size().abs() > self.maxfraglen as i64 {
485-
skip = true;
486-
}
487-
} else {
488-
let fragsize: u32 = rec
489-
.aligned_blocks()
490-
.map(|x| x[1] as u32 - x[0] as u32)
491-
.sum();
492-
if fragsize < self.minfraglen || fragsize > self.maxfraglen {
493-
skip = true;
494-
}
495-
}
496-
}
497-
// filterrnastrand
498-
if self.filterrnastrand.as_str() != "None" {
499-
match (self.filterrnastrand.as_str(), rec.is_paired()) {
500-
("forward", true) => {
501-
if !((rec.flags() & 144 == 128) || (rec.flags() & 96 == 64)) {
502-
skip = true;
503-
}
504-
},
505-
("forward", false) => {
506-
if !(rec.flags() & 16 == 16) {
507-
skip = true;
508-
}
509-
},
510-
("reverse", true) => {
511-
if !((rec.flags() & 144 == 144) || (rec.flags() & 96 == 96)) {
512-
skip = true;
513-
}
514-
},
515-
("reverse", false) => {
516-
if !(rec.flags() & 16 == 0) {
517-
skip = true;
518-
}
519-
},
520-
_ => {
521-
panic!("filterrnastrand should be either forward or reverse. {:?} is not supported.", self.filterrnastrand)
522-
},
523-
}
524-
}
525-
return skip;
526-
}
527-
}
528-
529-
pub fn manipulate_record(&self, rec: &mut Record) {
530-
println!("Will manipulate the record, but not now lol");
531-
}
532-
}
533-
534-
535395
#[derive(Clone, Debug)]
536396
pub struct Region {
537397
pub chrom: String,

0 commit comments

Comments
 (0)