diff --git a/flatgfa/src/cli/cmds.rs b/flatgfa/src/cli/cmds.rs index 3bf22cd4..9b55fd9c 100644 --- a/flatgfa/src/cli/cmds.rs +++ b/flatgfa/src/cli/cmds.rs @@ -196,10 +196,14 @@ pub fn extract( /// compute node depth, the number of times paths cross a node #[derive(FromArgs, PartialEq, Debug)] #[argh(subcommand, name = "depth")] -pub struct Depth {} +pub struct Depth { + /// enable usage of StepsBySegIndex + #[argh(switch, short = 'n')] + index: bool, +} -pub fn depth(gfa: &flatgfa::FlatGFA) { - let (depths, uniq_paths) = ops::depth::depth(gfa); +pub fn depth(gfa: &flatgfa::FlatGFA, args: Depth) { + let (depths, uniq_paths) = ops::depth::depth(gfa, args.index); println!("#node.id\tdepth\tdepth.uniq"); for (id, seg) in gfa.segs.items() { diff --git a/flatgfa/src/cli/main.rs b/flatgfa/src/cli/main.rs index 03ee8357..f24226ba 100644 --- a/flatgfa/src/cli/main.rs +++ b/flatgfa/src/cli/main.rs @@ -127,8 +127,8 @@ fn main() -> Result<(), &'static str> { let store = cmds::extract(&gfa, sub_args)?; dump(&store.as_ref(), &args.output); } - Some(Command::Depth(_)) => { - cmds::depth(&gfa); + Some(Command::Depth(sub_args)) => { + cmds::depth(&gfa, sub_args); } Some(Command::Chop(sub_args)) => { let store = cmds::chop(&gfa, sub_args)?; diff --git a/flatgfa/src/index.rs b/flatgfa/src/index.rs new file mode 100644 index 00000000..90c42591 --- /dev/null +++ b/flatgfa/src/index.rs @@ -0,0 +1,124 @@ +use std::ops::Range; + +use crate::flatgfa::{FlatGFA, Path, Segment}; +use crate::pool::{Id, Span}; + +/// A reference to a specific Step within a Path +pub struct StepRef { + /// The ID of the Path + pub path: Id, + + /// The offset of the Step within that Path's Span of Steps + pub offset: u32, +} + +/// An in memory index that provides all Steps over a given Segment +pub struct StepsBySegIndex { + /// All Steps that are accessible by the index. + /// StepRefs that are cross over the same segment are contiguous + pub steps: Vec, + + /// Spans into the steps vector. All Steps in the Span cross over one Segment. + /// A Span's index maps to a Segment's index in a corresponding FlatGFA + pub segment_steps: Vec>, +} + +// TODO: make this return an Id instead of a usize +/// Extract the segment index from a StepRef in a given FlatGFA +fn segment_of_step(fgfa: &FlatGFA, step: &StepRef) -> Id { + let path = &fgfa.paths[step.path]; + let step_span = path.steps; + let step_slice = &fgfa.steps[step_span]; + step_slice[step.offset as usize].segment() +} + +impl StepsBySegIndex { + pub fn new(fgfa: &FlatGFA) -> Self { + // will be our `steps` vector that contains all steprefs + let mut all_steps = Vec::new(); + + // build the vector of all the steprefs, by iterating over each path, so we can construct using a path id and offset + // package the Id with the StepRef's into tuples so sorting by it is easier + for (path_id, path) in fgfa.paths.items() { + for (offset, _) in fgfa.get_path_steps(path).enumerate() { + // the constructed StepRef based on the path and step offset + let step = StepRef { + path: path_id, + offset: offset as u32, + }; + + // the segment that this step passes over in the form of a Id + let seg = segment_of_step(fgfa, &step); + + // push the tuple of the Id and StepRef into the steps vector + all_steps.push((seg, step)); + } + } + + // sort the steprefs by the index of the segment in the segment pool + // by extracting the actual numeric index from the Id + all_steps.sort_by_key(|a| a.0.index()); + + // once sorted, steps that cross the same segment will be contiguous in all_steps + // this allows us to generate spans over steps that cross over the same segment, + // storing them in segment_steps + + // the working segment's index + let mut seg_ind: &Id = &all_steps[0].0; + + // the working start of the span of StepRefs + let mut span_start: usize = 0; + + // the vector of spans of step refs that will act as our index + let mut segment_steps: Vec> = Vec::new(); + + // fill the segment_steps with empty spans + // TODO: we definitely don't need to do another iteration to fill this with empty spans + // It's likely more efficient to push empty spans as needed + for _ in 0..fgfa.segs.len() { + segment_steps.push(Span::new_empty()); + } + + // enumerate over each step ref in all_steps to build our spans + for (i, (curr_seg_ind, _)) in all_steps.iter().enumerate() { + // if we have moved onto a new segment add to the segment_steps vec + if curr_seg_ind.index() > seg_ind.index() { + // create and set the new span of step refs for this segment + // it will be left inclusive right exclusive + let new_span: Span = Span::new(Id::new(span_start), Id::new(i)); + + // assign the span to the index in segment_steps that maps to the index of the segment in the FlatGFA segment pool + segment_steps[seg_ind.index()] = new_span; + + // update the working seg_ind and range_start variables to reflect the new segment/range + seg_ind = curr_seg_ind; + span_start = i; + } + } + + // because we've only set a new span whenever we move onto a new segment + // we will need to set the final span after iterating + let new_span: Span<_> = Span::new(Id::new(span_start), Id::new(all_steps.len())); + segment_steps[seg_ind.index()] = new_span; + + // unpackage all_steps into a vector with just spans + let steps: Vec = all_steps.into_iter().map(|x| x.1).collect(); + + Self { + steps, + segment_steps, + } + } + + /// Returns a slice of StepRefs that cross over the given segment + pub fn get_steps_slice(&self, segment: Id) -> &[StepRef] { + let range: Range = Range::from(&self.segment_steps[segment.index()]); + + &(self.steps[range]) + } + + /// Returns the number of steps that cross over this segment + pub fn get_num_steps(&self, segment: Id) -> usize { + self.segment_steps[segment.index()].len() + } +} diff --git a/flatgfa/src/lib.rs b/flatgfa/src/lib.rs index 7b5eeafe..c132ddd9 100644 --- a/flatgfa/src/lib.rs +++ b/flatgfa/src/lib.rs @@ -3,6 +3,7 @@ pub mod file; pub mod flatbed; pub mod flatgfa; pub mod gfaline; +pub mod index; pub mod memfile; pub mod namemap; pub mod ops; diff --git a/flatgfa/src/ops/depth.rs b/flatgfa/src/ops/depth.rs index b46070d5..1467e16e 100644 --- a/flatgfa/src/ops/depth.rs +++ b/flatgfa/src/ops/depth.rs @@ -1,4 +1,5 @@ use crate::flatgfa; +use crate::index; use bit_vec::BitVec; /// Compute the *depth* of each segment in the variation graph. @@ -8,7 +9,7 @@ use bit_vec::BitVec; /// which only counts each path that tarverses a given segment once. /// /// Both outputs are depth values indexed by segment ID. -pub fn depth(gfa: &flatgfa::FlatGFA) -> (Vec, Vec) { +pub fn depth(gfa: &flatgfa::FlatGFA, use_index: bool) -> (Vec, Vec) { // Our output vectors: the ordinary and unique depths of each segment. let mut depths = vec![0; gfa.segs.len()]; let mut uniq_depths = vec![0; gfa.segs.len()]; @@ -18,15 +19,53 @@ pub fn depth(gfa: &flatgfa::FlatGFA) -> (Vec, Vec) { // subsequent traversals (for the purpose of counting unique depth). let mut seen = BitVec::from_elem(gfa.segs.len(), false); - for path in gfa.paths.all().iter() { - seen.clear(); // All segments are unseen. - for step in &gfa.steps[path.steps] { - let seg_id = step.segment().index(); - depths[seg_id] += 1; - if !seen[seg_id] { - // The first traversal of this path over this segment. - uniq_depths[seg_id] += 1; - seen.set(seg_id, true); + if use_index { + // Build the index + let step_seg_index = index::StepsBySegIndex::new(gfa); + + // This bit vector keeps track of whether the current *path* has already been seen + // by the current working *segment*. A reverse approach to the non-indexed option + let mut path_seen: BitVec = BitVec::from_elem(gfa.paths.len(), false); + + // iterate over each segment and populate the output vectors + for (seg_id, _) in gfa.segs.items() { + // clear the path_seen vector, since all paths should be unseen + path_seen.clear(); + + // get the actual offset of the id of thesegment + let ind = seg_id.index(); + + // get the span of StepRefs for this segment + let span = step_seg_index.get_steps_slice(seg_id); + + // use that offset to directly modify the depths vector + depths[ind] = span.len(); + + // iterate over the span to populate the uniq_depths vector + for stepref in span.iter() { + // extract the path index + let path_id = stepref.path.index(); + + // if the path has not been seen, increment the unique depth + if !path_seen[path_id] { + uniq_depths[ind] += 1; + + // set the path to seen in the path_seen bitvec + path_seen.set(path_id, true); + } + } + } + } else { + for path in gfa.paths.all().iter() { + seen.clear(); // All segments are unseen. + for step in &gfa.steps[path.steps] { + let seg_id = step.segment().index(); + depths[seg_id] += 1; + if !seen[seg_id] { + // The first traversal of this path over this segment. + uniq_depths[seg_id] += 1; + seen.set(seg_id, true); + } } } }