-
Notifications
You must be signed in to change notification settings - Fork 2
Minimal Steps by Segment Index PR #239
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
4cec0f0
0eb4415
523f690
26fb424
5ed77de
887d72a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,124 @@ | ||
| use std::ops::Range; | ||
|
|
||
| use crate::flatgfa::{FlatGFA, Path, Segment}; | ||
| use crate::pool::{Id, Span}; | ||
|
|
||
| /// A reference to a specific Step within a Path | ||
| pub struct StepRef { | ||
| /// The ID of the Path | ||
| pub path: Id<Path>, | ||
|
|
||
| /// The offset of the Step within that Path's Span of Steps | ||
| pub offset: u32, | ||
| } | ||
|
|
||
| /// An in memory index that provides all Steps over a given Segment | ||
| pub struct StepsBySegIndex { | ||
| /// All Steps that are accessible by the index. | ||
| /// StepRefs that are cross over the same segment are contiguous | ||
| pub steps: Vec<StepRef>, | ||
|
|
||
| /// Spans into the steps vector. All Steps in the Span cross over one Segment. | ||
| /// A Span's index maps to a Segment's index in a corresponding FlatGFA | ||
| pub segment_steps: Vec<Span<StepRef>>, | ||
| } | ||
|
|
||
| // TODO: make this return an Id<Segment> instead of a usize | ||
| /// Extract the segment index from a StepRef in a given FlatGFA | ||
| fn segment_of_step(fgfa: &FlatGFA, step: &StepRef) -> Id<Segment> { | ||
| let path = &fgfa.paths[step.path]; | ||
| let step_span = path.steps; | ||
| let step_slice = &fgfa.steps[step_span]; | ||
| step_slice[step.offset as usize].segment() | ||
| } | ||
|
|
||
| impl StepsBySegIndex { | ||
| pub fn new(fgfa: &FlatGFA) -> Self { | ||
| // will be our `steps` vector that contains all steprefs | ||
| let mut all_steps = Vec::new(); | ||
|
|
||
| // build the vector of all the steprefs, by iterating over each path, so we can construct using a path id and offset | ||
| // package the Id<Segment> with the StepRef's into tuples so sorting by it is easier | ||
| for (path_id, path) in fgfa.paths.items() { | ||
| for (offset, _) in fgfa.get_path_steps(path).enumerate() { | ||
| // the constructed StepRef based on the path and step offset | ||
| let step = StepRef { | ||
| path: path_id, | ||
| offset: offset as u32, | ||
| }; | ||
|
|
||
| // the segment that this step passes over in the form of a Id<Segment> | ||
| let seg = segment_of_step(fgfa, &step); | ||
|
|
||
| // push the tuple of the Id<Segment> and StepRef into the steps vector | ||
| all_steps.push((seg, step)); | ||
| } | ||
| } | ||
|
|
||
| // sort the steprefs by the index of the segment in the segment pool | ||
| // by extracting the actual numeric index from the Id<Segment> | ||
| all_steps.sort_by_key(|a| a.0.index()); | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You mentioned you weren't sure whether this addressed my comment about sorting stuff. It does! This is exactly what I was thinking. |
||
|
|
||
| // once sorted, steps that cross the same segment will be contiguous in all_steps | ||
| // this allows us to generate spans over steps that cross over the same segment, | ||
| // storing them in segment_steps | ||
|
|
||
| // the working segment's index | ||
| let mut seg_ind: &Id<Segment> = &all_steps[0].0; | ||
|
|
||
| // the working start of the span of StepRefs | ||
| let mut span_start: usize = 0; | ||
|
|
||
| // the vector of spans of step refs that will act as our index | ||
| let mut segment_steps: Vec<Span<StepRef>> = Vec::new(); | ||
|
|
||
| // fill the segment_steps with empty spans | ||
| // TODO: we definitely don't need to do another iteration to fill this with empty spans | ||
| // It's likely more efficient to push empty spans as needed | ||
|
Comment on lines
+76
to
+77
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think it would be worth measuring the cost. You could do a little benchmarking with Hyperfine to see what happens if you make these uninitialized. |
||
| for _ in 0..fgfa.segs.len() { | ||
| segment_steps.push(Span::new_empty()); | ||
| } | ||
|
Comment on lines
+78
to
+80
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If you do want to initialize a big array, there is a short syntax for that too:
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This was my initial thought! But I ran into errors with Spans not being cloneable (or something similar, I'll update this comment with the exact error I got).
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. On further inspection, I think I would just need to make sure that
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah, we could make it cloneable! |
||
|
|
||
| // enumerate over each step ref in all_steps to build our spans | ||
| for (i, (curr_seg_ind, _)) in all_steps.iter().enumerate() { | ||
| // if we have moved onto a new segment add to the segment_steps vec | ||
| if curr_seg_ind.index() > seg_ind.index() { | ||
| // create and set the new span of step refs for this segment | ||
| // it will be left inclusive right exclusive | ||
| let new_span: Span<StepRef> = Span::new(Id::new(span_start), Id::new(i)); | ||
|
|
||
| // assign the span to the index in segment_steps that maps to the index of the segment in the FlatGFA segment pool | ||
| segment_steps[seg_ind.index()] = new_span; | ||
|
Comment on lines
+88
to
+91
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Because you are starting with an initialized array of spans, you could just mutate it in place instead of replacing the old one. It is also worth measuring whether this makes a difference too… |
||
|
|
||
| // update the working seg_ind and range_start variables to reflect the new segment/range | ||
| seg_ind = curr_seg_ind; | ||
| span_start = i; | ||
| } | ||
| } | ||
|
|
||
| // because we've only set a new span whenever we move onto a new segment | ||
| // we will need to set the final span after iterating | ||
| let new_span: Span<_> = Span::new(Id::new(span_start), Id::new(all_steps.len())); | ||
| segment_steps[seg_ind.index()] = new_span; | ||
|
|
||
| // unpackage all_steps into a vector with just spans | ||
| let steps: Vec<StepRef> = all_steps.into_iter().map(|x| x.1).collect(); | ||
|
|
||
| Self { | ||
| steps, | ||
| segment_steps, | ||
| } | ||
| } | ||
|
|
||
| /// Returns a slice of StepRefs that cross over the given segment | ||
| pub fn get_steps_slice(&self, segment: Id<Segment>) -> &[StepRef] { | ||
| let range: Range<usize> = Range::from(&self.segment_steps[segment.index()]); | ||
|
|
||
| &(self.steps[range]) | ||
| } | ||
|
|
||
| /// Returns the number of steps that cross over this segment | ||
| pub fn get_num_steps(&self, segment: Id<Segment>) -> usize { | ||
| self.segment_steps[segment.index()].len() | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,4 +1,5 @@ | ||
| use crate::flatgfa; | ||
| use crate::index; | ||
| use bit_vec::BitVec; | ||
|
|
||
| /// Compute the *depth* of each segment in the variation graph. | ||
|
|
@@ -8,7 +9,7 @@ use bit_vec::BitVec; | |
| /// which only counts each path that tarverses a given segment once. | ||
| /// | ||
| /// Both outputs are depth values indexed by segment ID. | ||
| pub fn depth(gfa: &flatgfa::FlatGFA) -> (Vec<usize>, Vec<usize>) { | ||
| pub fn depth(gfa: &flatgfa::FlatGFA, use_index: bool) -> (Vec<usize>, Vec<usize>) { | ||
| // Our output vectors: the ordinary and unique depths of each segment. | ||
| let mut depths = vec![0; gfa.segs.len()]; | ||
| let mut uniq_depths = vec![0; gfa.segs.len()]; | ||
|
|
@@ -18,15 +19,53 @@ pub fn depth(gfa: &flatgfa::FlatGFA) -> (Vec<usize>, Vec<usize>) { | |
| // subsequent traversals (for the purpose of counting unique depth). | ||
| let mut seen = BitVec::from_elem(gfa.segs.len(), false); | ||
|
|
||
| for path in gfa.paths.all().iter() { | ||
| seen.clear(); // All segments are unseen. | ||
| for step in &gfa.steps[path.steps] { | ||
| let seg_id = step.segment().index(); | ||
| depths[seg_id] += 1; | ||
| if !seen[seg_id] { | ||
| // The first traversal of this path over this segment. | ||
| uniq_depths[seg_id] += 1; | ||
| seen.set(seg_id, true); | ||
| if use_index { | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Because these two routes have such completely different implementations, I say let's just put them in separate functions. It will make each one easier to read. |
||
| // Build the index | ||
| let step_seg_index = index::StepsBySegIndex::new(gfa); | ||
|
|
||
| // This bit vector keeps track of whether the current *path* has already been seen | ||
| // by the current working *segment*. A reverse approach to the non-indexed option | ||
| let mut path_seen: BitVec = BitVec::from_elem(gfa.paths.len(), false); | ||
|
|
||
| // iterate over each segment and populate the output vectors | ||
| for (seg_id, _) in gfa.segs.items() { | ||
| // clear the path_seen vector, since all paths should be unseen | ||
| path_seen.clear(); | ||
|
|
||
| // get the actual offset of the id of thesegment | ||
| let ind = seg_id.index(); | ||
|
|
||
| // get the span of StepRefs for this segment | ||
| let span = step_seg_index.get_steps_slice(seg_id); | ||
|
|
||
| // use that offset to directly modify the depths vector | ||
| depths[ind] = span.len(); | ||
|
|
||
| // iterate over the span to populate the uniq_depths vector | ||
| for stepref in span.iter() { | ||
| // extract the path index | ||
| let path_id = stepref.path.index(); | ||
|
|
||
| // if the path has not been seen, increment the unique depth | ||
| if !path_seen[path_id] { | ||
| uniq_depths[ind] += 1; | ||
|
|
||
| // set the path to seen in the path_seen bitvec | ||
| path_seen.set(path_id, true); | ||
| } | ||
| } | ||
| } | ||
| } else { | ||
| for path in gfa.paths.all().iter() { | ||
| seen.clear(); // All segments are unseen. | ||
| for step in &gfa.steps[path.steps] { | ||
| let seg_id = step.segment().index(); | ||
| depths[seg_id] += 1; | ||
| if !seen[seg_id] { | ||
| // The first traversal of this path over this segment. | ||
| uniq_depths[seg_id] += 1; | ||
| seen.set(seg_id, true); | ||
| } | ||
| } | ||
| } | ||
| } | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This would be a bit clearer and more efficient using a
Vec::collect()to avoid the mutability and thepushes in a loop. Here's how that would look: