Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 7 additions & 3 deletions flatgfa/src/cli/cmds.rs
Original file line number Diff line number Diff line change
Expand Up @@ -196,10 +196,14 @@ pub fn extract(
/// compute node depth, the number of times paths cross a node
#[derive(FromArgs, PartialEq, Debug)]
#[argh(subcommand, name = "depth")]
pub struct Depth {}
pub struct Depth {
/// enable usage of StepsBySegIndex
#[argh(switch, short = 'n')]
index: bool,
}

pub fn depth(gfa: &flatgfa::FlatGFA) {
let (depths, uniq_paths) = ops::depth::depth(gfa);
pub fn depth(gfa: &flatgfa::FlatGFA, args: Depth) {
let (depths, uniq_paths) = ops::depth::depth(gfa, args.index);

println!("#node.id\tdepth\tdepth.uniq");
for (id, seg) in gfa.segs.items() {
Expand Down
4 changes: 2 additions & 2 deletions flatgfa/src/cli/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -127,8 +127,8 @@ fn main() -> Result<(), &'static str> {
let store = cmds::extract(&gfa, sub_args)?;
dump(&store.as_ref(), &args.output);
}
Some(Command::Depth(_)) => {
cmds::depth(&gfa);
Some(Command::Depth(sub_args)) => {
cmds::depth(&gfa, sub_args);
}
Some(Command::Chop(sub_args)) => {
let store = cmds::chop(&gfa, sub_args)?;
Expand Down
124 changes: 124 additions & 0 deletions flatgfa/src/index.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
use std::ops::Range;

use crate::flatgfa::{FlatGFA, Path, Segment};
use crate::pool::{Id, Span};

/// A reference to a specific Step within a Path
pub struct StepRef {
/// The ID of the Path
pub path: Id<Path>,

/// The offset of the Step within that Path's Span of Steps
pub offset: u32,
}

/// An in memory index that provides all Steps over a given Segment
pub struct StepsBySegIndex {
/// All Steps that are accessible by the index.
/// StepRefs that are cross over the same segment are contiguous
pub steps: Vec<StepRef>,

/// Spans into the steps vector. All Steps in the Span cross over one Segment.
/// A Span's index maps to a Segment's index in a corresponding FlatGFA
pub segment_steps: Vec<Span<StepRef>>,
}

// TODO: make this return an Id<Segment> instead of a usize
/// Extract the segment index from a StepRef in a given FlatGFA
fn segment_of_step(fgfa: &FlatGFA, step: &StepRef) -> Id<Segment> {
let path = &fgfa.paths[step.path];
let step_span = path.steps;
let step_slice = &fgfa.steps[step_span];
step_slice[step.offset as usize].segment()
}

impl StepsBySegIndex {
pub fn new(fgfa: &FlatGFA) -> Self {
// will be our `steps` vector that contains all steprefs
let mut all_steps = Vec::new();
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This would be a bit clearer and more efficient using a Vec::collect() to avoid the mutability and the pushes in a loop. Here's how that would look:

let all_steps: Vec<_> = fgfa.paths.items().map(|(path_id, path)| {
  // your loop body here
  (seg, step)
}).collect();


// build the vector of all the steprefs, by iterating over each path, so we can construct using a path id and offset
// package the Id<Segment> with the StepRef's into tuples so sorting by it is easier
for (path_id, path) in fgfa.paths.items() {
for (offset, _) in fgfa.get_path_steps(path).enumerate() {
// the constructed StepRef based on the path and step offset
let step = StepRef {
path: path_id,
offset: offset as u32,
};

// the segment that this step passes over in the form of a Id<Segment>
let seg = segment_of_step(fgfa, &step);

// push the tuple of the Id<Segment> and StepRef into the steps vector
all_steps.push((seg, step));
}
}

// sort the steprefs by the index of the segment in the segment pool
// by extracting the actual numeric index from the Id<Segment>
all_steps.sort_by_key(|a| a.0.index());
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You mentioned you weren't sure whether this addressed my comment about sorting stuff. It does! This is exactly what I was thinking.


// once sorted, steps that cross the same segment will be contiguous in all_steps
// this allows us to generate spans over steps that cross over the same segment,
// storing them in segment_steps

// the working segment's index
let mut seg_ind: &Id<Segment> = &all_steps[0].0;

// the working start of the span of StepRefs
let mut span_start: usize = 0;

// the vector of spans of step refs that will act as our index
let mut segment_steps: Vec<Span<StepRef>> = Vec::new();

// fill the segment_steps with empty spans
// TODO: we definitely don't need to do another iteration to fill this with empty spans
// It's likely more efficient to push empty spans as needed
Comment on lines +76 to +77
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it would be worth measuring the cost. You could do a little benchmarking with Hyperfine to see what happens if you make these uninitialized.

for _ in 0..fgfa.segs.len() {
segment_steps.push(Span::new_empty());
}
Comment on lines +78 to +80
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If you do want to initialize a big array, there is a short syntax for that too: vec![n; initial]. So something like vec![fgfa.segs.len(); Span::new_empty()].

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This was my initial thought! But I ran into errors with Spans not being cloneable (or something similar, I'll update this comment with the exact error I got).

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

On further inspection, I think I would just need to make sure that StepRef needs to be cloneable. I think that's the reason I was running into issues earlier.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, we could make it cloneable!


// enumerate over each step ref in all_steps to build our spans
for (i, (curr_seg_ind, _)) in all_steps.iter().enumerate() {
// if we have moved onto a new segment add to the segment_steps vec
if curr_seg_ind.index() > seg_ind.index() {
// create and set the new span of step refs for this segment
// it will be left inclusive right exclusive
let new_span: Span<StepRef> = Span::new(Id::new(span_start), Id::new(i));

// assign the span to the index in segment_steps that maps to the index of the segment in the FlatGFA segment pool
segment_steps[seg_ind.index()] = new_span;
Comment on lines +88 to +91
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Because you are starting with an initialized array of spans, you could just mutate it in place instead of replacing the old one. It is also worth measuring whether this makes a difference too…


// update the working seg_ind and range_start variables to reflect the new segment/range
seg_ind = curr_seg_ind;
span_start = i;
}
}

// because we've only set a new span whenever we move onto a new segment
// we will need to set the final span after iterating
let new_span: Span<_> = Span::new(Id::new(span_start), Id::new(all_steps.len()));
segment_steps[seg_ind.index()] = new_span;

// unpackage all_steps into a vector with just spans
let steps: Vec<StepRef> = all_steps.into_iter().map(|x| x.1).collect();

Self {
steps,
segment_steps,
}
}

/// Returns a slice of StepRefs that cross over the given segment
pub fn get_steps_slice(&self, segment: Id<Segment>) -> &[StepRef] {
let range: Range<usize> = Range::from(&self.segment_steps[segment.index()]);

&(self.steps[range])
}

/// Returns the number of steps that cross over this segment
pub fn get_num_steps(&self, segment: Id<Segment>) -> usize {
self.segment_steps[segment.index()].len()
}
}
1 change: 1 addition & 0 deletions flatgfa/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ pub mod file;
pub mod flatbed;
pub mod flatgfa;
pub mod gfaline;
pub mod index;
pub mod memfile;
pub mod namemap;
pub mod ops;
Expand Down
59 changes: 49 additions & 10 deletions flatgfa/src/ops/depth.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
use crate::flatgfa;
use crate::index;
use bit_vec::BitVec;

/// Compute the *depth* of each segment in the variation graph.
Expand All @@ -8,7 +9,7 @@ use bit_vec::BitVec;
/// which only counts each path that tarverses a given segment once.
///
/// Both outputs are depth values indexed by segment ID.
pub fn depth(gfa: &flatgfa::FlatGFA) -> (Vec<usize>, Vec<usize>) {
pub fn depth(gfa: &flatgfa::FlatGFA, use_index: bool) -> (Vec<usize>, Vec<usize>) {
// Our output vectors: the ordinary and unique depths of each segment.
let mut depths = vec![0; gfa.segs.len()];
let mut uniq_depths = vec![0; gfa.segs.len()];
Expand All @@ -18,15 +19,53 @@ pub fn depth(gfa: &flatgfa::FlatGFA) -> (Vec<usize>, Vec<usize>) {
// subsequent traversals (for the purpose of counting unique depth).
let mut seen = BitVec::from_elem(gfa.segs.len(), false);

for path in gfa.paths.all().iter() {
seen.clear(); // All segments are unseen.
for step in &gfa.steps[path.steps] {
let seg_id = step.segment().index();
depths[seg_id] += 1;
if !seen[seg_id] {
// The first traversal of this path over this segment.
uniq_depths[seg_id] += 1;
seen.set(seg_id, true);
if use_index {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Because these two routes have such completely different implementations, I say let's just put them in separate functions. It will make each one easier to read.

// Build the index
let step_seg_index = index::StepsBySegIndex::new(gfa);

// This bit vector keeps track of whether the current *path* has already been seen
// by the current working *segment*. A reverse approach to the non-indexed option
let mut path_seen: BitVec = BitVec::from_elem(gfa.paths.len(), false);

// iterate over each segment and populate the output vectors
for (seg_id, _) in gfa.segs.items() {
// clear the path_seen vector, since all paths should be unseen
path_seen.clear();

// get the actual offset of the id of thesegment
let ind = seg_id.index();

// get the span of StepRefs for this segment
let span = step_seg_index.get_steps_slice(seg_id);

// use that offset to directly modify the depths vector
depths[ind] = span.len();

// iterate over the span to populate the uniq_depths vector
for stepref in span.iter() {
// extract the path index
let path_id = stepref.path.index();

// if the path has not been seen, increment the unique depth
if !path_seen[path_id] {
uniq_depths[ind] += 1;

// set the path to seen in the path_seen bitvec
path_seen.set(path_id, true);
}
}
}
} else {
for path in gfa.paths.all().iter() {
seen.clear(); // All segments are unseen.
for step in &gfa.steps[path.steps] {
let seg_id = step.segment().index();
depths[seg_id] += 1;
if !seen[seg_id] {
// The first traversal of this path over this segment.
uniq_depths[seg_id] += 1;
seen.set(seg_id, true);
}
}
}
}
Expand Down
Loading