-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathblock_group.rs
More file actions
2809 lines (2653 loc) · 102 KB
/
block_group.rs
File metadata and controls
2809 lines (2653 loc) · 102 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
use std::collections::{HashMap, HashSet};
use std::rc::Rc;
use crate::graph::{
all_intermediate_edges, all_reachable_nodes, all_simple_paths, flatten_to_interval_tree,
GraphEdge, GraphNode,
};
use crate::models::accession::{Accession, AccessionEdge, AccessionEdgeData, AccessionPath};
use crate::models::block_group_edge::{AugmentedEdgeData, BlockGroupEdge, BlockGroupEdgeData};
use crate::models::edge::{Edge, EdgeData, GroupBlock};
use crate::models::node::{Node, PATH_END_NODE_ID, PATH_START_NODE_ID};
use crate::models::path::{Path, PathBlock, PathData};
use crate::models::path_edge::PathEdge;
use crate::models::strand::Strand;
use crate::models::traits::*;
use intervaltree::IntervalTree;
use petgraph::graphmap::DiGraphMap;
use petgraph::Direction;
use rusqlite::{params, params_from_iter, types::Value as SQLValue, Connection, Row};
use serde::{Deserialize, Serialize};
use thiserror::Error;
#[derive(Debug, Deserialize, Serialize)]
pub struct BlockGroup {
pub id: i64,
pub collection_name: String,
pub sample_name: Option<String>,
pub name: String,
}
#[derive(Clone, Debug, Eq, Hash, PartialEq)]
pub struct BlockGroupData<'a> {
pub collection_name: &'a str,
pub sample_name: Option<&'a str>,
pub name: String,
}
#[derive(Clone, Debug)]
pub struct PathChange {
pub block_group_id: i64,
pub path: Path,
pub path_accession: Option<String>,
pub start: i64,
pub end: i64,
pub block: PathBlock,
pub chromosome_index: i64,
pub phased: i64,
}
pub struct PathCache<'a> {
pub cache: HashMap<PathData, Path>,
pub intervaltree_cache: HashMap<Path, IntervalTree<i64, NodeIntervalBlock>>,
pub conn: &'a Connection,
}
impl PathCache<'_> {
pub fn new(conn: &Connection) -> PathCache {
PathCache {
cache: HashMap::<PathData, Path>::new(),
intervaltree_cache: HashMap::<Path, IntervalTree<i64, NodeIntervalBlock>>::new(),
conn,
}
}
pub fn lookup(path_cache: &mut PathCache, block_group_id: i64, name: String) -> Path {
let path_key = PathData {
name: name.clone(),
block_group_id,
};
let path_lookup = path_cache.cache.get(&path_key);
if let Some(path) = path_lookup {
path.clone()
} else {
let new_path = Path::query(
path_cache.conn,
"select * from paths where block_group_id = ?1 AND name = ?2",
rusqlite::params!(SQLValue::from(block_group_id), SQLValue::from(name)),
)[0]
.clone();
path_cache.cache.insert(path_key, new_path.clone());
let tree = new_path.intervaltree(path_cache.conn);
path_cache.intervaltree_cache.insert(new_path.clone(), tree);
new_path
}
}
pub fn get_intervaltree<'a>(
path_cache: &'a PathCache<'a>,
path: &'a Path,
) -> Option<&'a IntervalTree<i64, NodeIntervalBlock>> {
path_cache.intervaltree_cache.get(path)
}
}
#[derive(Copy, Clone, Debug, Eq, PartialEq, Hash, Ord, PartialOrd)]
pub struct NodeIntervalBlock {
pub block_id: i64,
pub node_id: i64,
pub start: i64,
pub end: i64,
pub sequence_start: i64,
pub sequence_end: i64,
pub strand: Strand,
}
#[derive(Debug, Error, PartialEq)]
pub enum ChangeError {
#[error("Operation Error: {0}")]
OutOfBounds(String),
}
impl BlockGroup {
pub fn create(
conn: &Connection,
collection_name: &str,
sample_name: Option<&str>,
name: &str,
) -> BlockGroup {
let query = "INSERT INTO block_groups (collection_name, sample_name, name) VALUES (?1, ?2, ?3) RETURNING *";
let mut stmt = conn.prepare(query).unwrap();
match stmt.query_row((collection_name, sample_name, name), |row| {
Ok(BlockGroup {
id: row.get(0)?,
collection_name: row.get(1)?,
sample_name: row.get(2)?,
name: row.get(3)?,
})
}) {
Ok(res) => res,
Err(rusqlite::Error::SqliteFailure(err, _details)) => {
if err.code == rusqlite::ErrorCode::ConstraintViolation {
let bg_id = match sample_name {
Some(v) => {conn
.query_row(
"select id from block_groups where collection_name = ?1 and sample_name = ?2 and name = ?3",
(collection_name, v, name),
|row| row.get(0),
)
.unwrap()}
None => {
conn
.query_row(
"select id from block_groups where collection_name = ?1 and sample_name is null and name = ?2",
(collection_name, name),
|row| row.get(0),
)
.unwrap()
}
};
BlockGroup {
id: bg_id,
collection_name: collection_name.to_string(),
sample_name: sample_name.map(|s| s.to_string()),
name: name.to_string(),
}
} else {
panic!("something bad happened querying the database")
}
}
Err(_) => {
panic!("something bad happened querying the database")
}
}
}
pub fn get_by_id(conn: &Connection, id: i64) -> BlockGroup {
let query = "SELECT * FROM block_groups WHERE id = ?1";
let mut stmt = conn.prepare(query).unwrap();
match stmt.query_row(params_from_iter(vec![SQLValue::from(id)]), |row| {
Ok(BlockGroup {
id: row.get(0)?,
collection_name: row.get(1)?,
sample_name: row.get(2)?,
name: row.get(3)?,
})
}) {
Ok(res) => res,
Err(rusqlite::Error::QueryReturnedNoRows) => panic!("No block group with id {}", id),
Err(_) => {
panic!("something bad happened querying the database")
}
}
}
pub fn get_by_ids(conn: &Connection, ids: &[i64]) -> Vec<BlockGroup> {
let query = "SELECT * FROM block_groups WHERE id IN rarray(?1)";
Self::query(
conn,
query,
params!(Rc::new(
ids.iter().map(|i| SQLValue::from(*i)).collect::<Vec<_>>()
)),
)
}
pub fn clone(conn: &Connection, source_block_group_id: i64, target_block_group_id: i64) {
let existing_paths = Path::query(
conn,
"SELECT * from paths where block_group_id = ?1 ORDER BY id ASC;",
rusqlite::params!(SQLValue::from(source_block_group_id)),
);
let augmented_edges = BlockGroupEdge::edges_for_block_group(conn, source_block_group_id);
let edge_ids = augmented_edges
.iter()
.map(|edge| edge.edge.id)
.collect::<Vec<i64>>();
let new_block_group_edges = edge_ids
.iter()
.enumerate()
.map(|(i, edge_id)| BlockGroupEdgeData {
block_group_id: target_block_group_id,
edge_id: *edge_id,
chromosome_index: augmented_edges[i].chromosome_index,
phased: augmented_edges[i].phased,
})
.collect::<Vec<_>>();
BlockGroupEdge::bulk_create(conn, &new_block_group_edges);
let mut path_map = HashMap::new();
for path in existing_paths.iter() {
let edge_ids = PathEdge::edges_for_path(conn, path.id)
.into_iter()
.map(|edge| edge.id)
.collect::<Vec<i64>>();
let new_path = Path::create(conn, &path.name, target_block_group_id, &edge_ids);
path_map.insert(path.id, new_path.id);
}
for accession in Accession::query(
conn,
"select * from accessions where path_id IN rarray(?1);",
rusqlite::params!(Rc::new(
existing_paths
.iter()
.map(|path| SQLValue::from(path.id))
.collect::<Vec<SQLValue>>()
)),
) {
let edges = AccessionPath::query(
conn,
"Select * from accession_paths where accession_id = ?1 order by index_in_path ASC;",
rusqlite::params!(SQLValue::from(accession.id)),
);
let new_path_id = path_map[&accession.path_id];
let obj = Accession::create(
conn,
&accession.name,
new_path_id,
accession.parent_accession_id,
)
.expect("Unable to create accession in clone.");
AccessionPath::create(
conn,
obj.id,
&edges.iter().map(|ap| ap.edge_id).collect::<Vec<i64>>(),
);
}
}
pub fn get_or_create_sample_block_group(
conn: &Connection,
collection_name: &str,
sample_name: &str,
group_name: &str,
parent_sample: Option<&str>,
) -> Result<i64, &'static str> {
let mut bg_id : i64 = match conn.query_row(
"select id from block_groups where collection_name = ?1 AND sample_name = ?2 AND name = ?3",
(collection_name, sample_name, group_name),
|row| row.get(0),
) {
Ok(res) => res,
Err(rusqlite::Error::QueryReturnedNoRows) => 0,
Err(_e) => {
panic!("Error querying the database: {_e}");
}
};
if bg_id != 0 {
return Ok(bg_id);
} else {
// use the base reference group if it exists
if let Some(parent_sample_name) = parent_sample {
bg_id = match conn.query_row(
"select id from block_groups where collection_name = ?1 AND sample_name = ?2 AND name = ?3",
(collection_name, parent_sample_name, group_name),
|row| row.get(0),
) {
Ok(res) => res,
Err(rusqlite::Error::QueryReturnedNoRows) => 0,
Err(_e) => {
panic!("something bad happened querying the database")
}
}
} else {
bg_id = match conn.query_row(
"select id from block_groups where collection_name = ?1 AND sample_name IS null AND name = ?2",
(collection_name, group_name),
|row| row.get(0),
) {
Ok(res) => res,
Err(rusqlite::Error::QueryReturnedNoRows) => 0,
Err(_e) => {
panic!("something bad happened querying the database")
}
}
}
}
if bg_id == 0 {
let error_string = format!(
"No base path exists for sample {} and graph name {}",
sample_name, group_name
);
return Err(Box::leak(error_string.into_boxed_str()));
}
let new_bg_id = BlockGroup::create(conn, collection_name, Some(sample_name), group_name);
// clone parent blocks/edges/path
BlockGroup::clone(conn, bg_id, new_bg_id.id);
Ok(new_bg_id.id)
}
pub fn get_id(
conn: &Connection,
collection_name: &str,
sample_name: Option<&str>,
group_name: &str,
) -> i64 {
let result = if sample_name.is_some() {
conn.query_row(
"select id from block_groups where collection_name = ?1 AND sample_name = ?2 AND name = ?3",
(collection_name, sample_name, group_name),
|row| row.get(0),
)
} else {
conn.query_row(
"select id from block_groups where collection_name = ?1 AND sample_name IS NULL AND name = ?2",
(collection_name, group_name),
|row| row.get(0),
)
};
match result {
Ok(res) => res,
Err(rusqlite::Error::QueryReturnedNoRows) => 0,
Err(_e) => {
panic!("Error querying the database: {_e}");
}
}
}
pub fn get_graph(conn: &Connection, block_group_id: i64) -> DiGraphMap<GraphNode, GraphEdge> {
let mut edges = BlockGroupEdge::edges_for_block_group(conn, block_group_id);
let blocks = Edge::blocks_from_edges(conn, &edges);
let boundary_edges = Edge::boundary_edges_from_sequences(&blocks);
edges.extend(boundary_edges.clone());
let (graph, _) = Edge::build_graph(&edges, &blocks);
graph
}
pub fn prune_graph(graph: &mut DiGraphMap<GraphNode, GraphEdge>) {
// Prunes a graph by removing edges on the same chromosome_index. This means if 2 edges are
// both "chromosome index 0", we keep the newer one (newer known by the higher edge id).
// TODO: This check is not actually right but allows us to test some functionality wrt
// inherited block groups now. We need to know whether an edge was added to a blockgroup
// via inheritance or created by it. Because edges can be reused, if an edge created
// earlier in some other sample is added to a sample, it may be the correct one but have
// a lower edge id than some edge in the current sample.
let mut root_nodes = HashSet::new();
let mut edges_to_remove: Vec<(GraphNode, GraphNode)> = vec![];
for node in graph.nodes() {
if node.node_id == PATH_START_NODE_ID {
root_nodes.insert(node);
}
let mut edges_by_ci: HashMap<i64, (i64, GraphNode, GraphNode)> = HashMap::new();
for (source_node, target_node, edge_weight) in graph.edges(node) {
edges_by_ci
.entry(edge_weight.chromosome_index)
.and_modify(|(edge_id, source, target)| {
if edge_weight.edge_id > *edge_id {
edges_to_remove.push((*source, *target));
*edge_id = edge_weight.edge_id;
*source = source_node;
*target = target_node;
} else {
edges_to_remove.push((source_node, target_node));
}
})
.or_insert((edge_weight.edge_id, source_node, target_node));
}
}
for (source, target) in edges_to_remove.iter() {
graph.remove_edge(*source, *target);
}
let reachable_nodes = all_reachable_nodes(&*graph, &Vec::from_iter(root_nodes));
let mut to_remove = vec![];
for node in graph.nodes() {
if !reachable_nodes.contains(&node) {
to_remove.push(node);
}
}
for node in to_remove {
graph.remove_node(node);
}
}
pub fn get_all_sequences(
conn: &Connection,
block_group_id: i64,
prune: bool,
) -> HashSet<String> {
let mut edges = BlockGroupEdge::edges_for_block_group(conn, block_group_id);
let blocks = Edge::blocks_from_edges(conn, &edges);
let boundary_edges = Edge::boundary_edges_from_sequences(&blocks);
edges.extend(boundary_edges.clone());
let (mut graph, _) = Edge::build_graph(&edges, &blocks);
if prune {
Self::prune_graph(&mut graph);
}
let mut start_nodes = vec![];
let mut end_nodes = vec![];
for node in graph.nodes() {
if Node::is_start_node(node.node_id)
|| graph
.neighbors_directed(node, Direction::Incoming)
.next()
.is_none()
{
start_nodes.push(node);
} else if Node::is_end_node(node.node_id)
|| graph
.neighbors_directed(node, Direction::Outgoing)
.next()
.is_none()
{
end_nodes.push(node);
}
}
let blocks_by_id = blocks
.clone()
.into_iter()
.map(|block| (block.id, block))
.collect::<HashMap<i64, GroupBlock>>();
let mut sequences = HashSet::<String>::new();
for start_node in start_nodes {
for end_node in &end_nodes {
// TODO: maybe make all_simple_paths return a single path id where start == end
if start_node == *end_node {
let block = blocks_by_id.get(&start_node.block_id).unwrap();
if block.node_id != PATH_START_NODE_ID && block.node_id != PATH_END_NODE_ID {
sequences.insert(block.sequence());
}
} else {
for path in all_simple_paths(&graph, start_node, *end_node) {
let mut current_sequence = "".to_string();
for node in path {
let block = blocks_by_id.get(&node.block_id).unwrap();
let block_sequence = block.sequence();
current_sequence.push_str(&block_sequence);
}
sequences.insert(current_sequence);
}
}
}
}
sequences
}
pub fn add_accession(
conn: &Connection,
path: &Path,
name: &str,
start: i64,
end: i64,
cache: &mut PathCache,
) -> Accession {
let tree = PathCache::get_intervaltree(cache, path).unwrap();
let start_blocks: Vec<&NodeIntervalBlock> =
tree.query_point(start).map(|x| &x.value).collect();
assert_eq!(start_blocks.len(), 1);
let start_block = start_blocks[0];
let end_blocks: Vec<&NodeIntervalBlock> = tree.query_point(end).map(|x| &x.value).collect();
assert_eq!(end_blocks.len(), 1);
let end_block = end_blocks[0];
// we make a start/end edge for the accession start/end, then fill in the middle
// with any existing edges
let start_edge = AccessionEdgeData {
source_node_id: PATH_START_NODE_ID,
source_coordinate: -1,
source_strand: Strand::Forward,
target_node_id: start_block.node_id,
target_coordinate: start - start_block.start + start_block.sequence_start,
target_strand: Strand::Forward,
chromosome_index: 0,
};
let end_edge = AccessionEdgeData {
source_node_id: end_block.node_id,
source_coordinate: end - end_block.start + end_block.sequence_start,
source_strand: Strand::Forward,
target_node_id: PATH_END_NODE_ID,
target_coordinate: -1,
target_strand: Strand::Forward,
chromosome_index: 0,
};
let accession =
Accession::create(conn, name, path.id, None).expect("Unable to create accession.");
let mut path_edges = vec![start_edge];
if start_block == end_block {
path_edges.push(end_edge);
} else {
let mut in_range = false;
let path_blocks: Vec<&NodeIntervalBlock> = tree
.iter_sorted()
.map(|x| &x.value)
.filter(|block| {
if block.block_id == start_block.block_id {
in_range = true;
} else if block.block_id == end_block.block_id {
in_range = false;
return true;
}
in_range
})
.collect::<Vec<_>>();
// if start and end block are not the same, we will always have at least 2 elements in path_blocks
for (block, next_block) in path_blocks.iter().zip(path_blocks[1..].iter()) {
path_edges.push(AccessionEdgeData {
source_node_id: block.node_id,
source_coordinate: block.sequence_end,
source_strand: block.strand,
target_node_id: next_block.node_id,
target_coordinate: next_block.sequence_start,
target_strand: next_block.strand,
chromosome_index: 0,
})
}
path_edges.push(end_edge);
}
AccessionPath::create(
conn,
accession.id,
&AccessionEdge::bulk_create(conn, &path_edges),
);
accession
}
pub fn insert_changes(
conn: &Connection,
changes: &Vec<PathChange>,
cache: &mut PathCache,
modify_blockgroup: bool,
) -> Result<(), ChangeError> {
let mut new_augmented_edges_by_block_group = HashMap::<i64, Vec<AugmentedEdgeData>>::new();
let mut new_accession_edges = HashMap::new();
let mut tree_map = HashMap::new();
for change in changes {
let tree = if modify_blockgroup {
tree_map.entry(change.block_group_id).or_insert_with(|| {
BlockGroup::intervaltree_for(conn, change.block_group_id, true)
})
} else {
PathCache::get_intervaltree(cache, &change.path).unwrap()
};
let new_augmented_edges = BlockGroup::set_up_new_edges(change, tree)?;
new_augmented_edges_by_block_group
.entry(change.block_group_id)
.and_modify(|new_edge_data| new_edge_data.extend(new_augmented_edges.clone()))
.or_insert_with(|| new_augmented_edges.clone());
if let Some(accession) = &change.path_accession {
new_accession_edges
.entry((&change.path, accession))
.and_modify(|new_edge_data: &mut Vec<AugmentedEdgeData>| {
new_edge_data.extend(new_augmented_edges.clone())
})
.or_insert_with(|| new_augmented_edges.clone());
}
}
let mut edge_data_map = HashMap::new();
for (block_group_id, new_augmented_edges) in new_augmented_edges_by_block_group {
let new_edges = new_augmented_edges
.iter()
.map(|augmented_edge| augmented_edge.edge_data.clone())
.collect::<Vec<_>>();
let edge_ids = Edge::bulk_create(conn, &new_edges);
for (i, edge_data) in new_edges.iter().enumerate() {
edge_data_map.insert(edge_data.clone(), edge_ids[i]);
}
let new_block_group_edges = edge_ids
.iter()
.enumerate()
.map(|(i, edge_id)| BlockGroupEdgeData {
block_group_id,
edge_id: *edge_id,
chromosome_index: new_augmented_edges[i].chromosome_index,
phased: new_augmented_edges[i].phased,
})
.collect::<Vec<_>>();
BlockGroupEdge::bulk_create(conn, &new_block_group_edges);
}
for ((path, accession_name), path_edges) in new_accession_edges {
match Accession::get(
conn,
"select * from accessions where name = ?1 AND path_id = ?2",
params![
SQLValue::from(accession_name.clone()),
SQLValue::from(path.id),
],
) {
Ok(_) => {
println!("accession already exists, consider a better matching algorithm to determine if this is an error.");
}
Err(_) => {
let acc_edges = AccessionEdge::bulk_create(
conn,
&path_edges.iter().map(AccessionEdgeData::from).collect(),
);
let acc = Accession::create(conn, accession_name, path.id, None)
.expect("Accession could not be created.");
AccessionPath::create(conn, acc.id, &acc_edges);
}
}
}
Ok(())
}
#[allow(clippy::ptr_arg)]
#[allow(clippy::needless_late_init)]
pub fn insert_change(
conn: &Connection,
change: &PathChange,
tree: &IntervalTree<i64, NodeIntervalBlock>,
) -> Result<(), ChangeError> {
let new_augmented_edges = BlockGroup::set_up_new_edges(change, tree)?;
let new_edges = new_augmented_edges
.iter()
.map(|augmented_edge| augmented_edge.edge_data.clone())
.collect::<Vec<_>>();
let edge_ids = Edge::bulk_create(conn, &new_edges);
let new_block_group_edges = edge_ids
.iter()
.enumerate()
.map(|(i, edge_id)| BlockGroupEdgeData {
block_group_id: change.block_group_id,
edge_id: *edge_id,
chromosome_index: new_augmented_edges[i].chromosome_index,
phased: new_augmented_edges[i].phased,
})
.collect::<Vec<_>>();
BlockGroupEdge::bulk_create(conn, &new_block_group_edges);
Ok(())
}
fn set_up_new_edges(
change: &PathChange,
tree: &IntervalTree<i64, NodeIntervalBlock>,
) -> Result<Vec<AugmentedEdgeData>, ChangeError> {
let start_blocks: Vec<&NodeIntervalBlock> =
tree.query_point(change.start).map(|x| &x.value).collect();
assert_eq!(start_blocks.len(), 1);
// NOTE: This may not be used but needs to be initialized here instead of inside the if
// statement that uses it, so that the borrow checker is happy
let previous_start_blocks: Vec<&NodeIntervalBlock> = tree
.query_point(change.start - 1)
.map(|x| &x.value)
.collect();
assert_eq!(previous_start_blocks.len(), 1);
let start_block = if start_blocks[0].start == change.start {
// First part of this block will be replaced/deleted, need to get previous block to add
// edge including it
previous_start_blocks[0]
} else {
start_blocks[0]
};
// Ensure the change is within the path bounds. The logic here is a bit backwards, where
// we check if the start is before the start block's end and the end is before the end
// block's start. This is because the terminal blocks start and end at the bounds of the
// interval tree. So while it's ok to have a start/end block be the start/end block (for
// changes at the extremes, it's not ok for the change to start beyond the current
// boundaries.
if Node::is_start_node(start_block.node_id) && change.start < start_block.end {
return Err(ChangeError::OutOfBounds(format!("Invalid change specified. Coordinate {pos} is before start of path range ({path_pos}).", pos=change.start, path_pos=start_block.end)));
}
let end_blocks: Vec<&NodeIntervalBlock> =
tree.query_point(change.end).map(|x| &x.value).collect();
assert_eq!(end_blocks.len(), 1);
let end_block = end_blocks[0];
if Node::is_end_node(end_block.node_id) && change.end > end_block.start {
return Err(ChangeError::OutOfBounds(format!("Invalid change specified. Coordinate {pos} is before start of path range ({path_pos}).", pos=change.end, path_pos=end_block.start)));
}
let mut new_edges = vec![];
if change.block.sequence_start == change.block.sequence_end {
// Deletion
let new_edge = EdgeData {
source_node_id: start_block.node_id,
source_coordinate: change.start - start_block.start + start_block.sequence_start,
source_strand: Strand::Forward,
target_node_id: end_block.node_id,
target_coordinate: change.end - end_block.start + end_block.sequence_start,
target_strand: Strand::Forward,
};
let new_augmented_edge = AugmentedEdgeData {
edge_data: new_edge,
chromosome_index: change.chromosome_index,
phased: change.phased,
};
new_edges.push(new_augmented_edge);
// NOTE: If the deletion is happening at the very beginning of a path, we need to add
// an edge from the dedicated start node to the end of the deletion, to indicate it's
// another start point in the block group DAG.
if change.start == 0 {
let new_beginning_edge = EdgeData {
source_node_id: PATH_START_NODE_ID,
source_coordinate: 0,
source_strand: Strand::Forward,
target_node_id: end_block.node_id,
target_coordinate: change.end - end_block.start + end_block.sequence_start,
target_strand: Strand::Forward,
};
let new_augmented_edge = AugmentedEdgeData {
edge_data: new_beginning_edge,
chromosome_index: change.chromosome_index,
phased: change.phased,
};
new_edges.push(new_augmented_edge);
}
// NOTE: If the deletion is happening at the very end of a path, we might add an edge
// from the beginning of the deletion to the dedicated end node, but in practice it
// doesn't affect sequence readouts, so it may not be worth it.
} else {
// Insertion/replacement
let new_start_edge = EdgeData {
source_node_id: start_block.node_id,
source_coordinate: change.start - start_block.start + start_block.sequence_start,
source_strand: Strand::Forward,
target_node_id: change.block.node_id,
target_coordinate: change.block.sequence_start,
target_strand: Strand::Forward,
};
let new_augmented_start_edge = AugmentedEdgeData {
edge_data: new_start_edge,
chromosome_index: change.chromosome_index,
phased: change.phased,
};
let new_end_edge = EdgeData {
source_node_id: change.block.node_id,
source_coordinate: change.block.sequence_end,
source_strand: Strand::Forward,
target_node_id: end_block.node_id,
target_coordinate: change.end - end_block.start + end_block.sequence_start,
target_strand: Strand::Forward,
};
let new_augmented_end_edge = AugmentedEdgeData {
edge_data: new_end_edge,
chromosome_index: change.chromosome_index,
phased: change.phased,
};
new_edges.push(new_augmented_start_edge);
new_edges.push(new_augmented_end_edge);
}
Ok(new_edges)
}
pub fn intervaltree_for(
conn: &Connection,
block_group_id: i64,
remove_ambiguous_positions: bool,
) -> IntervalTree<i64, NodeIntervalBlock> {
// make a tree where every node has a span in the graph.
let mut graph = BlockGroup::get_graph(conn, block_group_id);
BlockGroup::prune_graph(&mut graph);
flatten_to_interval_tree(&graph, remove_ambiguous_positions)
}
pub fn get_current_path(conn: &Connection, block_group_id: i64) -> Path {
let paths = Path::query(
conn,
"SELECT * FROM paths WHERE block_group_id = ?1 ORDER BY id DESC",
rusqlite::params!(SQLValue::from(block_group_id)),
);
paths[0].clone()
}
pub fn derive_subgraph(
conn: &Connection,
source_block_group_id: i64,
start_block: &NodeIntervalBlock,
end_block: &NodeIntervalBlock,
start_node_coordinate: i64,
end_node_coordinate: i64,
target_block_group_id: i64,
) -> i64 {
let current_graph = BlockGroup::get_graph(conn, source_block_group_id);
let start_node = current_graph
.nodes()
.find(|node| {
node.node_id == start_block.node_id
&& node.sequence_start <= start_node_coordinate
&& node.sequence_end >= start_node_coordinate
})
.unwrap();
let end_node = current_graph
.nodes()
.find(|node| {
node.node_id == end_block.node_id
&& node.sequence_start <= end_node_coordinate
&& node.sequence_end >= end_node_coordinate
})
.unwrap();
let subgraph_edges = all_intermediate_edges(¤t_graph, start_node, end_node);
// Filter out internal edges (boundary edges) that don't exist in the database
let subgraph_edge_ids = subgraph_edges
.iter()
.map(|(_to, _from, edge_info)| edge_info.edge_id)
.collect::<Vec<_>>();
let source_edges = Edge::bulk_load(conn, &subgraph_edge_ids);
let source_block_group_edges = BlockGroupEdge::specific_edges_for_block_group(
conn,
source_block_group_id,
&subgraph_edge_ids,
);
let source_edge_ids = source_edges
.iter()
.map(|edge| edge.id)
.collect::<HashSet<_>>();
let source_block_group_edges = source_block_group_edges
.iter()
.filter(|block_group_edge| source_edge_ids.contains(&block_group_edge.edge.id))
.collect::<Vec<_>>();
let source_block_group_edges_by_edge_id = source_block_group_edges
.iter()
.map(|block_group_edge| (block_group_edge.edge.id, block_group_edge))
.collect::<HashMap<_, _>>();
let subgraph_edge_inputs = source_block_group_edges
.iter()
.map(|edge| {
let block_group_edge = source_block_group_edges_by_edge_id
.get(&edge.edge.id)
.unwrap();
BlockGroupEdgeData {
block_group_id: target_block_group_id,
edge_id: edge.edge.id,
chromosome_index: block_group_edge.chromosome_index,
phased: block_group_edge.phased,
}
})
.collect::<Vec<_>>();
let new_start_edge = Edge::create(
conn,
PATH_START_NODE_ID,
-1,
Strand::Forward,
start_block.node_id,
start_node_coordinate,
start_block.strand,
);
let new_start_edge_data = BlockGroupEdgeData {
block_group_id: target_block_group_id,
edge_id: new_start_edge.id,
chromosome_index: 0,
phased: 0,
};
let new_end_edge = Edge::create(
conn,
end_block.node_id,
end_node_coordinate,
end_block.strand,
PATH_END_NODE_ID,
-1,
Strand::Forward,
);
let new_end_edge_data = BlockGroupEdgeData {
block_group_id: target_block_group_id,
edge_id: new_end_edge.id,
chromosome_index: 0,
phased: 0,
};
let mut all_edges = subgraph_edge_inputs.clone();
all_edges.push(new_start_edge_data);
all_edges.push(new_end_edge_data);
BlockGroupEdge::bulk_create(conn, &all_edges);
target_block_group_id
}
}
impl Query for BlockGroup {
type Model = BlockGroup;
fn process_row(row: &Row) -> Self::Model {
BlockGroup {
id: row.get(0).unwrap(),
collection_name: row.get(1).unwrap(),
sample_name: row.get(2).unwrap(),
name: row.get(3).unwrap(),
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::models::{collection::Collection, node::Node, sample::Sample, sequence::Sequence};
use crate::test_helpers::{get_connection, interval_tree_verify, setup_block_group};
use core::ops::Range;
#[test]
fn test_blockgroup_create() {
let conn = &get_connection(None);
Collection::create(conn, "test");
let bg1 = BlockGroup::create(conn, "test", None, "hg19");
assert_eq!(bg1.collection_name, "test");
assert_eq!(bg1.name, "hg19");
Sample::get_or_create(conn, "sample");
let bg2 = BlockGroup::create(conn, "test", Some("sample"), "hg19");
assert_eq!(bg2.collection_name, "test");
assert_eq!(bg2.name, "hg19");
assert_eq!(bg2.sample_name, Some("sample".to_string()));
assert_ne!(bg1.id, bg2.id);
}
#[test]
fn test_blockgroup_clone() {
let conn = &get_connection(None);
Collection::create(conn, "test");
let bg1 = BlockGroup::create(conn, "test", None, "hg19");
assert_eq!(bg1.collection_name, "test");
assert_eq!(bg1.name, "hg19");
Sample::get_or_create(conn, "sample");
let bg2 =
BlockGroup::get_or_create_sample_block_group(conn, "test", "sample", "hg19", None)
.unwrap();
assert_eq!(
BlockGroupEdge::edges_for_block_group(conn, bg1.id),
BlockGroupEdge::edges_for_block_group(conn, bg2)
);
}
#[test]
fn test_blockgroup_clone_passes_accessions() {
let conn = &get_connection(None);
let (bg_1, path) = setup_block_group(conn);
let mut path_cache = PathCache::new(conn);
PathCache::lookup(&mut path_cache, bg_1, path.name.clone());
let acc_1 = BlockGroup::add_accession(conn, &path, "test", 3, 7, &mut path_cache);
assert_eq!(
Accession::query(
conn,
"select * from accessions where name = ?1",
rusqlite::params!(SQLValue::from("test".to_string())),
),
vec![Accession {
id: acc_1.id,
name: "test".to_string(),
path_id: path.id,
parent_accession_id: None,
}]
);
Sample::get_or_create(conn, "sample2");
let _bg2 =
BlockGroup::get_or_create_sample_block_group(conn, "test", "sample2", "chr1", None)
.unwrap();
assert_eq!(
Accession::query(
conn,
"select * from accessions where name = ?1",
rusqlite::params!(SQLValue::from("test".to_string())),
),
vec![
Accession {
id: acc_1.id,
name: "test".to_string(),
path_id: path.id,
parent_accession_id: None,
},
Accession {
id: acc_1.id + 1,