Skip to content

Commit 1593647

Browse files
committed
Breaking of long scaffolds, 2 Gbp by default
1 parent dd97d79 commit 1593647

File tree

5 files changed

+168
-24
lines changed

5 files changed

+168
-24
lines changed

src/tola/assembly/build_assembly.py

Lines changed: 124 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ def __init__(
3131
default_gap=None,
3232
bp_per_texel=None,
3333
autosome_prefix=None,
34+
max_contig_length=2_000_000_000,
3435
):
3536
super().__init__(name, header, scaffolds, bp_per_texel)
3637
self.default_gap = default_gap
@@ -40,6 +41,7 @@ def __init__(
4041
self.assembly_stats: AssemblyStats = AssemblyStats()
4142
if autosome_prefix:
4243
self.autosome_prefix = autosome_prefix
44+
self.max_contig_length = max_contig_length
4345

4446
@property
4547
def autosome_prefix(self):
@@ -288,17 +290,21 @@ def __build_name_and_sort_assemblies(
288290
asm_key = hap
289291
else:
290292
asm_key = None
291-
new_asm = assemblies.setdefault(
292-
asm_key, Assembly(self.name, curated=curated)
293-
)
294-
new_asm.add_scaffold(scffld)
295-
if scffld.rank == 1:
296-
# Add autosome to the ChrNamer
297-
chr_namer.add_scaffold(asm_key, scffld)
298-
elif scffld.rank == 2:
299-
chr_namer.add_chr_prefix(scffld, hap)
300-
elif hap is not None:
301-
chr_namer.add_haplotype_prefix(scffld, hap)
293+
294+
if not (new_asm := assemblies.get(asm_key)):
295+
new_asm = Assembly(self.name, curated=curated)
296+
assemblies[asm_key] = new_asm
297+
298+
for cut_scffld in self.cut_scaffold_if_too_long(scffld):
299+
new_asm.add_scaffold(cut_scffld)
300+
301+
if cut_scffld.rank == 1:
302+
# Add autosome to the ChrNamer
303+
chr_namer.add_scaffold(asm_key, cut_scffld)
304+
elif cut_scffld.rank == 2:
305+
chr_namer.add_chr_prefix(cut_scffld, hap)
306+
elif hap is not None:
307+
chr_namer.add_haplotype_prefix(cut_scffld, hap)
302308

303309
# ChrNamer names autosome chromosomes by size
304310
chr_namer.name_chromosomes()
@@ -311,6 +317,113 @@ def __build_name_and_sort_assemblies(
311317

312318
return scaffolds, assemblies
313319

320+
def cut_scaffold_if_too_long(self, scffld: Scaffold) -> list[Scaffold]:
321+
whole = scffld.length
322+
pieces = math.ceil(whole / self.max_contig_length)
323+
324+
if pieces == 1:
325+
return [scffld]
326+
327+
# If, for example, we need to cut the scaffold into 3 pieces, this
328+
# loop will take the first 1/3 off the scaffold, then 1/2 of what's
329+
# remaining.
330+
cut_parts = []
331+
to_cut = scffld
332+
for div in range(pieces, 1, -1):
333+
cut_at = whole // div
334+
gap_i = self.index_of_nearest_gap_to_ideal_cut_site(to_cut, cut_at)
335+
rows = to_cut.rows
336+
# First part is everything up to, but not including, the gap
337+
cut_parts.append(Scaffold(to_cut.name, rows[:gap_i]))
338+
# Second part is everything after the gap
339+
to_cut = Scaffold(to_cut.name, rows[gap_i + 1 :])
340+
cut_parts.append(to_cut)
341+
342+
# Add suffix "_1", "_2" etc... to cut scaffolds
343+
for i, part in enumerate(cut_parts):
344+
part.name = f"{part.name}_{i + 1}"
345+
346+
# Format report of cuts made
347+
whole_str = f"{whole:,d}"
348+
wl = len(whole_str)
349+
nl = len(scffld.name) + 4
350+
log.info(
351+
f"Cut {scffld.name:<{nl}} {whole:{wl},d} bp (including gaps) into:\n"
352+
+ "".join(
353+
[f" {x.name:<{nl}} {x.length:{wl},d} bp\n" for x in cut_parts]
354+
)
355+
)
356+
357+
return cut_parts
358+
359+
def index_of_nearest_gap_to_ideal_cut_site(self, to_cut: Scaffold, cut_at: int):
360+
# Make a temporary `IndexedAssembly` to use it's code to search for an
361+
# row which overlaps out cut coordiante.
362+
idx_asm = IndexedAssembly(
363+
f"Temporary Assembly for cutting '{to_cut.name}' at {cut_at:_d}",
364+
scaffolds=[to_cut],
365+
)
366+
367+
# Find the row which overlaps the ideal cut site
368+
ovr_i_j = idx_asm.overlapping_indices_by_scaffold_start_end(
369+
to_cut, cut_at, cut_at
370+
)
371+
if not ovr_i_j:
372+
msg = (
373+
f"Failed to find an element at {cut_at:_d}"
374+
f" within '{to_cut.name}' of length {to_cut.length:_d}"
375+
)
376+
raise ValueError(msg)
377+
378+
ovr_i = ovr_i_j[0]
379+
rows = to_cut.rows
380+
ele = rows[ovr_i]
381+
if not isinstance(ele, Gap):
382+
# This isn't a gap, so we need to find the nearest
383+
gap_i_before = None
384+
for i in range(ovr_i, 0, -1):
385+
if isinstance(rows[i], Gap):
386+
gap_i_before = i
387+
break
388+
389+
gap_i_after = None
390+
for i in range(ovr_i, len(rows)):
391+
if isinstance(rows[i], Gap):
392+
gap_i_after = i
393+
break
394+
395+
if gap_i_before is None and gap_i_after is None:
396+
msg = (
397+
f"Failed to find gap before or after {cut_at:_d} in '{to_cut.name}'"
398+
)
399+
raise ValueError(msg)
400+
401+
if gap_i_before is None:
402+
ovr_i = gap_i_after
403+
elif gap_i_after is None:
404+
ovr_i = gap_i_before
405+
else:
406+
length_before = 0
407+
length_after = 0
408+
for i, this_row in enumerate(rows):
409+
if i < gap_i_before:
410+
length_before += this_row.length
411+
412+
if i < gap_i_after:
413+
length_after += this_row.length
414+
else:
415+
break
416+
417+
# Choose the gap before or after, whichever is nearest to the
418+
# ideal cut point.
419+
ovr_i = (
420+
gap_i_before
421+
if abs(cut_at - length_before) < abs(cut_at - length_after)
422+
else gap_i_after
423+
)
424+
425+
return ovr_i
426+
314427
def scaffolds_fused_by_name(self) -> list[Scaffold]:
315428
gap = self.default_gap
316429
hap_name_scaffold: dict[tuple[str, str], Scaffold] = {}

src/tola/assembly/indexed_assembly.py

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -64,9 +64,10 @@ def find_overlaps(self, bait: Fragment) -> OverlapResult | None:
6464
raise ValueError(msg)
6565

6666
idx = self.overlap_index_by_name(scffld.name)
67-
i_ovr, j_ovr = self.get_overlap_fragment_indices(idx, bait.start, bait.end)
68-
if i_ovr is None or j_ovr is None:
67+
rng = self.__get_overlap_fragment_indices(idx, bait.start, bait.end)
68+
if rng is None:
6969
return None
70+
i_ovr, j_ovr = rng
7071

7172
# Walk start and end pointers back to ignore Gaps on the ends
7273
while isinstance(scffld.rows[i_ovr], Gap):
@@ -87,9 +88,9 @@ def find_overlaps(self, bait: Fragment) -> OverlapResult | None:
8788
rows=overlaps,
8889
)
8990

90-
def get_overlap_fragment_indices(
91+
def __get_overlap_fragment_indices(
9192
self, idx: list[int], start: int, end: int
92-
) -> tuple[int | None, int | None]:
93+
) -> tuple[int, int] | None:
9394
"""
9495
Perfoms a binary search through the index `idx` (a list of
9596
`Fragment | Gap` end positions), returning the start and end indices
@@ -116,7 +117,7 @@ def get_overlap_fragment_indices(
116117
break
117118

118119
if ovr is None:
119-
return None, None
120+
return None
120121

121122
# The span of overlapping entries may extend to the left or right
122123
# of "ovr"
@@ -132,3 +133,12 @@ def get_overlap_fragment_indices(
132133
j_ovr = j
133134

134135
return i_ovr, j_ovr
136+
137+
def overlapping_indices_by_scaffold_start_end(
138+
self, scaffold: Scaffold, start: int, end: int
139+
) -> tuple[int, int] | None:
140+
"""
141+
Get the start and end index
142+
"""
143+
idx = self.overlap_index_by_name(scaffold.name)
144+
return self.__get_overlap_fragment_indices(idx, start, end)

src/tola/assembly/naming_utils.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -396,6 +396,7 @@ def build_groups(self):
396396
# },
397397
# }
398398
# )
399+
399400
group.add_scaffold_to_haplotype(haplotype, scffld)
400401
last_haplotype = haplotype
401402
last_orig = orig

src/tola/assembly/scaffold.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ class Scaffold:
1010
def __init__(
1111
self,
1212
name,
13-
rows=None,
13+
rows: list[Fragment | Gap] | None = None,
1414
tag=None,
1515
haplotype=None,
1616
rank=0,
@@ -21,7 +21,7 @@ def __init__(
2121
if rows:
2222
self.rows = [*rows]
2323
else:
24-
self.rows = []
24+
self.rows: list[Fragment | Gap] = []
2525
self.tag: str | None = tag
2626
self.haplotype: str | None = haplotype
2727
self.rank: int = rank
@@ -57,7 +57,7 @@ def __str__(self):
5757
txt.write(f" {row.length:14_d} {row}\n")
5858
return txt.getvalue()
5959

60-
def add_row(self, row):
60+
def add_row(self, row: Fragment | Gap):
6161
self.rows.append(row)
6262

6363
@property

src/tola/assembly/scripts/pretext_to_asm.py

Lines changed: 25 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import re
44
import sys
55
from pathlib import Path
6+
from typing import no_type_check
67

78
import click
89
import yaml
@@ -187,6 +188,18 @@ def ul(txt):
187188
to 'map-order' if the '--keep-map-order' option is given.
188189
""",
189190
)
191+
@click.option(
192+
"--max-contig-length",
193+
"max_contig_length",
194+
type=int,
195+
help=f"""
196+
Maximum length for a single contig. If a contig exceeds this size, it
197+
will be broken at gaps into a number of pieces such that each is less
198+
than this number. The pieces will have the suffixes '_1', '_2'
199+
{it("etc...")} added to their names. [default: 2 Gbp]
200+
""",
201+
default=2_000_000_000,
202+
)
190203
def cli(
191204
assembly_file,
192205
pretext_file,
@@ -197,6 +210,7 @@ def cli(
197210
write_log,
198211
keep_map_order,
199212
default_asm_name,
213+
max_contig_length,
200214
):
201215
logfile = setup_logging(log_level, output_file, write_log, clobber)
202216

@@ -218,6 +232,7 @@ def cli(
218232
"stdout",
219233
default_gap=Gap(200, "scaffold"),
220234
autosome_prefix=autosome_prefix,
235+
max_contig_length=max_contig_length,
221236
)
222237
build_asm.remap_to_input_assembly(prtxt_asm, input_asm)
223238

@@ -306,9 +321,9 @@ def name_assemblies(
306321
# has been curated. One of the painted chromosomes in the curated
307322
# haplotype has been tagged with 'Primary'
308323
if asm_dict.get("Primary"):
309-
# <ToLID>.1.primary.curated.fa <- Sequence from "Hap1" tagged scaffolds)
324+
# <ToLID>.1.primary.curated.fa <- Sequence from "Hap1" tagged scaffolds
310325
# <ToLID>.1.primary.chromosome.list.csv
311-
# <ToLID>.1.all_haplotigs.curated.fa <- Sequence from "Hap2" tagged scaffolds)
326+
# <ToLID>.1.all_haplotigs.curated.fa <- Sequence from "Hap2" tagged scaffolds
312327
other_asm = []
313328
for asm_key, asm in asm_dict.items():
314329
if asm_key == "Primary":
@@ -332,8 +347,8 @@ def name_assemblies(
332347
elif asm_dict.get(None):
333348
# <ToLID>.1.primary.curated.fa
334349
# <ToLID>.1.primary.chromosome.list.csv
335-
# <ToLID>.1.additional_haplotigs.curated.fa <- Sequence from "Haplotig"
336-
# tagged scaffolds
350+
# <ToLID>.1.additional_haplotigs.curated.fa <- Sequence from "Haplotig"
351+
# tagged scaffolds
337352
other_asm = []
338353
for asm_key, asm in asm_dict.items():
339354
if asm_key is None:
@@ -483,7 +498,12 @@ def write_chr_csv_files(
483498
csv_fh.write(chr_names)
484499

485500

486-
def write_info_yaml(output_file, stats: AssemblyStats, out_assemblies: AssemblyDict, clobber):
501+
def write_info_yaml(
502+
output_file,
503+
stats: AssemblyStats,
504+
out_assemblies: AssemblyDict,
505+
clobber,
506+
):
487507
asm_stats = stats.per_assembly_stats
488508
info = {"assemblies": asm_stats}
489509
if len(asm_stats) > 1:

0 commit comments

Comments
 (0)