@@ -31,6 +31,7 @@ def __init__(
3131 default_gap = None ,
3232 bp_per_texel = None ,
3333 autosome_prefix = None ,
34+ max_contig_length = 2_000_000_000 ,
3435 ):
3536 super ().__init__ (name , header , scaffolds , bp_per_texel )
3637 self .default_gap = default_gap
@@ -40,6 +41,7 @@ def __init__(
4041 self .assembly_stats : AssemblyStats = AssemblyStats ()
4142 if autosome_prefix :
4243 self .autosome_prefix = autosome_prefix
44+ self .max_contig_length = max_contig_length
4345
4446 @property
4547 def autosome_prefix (self ):
@@ -288,17 +290,21 @@ def __build_name_and_sort_assemblies(
288290 asm_key = hap
289291 else :
290292 asm_key = None
291- new_asm = assemblies .setdefault (
292- asm_key , Assembly (self .name , curated = curated )
293- )
294- new_asm .add_scaffold (scffld )
295- if scffld .rank == 1 :
296- # Add autosome to the ChrNamer
297- chr_namer .add_scaffold (asm_key , scffld )
298- elif scffld .rank == 2 :
299- chr_namer .add_chr_prefix (scffld , hap )
300- elif hap is not None :
301- chr_namer .add_haplotype_prefix (scffld , hap )
293+
294+ if not (new_asm := assemblies .get (asm_key )):
295+ new_asm = Assembly (self .name , curated = curated )
296+ assemblies [asm_key ] = new_asm
297+
298+ for cut_scffld in self .cut_scaffold_if_too_long (scffld ):
299+ new_asm .add_scaffold (cut_scffld )
300+
301+ if cut_scffld .rank == 1 :
302+ # Add autosome to the ChrNamer
303+ chr_namer .add_scaffold (asm_key , cut_scffld )
304+ elif cut_scffld .rank == 2 :
305+ chr_namer .add_chr_prefix (cut_scffld , hap )
306+ elif hap is not None :
307+ chr_namer .add_haplotype_prefix (cut_scffld , hap )
302308
303309 # ChrNamer names autosome chromosomes by size
304310 chr_namer .name_chromosomes ()
@@ -311,6 +317,113 @@ def __build_name_and_sort_assemblies(
311317
312318 return scaffolds , assemblies
313319
320+ def cut_scaffold_if_too_long (self , scffld : Scaffold ) -> list [Scaffold ]:
321+ whole = scffld .length
322+ pieces = math .ceil (whole / self .max_contig_length )
323+
324+ if pieces == 1 :
325+ return [scffld ]
326+
327+ # If, for example, we need to cut the scaffold into 3 pieces, this
328+ # loop will take the first 1/3 off the scaffold, then 1/2 of what's
329+ # remaining.
330+ cut_parts = []
331+ to_cut = scffld
332+ for div in range (pieces , 1 , - 1 ):
333+ cut_at = whole // div
334+ gap_i = self .index_of_nearest_gap_to_ideal_cut_site (to_cut , cut_at )
335+ rows = to_cut .rows
336+ # First part is everything up to, but not including, the gap
337+ cut_parts .append (Scaffold (to_cut .name , rows [:gap_i ]))
338+ # Second part is everything after the gap
339+ to_cut = Scaffold (to_cut .name , rows [gap_i + 1 :])
340+ cut_parts .append (to_cut )
341+
342+ # Add suffix "_1", "_2" etc... to cut scaffolds
343+ for i , part in enumerate (cut_parts ):
344+ part .name = f"{ part .name } _{ i + 1 } "
345+
346+ # Format report of cuts made
347+ whole_str = f"{ whole :,d} "
348+ wl = len (whole_str )
349+ nl = len (scffld .name ) + 4
350+ log .info (
351+ f"Cut { scffld .name :<{nl }} { whole :{wl },d} bp (including gaps) into:\n "
352+ + "" .join (
353+ [f" { x .name :<{nl }} { x .length :{wl },d} bp\n " for x in cut_parts ]
354+ )
355+ )
356+
357+ return cut_parts
358+
359+ def index_of_nearest_gap_to_ideal_cut_site (self , to_cut : Scaffold , cut_at : int ):
360+ # Make a temporary `IndexedAssembly` to use it's code to search for an
361+ # row which overlaps out cut coordiante.
362+ idx_asm = IndexedAssembly (
363+ f"Temporary Assembly for cutting '{ to_cut .name } ' at { cut_at :_d} " ,
364+ scaffolds = [to_cut ],
365+ )
366+
367+ # Find the row which overlaps the ideal cut site
368+ ovr_i_j = idx_asm .overlapping_indices_by_scaffold_start_end (
369+ to_cut , cut_at , cut_at
370+ )
371+ if not ovr_i_j :
372+ msg = (
373+ f"Failed to find an element at { cut_at :_d} "
374+ f" within '{ to_cut .name } ' of length { to_cut .length :_d} "
375+ )
376+ raise ValueError (msg )
377+
378+ ovr_i = ovr_i_j [0 ]
379+ rows = to_cut .rows
380+ ele = rows [ovr_i ]
381+ if not isinstance (ele , Gap ):
382+ # This isn't a gap, so we need to find the nearest
383+ gap_i_before = None
384+ for i in range (ovr_i , 0 , - 1 ):
385+ if isinstance (rows [i ], Gap ):
386+ gap_i_before = i
387+ break
388+
389+ gap_i_after = None
390+ for i in range (ovr_i , len (rows )):
391+ if isinstance (rows [i ], Gap ):
392+ gap_i_after = i
393+ break
394+
395+ if gap_i_before is None and gap_i_after is None :
396+ msg = (
397+ f"Failed to find gap before or after { cut_at :_d} in '{ to_cut .name } '"
398+ )
399+ raise ValueError (msg )
400+
401+ if gap_i_before is None :
402+ ovr_i = gap_i_after
403+ elif gap_i_after is None :
404+ ovr_i = gap_i_before
405+ else :
406+ length_before = 0
407+ length_after = 0
408+ for i , this_row in enumerate (rows ):
409+ if i < gap_i_before :
410+ length_before += this_row .length
411+
412+ if i < gap_i_after :
413+ length_after += this_row .length
414+ else :
415+ break
416+
417+ # Choose the gap before or after, whichever is nearest to the
418+ # ideal cut point.
419+ ovr_i = (
420+ gap_i_before
421+ if abs (cut_at - length_before ) < abs (cut_at - length_after )
422+ else gap_i_after
423+ )
424+
425+ return ovr_i
426+
314427 def scaffolds_fused_by_name (self ) -> list [Scaffold ]:
315428 gap = self .default_gap
316429 hap_name_scaffold : dict [tuple [str , str ], Scaffold ] = {}
0 commit comments